前端时间花了1个月左右,搞了个新闻APP,功能很简单,就是把页面版的新闻条目定时爬到后台数据库,然后用app显示出来。
1.客户端
使用了DCloud框架,js基本是个新手,从没写过像样的代码,html5更是新手,索性直接使用现成的前端框架。APPcan,APICloud尝试过,最终选择DCloud,话说它的HBuild编辑器确实不错。
贴一部分关键代码: 使用DCloud的下拉刷新方法,使用ajax获取后台返回的json列表;
1 <!DOCTYPE html>
2 <html>
3
4 <head>
5 <meta charset="utf-8">
6 <meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no" />
7 <title></title>
8 <script src="js/mui.min.js"></script>
9 <link href="css/mui.min.css" rel="stylesheet" />
10 <script type="text/javascript" charset="utf-8">
11 // mui.init();
12 var t;
13 mui.init({
14 pullRefresh: {
15 container: "#pullMine", // 下拉刷新容器标识,querySelector能定位的css选择器均可,比如:id、.class等
16 down: {
17 contentdown: "下拉可以刷新", // 可选,在下拉可刷新状态时,下拉刷新控件上显示的标题内容
18 contentover: "释放立即刷新", // 可选,在释放可刷新状态时,下拉刷新控件上显示的标题内容
19 contentrefresh: "正在刷新...", // 可选,正在刷新状态时,下拉刷新控件上显示的标题内容
20 callback: pulldownRefresh // 必选,刷新函数,根据具体业务来编写,比如通过ajax从服务器获取新数据;
21 }
22 }
23 });
24
25 mui.plusReady( function() {
26 console.log("当前页面URL:" + plus.webview.currentWebview().getURL());
27 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
28 dataType: 'json',
29 type: 'get',
30 timeout: 10000,
31 success: function(data) {
32 t=data;
33 var list = document.getElementById("list");
34 var finallist = '';
35 for (i = data.length - 1; i >= 0; i--) {
36 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
37 }
38 list.innerHTML = finallist;
39 console.log("no1"+finallist);
40 mui('#list').on('tap', 'li', function() {
41 mui.openWindow({
42 url: 'detail_sd.html',
43 id: 'detail_sd',
44 extras: {
45 title: t[ this.getAttribute('data-id')].title,
46 author: t[ this.getAttribute('data-id')].author,
47 pubtime: t[ this.getAttribute('data-id')].pubtime,
48 content: t[ this.getAttribute('data-id')].content
49 }
50 })
51
52 })
53 },
54 error: function() {}
55 })
56 })
57
58 // 下拉刷新
59 //
60
61
62
63 /* *
64 * 下拉刷新具体业务实现
65 */ function pulldownRefresh() {
66 setTimeout( function() {
67 console.log("refreshing....");
68 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
69 dataType: 'json',
70 type: 'get',
71 timeout: 10000,
72 success: function(data) {
73 t=data;
74 var list = document.getElementById("list");
75 var finallist = '';
76 for (i = data.length - 1; i >= 0; i--) {
77 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
78 // finallist=finallist+'<li data-id="'+i+'" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">'+data[i].title+'<p class="mui-ellipsis">'+data[i].content+'</p></div></a></li>';
79 }
80 list.innerHTML = finallist;
81
82
83 },
84 error: function() {}
85 });
86 mui('#pullMine').pullRefresh().endPulldownToRefresh(); // refresh completed
87
88 }, 1500);
89 }
90 </script>
91 </head>
92
93 <body>
94
95 <!--<div id="pullMine" class="mui-content mui-scroll-wrapper">
96 <div class="mui-scroll">
97 <ul class="mui-table-view" id="list">
98
99 </ul>
100 </div>
101
102 </div>-->
103
104 <div id="pullMine" class="mui-content mui-scroll-wrapper">
105
106 <div class="mui-scroll">
107 <ul class="mui-table-view" id="list">
108
109 </ul>
110 </div>
111 </div>
112
113 </body>
114
2 <html>
3
4 <head>
5 <meta charset="utf-8">
6 <meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no" />
7 <title></title>
8 <script src="js/mui.min.js"></script>
9 <link href="css/mui.min.css" rel="stylesheet" />
10 <script type="text/javascript" charset="utf-8">
11 // mui.init();
12 var t;
13 mui.init({
14 pullRefresh: {
15 container: "#pullMine", // 下拉刷新容器标识,querySelector能定位的css选择器均可,比如:id、.class等
16 down: {
17 contentdown: "下拉可以刷新", // 可选,在下拉可刷新状态时,下拉刷新控件上显示的标题内容
18 contentover: "释放立即刷新", // 可选,在释放可刷新状态时,下拉刷新控件上显示的标题内容
19 contentrefresh: "正在刷新...", // 可选,正在刷新状态时,下拉刷新控件上显示的标题内容
20 callback: pulldownRefresh // 必选,刷新函数,根据具体业务来编写,比如通过ajax从服务器获取新数据;
21 }
22 }
23 });
24
25 mui.plusReady( function() {
26 console.log("当前页面URL:" + plus.webview.currentWebview().getURL());
27 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
28 dataType: 'json',
29 type: 'get',
30 timeout: 10000,
31 success: function(data) {
32 t=data;
33 var list = document.getElementById("list");
34 var finallist = '';
35 for (i = data.length - 1; i >= 0; i--) {
36 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
37 }
38 list.innerHTML = finallist;
39 console.log("no1"+finallist);
40 mui('#list').on('tap', 'li', function() {
41 mui.openWindow({
42 url: 'detail_sd.html',
43 id: 'detail_sd',
44 extras: {
45 title: t[ this.getAttribute('data-id')].title,
46 author: t[ this.getAttribute('data-id')].author,
47 pubtime: t[ this.getAttribute('data-id')].pubtime,
48 content: t[ this.getAttribute('data-id')].content
49 }
50 })
51
52 })
53 },
54 error: function() {}
55 })
56 })
57
58 // 下拉刷新
59 //
60
61
62
63 /* *
64 * 下拉刷新具体业务实现
65 */ function pulldownRefresh() {
66 setTimeout( function() {
67 console.log("refreshing....");
68 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
69 dataType: 'json',
70 type: 'get',
71 timeout: 10000,
72 success: function(data) {
73 t=data;
74 var list = document.getElementById("list");
75 var finallist = '';
76 for (i = data.length - 1; i >= 0; i--) {
77 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
78 // finallist=finallist+'<li data-id="'+i+'" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">'+data[i].title+'<p class="mui-ellipsis">'+data[i].content+'</p></div></a></li>';
79 }
80 list.innerHTML = finallist;
81
82
83 },
84 error: function() {}
85 });
86 mui('#pullMine').pullRefresh().endPulldownToRefresh(); // refresh completed
87
88 }, 1500);
89 }
90 </script>
91 </head>
92
93 <body>
94
95 <!--<div id="pullMine" class="mui-content mui-scroll-wrapper">
96 <div class="mui-scroll">
97 <ul class="mui-table-view" id="list">
98
99 </ul>
100 </div>
101
102 </div>-->
103
104 <div id="pullMine" class="mui-content mui-scroll-wrapper">
105
106 <div class="mui-scroll">
107 <ul class="mui-table-view" id="list">
108
109 </ul>
110 </div>
111 </div>
112
113 </body>
114
115 </html>
2.后台PHP发布端
使用了thinkphp框架
1 <?php
2 namespace Home\Controller;
3 use Think\Controller;
4 class NewsController extends Controller {
5 public function getlist(){
6 $newsList=M('news')->order('pubtime asc')->limit(30)->select();
7 echo json_encode( $newsList);
8 }
9 public function getlist_sd(){
10 $newsList=M('newssd')->order('pubtime asc')->limit(30)->select();
11 echo json_encode( $newsList);
12 }
2 namespace Home\Controller;
3 use Think\Controller;
4 class NewsController extends Controller {
5 public function getlist(){
6 $newsList=M('news')->order('pubtime asc')->limit(30)->select();
7 echo json_encode( $newsList);
8 }
9 public function getlist_sd(){
10 $newsList=M('newssd')->order('pubtime asc')->limit(30)->select();
11 echo json_encode( $newsList);
12 }
13 ?>
3.后台爬虫
使用了scrapy,爬取新闻内容写入DB
pipelines.py
1
#
-*- coding: utf-8 -*-
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
8 from scrapy import signals
9 import json
10 import codecs
11 from twisted.enterprise import adbapi
12 from datetime import datetime
13 from hashlib import md5
14 import MySQLdb
15 import MySQLdb.cursors
16
17 class JsonWithEncodingtutorialPipeline(object):
18 def __init__(self):
19 self.file = codecs.open( ' qdnews.json ', ' w ', encoding= ' utf-8 ')
20 def process_item(self, item, spider):
21 line = json.dumps(dict(item), ensure_ascii=False) + " \n "
22 self.file.write(line)
23 return item
24 def spider_closed(self, spider):
25 self.file.close()
26
27 class MySQLStoretutorialPipeline(object):
28 def __init__(self, dbpool):
29 self.dbpool = dbpool
30 print( " -----------init sql proc--- ")
31 @classmethod
32 def from_settings(cls, settings):
33 dbargs = dict(
34 host=settings[ ' MYSQL_HOST '],
35 db=settings[ ' MYSQL_DBNAME '],
36 user=settings[ ' MYSQL_USER '],
37 passwd=settings[ ' MYSQL_PASSWD '],
38 charset= ' utf8 ',
39 cursorclass = MySQLdb.cursors.DictCursor,
40 use_unicode= True,
41 )
42 dbpool = adbapi.ConnectionPool( ' MySQLdb ', **dbargs)
43 return cls(dbpool)
44
45 # pipeline默认调用
46 def process_item(self, item, spider):
47 d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
48 d.addErrback(self._handle_error, item, spider)
49 d.addBoth( lambda _: item)
50 return d
51 # 将每行更新或写入数据库中
52 def _do_upinsert(self, conn, item, spider):
53 print (item[ ' link '][0])
54 linkmd5id = self._get_linkmd5id(item)
55
56 print linkmd5id
57 print( " -------------- ")
58 now = datetime.now().replace(microsecond=0).isoformat( ' ')
59 # now=datetime2timestamp(datetime.datetime.now())
60 conn.execute( """
61 select 1 from tp_news where linkmd5id = %s
62 """, (linkmd5id, ))
63 ret = conn.fetchone()
64 print ( ' ret= ',ret)
65
66 if ret:
67 print " 1111111111 "
68 conn.execute( """
69 update tp_news set title = %s, content = %s, author = %s,pubtime = %s, pubtime2 = %s,link = %s, updated = %s where linkmd5id = %s
70 """, (item[ ' title '][0][4:-5], item[ ' content '][0], item[ ' pubtime '][0][16:-4],item[ ' pubtime '][0][-14:-4], item[ ' pubtime '][0][-14:-4],item[ ' link '][0], now, linkmd5id))
71 # print """
72 # update tp_news_2 set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
73 # """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
74 else:
75 print ' 2222222222 '
76 conn.execute( """
77 insert into tp_news(linkmd5id, title, content, author,link, updated, pubtime, pubtime2)
78 values(%s, %s, %s, %s, %s,%s,%s,%s)
79 """, (linkmd5id, item[ ' title '][0][4:-5], item[ ' content '][0], item[ ' pubtime '][0][16:-4],item[ ' link '][0], now,item[ ' pubtime '][0][-14:-4], item[ ' pubtime '][0][-14:-4]))
80 # print """
81 # insert into tp_news_2(linkmd5id, title, description, link, listUrl, updated)
82 # values(%s, %s, %s, %s, %s, %s)
83 # """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
84 # 获取url的md5编码
85 def _get_linkmd5id(self, item):
86 # url进行md5处理,为避免重复采集设计
87 s=md5(item[ ' link '][0]).hexdigest()
88 # print (s)
89 # print(md5(item['link']).hexdigest())
90 return s
91 # 异常处理
92 def _handle_error(self, failue, item, spider):
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
8 from scrapy import signals
9 import json
10 import codecs
11 from twisted.enterprise import adbapi
12 from datetime import datetime
13 from hashlib import md5
14 import MySQLdb
15 import MySQLdb.cursors
16
17 class JsonWithEncodingtutorialPipeline(object):
18 def __init__(self):
19 self.file = codecs.open( ' qdnews.json ', ' w ', encoding= ' utf-8 ')
20 def process_item(self, item, spider):
21 line = json.dumps(dict(item), ensure_ascii=False) + " \n "
22 self.file.write(line)
23 return item
24 def spider_closed(self, spider):
25 self.file.close()
26
27 class MySQLStoretutorialPipeline(object):
28 def __init__(self, dbpool):
29 self.dbpool = dbpool
30 print( " -----------init sql proc--- ")
31 @classmethod
32 def from_settings(cls, settings):
33 dbargs = dict(
34 host=settings[ ' MYSQL_HOST '],
35 db=settings[ ' MYSQL_DBNAME '],
36 user=settings[ ' MYSQL_USER '],
37 passwd=settings[ ' MYSQL_PASSWD '],
38 charset= ' utf8 ',
39 cursorclass = MySQLdb.cursors.DictCursor,
40 use_unicode= True,
41 )
42 dbpool = adbapi.ConnectionPool( ' MySQLdb ', **dbargs)
43 return cls(dbpool)
44
45 # pipeline默认调用
46 def process_item(self, item, spider):
47 d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
48 d.addErrback(self._handle_error, item, spider)
49 d.addBoth( lambda _: item)
50 return d
51 # 将每行更新或写入数据库中
52 def _do_upinsert(self, conn, item, spider):
53 print (item[ ' link '][0])
54 linkmd5id = self._get_linkmd5id(item)
55
56 print linkmd5id
57 print( " -------------- ")
58 now = datetime.now().replace(microsecond=0).isoformat( ' ')
59 # now=datetime2timestamp(datetime.datetime.now())
60 conn.execute( """
61 select 1 from tp_news where linkmd5id = %s
62 """, (linkmd5id, ))
63 ret = conn.fetchone()
64 print ( ' ret= ',ret)
65
66 if ret:
67 print " 1111111111 "
68 conn.execute( """
69 update tp_news set title = %s, content = %s, author = %s,pubtime = %s, pubtime2 = %s,link = %s, updated = %s where linkmd5id = %s
70 """, (item[ ' title '][0][4:-5], item[ ' content '][0], item[ ' pubtime '][0][16:-4],item[ ' pubtime '][0][-14:-4], item[ ' pubtime '][0][-14:-4],item[ ' link '][0], now, linkmd5id))
71 # print """
72 # update tp_news_2 set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
73 # """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
74 else:
75 print ' 2222222222 '
76 conn.execute( """
77 insert into tp_news(linkmd5id, title, content, author,link, updated, pubtime, pubtime2)
78 values(%s, %s, %s, %s, %s,%s,%s,%s)
79 """, (linkmd5id, item[ ' title '][0][4:-5], item[ ' content '][0], item[ ' pubtime '][0][16:-4],item[ ' link '][0], now,item[ ' pubtime '][0][-14:-4], item[ ' pubtime '][0][-14:-4]))
80 # print """
81 # insert into tp_news_2(linkmd5id, title, description, link, listUrl, updated)
82 # values(%s, %s, %s, %s, %s, %s)
83 # """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
84 # 获取url的md5编码
85 def _get_linkmd5id(self, item):
86 # url进行md5处理,为避免重复采集设计
87 s=md5(item[ ' link '][0]).hexdigest()
88 # print (s)
89 # print(md5(item['link']).hexdigest())
90 return s
91 # 异常处理
92 def _handle_error(self, failue, item, spider):
93 log.err(failure)
items.py
1
#
-*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class DmozItem(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 pubtime=scrapy.Field()
15 title=scrapy.Field()
16 link=scrapy.Field()
17 desc=scrapy.Field()
18 content=scrapy.Field()
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class DmozItem(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 pubtime=scrapy.Field()
15 title=scrapy.Field()
16 link=scrapy.Field()
17 desc=scrapy.Field()
18 content=scrapy.Field()
19 id=scrapy.Field()
spiders.py
1
from scrapy.spider
import BaseSpider
2 from scrapy.selector import HtmlXPathSelector
3 from tutorial.items import DmozItem
4 from scrapy.http import Request
5 from scrapy.utils.response import get_base_url
6 from scrapy.utils.url import urljoin_rfc
7 from urllib2 import urlopen
8 from BeautifulSoup import BeautifulSoup
9
10 from scrapy.spiders import CrawlSpider
11 from scrapy.loader import ItemLoader
12 from scrapy.linkextractors.sgml import SgmlLinkExtractor
13
14
15 import scrapy
16 class DmozSpider(BaseSpider):
17 name = " dmoz "
18 allowed_domains = [ " dmoz.org "]
19 start_urls = [
20 " http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ ",
21 " http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ "
22 ]
23 def parse(self, response):
24 # filename = response.url.split("/")[-2]
25 # open(filename, 'wb').write(response.body)
26 hxs=HtmlXPathSelector(response)
27 sites=hxs.select( ' //ul/li ')
28 items=[]
29 for site in sites:
30 item=DmozItem()
31 item[ ' title ']=site.select( ' a/text() ').extract()
32 item[ ' link ']=site.select( ' a/@href ').extract()
33 item[ ' desc ']=site.select( ' text() ').extract()
34 items.append(item)
35 return items
36
37 class DmozSpider2(BaseSpider):
38 name = " dmoz2 "
39 allowed_domains = [ " 10.60.32.179 "]
40 start_urls = [
41 " http://10.60.32.179/Site/Site1/myindex.shtml ",
42 # "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
43 ]
44 def parse(self, response):
45 # filename = response.url.split("/")[-2]
46 # open(filename, 'wb').write(response.body)
47 hxs=HtmlXPathSelector(response)
48 sites=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
49 items=[]
50 for site in sites:
51 item=DmozItem()
52 item[ ' date ']=site.select( ' span/text() ').extract()
53 item[ ' title ']=site.select( ' a/text() ').extract()
54 item[ ' link ']=site.select( ' a/@href ').extract()
55 item[ ' desc ']=site.select( ' text() ').extract()
56 items.append(item)
57 return items
58
59
60 class MySpider(BaseSpider):
61 name = " myspider "
62 allowed_domains = [ " 10.60.32.179 "]
63 start_urls = [
64 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
65 # 'http://example.com/page2',
66 ]
67 def parse(self, response):
68 # collect `item_urls`
69 hxs=HtmlXPathSelector(response)
70 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
71 base_url = get_base_url(response)
72 items=[]
73 for item_url in item_urls:
74
75 yield Request(url=response.url, callback=self.parse_item,meta={ ' items ': items})
76
77 def parse_item(self, response):
78 hxs=HtmlXPathSelector(response)
79 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
80
81 item=DmozItem()
82 items=response.meta[ ' items ']
83 item[ ' date ']=item_urls.select( ' span/text() ').extract()
84 item[ ' title ']=item_urls.select( ' a/text() ').extract()
85 item[ ' link ']=item_urls.select( ' a/@href ').extract()
86 item[ ' desc ']=item_urls.select( ' text() ').extract()
87
88 # item_details_url=item['link']
89 # populate `item` fields
90 relative_url=item_urls.select( ' a/@href ').extract()
91 print(relative_url[0])
92 base_url = get_base_url(response)
93 item_details_url=urljoin_rfc(base_url, relative_url[0])
94 yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={ ' item ':item, ' items ':items})
95
96 def parse_details(self, response):
97 # item = response.meta['item']
98 # populate more `item` fields
99 print( " ***********************In parse_details()*************** ")
100 hxs=HtmlXPathSelector(response)
101 print( " ------------------------------- ")
102 print(response.url)
103 item_detail=hxs.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
104 print( " ________________ ",item_detail)
105 item=response.meta[ ' item ']
106 item[ ' detail ']=item_detail
107 items=response.meta[ ' items ']
108 items.append[item]
109 return items
110
111
112
113
114 class DmozSpider3(BaseSpider):
115 name = " dmoz3 "
116 allowed_domains = [ " 10.60.32.179 "]
117 start_urls = [
118 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
119 ]
120
121 def parse(self, response):
122 hxs=HtmlXPathSelector(response)
123 sites=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
124 items=[]
125 for site in sites:
126 item=DmozItem()
127 item[ ' date ']=site.select( ' span/text() ').extract()
128 item[ ' title ']=site.select( ' a/text() ').extract()
129 item[ ' link ']=site.select( ' a/@href ').extract()
130 item[ ' desc ']=site.select( ' text() ').extract()
131
132 print(item[ ' link '][0])
133 base_url = get_base_url(response)
134 relative_url=item[ ' link '][0]
135 item_details_url=urljoin_rfc(base_url, relative_url)
136 print( " ********************* ",item_details_url)
137 # response2=BeautifulSoup(urlopen(item_details_url).read())
138 response2=scrapy.http.Response(item_details_url)
139 hxs2=HtmlXPathSelector(response2)
140 item[ ' detail ']=hxs2.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
141
142 items.append(item)
143 return items
144
145
146
147
148 class MySpider5(BaseSpider):
149 name = " myspider5 "
150 allowed_domains = [ " 10.60.32.179 "]
151 start_urls = [
152 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
153 # 'http://example.com/page2',
154 ]
155
156 items=[]
157 item=DmozItem()
158
159 def parse(self, response):
160 # collect `item_urls`
161 hxs=HtmlXPathSelector(response)
162 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
163
164
165
166 base_url = get_base_url(response)
167
168
169 for item_url in item_urls:
170
171
172
173
174 MySpider5.item[ ' date ']=item_url.select( ' span/text() ').extract()
175 MySpider5.item[ ' title ']=item_url.select( ' a/text() ').extract()
176 MySpider5.item[ ' link ']=item_url.select( ' a/@href ').extract()
177 MySpider5.item[ ' desc ']=item_url.select( ' text() ').extract()
178
179
180
181 relative_url=MySpider5.item[ ' link ']
182 print(relative_url[0])
183 base_url = get_base_url(response)
184 item_details_url=urljoin_rfc(base_url, relative_url[0])
185 print ' xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx= ', str(item_details_url)
186
187 yield Request(url=item_details_url, callback=self.parse_details)
188
189
190
191
192 # def parse_item(self, response):
193 # hxs=HtmlXPathSelector(response)
194 # item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
195
196
197
198 # # item_details_url=item['link']
199 # # populate `item` fields
200 # relative_url=item_urls.select('a/@href').extract()
201 # print(relative_url[0])
202 # base_url = get_base_url(response)
203 # item_details_url=urljoin_rfc(base_url, relative_url[0])
204 # print 'item urls============================================================='
205 # yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
206
207 def parse_details(self, response):
208 # item = response.meta['item']
209 # populate more `item` fields
210
211
212 print( " ***********************In parse_details()*************** ")
213 hxs=HtmlXPathSelector(response)
214 print( " ---------------------------------------------------------------- ")
215 print(response.url)
216 item_detail=hxs.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
217 print( " ________________ ",item_detail)
218 # item=response.meta['item']
219 # item['detail']=item_detail
220
221 # items.append(item)
222 MySpider5.item[ ' detail ']=item_detail
223 # item['detail']=item_detail
224
225 MySpider5.items.append(MySpider5.item)
226
227
228
229 return MySpider5.item
230
231
232 def parse_details2(self, response):
233 # item = response.meta['item']
234 # populate more `item` fields
235 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
236 url=str(response.url)
237 bbsItem_loader.add_value( ' title ',item[ ' title '])
238 abc={
239 ' detail ': ' /html/body/center/div/div[4]/div[1]/p[1] '}
240 bbsItem_loader.add_xpath( ' detail ',abc[ ' detail '])
241 return bbsItem_loader.load_item()
242
243
244
245 class MySpider6(CrawlSpider):
246 name = " myspider6 "
247 allowed_domains = [ " 10.60.32.179 "]
248 start_urls = [
249 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
250 # 'http://example.com/page2',
251 ]
252 link_extractor={
253 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
254 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
255 ' page ':SgmlLinkExtractor(allow= ' /Article/\w+\/\w+\.shtml$ '),
256 }
257
258 _x_query={
259 ' date ': ' span/text() ',
260 ' date2 ': ' /html/body/center/div/div[4]/p ',
261 ' title ': ' a/text() ',
262 ' title2 ': ' /html/body/center/div/div[4]/h2 '
263 }
264 _y_query={
265 ' detail ': ' /html/body/center/div/div[4]/div[1]/p[1] ',
266 }
267
268 def parse(self,response):
269 self.t=0
270 for link in self.link_extractor[ ' page '].extract_links(response):
271 yield Request(url=link.url,callback=self.parse_content)
272 self.t=self.t+1
273
274
275
276 def parse_content(self,response):
277 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
278 url=str(response.url)
279 bbsItem_loader.add_value( ' desc ',url)
280 bbsItem_loader.add_value( ' link ',url)
281 bbsItem_loader.add_xpath( ' title ',self._x_query[ ' title2 '])
282 bbsItem_loader.add_xpath( ' pubtime ',self._x_query[ ' date2 '])
283 bbsItem_loader.add_xpath( ' content ',self._y_query[ ' detail '])
284 bbsItem_loader.add_value( ' id ',self.t) # why not useful?
285 return bbsItem_loader.load_item()
286
287
288 class MySpider6SD(CrawlSpider):
289 name = " myspider6sd "
290 allowed_domains = [ " 10.60.7.45 "]
291 start_urls = [
292 ' http://10.60.7.45/SITE_sdyc_WEB/Site1219/index.shtml ',
293 # 'http://example.com/page2',
294 ]
295 link_extractor={
296 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
297 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
298 ' page ':SgmlLinkExtractor(allow= ' /Article/\w+\/\w+\.shtml$ '),
299 # http://10.60.32.179/Site/Col411/Article/201510/35770_2015_10_29_8058797.shtml
300 # http://10.60.7.45/SITE_sdyc_WEB/Col1527/Article/201510/sdnw_2110280_2015_10_29_91353216.shtml
301 }
302
303 _x_query={
304 ' date ': ' span/text() ',
305 ' date2 ': ' /html/body/center/div/div[4]/p ',
306
307 ' title ': ' a/text() ',
308 # 'title2':'/html/body/center/div/div[4]/h2'
309 ' title2 ': ' /html/body/div[4]/div[1]/div[2]/div[1]/h1[2]/font '
310 # 'author':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[1]'
311 # 'pubtime2':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[2]'
312
313 }
314 _y_query={
315 # 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
316 ' detail ': ' //*[@id="Zoom"] '
317 }
318
319 def parse(self,response):
320 self.t=0
321 for link in self.link_extractor[ ' page '].extract_links(response):
322 yield Request(url=link.url,callback=self.parse_content)
323 self.t=self.t+1
324
325
326
327 def parse_content(self,response):
328 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
329 url=str(response.url)
330 bbsItem_loader.add_value( ' desc ',url)
331 bbsItem_loader.add_value( ' link ',url)
332 bbsItem_loader.add_xpath( ' title ',self._x_query[ ' title2 '])
333 bbsItem_loader.add_xpath( ' pubtime ',self._x_query[ ' date2 '])
334 bbsItem_loader.add_xpath( ' content ',self._y_query[ ' detail '])
335 bbsItem_loader.add_value( ' id ',self.t) # why not useful?
2 from scrapy.selector import HtmlXPathSelector
3 from tutorial.items import DmozItem
4 from scrapy.http import Request
5 from scrapy.utils.response import get_base_url
6 from scrapy.utils.url import urljoin_rfc
7 from urllib2 import urlopen
8 from BeautifulSoup import BeautifulSoup
9
10 from scrapy.spiders import CrawlSpider
11 from scrapy.loader import ItemLoader
12 from scrapy.linkextractors.sgml import SgmlLinkExtractor
13
14
15 import scrapy
16 class DmozSpider(BaseSpider):
17 name = " dmoz "
18 allowed_domains = [ " dmoz.org "]
19 start_urls = [
20 " http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ ",
21 " http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ "
22 ]
23 def parse(self, response):
24 # filename = response.url.split("/")[-2]
25 # open(filename, 'wb').write(response.body)
26 hxs=HtmlXPathSelector(response)
27 sites=hxs.select( ' //ul/li ')
28 items=[]
29 for site in sites:
30 item=DmozItem()
31 item[ ' title ']=site.select( ' a/text() ').extract()
32 item[ ' link ']=site.select( ' a/@href ').extract()
33 item[ ' desc ']=site.select( ' text() ').extract()
34 items.append(item)
35 return items
36
37 class DmozSpider2(BaseSpider):
38 name = " dmoz2 "
39 allowed_domains = [ " 10.60.32.179 "]
40 start_urls = [
41 " http://10.60.32.179/Site/Site1/myindex.shtml ",
42 # "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
43 ]
44 def parse(self, response):
45 # filename = response.url.split("/")[-2]
46 # open(filename, 'wb').write(response.body)
47 hxs=HtmlXPathSelector(response)
48 sites=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
49 items=[]
50 for site in sites:
51 item=DmozItem()
52 item[ ' date ']=site.select( ' span/text() ').extract()
53 item[ ' title ']=site.select( ' a/text() ').extract()
54 item[ ' link ']=site.select( ' a/@href ').extract()
55 item[ ' desc ']=site.select( ' text() ').extract()
56 items.append(item)
57 return items
58
59
60 class MySpider(BaseSpider):
61 name = " myspider "
62 allowed_domains = [ " 10.60.32.179 "]
63 start_urls = [
64 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
65 # 'http://example.com/page2',
66 ]
67 def parse(self, response):
68 # collect `item_urls`
69 hxs=HtmlXPathSelector(response)
70 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
71 base_url = get_base_url(response)
72 items=[]
73 for item_url in item_urls:
74
75 yield Request(url=response.url, callback=self.parse_item,meta={ ' items ': items})
76
77 def parse_item(self, response):
78 hxs=HtmlXPathSelector(response)
79 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
80
81 item=DmozItem()
82 items=response.meta[ ' items ']
83 item[ ' date ']=item_urls.select( ' span/text() ').extract()
84 item[ ' title ']=item_urls.select( ' a/text() ').extract()
85 item[ ' link ']=item_urls.select( ' a/@href ').extract()
86 item[ ' desc ']=item_urls.select( ' text() ').extract()
87
88 # item_details_url=item['link']
89 # populate `item` fields
90 relative_url=item_urls.select( ' a/@href ').extract()
91 print(relative_url[0])
92 base_url = get_base_url(response)
93 item_details_url=urljoin_rfc(base_url, relative_url[0])
94 yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={ ' item ':item, ' items ':items})
95
96 def parse_details(self, response):
97 # item = response.meta['item']
98 # populate more `item` fields
99 print( " ***********************In parse_details()*************** ")
100 hxs=HtmlXPathSelector(response)
101 print( " ------------------------------- ")
102 print(response.url)
103 item_detail=hxs.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
104 print( " ________________ ",item_detail)
105 item=response.meta[ ' item ']
106 item[ ' detail ']=item_detail
107 items=response.meta[ ' items ']
108 items.append[item]
109 return items
110
111
112
113
114 class DmozSpider3(BaseSpider):
115 name = " dmoz3 "
116 allowed_domains = [ " 10.60.32.179 "]
117 start_urls = [
118 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
119 ]
120
121 def parse(self, response):
122 hxs=HtmlXPathSelector(response)
123 sites=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
124 items=[]
125 for site in sites:
126 item=DmozItem()
127 item[ ' date ']=site.select( ' span/text() ').extract()
128 item[ ' title ']=site.select( ' a/text() ').extract()
129 item[ ' link ']=site.select( ' a/@href ').extract()
130 item[ ' desc ']=site.select( ' text() ').extract()
131
132 print(item[ ' link '][0])
133 base_url = get_base_url(response)
134 relative_url=item[ ' link '][0]
135 item_details_url=urljoin_rfc(base_url, relative_url)
136 print( " ********************* ",item_details_url)
137 # response2=BeautifulSoup(urlopen(item_details_url).read())
138 response2=scrapy.http.Response(item_details_url)
139 hxs2=HtmlXPathSelector(response2)
140 item[ ' detail ']=hxs2.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
141
142 items.append(item)
143 return items
144
145
146
147
148 class MySpider5(BaseSpider):
149 name = " myspider5 "
150 allowed_domains = [ " 10.60.32.179 "]
151 start_urls = [
152 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
153 # 'http://example.com/page2',
154 ]
155
156 items=[]
157 item=DmozItem()
158
159 def parse(self, response):
160 # collect `item_urls`
161 hxs=HtmlXPathSelector(response)
162 item_urls=hxs.select( ' //*[@id="_ctl0_LblContent"]/div/div//ul/li ')
163
164
165
166 base_url = get_base_url(response)
167
168
169 for item_url in item_urls:
170
171
172
173
174 MySpider5.item[ ' date ']=item_url.select( ' span/text() ').extract()
175 MySpider5.item[ ' title ']=item_url.select( ' a/text() ').extract()
176 MySpider5.item[ ' link ']=item_url.select( ' a/@href ').extract()
177 MySpider5.item[ ' desc ']=item_url.select( ' text() ').extract()
178
179
180
181 relative_url=MySpider5.item[ ' link ']
182 print(relative_url[0])
183 base_url = get_base_url(response)
184 item_details_url=urljoin_rfc(base_url, relative_url[0])
185 print ' xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx= ', str(item_details_url)
186
187 yield Request(url=item_details_url, callback=self.parse_details)
188
189
190
191
192 # def parse_item(self, response):
193 # hxs=HtmlXPathSelector(response)
194 # item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
195
196
197
198 # # item_details_url=item['link']
199 # # populate `item` fields
200 # relative_url=item_urls.select('a/@href').extract()
201 # print(relative_url[0])
202 # base_url = get_base_url(response)
203 # item_details_url=urljoin_rfc(base_url, relative_url[0])
204 # print 'item urls============================================================='
205 # yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
206
207 def parse_details(self, response):
208 # item = response.meta['item']
209 # populate more `item` fields
210
211
212 print( " ***********************In parse_details()*************** ")
213 hxs=HtmlXPathSelector(response)
214 print( " ---------------------------------------------------------------- ")
215 print(response.url)
216 item_detail=hxs.select( ' /html/body/center/div/div[4]/div[1]/p[1] ').extract()
217 print( " ________________ ",item_detail)
218 # item=response.meta['item']
219 # item['detail']=item_detail
220
221 # items.append(item)
222 MySpider5.item[ ' detail ']=item_detail
223 # item['detail']=item_detail
224
225 MySpider5.items.append(MySpider5.item)
226
227
228
229 return MySpider5.item
230
231
232 def parse_details2(self, response):
233 # item = response.meta['item']
234 # populate more `item` fields
235 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
236 url=str(response.url)
237 bbsItem_loader.add_value( ' title ',item[ ' title '])
238 abc={
239 ' detail ': ' /html/body/center/div/div[4]/div[1]/p[1] '}
240 bbsItem_loader.add_xpath( ' detail ',abc[ ' detail '])
241 return bbsItem_loader.load_item()
242
243
244
245 class MySpider6(CrawlSpider):
246 name = " myspider6 "
247 allowed_domains = [ " 10.60.32.179 "]
248 start_urls = [
249 ' http://10.60.32.179/Site/Site1/myindex.shtml ',
250 # 'http://example.com/page2',
251 ]
252 link_extractor={
253 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
254 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
255 ' page ':SgmlLinkExtractor(allow= ' /Article/\w+\/\w+\.shtml$ '),
256 }
257
258 _x_query={
259 ' date ': ' span/text() ',
260 ' date2 ': ' /html/body/center/div/div[4]/p ',
261 ' title ': ' a/text() ',
262 ' title2 ': ' /html/body/center/div/div[4]/h2 '
263 }
264 _y_query={
265 ' detail ': ' /html/body/center/div/div[4]/div[1]/p[1] ',
266 }
267
268 def parse(self,response):
269 self.t=0
270 for link in self.link_extractor[ ' page '].extract_links(response):
271 yield Request(url=link.url,callback=self.parse_content)
272 self.t=self.t+1
273
274
275
276 def parse_content(self,response):
277 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
278 url=str(response.url)
279 bbsItem_loader.add_value( ' desc ',url)
280 bbsItem_loader.add_value( ' link ',url)
281 bbsItem_loader.add_xpath( ' title ',self._x_query[ ' title2 '])
282 bbsItem_loader.add_xpath( ' pubtime ',self._x_query[ ' date2 '])
283 bbsItem_loader.add_xpath( ' content ',self._y_query[ ' detail '])
284 bbsItem_loader.add_value( ' id ',self.t) # why not useful?
285 return bbsItem_loader.load_item()
286
287
288 class MySpider6SD(CrawlSpider):
289 name = " myspider6sd "
290 allowed_domains = [ " 10.60.7.45 "]
291 start_urls = [
292 ' http://10.60.7.45/SITE_sdyc_WEB/Site1219/index.shtml ',
293 # 'http://example.com/page2',
294 ]
295 link_extractor={
296 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
297 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
298 ' page ':SgmlLinkExtractor(allow= ' /Article/\w+\/\w+\.shtml$ '),
299 # http://10.60.32.179/Site/Col411/Article/201510/35770_2015_10_29_8058797.shtml
300 # http://10.60.7.45/SITE_sdyc_WEB/Col1527/Article/201510/sdnw_2110280_2015_10_29_91353216.shtml
301 }
302
303 _x_query={
304 ' date ': ' span/text() ',
305 ' date2 ': ' /html/body/center/div/div[4]/p ',
306
307 ' title ': ' a/text() ',
308 # 'title2':'/html/body/center/div/div[4]/h2'
309 ' title2 ': ' /html/body/div[4]/div[1]/div[2]/div[1]/h1[2]/font '
310 # 'author':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[1]'
311 # 'pubtime2':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[2]'
312
313 }
314 _y_query={
315 # 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
316 ' detail ': ' //*[@id="Zoom"] '
317 }
318
319 def parse(self,response):
320 self.t=0
321 for link in self.link_extractor[ ' page '].extract_links(response):
322 yield Request(url=link.url,callback=self.parse_content)
323 self.t=self.t+1
324
325
326
327 def parse_content(self,response):
328 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
329 url=str(response.url)
330 bbsItem_loader.add_value( ' desc ',url)
331 bbsItem_loader.add_value( ' link ',url)
332 bbsItem_loader.add_xpath( ' title ',self._x_query[ ' title2 '])
333 bbsItem_loader.add_xpath( ' pubtime ',self._x_query[ ' date2 '])
334 bbsItem_loader.add_xpath( ' content ',self._y_query[ ' detail '])
335 bbsItem_loader.add_value( ' id ',self.t) # why not useful?
336 return bbsItem_loader.load_item()