本文实现详情页数据的采集功能
首先修改items.py文件,新增ArticleItem类1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16import scrapy
class ScrapytutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
# pass
class ArticleItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
update = scrapy.Field()
cate = scrapy.Field()
修改blog_spider.py文件1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from ..items import ScrapytutorialItem
from ..items import ArticleItem
class BlogSpider(scrapy.Spider):
name = 'blog'
start_urls = {
'https://www.cnblogs.com/chenying99'
}
def parse(self, response):
items = ScrapytutorialItem()
container = response.css('#main')[0]
posts = container.css('div.post')
for article in posts:
title = article.css('a.postTitle2::text').extract_first().strip()
link = article.css('a.postTitle2::attr(href)').extract_first()
items['title'] = title
items['link'] = link
url = response.urljoin(link)
yield scrapy.Request(url=url, callback=self.parse_details)
next_page = response.css('#homepage_bottom_pager > div > a:nth-child(8)::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_details(self, response):
items = ArticleItem()
title = response.css('#cb_post_title_url::text').extract_first(default='not-found').strip()
link = response.css('#cb_post_title_url::attr(href)').extract_first().strip()
content = response.css('#cnblogs_post_body').extract_first().strip()
cate = response.css('#BlogPostCategory > a::text').get(default='not-found')
update = response.css('#post-date::text').extract_first().strip()
items['title'] = title
items['link'] = link
items['content'] = content
items['cate'] = cate
items['update'] = update
yield items
修改pipelines.py文件1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18import pymongo
from .items import ArticleItem
class ScrapytutorialPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient(
'127.0.0.1',
27017
)
db = self.conn['scrapy']
self.collection = db['article']
def process_item(self, item, spider):
if isinstance(item, ArticleItem):
return self.collection.insert(dict(item))
return item