python开发:scrapy爬虫(三)

上文已实现了简单的抓取文章的功能,这里进一步改进

修改scrapytutorial目录的items.py文件

1
2
3
4
5
6
7
8
9
import scrapy


class ScrapytutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
# pass

修改spiders目录blog_spider.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

from ..items import ScrapytutorialItem


class BlogSpider(scrapy.Spider):
name = 'blog'
start_urls = {
'https://www.cnblogs.com/chenying99/'
}
def parse(self, response):

items = ScrapytutorialItem()
container = response.css('#main')[0]
posts = container.css('div.post')
for article in posts:

title = article.css('a.postTitle2::text').extract_first().strip()
link = article.css('a.postTitle2::attr(href)').extract_first()
items['title'] = title
items['link'] = link
yield items

重新运行scrapy crawl blog

坚持原创技术分享,您的支持是我前进的动力!