本文实现scrapy的分页采集功能
分页方式一:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from ..items import ScrapytutorialItem
class BlogSpider(scrapy.Spider):
name = 'blog'
start_urls = {
'https://www.cnblogs.com/chenying99/'
}
def parse(self, response):
items = ScrapytutorialItem()
container = response.css('#main')[0]
posts = container.css('div.post')
for article in posts:
title = article.css('a.postTitle2::text').extract_first().strip()
link = article.css('a.postTitle2::attr(href)').extract_first()
items['title'] = title
items['link'] = link
yield items
next_page = response.css('#homepage_bottom_pager > div > a:nth-child(8)::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
分页方式二:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from ..items import ScrapytutorialItem
class BlogSpider(scrapy.Spider):
name = 'blog'
page_number = 2
start_urls = {
'https://www.cnblogs.com/chenying99/default.html?page=1'
}
def parse(self, response):
items = ScrapytutorialItem()
container = response.css('#main')[0]
posts = container.css('div.post')
for article in posts:
title = article.css('a.postTitle2::text').extract_first().strip()
link = article.css('a.postTitle2::attr(href)').extract_first()
items['title'] = title
items['link'] = link
yield items
next_page = 'https://www.cnblogs.com/chenying99/default.html?page='+str(BlogSpider.page_number)
if BlogSpider.page_number < 10:
BlogSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)