From 1f87d0de06f5f41fb0c337a3f375a4d80344f93c Mon Sep 17 00:00:00 2001 From: JLUVicent <17390955615@163.com> Date: Thu, 16 Sep 2021 22:36:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B3=A8=E6=84=8F=E4=BD=BF=E7=94=A8=E5=A4=9A?= =?UTF-8?q?=E9=A1=B5=E7=88=AC=E5=8F=96=E7=9A=84=E6=96=B9=E6=B3=95.Request?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E8=AF=B7=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scrapy_dangdang_040/spiders/dang.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/scrapy_dangdang_040/scrapy_dangdang_040/spiders/dang.py b/scrapy_dangdang_040/scrapy_dangdang_040/spiders/dang.py index 55cf4e8..0b037eb 100644 --- a/scrapy_dangdang_040/scrapy_dangdang_040/spiders/dang.py +++ b/scrapy_dangdang_040/scrapy_dangdang_040/spiders/dang.py @@ -4,9 +4,14 @@ class DangSpider(scrapy.Spider): name = 'dang' - allowed_domains = ['http://category.dangdang.com/cp01.01.02.00.00.00.html'] + + # 如果是多页下载必须调整allowed_domains的范围,一般情况下只写域名 + allowed_domains = ['category.dangdang.com'] start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html'] + base_url = 'http://category.dangdang.com/pg' + page = 1 + def parse(self, response): print("-----------------------------") @@ -36,4 +41,18 @@ def parse(self, response): # 获取一个book就将book交给pipelines yield book - pass + + +# 多页爬取 +# 每一页爬取的业务逻辑都是一样的,只需要将执行页请求再次调用parse方法即可 +# http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html +# http://category.dangdang.com/pg3-cp01.01.02.00.00.00.html + + if self.page < 100: + self.page = self.page+1 + url = self.base_url+str(self.page)+'-cp01.01.02.00.00.00.html' + + # 如何调用parse方法 + # scrapy.Request就是scrapy的get请求 + # url就是请求地址 callback是要执行的函数,不需要加圆括号 + yield scrapy.Request(url=url, callback=self.parse)