失眠网 > Python——Scrapy爬取链家网站所有房源信息

Python——Scrapy爬取链家网站所有房源信息

时间：2023-06-19 12:18:21

用scrapy爬取链家全国以上房源分类的信息：

路径：

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass LianItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()passclass ErShouFangItem(scrapy.Item):# 省份province = scrapy.Field()# 城市city = scrapy.Field()# 总价total_price = scrapy.Field()# 单价single_price = scrapy.Field()# 楼层room_info = scrapy.Field()# 住宅位置region = scrapy.Field()# 房屋朝向及装修情况direction = scrapy.Field()# 建筑面积area = scrapy.Field()# 建筑类型house_struct = scrapy.Field()# 房屋户型huxing = scrapy.Field()# 购买时间buy_time = scrapy.Field()# urlershou_detail_url = scrapy.Field()class NewHouseItem(scrapy.Item):# 省份province = scrapy.Field()# 城市city = scrapy.Field()# 标题title = scrapy.Field()# 位置region = scrapy.Field()# 房屋信息room_info = scrapy.Field()# 建筑面积area = scrapy.Field()# 价格price = scrapy.Field()# 详情页newHouse_detail_url = scrapy.Field()class RentHouseItem(scrapy.Item):# 省份province = scrapy.Field()# 城市city = scrapy.Field()# 标题title = scrapy.Field()# 价格price = scrapy.Field()# 房间信息（房源户型、朝向、面积、租赁方式）house_info = scrapy.Field()# 发布时间pub_time = scrapy.Field()# 入住：in_time = scrapy.Field()# 租期lease = scrapy.Field()# 楼层floor = scrapy.Field()# 电梯：lift = scrapy.Field()# 车位：carport = scrapy.Field()# 用水：use_water = scrapy.Field()# 用电：use_electricity = scrapy.Field()# 燃气：use_gas = scrapy.Field()# urlrent_detail_url = scrapy.Field()class OfficeHouseItem(scrapy.Item):# 省份province = scrapy.Field()# 城市city = scrapy.Field()# 标题title = scrapy.Field()# 价格price = scrapy.Field()# 数量num = scrapy.Field()# 面积area = scrapy.Field()# urloffice_detail_url = scrapy.Field()class XiaoquHouseItem(scrapy.Item):# 省份province = scrapy.Field()# 城市city = scrapy.Field()# 标题title = scrapy.Field()# 地区region = scrapy.Field()# 单价single_price = scrapy.Field()# 建筑年代build_time = scrapy.Field()# 建筑类型house_struct = scrapy.Field()# 物业费用service_fees = scrapy.Field()# 物业公司service_company = scrapy.Field()# 开发商build_company = scrapy.Field()# 楼栋数building_nums = scrapy.Field()# 房屋总数house_nums = scrapy.Field()# urlxiaoqu_detail_url = scrapy.Field()

View Code

pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: /en/latest/topics/item-pipeline.htmlfrom scrapy.exporters import JsonLinesItemExporterfrom lian.items import ErShouFangItem, NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem # 已经导入成功，不用管class LianPipeline(object):def __init__(self):self.ershoufang_fp = open('ershoufang.json', 'wb')self.ershoufang_exporter = JsonLinesItemExporter(self.ershoufang_fp, ensure_ascii=False)self.newhouse_fp = open('newhouse.json', 'wb')self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)self.renthouse_fp = open('renthouse.json', 'wb')self.renthouse_exporter = JsonLinesItemExporter(self.renthouse_fp, ensure_ascii=False)self.officehouse_fp = open('officehouse.json', 'wb')self.officehouse_exporter = JsonLinesItemExporter(self.officehouse_fp, ensure_ascii=False)self.xiaoquhouse_fp = open('xiaoquhouse.json', 'wb')self.xiaoquhouse_exporter = JsonLinesItemExporter(self.xiaoquhouse_fp, ensure_ascii=False)def process_item(self, item, spider):if isinstance(item, ErShouFangItem):self.ershoufang_exporter.export_item(item)elif isinstance(item, NewHouseItem):self.newhouse_exporter.export_item(item)elif isinstance(item, RentHouseItem):self.renthouse_exporter.export_item(item)elif isinstance(item ,OfficeHouseItem):self.officehouse_exporter.export_item(item)else:self.xiaoquhouse_exporter.export_item(item)return itemdef close_spider(self, spider):self.ershoufang_fp.close()self.newhouse_fp.close()self.renthouse_fp.close()# self.officehouse_fp.closed()self.xiaoquhouse_fp.close()

View Code

lian_spider.py

# -*- coding: utf-8 -*-import scrapyimport refrom lian.items import ErShouFangItem,NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem # 已经导入成功，不用管class LianSpiderSpider(scrapy.Spider):name = 'lian_spider'allowed_domains = ['']start_urls = ['/city/']headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','Cookie': 'select_city=510700; lianjia_uuid=8bd3d017-2c99-49a5-826e-986f56ce99b9; _smt_uid=5cd3cd13.44c49764; UM_distinctid=16a9b59145a158-0442ba7704d667-3b654406-c0000-16a9b59146011e; _jzqckmp=1; _ga=GA1.2.822868133.1557384475; _gid=GA1.2.801531476.1557384475; all-lj=ed5a77c9e9ec3809d0c1321ec78803ae; lianjia_ssid=50fd11a7-d48c-4dde-b281-287224c40487; TY_SESSION_ID=ae45e1a4-b6d9-46bb-81c8-7cff32931953; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1557384618,1557389971,1557392984,1557446598; _jzqc=1; _jzqy=1.1557384468.1557446599.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.-; _qzjc=1; sensorsdatajssdkcross=%7B%22distinct_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22%24device_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _jzqa=1.1500973956232310800.1557384468.1557451920.1557454945.6; _jzqx=1.1557451920.1557454945.2.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/pag1/.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/; CNZZDATA1255604082=609852050-1557381958-https%253A%252F%%252F%7C1557455869; CNZZDATA1254525948=1645681089-1557382543-https%253A%252F%%252F%7C1557458144; CNZZDATA1255633284=262578687-1557381275-https%253A%252F%%252F%7C1557458627; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1557459240; _qzja=1.677427564.1557384472885.155745198.1557454945305.155745951.1557459240226.0.0.0.62.6; _qzjb=1.1557454945305.13.0.0.0; _qzjto=33.3.0; _jzqb=1.13.10.1557454945.1'}# 每个城市def parse(self, response):lis = response.xpath('//div[@class="city_list_section"]/ul/li')city_links = []for li in lis:province = li.xpath('.//div[@class="city_list_tit c_b"]/text()').extract_first()# print(province)lis2 = li.xpath('.//div[@class="city_province"]/ul/li')city_info = {}for l in lis2:city_info['city'] = l.xpath('./a/text()').extract_first()city_info['city_link'] = l.xpath('./a/@href').extract_first()city_links.append(city_info)# print(city_info)yield scrapy.Request(url=city_info['city_link'],headers=self.headers,callback=self.parse_rent_type,meta={'city_name': (province,city_info['city'])})# 爬取海外房源，所有城市标题上房有海外房源的信息均为一致，所以只爬取一次# yield scrapy.Request(#url='/us',#headers=self.headers,#callback=self.parse_haiwai# )# 类型（二手房，新房，租房，商业办公，小区）def parse_rent_type(self, response):province,city_name = response.meta.get('city_name')lis = response.xpath('//div[@class="nav typeUserInfo"]/ul/li')for li in lis:type = li.xpath('./a/text()').extract_first()if type == '二手房':ershoufang_link = li.xpath('./a/@href').extract_first()# print("city：{}————————{}".format(city_name,ershoufang_link))next_urls = [ershoufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]i = 0for url in next_urls:i = i+1yield scrapy.Request(url=url,headers=self.headers,callback=self.parse_ershoufang,meta={'city_name': (province,city_name,i)})# 不好找页码elif type == '新房':xinfang_link = li.xpath('./a/@href').extract_first()xinfang_link = xinfang_link + '/loupan/'yield scrapy.Request(url=xinfang_link,headers=self.headers,callback=self.parse_xinfang,meta={'city_name': (province,city_name)})elif type == '租房':zufang_link = li.xpath('./a/@href').extract_first()next_urls = [zufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]i = 0for url in next_urls:i = i + 1yield scrapy.Request(url=url,headers=self.headers,callback=self.parse_zufang,meta={'city_name': (url,province,city_name,i)})# 不好找页码elif type == '商业办公':#TODO 有一个重定向，只会爬取一页shangyebangong_link = li.xpath('./a/@href').extract_first()shangyebangong_link = str(shangyebangong_link)+"/xzl/rent/mlist"# print(province, city_name,shangyebangong_link)if shangyebangong_link == None:continueyield scrapy.Request(url=shangyebangong_link,headers=self.headers,callback=self.parse_shangyebangong,meta={'city_name': (province,city_name)})# 不好找页码elif type == '小区':xiaoqu_link = li.xpath('./a/@href').extract_first()yield scrapy.Request(url=xiaoqu_link,headers=self.headers,callback=self.parse_xiaoqu,meta={'city_name': (province,city_name)})# 获取二手房主页item+def parse_ershoufang(self, response):province,city_name,i = response.meta.get('city_name')lis = response.xpath('//ul[@class="sellListContent"]/li')for li in lis:ershou_detail_link = li.xpath('.//div[@class="title"]/a/@href').extract_first()# 注意有的房屋信息为Noneif ershou_detail_link == None:continue# print("{}——————{}".format(city_name,ershou_detail_link))yield scrapy.Request(url=ershou_detail_link,headers=self.headers,callback=self.parse_ershoufang_detail,meta={'city_name': (ershou_detail_link,province,city_name,i)})# 二手房item详情页def parse_ershoufang_detail(self, response):ershou_detail_link,province,city_name,i = response.meta.get('city_name')title = response.xpath('//div[@class="sellDetailHeader"]//div[@class="title"]/h1/text()').extract_first()# print("***第{}页*** 城市：{} 二手房标题：{}".format(i,city_name, title))total_price = response.xpath('//div[@class="price "]/span[@class="total"]/text()').extract_first() + str(response.xpath('//div[@class="price "]/span[@class="unit"]/span/text()').extract_first()).strip()single_price = response.xpath('//span[@class="unitPriceValue"]/text()').extract_first() + str(response.xpath('//span[@class="unitPriceValue"]/i/text()').extract_first())room_info = response.xpath('//div[@class="room"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="room"]/div[2]/text()').extract_first()region = response.xpath('//div[@class="areaName"]/span[@class="info"]/a[1]/text()').extract_first() + '-' + response.xpath('//div[@class="areaName"]/span[@class="info"]/a[2]/text()').extract_first()direction = response.xpath('//div[@class="type"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="type"]/div[2]/text()').extract_first()area = response.xpath('//div[@class="area"]/div[1]/text()').extract_first()house_struct = response.xpath('//div[@class="area"]/div[2]/text()').extract_first()huxing = response.xpath('//div[@class="introContent"]/div[1]/div[2]/ul/li[1]/text()').extract_first()buy_time = response.xpath('//div[@class="transaction"]/div[2]/ul/li[3]/span[2]/text()').extract_first()print("***第{}页*** 城市：{} 二手房标题：{} 总价：{} 单价：{} 楼层：{} 住宅位置：{} 房屋朝向：{} 建筑面积：{} 建筑类型：{} 房屋户型：{} 购买时间：{}".format(i, city_name, title,total_price,single_price,room_info,region,direction,area,house_struct,huxing,buy_time))item = ErShouFangItem(province = province,city = city_name,total_price = total_price,single_price = single_price,room_info = room_info,region = region,direction = direction,area = area,house_struct = house_struct,huxing = huxing,buy_time = buy_time,ershou_detail_url = ershou_detail_link)yield item# 新房楼盘主页def parse_xinfang(self, response):province,city_name = response.meta.get('city_name')lis = response.xpath('//ul[@class="resblock-list-wrapper"]/li')for li in lis:title = li.xpath('./a[@class="resblock-img-wrapper "]/@title').extract_first()region_infos = li.xpath('.//div[@class="resblock-location"]//text()').extract()region = ''for i in region_infos:region = region + i.replace('\n', '').strip(' ')room_infos = li.xpath('.//a[@class="resblock-room"]/span//text()').extract()room_info = ''for i in room_infos:room_info = room_info + i.strip(' ')area_infos = li.xpath('.//div[@class="main-price"]/span//text()').extract()area = ''for i in area_infos:area = area + i.strip(' ')# 加上单位并去除首尾空格price = li.xpath('.//div[@class="main-price"]/span[1]/text()').extract_first() + str(li.xpath('.//div[@class="main-price"]/span[2]/text()').extract_first()).strip()newhouse_detail_url = 'https://bj.'+str(li.xpath('./a[@class="resblock-img-wrapper "]/@href').extract_first())print("城市：{} 新房 {} {}".format(city_name,title, newhouse_detail_url))item = NewHouseItem(province=province,city = city_name,title = title,region = region,room_info = room_info,area = area,price = price,newHouse_detail_url = newhouse_detail_url)yield item# 租房首页def parse_zufang(self, response):zufang_link, province, city_name, i = response.meta.get('city_name')# 去掉链接pg页码信息# print("去掉之前：{}".format(zufang_link))zufang_link = re.findall('(.*?)/zufang//pg\d+/',zufang_link)[0]items = response.xpath('//div[@class="content__list"]/div')for zu in items:zufang_detail_link = zufang_link + str(zu.xpath('./a[@class="content__list--item--aside"]/@href').extract_first())# 注意有的房屋信息为Noneif zufang_detail_link == None:continue# print("{}——————{}".format(city_name,zufang_detail_link))yield scrapy.Request(url=zufang_detail_link,headers=self.headers,callback=self.parse_zufang_detail,meta={'city_name': (zufang_detail_link,province,city_name,i)})# 租房信息详情def parse_zufang_detail(self, response):zufang_detail_link, province, city_name, i = response.meta.get('city_name')title = response.xpath('//div[@class="content clear w1150"]/p/text()').extract_first()price = response.xpath('//div[@class="content__aside fr"]/p/span/text()').extract_first()house_infos = response.xpath('//ul[@class="content__aside__list"]/p//text()').extract()house_info = ''for i in house_infos:house_info = house_info + i.replace('\n','/').strip(' ')# 发布时间pub_time = str(response.xpath('string(//div[@class="content__subtitle"])').extract_first())pub_time = re.findall('\d{4}-\d{1,2}-\d{1,2}',pub_time)if pub_time:pub_time = pub_time[0]else:pub_time = None# 入住时间in_time = response.xpath('//div[@class="content__article__info"]/ul/li[3]/text()').extract_first()# 租期lease = response.xpath('//div[@class="content__article__info"]/ul/li[5]/text()').extract_first()# 楼层floor = response.xpath('//div[@class="content__article__info"]/ul/li[8]/text()').extract_first()# 是否有电梯lift = response.xpath('//div[@class="content__article__info"]/ul/li[9]/text()').extract_first()# 是否有停车位carport = response.xpath('//div[@class="content__article__info"]/ul/li[11]/text()').extract_first()use_water = response.xpath('//div[@class="content__article__info"]/ul/li[12]/text()').extract_first()use_electricity = response.xpath('//div[@class="content__article__info"]/ul/li[14]/text()').extract_first()use_gas = response.xpath('//div[@class="content__article__info"]/ul/li[15]/text()').extract_first()# print(" 城市：{} 租房 {} {} {} {} {} {} {}".format(city_name, lease,floor,lift,carport,use_water,use_electricity,use_gas))item = RentHouseItem(province = province,city = city_name,title = title,price = price,house_info = house_info,pub_time = pub_time,in_time = in_time,lease = lease,floor = floor,lift = lift,carport = carport,use_water = use_water,use_electricity = use_electricity,use_gas = use_gas,rent_detail_url = zufang_detail_link)yield itemprint("***第{}页*** 城市：{} 租房 {} {}".format(i, city_name, title, price))# 海外房源信息# def parse_haiwai(self,response):#items = response.xpath('//*[@id="env"]/div[4]/div/div[2]')#for i in items:# title = i.xpath('.//div[class="titles"]/a/div/text()').extract_first()# price = i.xpath('.//span[@class="fr"]/text()').extract_first()# print("城市：美国标题：{} 价格：{}".format(title,price))# 商业办公主页item详情def parse_shangyebangong(self, response):province, city_name = response.meta.get('city_name')items = response.xpath('//div[@class="result__ul"]/a')for i in items:office_detail_url = response.xpath('./@href')title = i.xpath('./div/p[@class="result__li-title"]/text()').extract_first()area = i.xpath('./div/p[@class="result__li-features"]/text()').extract_first()nums = i.xpath('./div/p[@class="result__li-other"]/text()').extract_first()price = i.xpath('./div/p[@class="result__li-price"]/span/text()').extract_first()item = OfficeHouseItem(province = province,city = city_name,title = title,price = price,num = nums,area = area,office_detail_url = office_detail_url)yield itemprint("城市：{} 商业办公标题：{} 面积：{} 数量：{} 价格：{} url:{}".format(city_name, title, area, nums, price, office_detail_url))# 小区主页itemdef parse_xiaoqu(self, response):province,city_name = response.meta.get('city_name')ul = response.xpath('//ul[@class="listContent"]/li')for li in ul:xiaoqu_detail_link = li.xpath('.//a[@class="img"]/@href').extract_first()if xiaoqu_detail_link == None:continueyield scrapy.Request(url=xiaoqu_detail_link,headers=self.headers,callback=self.parse_xiaoqu_detail,meta={'city_name': (xiaoqu_detail_link,province,city_name)})# 小区item详情def parse_xiaoqu_detail(self, response):xiaoqu_detail_link,province,city_name = response.meta.get('city_name')title = response.xpath('//h1[@class="detailTitle"]/text()').extract_first()region = response.xpath('//div[@class="detailDesc"]/text()').extract_first()single_price = response.xpath('//span[@class="xiaoquUnitPrice"]/text()').extract_first()# 注意有的房屋没有建成时间信息，影响后面值得获取，需要进行判断后准确取值build_time = str(response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()).strip()house_struct = Noneservice_fees = Nonepattern = pile('[0-9]+')if pattern.findall(build_time):build_time = build_timehouse_struct = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()service_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()build_company = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[7]/span[2]/text()').extract_first()else:build_time = Nonehouse_struct = response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()service_company = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()build_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()item = XiaoquHouseItem(province=province,city = city_name,title=title,region=region,single_price=single_price,build_time=build_time,house_struct=house_struct,service_fees=service_fees,service_company=service_company,build_company=build_company,building_nums=building_nums,house_nums=house_nums,xiaoqu_detail_url=xiaoqu_detail_link)yield itemprint("省份：{} 城市：{} 小区 {} {} {} {} {} {} {}".format(province, city_name, build_time,house_struct,service_fees,service_company,build_company,building_nums,house_nums))

View Code

settings.py

# -*- coding: utf-8 -*-# Scrapy settings for lian project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##/en/latest/topics/settings.html#/en/latest/topics/downloader-middleware.html#/en/latest/topics/spider-middleware.htmlBOT_NAME = 'lian'SPIDER_MODULES = ['lian.spiders']NEWSPIDER_MODULE = 'lian.spiders'LOG_LEVEL = "WARNING"# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'# Obey robots.txt rulesROBOTSTXT_OBEY = TrueDOWNLOAD_FAIL_ON_DATALOSS = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See /en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See /en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'lian.middlewares.LianSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See /en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'lian.middlewares.LianDownloaderMiddleware': 543,#}# Enable or disable extensions# See /en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See /en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {'lian.pipelines.LianPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See /en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View Code

结果：

如果觉得《Python——Scrapy爬取链家网站所有房源信息》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。