失眠网 > Python-Scrapy-抓取链家二手房信息

Python-Scrapy-抓取链家二手房信息

时间：2023-12-26 18:17:30

继续回顾下scrapy，来抓下链家二手房信息，主要涉及scrapy的部分有：

CrawlSpider

Rule

LinkExtractor

Mysql 数据存储

房产图片的下载

简单看下链家的二手房信息网站

详情页

翻页（最多只能抓取100页）

思路：

先抓取房屋详情链接

进入详情页抓取关键字段

找到图片链接

信息存入数据库

下载图片保存本地

翻页

写来写去，链家没什么反爬，加上个headers 就OK了

直接上代码

spider文件

# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider,Rulefrom scrapy.linkextractors import LinkExtractorfrom lianjia.items import LianjiaItemclass LjCrwalerSpider(CrawlSpider):name = 'lj_crawler'start_urls = ['/ershoufang/']#设置抓取规则rulerules = {#房产详情链接Rule(LinkExtractor(restrict_xpaths="//ul[@class='sellListContent']/li/div[@class='info clear']/div[@class='title']/a"), follow=True, callback="process_item"),#翻页链接Rule(LinkExtractor(restrict_xpaths="//div[@class='pagination_group_a']/a"), follow=True),}def process_item(self, response):item = LianjiaItem()#提取关键字段信息item['title'] = response.css('title::text').extract_first()item['price'] = response.css('div.overview div.content > div.price > span.total::text').extract_first()item['unit_price'] = response.css('div.overview div.content > div.price span.unitPriceValue::text').extract_first()item['community_name'] = response.css('div.overview div.content > div.aroundInfo > munityName > a::text').extract_first()item['region'] = response.css('div.areaName span.info a::text').extract()item['linkman'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="brokerName"]/a/text()').extract_first()item['linktel'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="phone"]/text()').extract()item['type'] = response.css('#introduction div.base ul > li:first-child::text').extract_first()item['construction_area'] = response.css('#introduction div.base ul > li:nth-child(3)::text').extract_first()item['actual_area'] = response.css('#introduction div.base ul > li:nth-child(5)::text').extract_first()item['orientation'] = response.css('#introduction div.base ul > li:nth-child(7)::text').extract_first()item['decoration'] = response.css('#introduction div.base ul > li:nth-child(9)::text').extract_first()item['floor'] = response.css('#introduction div.base ul > li:nth-child(2)::text').extract_first()item['elevator'] = response.css('#introduction div.base ul > li:nth-child(12)::text').extract_first()item['property'] = response.css('#introduction div.base ul > li:nth-child(13)::text').extract_first()item['house_years'] = response.css('#introduction div.transaction li:nth-child(5) span:nth-child(2)::text').extract_first()item['mortgage'] = response.css('#introduction div.transaction li:nth-child(7) span:nth-child(2)::text').extract_first().strip()item['purposes'] = response.css('#introduction div.transaction ul > li:nth-child(4) span:nth-child(2)::text').extract_first()item['release_date'] = response.css('#introduction div.transaction ul > li:first-child span:nth-child(2)::text').extract_first()item['image_urls'] = response.css('div.content-wrapper img::attr(src)').extract()item['from_url'] = response.urlyield item

item.py文件

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass LianjiaItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()#标题title = scrapy.Field()#价格price = scrapy.Field()#单价unit_price = scrapy.Field()# 小区名字community_name = scrapy.Field()# 地区region = scrapy.Field()# 联系人linkman = scrapy.Field()# 联系电话linktel = scrapy.Field()#户型type = scrapy.Field()#建筑面积construction_area = scrapy.Field()#实际面积actual_area = scrapy.Field()#房屋朝向orientation = scrapy.Field()#装修情况decoration = scrapy.Field()#所在楼层floor = scrapy.Field()#电梯elevator = scrapy.Field()#产权年限property = scrapy.Field()#房屋年限house_years = scrapy.Field()#有无抵押mortgage = scrapy.Field()#房屋用途purposes = scrapy.Field()#挂牌时间release_date = scrapy.Field()#房屋照片image_urls = scrapy.Field()#房产链接from_url = scrapy.Field()

pipelines.py文件

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: /en/latest/topics/item-pipeline.htmlimport hashlibimport pymysqlfrom scrapy.exceptions import DropItemfrom urllib.request import urlretrievefrom scrapy.utils.python import to_bytesimport osclass LianjiaPipeline(object):def __init__(self, settings):self.host = settings.get('HOST')self.port = settings.get('PORT')self.user = settings.get('USER')self.passwd = settings.get('PASSWD')self.db = settings.get('DB')self.charset = settings.get('CHARSET')self.table = settings.get('TABLE')self.settings = settings@classmethoddef from_crawler(cls, crawler):return cls(crawler.settings)def open_spider(self, spider):self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset=self.charset)self.db = self.conn.cursor()def close_spider(self, spider):self.db.close()self.conn.close()def save_data(self, item):'''数据存储:param item::return:'''keys = ', '.join(item.keys())values = ', '.join(['%s'] * len(item.keys()))insert_sql = "insert into `{}`({})values({})".format(self.table, keys, values)try:self.db.execute(insert_sql, tuple(item.values()))mit()except Exception as e:print(e.args)self.conn.rollback()def select_data(self, item):'''判重:param item::return:'''value = item.get('from_url')select_sql = "select * from `{}` where from_url='{}';".format(self.table, value)try:self.db.execute(select_sql)res = self.db.fetchall()if res:return Trueelse:return Falseexcept Exception as e:print(e.args)return Falsedef process_item(self, item, spider):item['linktel'] = '-'.join(item['linktel'])item['region'] = '/'.join(item['region'])item['image_urls'] = ','.join(item['image_urls'])if not self.select_data(item):self.save_data(item)return itemclass ImageDownloadPipeline(object):def __init__(self, settings):self.imagepath = settings.get('IMAGES_STORE')@classmethoddef from_crawler(cls, crawler):return cls(crawler.settings)def process_item(self, item, spider):'''图片下载:param item::param spider::return:'''for image in item['image_urls'].split(','):#图片命名image_guid = hashlib.sha1(to_bytes(image)).hexdigest()image_name = '%s.jpg' % (image_guid)house_id = item['from_url'].split('/')[-1].replace('.html','')file_path = '%s/%s'%(self.imagepath, house_id)if not os.path.exists(file_path):os.makedirs(file_path)image_path = '%s/%s/%s'%(self.imagepath, house_id, image_name)if not os.path.exists(image_path):urlretrieve(image, image_path)else:raise DropItem('It exists!')

settings.py 文件

# -*- coding: utf-8 -*-# Scrapy settings for lianjia project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##/en/latest/topics/settings.html#/en/latest/topics/downloader-middleware.html#/en/latest/topics/spider-middleware.htmlBOT_NAME = 'lianjia'SPIDER_MODULES = ['lianjia.spiders']NEWSPIDER_MODULE = 'lianjia.spiders'HOST = '127.0.0.1'PORT = 3306USER = 'root'PASSWD = '123456'DB = 'mycrawler'CHARSET = 'UTF8'TABLE = 'lianjia'IMAGES_STORE = 'C:/Users/wang/Desktop/lianjia/lianjia/images'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'lianjia (+)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See /en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'max-age=0','Connection': 'keep-alive','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}# Enable or disable spider middlewares# See /en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'lianjia.middlewares.LianjiaSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See /en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,#}# Enable or disable extensions# See /en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See /en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {'lianjia.pipelines.LianjiaPipeline': 300,'lianjia.pipelines.ImageDownloadPipeline': 400,}# Enable and configure the AutoThrottle extension (disabled by default)# See /en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

run.py文件

# -*- coding: utf-8 -*-from scrapy import cmdlinecmdline.execute("scrapy crawl lj_crawler".split())

结果

#总结：

图片的下载其实想用scrapy自带的imagespipeline来着，但是不知道应该怎么实现分目录存储图片，IMAGE_STORE貌似只能设置一个固定的路径，应该怎么实现动态的按照房产id生成文件夹呢？有大神的话，求指点一下，应该如何改写里面的方法抓取的是青岛的房产信息，全国的话，可以再对start_urls进行处理

如果觉得《Python-Scrapy-抓取链家二手房信息》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。