失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > python-安居客-郑州二手房销售信息抓取

python-安居客-郑州二手房销售信息抓取

时间:2018-09-01 19:20:30

相关推荐

python-安居客-郑州二手房销售信息抓取

python版本:3.7功能描述抓取安居客-郑州各区域内二手房销售信息.代码

# -*- coding: utf-8 -*-"""@site: http://www.wangxiaofeng.site"""import urllib3urllib3.disable_warnings()import sqlite3import randomimport threadingfrom bs4 import BeautifulSoup# Some User Agentshds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/1201 Firefox/3.5.6'}, \{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, \{'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/0101 Firefox/34.0'}, \{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, \{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, \{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/0101 Firefox/4.0.1'}, \{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/0101 Firefox/4.0.1'}, \{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, \{'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, \{'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]lock = threading.Lock()class SQLiteWraper(object):"""数据库的一个小封装,更好的处理多线程写入"""def __init__(self, path, command='', *args, **kwargs):self.lock = threading.RLock() # 锁self.path = path # 数据库连接参数if command != '':conn = self.get_conn()cu = conn.cursor()cu.execute(command)def get_conn(self):conn = sqlite3.connect(self.path) # ,check_same_thread=False)conn.text_factory = strreturn conndef conn_close(self, conn=None):conn.close()def conn_trans(func):def connection(self, *args, **kwargs):self.lock.acquire()conn = self.get_conn()kwargs['conn'] = connrs = func(self, *args, **kwargs)self.conn_close(conn)self.lock.release()return rsreturn connection@conn_transdef execute(self, command, method_flag=0, conn=None):cu = conn.cursor()try:if not method_flag:cu.execute(command)else:cu.execute(command[0], command[1])mit()except sqlite3.IntegrityError as e:# print ereturn -1except Exception as e:printereturn -2return 0def gen_ershoufang_insert_command(info_dict):"""生成小区数据库插入命令"""info_list = [u'小区名称', u'房屋户型', u'单价', u'位置', u'面积', u'首付', u'年代', u'朝向', u'月供', u'房屋类型', u'楼层', u'装修程度', u'产权年限',u'电梯', u'房本年限', u'产权性质', u'唯一住房']t = []for il in info_list:if il in info_dict:t.append(info_dict[il])else:t.append('')t = tuple(t)commands = (r"insert into anjuhouse values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t)# commands = (r"insert into anjuhouse values(?,?)", t)return commandsdef ershoufang_spider(db_ershoufang, url_page):"""爬取页面链接中的二手房信息"""try:# print(url_page)http = urllib3.PoolManager()req = http.request('GET', url_page, headers=hds[random.randint(0, len(hds) - 1)])source_code = req.dataplain_text = source_code.decode('utf-8')soup = BeautifulSoup(plain_text, "html.parser")# print(soup)# exitcj_list = soup.findAll('div', {'class': 'houseInfo-content'})info_dict = {}info_dict.update({u'小区名称': cj_list[0].get_text()})info_dict.update({u'房屋户型': cj_list[1].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'单价': cj_list[2].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'位置': cj_list[3].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'面积': cj_list[4].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'首付': cj_list[5].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'年代': cj_list[6].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'朝向': cj_list[7].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'月供': cj_list[8].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'房屋类型': cj_list[9].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'楼层': cj_list[10].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'装修程度': cj_list[11].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'产权年限': cj_list[12].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'电梯': cj_list[13].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'房本年限': cj_list[14].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'产权性质': cj_list[15].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})info_dict.update({u'唯一住房': cj_list[16].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})commands = gen_ershoufang_insert_command(info_dict)db_ershoufang.execute(commands, 1)# print(cj_list[0].get_text())# exitexcept (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:printeexit(-1)except Exception as e:printeexit(-2)def db_ershoufang_spider(db_ershoufang, page = 1):url = u"/sale/p%d/" % page# exit(url)try:http = urllib3.PoolManager()req = http.request('GET', url, headers=hds[random.randint(0, len(hds) - 1)])source_code = req.data# exit(source_code)plain_text = source_code.decode('utf-8')soup = BeautifulSoup(plain_text, "html.parser")list = []for viewlist in soup.findAll('a', {'class': 'houseListTitle'}):list.append(viewlist.get('href'))threads = []print(list)for viewurl in list:t = threading.Thread(target=ershoufang_spider, args=(db_ershoufang, viewurl))threads.append(t)for t in threads:t.start()for t in threads:t.join()except (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:printeexit(-3)except Exception as e:printeexit(-4)if __name__ == "__main__":command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT, danjia TEXT, weizhi TEXT, mianji TEXT, shoufu TEXT, niandai TEXT, chaoxiang TEXT, yuegong TEXT, leixing TEXT, louceng TEXT, zhuangxiu TEXT, chanquan TEXT, dianti TEXT, nianxian TEXT, xingzhi TEXT, weiyi TEXT)"# command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT)"db_ershoufang = SQLiteWraper('anjuke-ershoufang.db', command)for page in range(1, 50):db_ershoufang_spider(db_ershoufang, page + 1)

结果版权声明文章涉及内容以及代码仅供个人学习使用,请勿用于商业用途.因个人使用不当给其他人员造成的损失以及生成的各法律纠纷,作者概不承担.

如果觉得《python-安居客-郑州二手房销售信息抓取》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。