下面的代码用到了
1python 多线程
2 网页分析库:beautifulsoup ,这个库比之前分享的python SGMLParser网页分析库要强大很多,大家有兴趣可以去了解下。
#encoding=utf-8#@description:蜘蛛抓取内容。import Queueimport threadingimport urllib,urllib2import timefrom BeautifulSoup import BeautifulSouphosts = ["",""]#要抓取的网页queue = Queue.Queue()out_queue = Queue.Queue()class ThreadUrl(threading.Thread):"""Threaded Url Grab"""def __init__(self, queue, out_queue):threading.Thread.__init__(self)self.queue = queueself.out_queue = out_queuedef run(self):while True:#grabs host from queuehost = self.queue.get()proxy_support = urllib2.ProxyHandler({'http':'http://xxx.xxx.xxx.xxxx'})#代理IPopener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)urllib2.install_opener(opener)#grabs urls of hosts and then grabs chunk of webpageurl = urllib.urlopen(host)chunk = url.read()#place chunk into out queueself.out_queue.put(chunk)#signals to queue job is doneself.queue.task_done()class DatamineThread(threading.Thread):"""Threaded Url Grab"""def __init__(self, out_queue):threading.Thread.__init__(self)self.out_queue = out_queuedef run(self):while True:#grabs host from queuechunk = self.out_queue.get()#parse the chunksoup = BeautifulSoup(chunk)print soup.findAll(['title']))#signals to queue job is doneself.out_queue.task_done()start = time.time()def main():#spawn a pool of threads, and pass them queue instancet = ThreadUrl(queue, out_queue)t.setDaemon(True)t.start()#populate queue with datafor host in hosts:queue.put(host)dt = DatamineThread(out_queue)dt.setDaemon(True)dt.start()#wait on the queue until everything has been processedqueue.join()out_queue.join()main()print "Elapsed Time: %s" % (time.time() - start)
运行上面的程序需要安装beautifulsoup, 这个是beautifulsou文档,大家可以看看。
如果觉得《python beautifulsoup多线程分析抓取网页》对你有帮助,请点赞、收藏,并留下你的观点哦!