您好,欢迎访问一九零五行业门户网

Python爬虫:如何获取城市租房信息?

思路:先单线程爬虫,测试可以成功爬取之后再优化为多线程,最后存入数据库
以爬取郑州市租房信息为例
注意:本实战项目仅以学习为目的,为避免给网站造成太大压力,请将代码中的num修改成较小的数字,并将线程改小
一、单线程爬虫# 用session取代requests# 解析库使用bs4# 并发库使用concurrentimport requests# from lxml import etree # 使用xpath解析from bs4 import beautifulsoupfrom urllib import parseimport reimport time headers = { 'referer': 'https://zz.zu.fang.com/', 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.198 safari/537.36', 'cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; city=zz; integratecover=1; __utma=147393320.427795962.1613371106.1613371106.1613371106.1; __utmc=147393320; __utmz=147393320.1613371106.1.1.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; asp.net_sessionid=aamzdnhzct4i5mx3ak4cyoyp; rent_statlog=23d82b94-13d6-4601-9019-ce0225c092f6; captcha=61584f355169576f3355317957376e4f6f7552365351342b7574693561766e63785a70522f56557370586e3376585853346651565256574f37694b7074576b2b34536c5747715856516a4d3d; g_sourcepage=zf_fy%5elb_pc; unique_cookie=u_ffzvt3kztwck05jm6twso2wjw18kl67hqft*6; __utmb=147393320.12.10.1613371106'}data={ 'agentbid':''} session = requests.session()session.headers = headers # 获取页面def gethtml(url): try: re = session.get(url) re.encoding = re.apparent_encoding return re.text except: print(re.status_code) # 获取页面总数量def getnum(text): soup = beautifulsoup(text, 'lxml') txt = soup.select('.fanye .txt')[0].text # 取出“共**页”中间的数字 num = re.search(r'\d+', txt).group(0) return num # 获取详细链接def getlink(tex): soup=beautifulsoup(text,'lxml') links=soup.select('.title a') for link in links: href=parse.urljoin('https://zz.zu.fang.com/',link['href']) hrefs.append(href) # 解析页面def parsepage(url): res=session.get(url) if res.status_code==200: res.encoding=res.apparent_encoding soup=beautifulsoup(res.text,'lxml') try: title=soup.select('div .title')[0].text.strip().replace(' ','') price=soup.select('div .trl-item')[0].text.strip() block=soup.select('.rcont #agantzfxq_c02_08')[0].text.strip() building=soup.select('.rcont #agantzfxq_c02_07')[0].text.strip() try: address=soup.select('.trl-item2 .rcont')[2].text.strip() except: address=soup.select('.trl-item2 .rcont')[1].text.strip() detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','') detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','') detail=detail1+detail2 name=soup.select('.zf_jjname')[0].text.strip() buserid=re.search('buserid: \'(\d+)\'',res.text).group(1) phone=getphone(buserid) print(title,price,block,building,address,detail,name,phone) house = (title, price, block, building, address, detail, name, phone) info.append(house) except: pass else: print(re.status_code,re.text) # 获取代理人号码def getphone(buserid): url='https://zz.zu.fang.com/rentdetails/ajax/getagentvirtualmobile.aspx' data['agentbid']=buserid res=session.post(url,data=data) if res.status_code==200: return res.text else: print(res.status_code) return if __name__ == '__main__': start_time=time.time() hrefs=[] info=[] init_url = 'https://zz.zu.fang.com/house/' num=getnum(gethtml(init_url)) for i in range(0,num): url = f'https://zz.zu.fang.com/house/i3{i+1}/' text=gethtml(url) getlink(text) print(hrefs) for href in hrefs: parsepage(href) print("共获取%d条数据"%len(info)) print("共耗时{}".format(time.time()-start_time)) session.close()
二、优化为多线程爬虫# 用session取代requests# 解析库使用bs4# 并发库使用concurrentimport requests# from lxml import etree # 使用xpath解析from bs4 import beautifulsoupfrom concurrent.futures import threadpoolexecutorfrom urllib import parseimport reimport time headers = { 'referer': 'https://zz.zu.fang.com/', 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.198 safari/537.36', 'cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyword_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; asp.net_sessionid=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5elb_pc; captcha=4937566532507336644d6557347143746b5a6a6b4a7a48445a422f2f6a51746c67516f31357446573052634562725162316152533247514250736f72775566574a2b33514357304b6976343d; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=u_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4'}data={ 'agentbid':''} session = requests.session()session.headers = headers # 获取页面def gethtml(url): res = session.get(url) if res.status_code==200: res.encoding = res.apparent_encoding return res.text else: print(res.status_code) # 获取页面总数量def getnum(text): soup = beautifulsoup(text, 'lxml') txt = soup.select('.fanye .txt')[0].text # 取出“共**页”中间的数字 num = re.search(r'\d+', txt).group(0) return num # 获取详细链接def getlink(url): text=gethtml(url) soup=beautifulsoup(text,'lxml') links=soup.select('.title a') for link in links: href=parse.urljoin('https://zz.zu.fang.com/',link['href']) hrefs.append(href) # 解析页面def parsepage(url): res=session.get(url) if res.status_code==200: res.encoding=res.apparent_encoding soup=beautifulsoup(res.text,'lxml') try: title=soup.select('div .title')[0].text.strip().replace(' ','') price=soup.select('div .trl-item')[0].text.strip() block=soup.select('.rcont #agantzfxq_c02_08')[0].text.strip() building=soup.select('.rcont #agantzfxq_c02_07')[0].text.strip() try: address=soup.select('.trl-item2 .rcont')[2].text.strip() except: address=soup.select('.trl-item2 .rcont')[1].text.strip() detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','') detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','') detail=detail1+detail2 name=soup.select('.zf_jjname')[0].text.strip() buserid=re.search('buserid: \'(\d+)\'',res.text).group(1) phone=getphone(buserid) print(title,price,block,building,address,detail,name,phone) house = (title, price, block, building, address, detail, name, phone) info.append(house) except: pass else: print(re.status_code,re.text) # 获取代理人号码def getphone(buserid): url='https://zz.zu.fang.com/rentdetails/ajax/getagentvirtualmobile.aspx' data['agentbid']=buserid res=session.post(url,data=data) if res.status_code==200: return res.text else: print(res.status_code) return if __name__ == '__main__': start_time=time.time() hrefs=[] info=[] init_url = 'https://zz.zu.fang.com/house/' num=getnum(gethtml(init_url)) with threadpoolexecutor(max_workers=5) as t: for i in range(0,num): url = f'https://zz.zu.fang.com/house/i3{i+1}/' t.submit(getlink,url) print("共获取%d个链接"%len(hrefs)) print(hrefs) with threadpoolexecutor(max_workers=30) as t: for href in hrefs: t.submit(parsepage,href) print("共获取%d条数据"%len(info)) print("耗时{}".format(time.time()-start_time)) session.close()
三、使用asyncio进一步优化# 用session取代requests# 解析库使用bs4# 并发库使用concurrentimport requests# from lxml import etree # 使用xpath解析from bs4 import beautifulsoupfrom concurrent.futures import threadpoolexecutorfrom urllib import parseimport reimport timeimport asyncio headers = { 'referer': 'https://zz.zu.fang.com/', 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.198 safari/537.36', 'cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyword_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; asp.net_sessionid=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5elb_pc; captcha=4937566532507336644d6557347143746b5a6a6b4a7a48445a422f2f6a51746c67516f31357446573052634562725162316152533247514250736f72775566574a2b33514357304b6976343d; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=u_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4'}data={ 'agentbid':''} session = requests.session()session.headers = headers # 获取页面def gethtml(url): res = session.get(url) if res.status_code==200: res.encoding = res.apparent_encoding return res.text else: print(res.status_code) # 获取页面总数量def getnum(text): soup = beautifulsoup(text, 'lxml') txt = soup.select('.fanye .txt')[0].text # 取出“共**页”中间的数字 num = re.search(r'\d+', txt).group(0) return num # 获取详细链接def getlink(url): text=gethtml(url) soup=beautifulsoup(text,'lxml') links=soup.select('.title a') for link in links: href=parse.urljoin('https://zz.zu.fang.com/',link['href']) hrefs.append(href) # 解析页面def parsepage(url): res=session.get(url) if res.status_code==200: res.encoding=res.apparent_encoding soup=beautifulsoup(res.text,'lxml') try: title=soup.select('div .title')[0].text.strip().replace(' ','') price=soup.select('div .trl-item')[0].text.strip() block=soup.select('.rcont #agantzfxq_c02_08')[0].text.strip() building=soup.select('.rcont #agantzfxq_c02_07')[0].text.strip() try: address=soup.select('.trl-item2 .rcont')[2].text.strip() except: address=soup.select('.trl-item2 .rcont')[1].text.strip() detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','') detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','') detail=detail1+detail2 name=soup.select('.zf_jjname')[0].text.strip() buserid=re.search('buserid: \'(\d+)\'',res.text).group(1) phone=getphone(buserid) print(title,price,block,building,address,detail,name,phone) house = (title, price, block, building, address, detail, name, phone) info.append(house) except: pass else: print(re.status_code,re.text) # 获取代理人号码def getphone(buserid): url='https://zz.zu.fang.com/rentdetails/ajax/getagentvirtualmobile.aspx' data['agentbid']=buserid res=session.post(url,data=data) if res.status_code==200: return res.text else: print(res.status_code) return # 获取详细链接的线程池async def pool1(num): loop=asyncio.get_event_loop() task=[] with threadpoolexecutor(max_workers=5) as t: for i in range(0,num): url = f'https://zz.zu.fang.com/house/i3{i+1}/' task.append(loop.run_in_executor(t,getlink,url)) # 解析页面的线程池async def pool2(hrefs): loop=asyncio.get_event_loop() task=[] with threadpoolexecutor(max_workers=30) as t: for href in hrefs: task.append(loop.run_in_executor(t,parsepage,href)) if __name__ == '__main__': start_time=time.time() hrefs=[] info=[] task=[] init_url = 'https://zz.zu.fang.com/house/' num=getnum(gethtml(init_url)) loop = asyncio.get_event_loop() loop.run_until_complete(pool1(num)) print("共获取%d个链接"%len(hrefs)) print(hrefs) loop.run_until_complete(pool2(hrefs)) loop.close() print("共获取%d条数据"%len(info)) print("耗时{}".format(time.time()-start_time)) session.close()
四、存入mysql数据库(一)建表from sqlalchemy import create_enginefrom sqlalchemy import string, integer, column, textfrom sqlalchemy.orm import sessionmakerfrom sqlalchemy.orm import scoped_session # 多线程爬虫时避免出现线程安全问题from sqlalchemy.ext.declarative import declarative_base base = declarative_base() # 实例化engine = create_engine( "mysql+pymysql://root:root@127.0.0.1:3306/pytest?charset=utf8", max_overflow=300, # 超出连接池大小最多可以创建的连接 pool_size=100, # 连接池大小 echo=false, # 不显示调试信息) class house(base): __tablename__ = 'house' id = column(integer, primary_key=true, autoincrement=true) title=column(string(200)) price=column(string(200)) block=column(string(200)) building=column(string(200)) address=column(string(200)) detail=column(text()) name=column(string(20)) phone=column(string(20)) base.metadata.create_all(engine)session = sessionmaker(engine)sess = scoped_session(session)
(二)将数据存入数据库中 # 用session取代requests# 解析库使用bs4# 并发库使用concurrentimport requestsfrom bs4 import beautifulsoupfrom concurrent.futures import threadpoolexecutorfrom urllib import parsefrom mysqldb import sess, houseimport reimport timeimport asyncio headers = { 'referer': 'https://zz.zu.fang.com/', 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.198 safari/537.36', 'cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; __utmc=147393320; asp.net_sessionid=vhrhxr1tdatcc1xyoxwybuwv; __utma=147393320.427795962.1613371106.1613575774.1613580597.6; __utmz=147393320.1613580597.6.5.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; rent_statlog=c158b2a7-4622-45a9-9e69-dcf6f42cf577; keyword_recenthousezz=%5b%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e7%bb%8f%e5%bc%80%22%2c%22detailname%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014871%2f%22%2c%22sort%22%3a1%7d%5d; g_sourcepage=zf_fy%5elb_pc; captcha=6b65716a41454739794d666864397178613772676c75447a4e746c657144775a347a6d42554f446532357649643062344f6976756e563450554e59594b7833712b413579506c4b684958343d; unique_cookie=u_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*14; __utmb=147393320.21.10.1613580597'}data={ 'agentbid':''} session = requests.session()session.headers = headers # 获取页面def gethtml(url): res = session.get(url) if res.status_code==200: res.encoding = res.apparent_encoding return res.text else: print(res.status_code) # 获取页面总数量def getnum(text): soup = beautifulsoup(text, 'lxml') txt = soup.select('.fanye .txt')[0].text # 取出“共**页”中间的数字 num = re.search(r'\d+', txt).group(0) return num # 获取详细链接def getlink(url): text=gethtml(url) soup=beautifulsoup(text,'lxml') links=soup.select('.title a') for link in links: href=parse.urljoin('https://zz.zu.fang.com/',link['href']) hrefs.append(href) # 解析页面def parsepage(url): res=session.get(url) if res.status_code==200: res.encoding=res.apparent_encoding soup=beautifulsoup(res.text,'lxml') try: title=soup.select('div .title')[0].text.strip().replace(' ','') price=soup.select('div .trl-item')[0].text.strip() block=soup.select('.rcont #agantzfxq_c02_08')[0].text.strip() building=soup.select('.rcont #agantzfxq_c02_07')[0].text.strip() try: address=soup.select('.trl-item2 .rcont')[2].text.strip() except: address=soup.select('.trl-item2 .rcont')[1].text.strip() detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','') detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','') detail=detail1+detail2 name=soup.select('.zf_jjname')[0].text.strip() buserid=re.search('buserid: \'(\d+)\'',res.text).group(1) phone=getphone(buserid) print(title,price,block,building,address,detail,name,phone) house = (title, price, block, building, address, detail, name, phone) info.append(house) try: house_data=house( title=title, price=price, block=block, building=building, address=address, detail=detail, name=name, phone=phone ) sess.add(house_data) sess.commit() except exception as e: print(e) # 打印错误信息 sess.rollback() # 回滚 except: pass else: print(re.status_code,re.text) # 获取代理人号码def getphone(buserid): url='https://zz.zu.fang.com/rentdetails/ajax/getagentvirtualmobile.aspx' data['agentbid']=buserid res=session.post(url,data=data) if res.status_code==200: return res.text else: print(res.status_code) return # 获取详细链接的线程池async def pool1(num): loop=asyncio.get_event_loop() task=[] with threadpoolexecutor(max_workers=5) as t: for i in range(0,num): url = f'https://zz.zu.fang.com/house/i3{i+1}/' task.append(loop.run_in_executor(t,getlink,url)) # 解析页面的线程池async def pool2(hrefs): loop=asyncio.get_event_loop() task=[] with threadpoolexecutor(max_workers=30) as t: for href in hrefs: task.append(loop.run_in_executor(t,parsepage,href)) if __name__ == '__main__': start_time=time.time() hrefs=[] info=[] task=[] init_url = 'https://zz.zu.fang.com/house/' num=getnum(gethtml(init_url)) loop = asyncio.get_event_loop() loop.run_until_complete(pool1(num)) print("共获取%d个链接"%len(hrefs)) print(hrefs) loop.run_until_complete(pool2(hrefs)) loop.close() print("共获取%d条数据"%len(info)) print("耗时{}".format(time.time()-start_time)) session.close()
五、最终效果图 (已打码)
以上就是python爬虫:如何获取城市租房信息?的详细内容。
其它类似信息

推荐信息