您好,欢迎访问一九零五行业门户网

python抓取新浪微博,求教!!?

python抓取新浪微博,被挡,用了代理,有10个帐号,10个代理,爬的很慢,大家有什么好的办法,谢谢!!!
回复内容:http://github.com/zhu327/rss 既然你也用python就直接看代码吧
爬这里 http://service.weibo.com/widget/widget_blog.php?uid={uid} 替换uid,无需登录,不会被挡爬手机端
http://weibo.cn
可以参考下面的代码,来自极客学院,侵删
#-*-coding:utf8-*-import smtplibfrom email.mime.text import mimetextimport requestsfrom lxml import etreeimport osimport timeimport sysreload(sys)sys.setdefaultencoding('utf-8')class mailhelper(object): ''' 这个类实现发送邮件的功能 ''' def __init__(self): self.mail_host=smtp.xxxx.com #设置服务器 self.mail_user=xxxx #用户名 self.mail_pass=xxxx #密码 self.mail_postfix=xxxx.com #发件箱的后缀 def send_mail(self,to_list,sub,content): me=xxoohelper+ msg = mimetext(content,_subtype='plain',_charset='utf-8') msg['subject'] = sub msg['from'] = me msg['to'] = ;.join(to_list) try: server = smtplib.smtp() server.connect(self.mail_host) server.login(self.mail_user,self.mail_pass) server.sendmail(me, to_list, msg.as_string()) server.close() return true except exception, e: print str(e) return falseclass xxoohelper(object): ''' 这个类实现将爬取微博第一条内容 ''' def __init__(self): self.url = 'http://weibo.cn/u/xxxxxxx' #请输入准备抓取的微博地址 self.url_login = 'https://login.weibo.cn/login/' self.new_url = self.url_login def getsource(self): html = requests.get(self.url).content return html def getdata(self,html): selector = etree.html(html) password = selector.xpath('//input[@type=password]/@name')[0] vk = selector.xpath('//input[@name=vk]/@value')[0] action = selector.xpath('//form[@method=post]/@action')[0] self.new_url = self.url_login + action data = { 'mobile' : 'xxxxx@xxx.com', password : 'xxxxxx', 'remember' : 'on', 'backurl' : 'http://weibo.cn/u/xxxxxx', #此处请修改为微博地址 'backtitle' : u'微博', 'trycount' : '', 'vk' : vk, 'submit' : u'登录' } return data def getcontent(self,data): newhtml = requests.post(self.new_url,data=data).content new_selector = etree.html(newhtml) content = new_selector.xpath('//span[@class=ctt]') newcontent = unicode(content[2].xpath('string(.)')).replace('http://','') sendtime = new_selector.xpath('//span[@class=ct]/text()')[0] sendtext = newcontent + sendtime return sendtext def tosave(self,text): f= open('weibo.txt','a') f.write(text + '\n') f.close() def tocheck(self,data): if not os.path.exists('weibo.txt'): return true else: f = open('weibo.txt', 'r') existweibo = f.readlines() if data + '\n' in existweibo: return false else: return trueif __name__ == '__main__': mailto_list=['xxxxx@qq.com'] #此处填写接收邮件的邮箱 helper = xxoohelper() while true: source = helper.getsource() data = helper.getdata(source) content = helper.getcontent(data) if helper.tocheck(content): if mailhelper().send_mail(mailto_list,u女神更新啦,content): print u发送成功 else: print u发送失败 helper.tosave(content) print content else: print u'pass' time.sleep(30)
据说爬手机版会有奇效。我以前爬过,不知道现在可行不
爬他的移动端页面,当时限制比网页端少。
爬虫程序部署在google app engine多个节点上跑新浪有开发者平台,有专门的api接口,用爬虫会被屏蔽
其它类似信息

推荐信息