这篇文章主要介绍了关于python实现爬虫设置代理ip和伪装成浏览器的方法分享,有着一定的参考价值,现在分享给大家,有需要的朋友可以参考一下
1.python爬虫浏览器伪装
#导入urllib.request模块
import urllib.request
#设置请求头
headers=("user-agent","mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/49.0.2623.221 safari/537.36 se 2.x metasr 1.0")
#创建一个opener
opener=urllib.request.build_opener()
#将headers添加到opener中
opener.addheaders=[headers]
#将opener安装为全局
urllib.request.install_opener(opener)
#用urlopen打开网页
data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
2.设置代理
#定义代理ip
proxy_addr="122.241.72.191:808"
#设置代理
proxy=urllib.request.proxyhandle({'http':proxy_addr})
#创建一个opener
opener=urllib.request.build_opener(proxy,urllib.request.httphandle)
#将opener安装为全局
urllib.request.install_opener(opener)
#用urlopen打开网页
data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
3.同时设置用代理和模拟浏览器访问
#定义代理ip
proxy_addr="122.241.72.191:808"
#创建一个请求
req=urllib.request.request(url)
#添加headers
req.add_header("user-agent","mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko)
#设置代理
proxy=urllib.request.proxyhandle("http":proxy_addr)
#创建一个opener
opener=urllib.request.build_opener(proxy,urllib.request.httphandle)
#将opener安装为全局
urllib.request.install_opener(opener)
#用urlopen打开网页
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
4.在请求头中添加多个信息
import urllib.request
page_headers={"user-agent":"mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/49.0.2623.221 safari/537.36 se 2.x metasr 1.0",
"host":"www.baidu.com",
"cookie":"xxxxxxxx"
}
req=urllib.request.request(url,headers=page_headers)
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
5.添加post请求参数
import urllib.request
import urllib.parse
#设置post参数
page_data=urllib.parse.urlencode([
('pn',page_num),
('kd',keywords)
])
#设置headers
page_headers={
'user-agent':'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/49.0.2623.221 safari/537.36 se 2.x metasr 1.0',
'connection':'keep-alive',
'host':'www.lagou.com',
'origin':'https://www.lagou.com',
'cookie':'jsessionid=abaaabaabeeaaja8f28c00a88dc4d771796bb5c6ffa2dda; user_trace_token=20170715131136-d58c1f22f6434e9992fc0b35819a572b',
'accept':'application/json, text/javascript, */*; q=0.01',
'content-type':'application/x-www-form-urlencoded; charset=utf-8',
'referer':'https://www.lagou.com/jobs/list_%e6%95%b0%e6%8d%ae%e6%8c%96%e6%8e%98?labelwords=&fromsearch=true&suginput=',
'x-anit-forge-token':'none',
'x-requested-with':'xmlhttprequest'
}
#打开网页
req=urllib.request.request(url,headers=page_headers)
data=urllib.request.urlopen(req,data=page_data.encode('utf-8')).read().decode('utf-8')
6.利用phantomjs模拟浏览器请求
#1.下载phantomjs安装到本地,并设置环境变量
from selenium import webdriver
bs=webdriver.phantomjs()
#打开url
bs.get(url)
#获取网页源码
url_data=bs.page_source
#将浏览到的网页保存为图片
bs.get_screenshot_as_file(filename)
7.phantomjs设置user-agent和cookie
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import desiredcapabilities
dcap = dict(desiredcapabilities.phantomjs)
dcap["phantomjs.page.settings.useragent"] = ("mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/49.0.2623.221 safari/537.36 se 2.x metasr 1.0")
bs = webdriver.phantomjs(desired_capabilities=dcap)
bs.get(url)
#删除cookie
bs.delete_all_cookies()
#设置cookie
#cookie格式:在浏览器cookie中查看,一个cookie需要包含以下参数,domain、name、value、path
cookie={
'domain':'.www.baidu.com', #注意前面有.
'name':'xxxx',
'value':'xxxx',
'path':'xxxx'
}
#向phantomjs中添加cookie
bs.add_cookie(cookie)
8.利用web_driver工具
#1.下载web_driver工具(如chromdriver.exe)及对应的浏览器
#2.将chromdriver.exe放到某个目录,如c:\chromdriver.exe
from selenium import webdriver
driver=webdriver.chrome(executable_path="c:\chromdriver.exe")
#打开url
driver.get(url)
相关推荐:
python视频爬虫实现下载头条视频
python爬虫抓取代理ip并检验可用性的实例
python采集代理ip并判断是否可用和定时更新的方法
以上就是python实现爬虫设置代理ip和伪装成浏览器的方法分享的详细内容。