本篇文章主要介绍了java selenium处理极验滑动验证码示例,小编觉得挺不错的,现在分享给大家,也给大家做个参考。一起跟随小编过来看看吧
要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题,我按照这思路去大概实现了一下。
1.使用htmlunit(这种方式我没成功,模拟鼠标拖拽后轨迹没生成,可以跳过)
我用的是java,我首先先想到了用直接用htmlunit,我做了点初始化
private void initwebclient() {
if (webclient != null) {
return;
}
webclient = new webclient(browserversion.firefox_24);
webclient.getoptions().setproxyconfig(new proxyconfig("127.0.0.1",8888));
webclient.getoptions().setactivexnative(true);
webclient.getoptions().setuseinsecuressl(true); // 配置证书
webclient.getoptions().setjavascriptenabled(true);
webclient.getoptions().setcssenabled(true);
webclient.setcsserrorhandler(new silentcsserrorhandler());
webclient.getoptions().setthrowexceptiononscripterror(false);
webclient.getoptions().setthrowexceptiononfailingstatuscode(false);
cookiemanager cookiemanager = new cookiemanager();
list<org.apache.http.cookie.cookie> httpcookies = client.getcookies();//其方式获取的cookie
for (org.apache.http.cookie.cookie cookie : httpcookies) {
cookiemanager.addcookie(new com.gargoylesoftware.htmlunit.util.cookie(cookie));
}
webclient.setcookiemanager(cookiemanager);
}
初始化代理,cookie..然后就能正常调用了
htmlpage page = webclient.getpage("http://www.qixin.com/login");//企信宝
gepageinfor(page);
下面就是我获取图片,还原图片并且模拟拖拽,(这里我觉得是有些问题的,可能是拖拽我模拟的不对导致触发的js并没有生成正确的轨迹,还请大家帮忙看看哪里错了)
private void gepageinfor(htmlpage page) {
string[] img_slice={"p", "class", "gt_cut_fullbg_slice"};
string[] img_bg_slice={"p", "class", "gt_cut_bg_slice"};
htmlpision p = (htmlpision) page.getelementbyid("captcha");
int decaptcha = 0;
try {
byte[] img_slice_binary = client.get(getimgurl(img_slice, p, true)).getbinary();//获取图片byte
byte[] img_bg_slice_binary = client.get(getimgurl(img_bg_slice, p, false)).getbinary();
//获取还原后的图片
bufferedimage geetestimg = imgtest.getgeetestimg(img_slice_binary, imgtest.imgarray);
bufferedimage geetestimg2 = imgtest.getgeetestimg(img_bg_slice_binary, imgtest.imgarray);
//获得图片移动位置(目前还有问题,需改用第三方图片识别)
decaptcha =imgtest.decaptcha(geetestimg,geetestimg2);
system.out.println(decaptcha);
} catch (ioexception | fetchexception e) {
e.printstacktrace();
}
htmlpision p_slider_knob = get_p_slider_knob(page,"gt_slider_knob gt_show");//获取要移动p
htmlpage mouseover = (htmlpage) p_slider_knob.mouseover();
htmlpage mousedownpage = (htmlpage)p_slider_knob.mousedown();
p_slider_knob = get_p_slider_knob(mousedownpage,"gt_slider_knob gt_show moving");
mousemovex(decaptcha, p_slider_knob, mousedownpage);
htmlpage newpage =(htmlpage)p_slider_knob.mouseover();
// newpage =(htmlpage)p_slider_knob.mousedown();
system.out.println(newpage.asxml());
p = (htmlpision)newpage.getelementbyid("captcha");
htmlelement htmlelement = p.getelementsbyattribute("p", "class", "gt_slice gt_show moving").get(0);
system.out.println(htmlelement);
newpage =(htmlpage)p_slider_knob.mouseup();//触发js,轨迹没有生成
system.out.println("---------------");
system.out.println(newpage.asxml());
if (newpage.getelementbyid("captcha")!=null) {//错误重试
//gepageinfor(newpage);
}
}
private void mousemovex(int decaptcha, htmlpision p_slider_knob, htmlpage mousedown) {
mouseevent mouseevent = new mouseevent(p_slider_knob, mouseevent.type_mouse_move, false, false, false, mouseevent.button_left);
mouseevent.setclientx( mouseevent.getclientx()+((decaptcha!=0)?decaptcha:99)); //移动x坐标
scriptresult scriptresult = mousedown.getdocumentelement().fireevent(mouseevent);
}
private htmlpision get_p_slider_knob(htmlpage page,string classstring) {
return (htmlpision)(((htmlpision) page.getelementbyid("captcha")).getelementsbyattribute("p", "class", classstring).get(0));
}
private string getimgurl(string[] img_slice, htmlpision p, boolean isneedcheckpostion) {
string url ="";
int[] postion = new int[2];
boolean empty = p.getelementsbyattribute(img_slice[0],img_slice[1],img_slice[2]).isempty();
if (p.haschildnodes() && !empty) {
list<htmlelement> elementsbyattribute = p.getelementsbyattribute(img_slice[0],img_slice[1],img_slice[2]);
for(int i = 0;i<elementsbyattribute.size();i++){
htmlpision p_img = (htmlpision)elementsbyattribute.get(i);
string style = p_img.getattribute("style");
string[] imge_url_position = style.split(";");
if(stringutils.isblank(url)){//确认url
url = stringutils.replacepattern(imge_url_position[0], ".*\\(", "").replace(")", "");
}
if (isneedcheckpostion) {//确认图片切割postion,两张图切割方式一样 background-position: -157px -58px
// string[] positions = stringutils.split(stringutils.remove(imge_url_position[1], "px").replace("-", "").replaceall(".*:", ""), null);
string[] positions = stringutils.split(stringutils.removepattern(imge_url_position[1], "[^\\d+ \\s]"),null);
postion[0] = integer.parseint(positions[0]);
postion[1] = integer.parseint(positions[1]);
int[] is = imgtest.imgarray[i];
if (is[0]!=postion[0]||is[1]!=postion[1]) {
logger.debug("更新分割postion");
imgtest.imgarray[i] = postion;
}
system.out.println(imgtest.imgarray);
isneedcheckpostion= false;
}
}
}
return url;
}
对比图片获取位移方法(decaptcha)是错的我就不放代码了,下面是其中还原图片用的方法,目前是其实审查元素后你就明白怎么还原这个图片了,这里是每次读的10px,58px
public static bufferedimage getgeetestimg(byte[] binary, int[][] imgarray) throws ioexception {
bufferedimage img = imageio.read(new bytearrayinputstream(binary));
list<bufferedimage> list = new arraylist<>();
for (int i=0;i< imgarray.length;i++) {
bufferedimage subimage = img.getsubimage(imgarray[i][0], imgarray[i][1], 10, 58);
list.add(subimage);
// imageio.write(subimage, "jpg", new file("d:\\image\\imgs"+i+".jpg"));
}
bufferedimage mergeimageup = null;
bufferedimage mergeimagedown = null;
int mid = list.size()>>>1;
for (int i = 0; i <mid-1 ; i++) {
mergeimageup = mergeimage(mergeimageup==null?list.get(i):mergeimageup, list.get(i+1), true);
}
for(int i = mid;i<list.size()-1;i++){
mergeimagedown = mergeimage(mergeimagedown==null?list.get(i):mergeimagedown,list.get(i+1), true);
}
img = mergeimage(mergeimageup, mergeimagedown, false);
return img;
}
public static bufferedimage mergeimage(bufferedimage img1,
bufferedimage img2, boolean ishorizontal) throws ioexception {
int w1 = img1.getwidth();
int h1 = img1.getheight();
int w2 = img2.getwidth();
int h2 = img2.getheight();
// 从图片中读取rgb
int[] imagearrayone = new int[w1 * h1];
imagearrayone = img1.getrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 逐行扫描图像中各个像素的rgb到数组中
int[] imagearraytwo = new int[w2 * h2];
imagearraytwo = img2.getrgb(0, 0, w2, h2, imagearraytwo, 0, w2);
// 生成新图片
bufferedimage destimage = null;
if (ishorizontal) { // 水平方向合并
destimage = new bufferedimage(w1+w2, h1, bufferedimage.type_int_rgb);
destimage.setrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 设置上半部分或左半部分的rgb
destimage.setrgb(w1, 0, w2, h2, imagearraytwo, 0, w2);
} else { // 垂直方向合并
destimage = new bufferedimage(w1, h1 + h2,
bufferedimage.type_int_rgb);
destimage.setrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 设置上半部分或左半部分的rgb
destimage.setrgb(0, h1, w2, h2, imagearraytwo, 0, w2); // 设置下半部分的rgb
}
return destimage;
}
2.使用selenium
后来我想着是我模拟鼠标这个动作哪里有问题,我就又找到了selenium(2.42.2),他也能操作htmlunit关键他的鼠标动作好像封装比较完全
但是我尝试了以后发现了这个,htmlunitmouse这个动作没有实现
public void mousemove(coordinates where, long xoffset, long yoffset) {
throw new unsupportedoperationexception("moving to arbitrary x,y coordinates not supported.");
}
好吧,于是调用chrome吧
system.setproperty("webdriver.chrome.driver","c:\\chromedriver.exe");
proxy proxy = new proxy();
//设置代理服务器地址
proxy.sethttpproxy("127.0.0.1:8888");
// desiredcapabilities capabilities = desiredcapabilities.htmlunitwithjs();
desiredcapabilities capabilities = desiredcapabilities.chrome();
capabilities.setcapability(capabilitytype.proxy, proxy);
// final webdriver driver = new htmlunitdriver(capabilities);
webdriver driver = new chromedriver(capabilities);
driver.get("http://www.qixin.com/login");
driver.manage().timeouts().implicitlywait(10, timeunit.seconds);
checkpage(driver,"return $('.gt_cut_fullbg_slice');");
// 获取 网页的 title
system.out.println("1 page title is: " + driver.gettitle());
// 通过 id 找到 input 的 dom
string pagesource = driver.getpagesource();
system.out.println(pagesource);
org.openqa.selenium.javascriptexecutor executor = (org.openqa.selenium.javascriptexecutor)driver;
boolean equals = executor.executescript("return document.readystate").equals("complete");
int movex =99;//移动位置
if (equals) {
webelement element = driver.findelement(by.classname("gt_slider_knob"));//(".gt_slider_knob"));
point location = element.getlocation();
element.getsize();
actions action = new actions(driver);
// action.clickandhold().perform();// 鼠标在当前位置点击后不释放
// action.clickandhold(element).perform();// 鼠标在 onelement 元素的位置点击后不释放
// action.clickandhold(element).movebyoffset(location.x+99,location.y).release().perform(); //选中source元素->拖放到(xoffset,yoffset)位置->释放左键
action.draganddropby(element, location.x+movex,location.y).perform();
// action.draganddrop(element,newelement).perform();
pagesource = driver.getpagesource();
}
//更新cookie
set<org.openqa.selenium.cookie> cookies = driver.manage().getcookies();
set<cookie> cookies2 = new hashset<>();
for (org.openqa.selenium.cookie cookie : cookies) {
cookies2.add((cookie) new cookie(cookie.getdomain(), cookie.getname(), cookie.getvalue(), cookie.getpath(), cookie.getexpiry(), true));
}
for (cookie cookie : cookies2) {
org.apache.http.cookie.cookie httpclient = cookie.tohttpclient();
}
system.out.println(pagesource);
这样提交的表单确实是有轨迹的,这里移动位置我先写了个固定值,可以由上面图片还原,以及一些开源的图片识别工具识别出位置。以上应该就能解决这个滑动验证码了
以上就是java中关于selenium处理极验滑动验证码的示例的详细内容。