您好,欢迎访问一九零五行业门户网

自定义HTTP抓包和过滤

定义一个http抓包类,发送数据到一个自定义的接受脚本,可以发送成功,并收取数据,但是发送到外网,却不行,分析过在浏览器下发送http请求时的request header 信息,通过模拟请求,但超时...
//定义一个http抓包类,其实也可以用curl。。。。。hostinfo=parse_url($url); $this->setrequestheader(array('host' => $this->hostinfo['host'])); $this->setrequestheader(array('connection' => 'keep-alive')); } //设置http请求行信息,例如: get /resources http/1.1 //但为了避免漏掉url中?开始的查询信息,有必要进行判断 public function setrequestline($method) { //如果是post请求,则自动添加content-type头信息 if(strtolower($method)=='post') { $this->setrequestheader(array('content-type' => 'application/x-www-form-urlencoded')); } if(!empty($this->hostinfo['query'])) { $this->requestline=strtoupper($method). .$this->hostinfo['path'].?.$this->hostinfo['query']. http/1.1 \r\n; } else { $this->requestline=strtoupper($method). .$this->hostinfo['path']. http/1.1 \r\n; } } //设置http请求头。 //接收参数是数组类型,通过迭代拼接key:value,并换行 public function setrequestheader($header) { foreach($header as $key => $value) { $this->requestheader .=$key.:.$value.\r\n; } } //设置http请求体 //接收参数是数组类型,通过迭代拼接key=value,因为最后一席拼接会有一个多余的&,所以有必要去掉 public function setrequestbody($body) { foreach($body as $key => $value) { $this->requestbody .=$key.'='.$value.'&'; } $offset=strrpos($this->requestbody, '&'); $this->requestbody=substr($this->requestbody, 0, $offset); } //组装 请求行+请求头+请求体,并根据请求体的长度,自动填充请求头的content-length字段 public function setrequestentity() { if(!empty($this->requestbody)) { $contentlength=strlen($this->requestbody); $this->setrequestheader(array('content-length' => $contentlength)); $this->requestentity=$this->requestline.$this->requestheader.\r\n.$this->requestbody; } else { $this->requestentity=$this->requestline.$this->requestheader.\r\n; } } //解析主机名的函数,暂时没有用上....... public function parsehost($url) { $pat='#http://([^/]+)#i'; if(preg_match($pat, $url, $match)) { return $match[1]; } else { echo '匹配主机信息失败
'; } } //创建到主机的连接 public function createconnect() { $this->connect=fsockopen($this->hostinfo['host'], 80, $this->errno, $this->errstr) or die('连接主机失败'.$this->errstr); } //发送请求 public function sendrequest() { $this->setrequestentity(); echo $this->requestentity; exit(); $this->createconnect(); $entitylength=strlen($this->requestentity); if($entitylength != fwrite($this->connect, $this->requestentity, $entitylength)) { die('写入数据失败
'); } else { $this->receiveresponse(); } } //接受请求,并依次拼接响应体 public function receiveresponse() { while(!feof($this->connect)) { $this->responseentity .= fread($this->connect, 1024); } } //计算响应头与响应体之间的空行的位置 public function calculateemptylinepos() { $this->emptylinepos=strpos($this->responseentity,\r\n\r\n,0); } //接受响应体的头部.... public function receiveresponseheader() { $this->calculateemptylinepos(); $this->responseheader=substr($this->responseentity, 0, $this->emptylinepos); echo $this->responseheader; } //接收响应体的body部分 public function receiveresponsebody() { $this->calculateemptylinepos(); $this->responsebody=substr($this->responseentity, $this->emptylinepos); } //返回请求结果 public function getresponse() { return $this->responseentity; } public function parseresponse() {} public function __destruct() { //fclose($this->connect); } }set_time_limit(60);$http=new httpwrap(http://www.mmkao.com/beautyleg/);//设置http请求行$http->setrequestline(get);//设置http头$http->setrequestheader(array(accept => text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8));$http->setrequestheader(array(accept-language => zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3));$http->setrequestheader(array(accept-encoding => gzip, deflate));$http->setrequestheader(array(user-agent => mozilla/5.0 (windows nt 6.1) applewebkit/537.36 (khtml, like gecko) chrome/38.0.2125.101 safari/537.36));//$http->setrequestheader(array(cookie => baidu_dup_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputt=2363; safedog-flow-item=8471ba510da33350ed344ac374d3044a; bdshare_firstime=1415165097782; cscpvrich_fidx=6; ajstat_ok_pages=2; ajstat_ok_times=2; cnzzdata3811623=cnzz_eid%3d253823549-1415164312-http%253a%252f%252fwww.baidu.com%252f%26ntime%3d1415169712));//发送数据$http->sendrequest();//$http->receiveresponseheader();?>通过这个类给领一个自定义的脚本,可以发送和接收数据,领一个脚本如下:但是给这个网站发送请求时,却超时:网站是:http://www.mmkao.com/beautyleg/通过chrome给这个网站首页发送请求时的header头信息:accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8accept-encoding:gzip,deflate,sdchaccept-language:zh,en;q=0.8,zh-tw;q=0.6,zh-cn;q=0.4,ja;q=0.2cache-control:max-age=0connection:keep-alivecookie:baidu_dup_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputt=2363; safedog-flow-item=8471ba510da33350ed344ac374d3044a; bdshare_firstime=1415165097782; cscpvrich_fidx=7; ajstat_ok_pages=3; ajstat_ok_times=2; cnzzdata3811623=cnzz_eid%3d253823549-1415164312-http%253a%252f%252fwww.baidu.com%252f%26ntime%3d1415169712dnt:1host:www.mmkao.comuser-agent:mozilla/5.0 (windows nt 6.1) applewebkit/537.36 (khtml, like gecko) chrome/38.0.2125.101 safari/537.36response headersview source//通过相同的包装,并调用httpwrap发送请求时,却提示超时,是在不知道哪里出问题........针对这个网站写了一个过滤出图片链接的类:responsebody=$body; } //匹配图片src开始的链接地址 public function feedimage() { $pat='#imglink[]=$link; } } else { echo '匹配失败图片链接地址失败'.
; } } //提取head部分 public function filterheader($body) { $pat='#[\s\s]+#im'; if(preg_match($pat, $body, $match)) { $this->header=$match[0]; } else { echo '匹配head部分失败'.
; } } //提取body部分 public function filterbody($body) { $pat='#[\s\s]+#im'; if(preg_match($pat, $body, $match)) { $this->body=$match[0]; } else { echo '匹配body部分失败'.
; } } //提取分页信息,这个只能针对性的匹配,不能通用 public function rollpage($body) { $pat='#[\x{4e00}-\x{9fa5}]+\s*\d\s+?/\s+?\d+\s*[\x{4e00}-\x{9fa5}]*#ui'; if(preg_match($pat, $body, $match)) { $patnum='#/\s*(\d\d*)#'; if(preg_match($patnum, $match[0], $num)) { $this->pagenum=$num[1]; } else { echo '提取分页具体值失败'.
; } } else { echo '提取分页统计失败'.
; } }?>附注: 这两个类,,都通过了内网的测试,并成功过滤出图片的链接,但是给http://www.mmkao.com/beautyleg/发送请求时,却提示超时,,不知道哪里出了问题。。。。。。
回复讨论(解决方案) 在window的命令行下,提交请求,是可以收到数据的......
可以收到数据
$url = 'http://www.mmkao.com/beautyleg/';print_r(get_headers($url));


array( [0] => http/1.1 200 ok [1] => connection: close [2] => date: wed, 05 nov 2014 08:53:09 gmt [3] => content-length: 13889 [4] => content-type: text/html [5] => content-location: http://www.mmkao.com/beautyleg/index.html [6] => last-modified: wed, 05 nov 2014 05:39:09 gmt [7] => accept-ranges: bytes [8] => etag: e8939ad2baf8cf1:693 [9] => server: iis [10] => x-powered-by: waf/2.0 [11] => set-cookie: safedog-flow-item=8471ba510da33350ed344ac374d3044a; expires=sat, 12-dec-2150 10:26:25 gmt; domain=mmkao.com; path=/)


$url = 'http://www.mmkao.com/beautyleg/';print_r(get_headers($url));


array( [0] => http/1.1 200 ok [1] => connection: close [2] => date: wed, 05 nov 2014 08:53:09 gmt [3] => content-length: 13889 [4] => content-type: text/html [5] => content-location: http://www.mmkao.com/beautyleg/index.html [6] => last-modified: wed, 05 nov 2014 05:39:09 gmt [7] => accept-ranges: bytes [8] => etag: e8939ad2baf8cf1:693 [9] => server: iis [10] => x-powered-by: waf/2.0 [11] => set-cookie: safedog-flow-item=8471ba510da33350ed344ac374d3044a; expires=sat, 12-dec-2150 10:26:25 gmt; domain=mmkao.com; path=/)


这样确实可以。蛋疼,我再看看
get /beautyleg/ http/1.1
host:www.mmkao.com
connection:keep-alive
accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
accept-language:zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
accept-encoding:gzip, deflate
http/1.1 200 okdate: wed, 05 nov 2014 09:34:02 gmtcontent-length: 13889content-type: text/htmlcontent-location: http://www.mmkao.com/beautyleg/index.htmllast-modified: wed, 05 nov 2014 05:39:09 gmtaccept-ranges: bytesetag: e8939ad2baf8cf1:693server: iisx-powered-by: waf/2.0set-cookie: safedog-flow-item=bfc86ea38c3e0337fb45dce403130335; expires=sat, 12-dec-2150 11:07:18 gmt; domain=mmkao.com; path=/beautyleg_咪咪图秀............................ 共 1035 组,每页 30 组,当前 1 / 35 页。首页 上一页 下一页 尾页。 转到第
咪咪图秀 www.mmkao.com

这个截图是我通过httpwrap发送的请求头的信息,复制到telnent上也能完整获取网页
$url = 'http://www.mmkao.com/beautyleg/';print_r(get_headers($url));


array( [0] => http/1.1 200 ok [1] => connection: close [2] => date: wed, 05 nov 2014 08:53:09 gmt [3] => content-length: 13889 [4] => content-type: text/html [5] => content-location: http://www.mmkao.com/beautyleg/index.html [6] => last-modified: wed, 05 nov 2014 05:39:09 gmt [7] => accept-ranges: bytes [8] => etag: e8939ad2baf8cf1:693 [9] => server: iis [10] => x-powered-by: waf/2.0 [11] => set-cookie: safedog-flow-item=8471ba510da33350ed344ac374d3044a; expires=sat, 12-dec-2150 10:26:25 gmt; domain=mmkao.com; path=/)


这个问题遗留了好久,今天花时间解决了,而且可以整站采集,,,,,
http://blog.csdn.net/free_program_1314/article/details/41780835
其它类似信息

推荐信息