您好,欢迎访问一九零五行业门户网

php文章内容抓取

求大神帮忙抓取这个网页http://sports.sohu.com/zhongchao.shtml的排行榜部分的数据(包括积分榜和射手榜)
回复讨论(解决方案) 抓取 研究研究 phpquery
$url = 'http://sports.sohu.com/zhongchao.shtml';$s = file_get_contents($url);preg_match_all('/(? 名次球队场次积分
01 广州恒大 20 45
02 北京国安 ......

接下来自己做
可以使用preg_match去抓取对应的html代码然后再正则过滤你想要的数据即可。
给你推荐个类 simple_html_dom
include simple_html_dom.class.php;$url = http://sports.sohu.com/zhongchao.shtml;$dom = new simple_html_dom();$html = $dom->load(file_get_contents($url));$res = $html->find(div#turnidb div.turn);# 积分榜echo $res[0]->outertext;# 射手榜echo $res[1]->outertext;
结果
$str=file_get_contents(http://sports.sohu.com/zhongchao.shtml);preg_match_all('/\s*(.+?)\s*(.+?)\s*(\d+)\s*(.+?)\s*/i',$str,$match1);foreach($match1 as $k=>$v){ if($k!=0){ foreach($v as $k1=>$v1){ if($k1 array ( [0] => 01 [1] => 02 [2] => 03 [3] => 04 [4] => 05 [5] => 06 [6] => 07 [7] => 08 [8] => 09 [9] => 10 [10] => 11 [11] => 12 [12] => 13 [13] => 14 [14] => 15 [15] => 16 ) [2] => array ( [0] => 广州恒大 [1] => 北京国安 [2] => 广州富力 [3] => 上海东亚 [4] => 贵州茅台 [5] => 山东鲁能 [6] => 天津泰达 [7] => 江苏舜天 [8] => 上海绿地 [9] => 长春亚泰 [10] => 杭州绿城 [11] => 大连阿尔滨 [12] => 上海申鑫 [13] => 河南建业 [14] => 辽宁宏运 [15] => 哈尔滨毅腾 ) [3] => array ( [0] => 20 [1] => 19 [2] => 19 [3] => 19 [4] => 19 [5] => 19 [6] => 19 [7] => 18 [8] => 20 [9] => 19 [10] => 19 [11] => 19 [12] => 19 [13] => 19 [14] => 19 [15] => 18 ) [4] => array ( [0] => 45 [1] => 41 [2] => 34 [3] => 31 [4] => 30 [5] => 28 [6] => 27 [7] => 25 [8] => 23 [9] => 21 [10] => 21 [11] => 20 [12] => 19 [13] => 17 [14] => 16 [15] => 12 ))array( [1] => array ( [0] => 01 [1] => 02 [2] => 03 [3] => 04 [4] => 04 [5] => 04 [6] => 04 [7] => 08 [8] => 09 [9] => 09 [10] => 09 [11] => 09 [12] => 09 [13] => 09 [14] => 15 [15] => 15 ) [2] => array ( [0] => 埃尔克森 [1] => 哈默德 [2] => 海森 [3] => 达维 [4] => 多利 [5] => 洛维 [6] => 拉蒙 [7] => 德扬 [8] => 巴塔拉 [9] => 布鲁诺 [10] => 里卡多 [11] => 武磊 [12] => 埃尼奥 [13] => 尤里 [14] => 莫雷诺 [15] => 雷内 ) [3] => array ( [0] => 17 [1] => 16 [2] => 13 [3] => 9 [4] => 9 [5] => 9 [6] => 9 [7] => 8 [8] => 7 [9] => 7 [10] => 7 [11] => 7 [12] => 7 [13] => 7 [14] => 6 [15] => 6 ) [4] => array ( [0] => 广州恒大 [1] => 广州富力 [2] => 上海东亚 [3] => 广州富力 [4] => 哈尔滨毅腾 [5] => 山东鲁能 [6] => 杭州绿城 [7] => 北京国安 [8] => 北京国安 [9] => 大连阿尔滨 [10] => 哈尔滨毅腾 [11] => 上海东亚 [12] => 长春亚泰 [13] => 贵州茅台 [14] => 上海绿地 [15] => 广州恒大 ))*/
后面的自己处理吧
$url = 'http://sports.sohu.com/zhongchao.shtml';$s = file_get_contents($url);preg_match_all('/(? 名次球队场次积分
01 广州恒大 20 45
02 北京国安 ......

接下来自己做
我输出出来的怎么是一个空数组
sohu的页面是gb2312的,采集后需要转utf8,否则会乱码
echo '';$url = 'http://sports.sohu.com/zhongchao.shtml';$s = file_get_contents($url);$s = iconv('gbk','utf8', $s); // gb2312转utf8preg_match_all('/(?<=)\s/isu', $s, $m);// 获取积分榜preg_match_all('/\s*(.+?)\s*(.+?)\s*(\d+)\s*(.+?)\s*/i',$m[0][2],$scores);$scoreboard = array();for($i=0,$len=count($scores[1]); $i 01 [1] => 广州恒大 [2] => 20 [3] => 45 ) [1] => array ( [0] => 02 [1] => 北京国安 [2] => 19 [3] => 41 ) [2] => array ( [0] => 03 [1] => 广州富力 [2] => 19 [3] => 34 ) [3] => array ( [0] => 04 [1] => 上海东亚 [2] => 19 [3] => 31 ) [4] => array ( [0] => 05 [1] => 贵州茅台 [2] => 19 [3] => 30 ) [5] => array ( [0] => 06 [1] => 山东鲁能 [2] => 19 [3] => 28 ) [6] => array ( [0] => 07 [1] => 天津泰达 [2] => 19 [3] => 27 ) [7] => array ( [0] => 08 [1] => 江苏舜天 [2] => 18 [3] => 25 ) [8] => array ( [0] => 09 [1] => 上海绿地 [2] => 20 [3] => 23 ) [9] => array ( [0] => 10 [1] => 长春亚泰 [2] => 19 [3] => 21 ) [10] => array ( [0] => 11 [1] => 杭州绿城 [2] => 19 [3] => 21 ) [11] => array ( [0] => 12 [1] => 大连阿尔滨 [2] => 19 [3] => 20 ) [12] => array ( [0] => 13 [1] => 上海申鑫 [2] => 19 [3] => 19 ) [13] => array ( [0] => 14 [1] => 河南建业 [2] => 19 [3] => 17 ) [14] => array ( [0] => 15 [1] => 辽宁宏运 [2] => 19 [3] => 16 ) [15] => array ( [0] => 16 [1] => 哈尔滨毅腾 [2] => 18 [3] => 12 ))
射手榜
array( [0] => array ( [0] => 01 [1] => 埃尔克森 [2] => 17 [3] => 广州恒大 ) [1] => array ( [0] => 02 [1] => 哈默德 [2] => 16 [3] => 广州富力 ) [2] => array ( [0] => 03 [1] => 海森 [2] => 13 [3] => 上海东亚 ) [3] => array ( [0] => 04 [1] => 达维 [2] => 9 [3] => 广州富力 ) [4] => array ( [0] => 04 [1] => 多利 [2] => 9 [3] => 哈尔滨毅腾 ) [5] => array ( [0] => 04 [1] => 洛维 [2] => 9 [3] => 山东鲁能 ) [6] => array ( [0] => 04 [1] => 拉蒙 [2] => 9 [3] => 杭州绿城 ) [7] => array ( [0] => 08 [1] => 德扬 [2] => 8 [3] => 北京国安 ) [8] => array ( [0] => 09 [1] => 巴塔拉 [2] => 7 [3] => 北京国安 ) [9] => array ( [0] => 09 [1] => 布鲁诺 [2] => 7 [3] => 大连阿尔滨 ) [10] => array ( [0] => 09 [1] => 里卡多 [2] => 7 [3] => 哈尔滨毅腾 ) [11] => array ( [0] => 09 [1] => 武磊 [2] => 7 [3] => 上海东亚 ) [12] => array ( [0] => 09 [1] => 埃尼奥 [2] => 7 [3] => 长春亚泰 ) [13] => array ( [0] => 09 [1] => 尤里 [2] => 7 [3] => 贵州茅台 ) [14] => array ( [0] => 15 [1] => 莫雷诺 [2] => 6 [3] => 上海绿地 ) [15] => array ( [0] => 15 [1] => 雷内 [2] => 6 [3] => 广州恒大 ))
其它类似信息

推荐信息