新浪新闻采集程序

利用正则表达式提取内容信息
<?php //error_reporting(0); $date=date("ymd"); //echo $date; $url="http://top.finance.sina.com.cn/ws/gettopdatalist.php?top_type=day&top_cat=all&top_time=".$date."&top_show_num=20&top_order=asc"; $doc=file_get_contents($url); header("content-type:text/html;charset=gb2312");//unicode转gb2312 //echo $doc; preg_match_all('/"url":".*l/u',$doc,$article);//获取文章列表 $article=current($article);//二维数组转一维数组 //print_r($article); $rule="/\"url\":(.*)\"\"/"; function onlyurl($string){ $string=stripslashes($string); $string=str_replace('"url":"',"",$string); return $string; } function gettitle($url){ //获取文章标题 $doc=file_get_contents($url); $rule="/<h1 (.*)<\/h1>/"; preg_match($rule,$doc,$result); echo $result; } foreach($article as &$value){ $value=onlyurl($value); }//提取了url ?>
<?php function gettitle($url){ //获取文章标题 $rule="/<h1 (.*)<\/h1>/"; preg_match_all($rule,$doc,$result); $rule="/>(.*)</";//截掉<h1></h1> preg_match($rule,$result[0][1],$title); return $title[1]; } function getcontent($url){ $doc=file_get_contents($url);//打开网页 // echo $doc; $rule="/publish_helper(.*)publish_helper_end/xs"; preg_match($rule,$doc,$result);//取出正文部分 //echo $result[0]; $rule="/publish_helper(.*)<\/blockquote>/xs"; preg_match($rule,$result[0],$out);//取出链接部分 //print_r($out); $content=str_replace($out[0],"",$result[0]);//提取纯文字段的内容 echo $content; } ?>

新浪新闻采集程序

推荐信息