您好,欢迎访问一九零五行业门户网

抓取网页内容类

网页
url );
return $pagecontent;
  }
/**
  * 根据标记得到列表段
  * @param $content  页面源数据
  * @return string   列表段内容
  */
function getcontentpiece ( $content )
  {
   $content = $this->getcontent( $content, $this->startflag, $this->endflag );
                                                     if(!$content)  $content=$this->cut ($content, $this->startflag, $this->endflag );
   return $content;
  }
/**
  * 得到一个字符串中的某一部分
  * @param $sourcestr 源数据
  * @param $startstr 分离部分的开始标记
  * @param $endstart 分离部分的结束标记
  * @return boolean  操作成功返回true
  */
function getcontent ( $sourcestr, $startstr, $endstart )
  {
   $s = preg_quote( decode( $startstr ) );
   $e = preg_quote( decode( $endstart ) );
   $s = str_replace( , [[:space:]], $s );
   $e = str_replace( , [[:space:]], $e );
   $s = str_replace( \r\n, [[:cntrl:]], $s );
   $e = str_replace( \r\n, [[:cntrl:]], $e );
   preg_match_all( @ . $s . (.*?). $e .@is, $sourcestr, $tpl );
   $content = $tpl[1];
   $content = implode( , $content );
   return $content;
  }
function cut ( $sourcestr, $startstr, $endstr )
  {
                                                return  cut( $sourcestr ,decode( $startstr )  ,decode( $endstr) );
                                    }
/**
  * 得到只含有连接和内容的列表数组
  * @param $slist  页面列表源数据
  * @return array  列表段内容
  */
function getsourcelist ( $slist )
  {
   preg_match_all( //i, $slist, $list );
   $list = $list[0];
//foreach($list as $l) echo $l;
                                                     if(!$list || !is_array($list)){
                                                                   return $this->getsourcelistextend($slist);
                                                     }else{
                                   return $this->getlist ( $list );
                                                     }
}
function getsourcelistextend($slist)
                                   {
                                                  $content=explode(,$slist);
                                                  for($i=0;$i                                                  {
                                                           $lists=explode(                                                           $list[]=$lists[1];
                                                  }
                                                           return $this->getlistextend( $list );
                                   }
/**
  * 得到列表内容
  * @param $list  列表段内容
  * @return array  含有标题和连接的数组
  */
function getlist ( $list )
  {
   for ( $i = 0; $i    {               
    //title
    preg_match_all( />(.*?)/i, $list[$i], $templ );
    //content
    preg_match_all( /href=(\|'|)(.*?)(\|'|)/i, $list[$i], $tempc );
//获取的数据正确
    if( !empty( $templ[1][0] ) && !empty( $tempc[2][0] ) )
    {
     if( 0 == strpos( $tempc[2][0], / ) )
     {
      preg_match( @http://(.*?)/@i, $this->url, $url );
      $tempc[2][0] = substr( $url[0], 0, strlen( $url[0] ) - 1 ) . $tempc[2][0];
     }
$listcontent[$i][0] = $templ[1][0];
      $listcontent[$i][1] = $tempc[2][0];
    }
                                                     }
                                                     if(!$listcontent || !is_array($listcontent)){
                                                               return $this->getlistextend ( $list );
                                                     }else{
             return $listcontent;
                                                     }
  }
  function getlistextend ( $list )
  {                 
                                                     $list=str_replace(\,,$list);
                                                     $list=str_replace(',,$list);
                                                     $list=str_replace(=,,$list);
   for ( $i = 0; $i    {
    //content
    $temp_link=$this->cut($list[$i],href, );
                                                                       echo $temp_link.
;
    //title
                                                                       if(eregi(>,$list[$i])){
                                                                              $temp_title=substr(strrchr($list[$i], >), 1 );
                                                                              $temp_title=preg_replace( @\@is,,$temp_title);
                                                                              $temp_title=str_replace( >,,$temp_title);
                                                                              $temp_title=str_replace(                                                                               if(!$temp_title)   $temp_title=$list[$i] ;
                                                                               $temp_title=preg_replace( @\@is,,$temp_title);
                                                                               $temp_title=str_replace( >,,$temp_title);
                                                                               $temp_title=str_replace(                                                                                 echo $temp_title.
;
                                                                       }else{
                                                                             $temp_title=$list[$i];      
                                                                             $temp_title=preg_replace( @\@is,,$temp_title);
                                                                              $temp_title=str_replace( >,,$temp_title);
                                                                              $temp_title=str_replace(                                                                               echo $temp_title.
;
                                                                       }
    //获取的数据正确
    if( !empty( $temp_link ) && !empty( $temp_title) )
    {
     if( 0 == strpos( $tempc[2][0], / ) )
     {
      preg_match( @http://(.*?)/@i, $this->url, $url );
      $temp_link = substr( $url[0], 0, strlen( $url[0] ) - 1 ) . $temp_link;
     }
$listcontent[$i][0] = trim($temp_title);
     $listcontent[$i][1] = $temp_link;
    }
                                                     }
   return $listcontent;
                                   }
/**
  * 得到正文中的图片路径信息
  * @param $content 正文信息
  * @return array  信息中图片路径的数组
  */
function getimagelist ( $content )
  {
   preg_match_all( /src=(\|')(.*?)(\|')/i, $content, $temp );
$imagelist = $temp[2];
   return array_unique($imagelist);
  }
/**
  * 下载图片时将页面中的路径替换成新的路径
  * @param $content  需要替换路径的页面内容
  * @return string   替换后的页面内容
  */
function replaceimageparh ( $content )
  {
   for ( $i = 0; $i imagelist ); $i++ )
   {
                                                                      if($this->filename[$i]){
                  $content = str_replace( $this->imagelist[$i], $this->imageurl.$this->filename[$i], $content );
                                                                      }else{    
                                                                                    //$s= /src=(\\\|').preg_quote($this->imagelist[$i]).(\\\|')/i;
                  $content = str_replace($this->imagelist[$i], $globals[set][webpath].images/nopic.gif, $content );
                                                                      }
   }
return $content;
  }
/**
  * 下载图片时读取图片文件后存储在相应路径
  * @param $imageurl 需要读取的图片文件
  * @return boolean  操作成功返回true
  */
function saveimage ( $imageurl )
  {
for ( $i = 0; $i    {
    $fname = $this->savefile( $imageurl[$i] );
    if( !empty( $fname ) )
    {               
     $filename[$i] = $fname;
    }
   }
return $filename;
  }
function savefile( $filename )
  {
$s_filename = basename( $filename );
   $ext_name = strtolower( strrchr( $s_filename, . ) );
if( ( .jpg && .gif && .swf ) != strtolower( $ext_name ) )
   {
    return ;
   }
if( 0 == strpos( $filename, / ) )
   {
    preg_match( @http://(.*?)/@i, $this->url, $url );
    $url = $url[0];
   }
if( 0 == strpos( $filename, . ) )
   {
    $url = substr( $this->url, 0, strrpos( $filename, / ) );
   }
$contents = @file_get_contents( $url . $filename );
   $s_filename = time(). rand( 1000, 9999 ) . $ext_name;
//file_put_contents( $this->saveimagepath.$s_filename, $contents );
$handle = @fopen ( $this->saveimagepath.$s_filename, w );
   @fwrite( $handle, $contents );
   @fclose($handle);
   if(filesize($this->saveimagepath.$s_filename)>3072){
             return $s_filename;
                                                     }else{
                                                               @unlink($this->saveimagepath.$s_filename);
             return ;
                                                    }
}
/**
  * 不下载图片则格式化其路径为绝对路径
                                   * 不能格式化变态路径 eg: ./../  or /./../ 一类的  不过不影响结果
  * @param $imageurl 需要读取的图片文件
  * @return $filename  返回格式化的图片路径
  */
                                   function  topath($imageurl)
                                   {
                                                     $patharray=parse_url($this->url);
                                                     $webpath=$patharray[scheme].://.$patharray[host] ;
                                                     $filepath=$patharray[path] ;
                                           for ( $i = 0; $i    {
                                                                if( substr( $imageurl[$i] ,0,1 )== '/' ){
                                                                             $filename[$i] =$webpath.$imageurl[$i];
                                                                }elseif( substr( $imageurl[$i] ,0,2 )== './' ){
                                                                             $filename[$i] =$webpath.$filepath.substr( $imageurl[$i] ,1, strlen( $imageurl[$i]) );
                                                                }elseif( substr( $imageurl[$i] ,0,3 )== '../' ){
                                                                             $index=strrchr($filepath,/);
                                                                             $filename[$i] =$webpath.substr($filepath,0,$index).substr($imageurl[$i] ,2, strlen( $imageurl[$i]));
                                                                }elseif(substr( $imageurl[$i] ,0,4)== 'http'){
                                                                             $filename[$i] =$imageurl[$i] ;
                                                                }else{
}
   }
return $filename;                    
                                   }
  /**
  * 不下载图片时将页面中的路径替换成新的路径
  * @param $content  需要替换路径的页面内容
  * @return string   替换后的页面内容
  */
                                  function imgpathreplace( $content )
                                  {
   for ( $i = 0; $i imagelist ); $i++ )
   {
    $content = str_replace( $this->imagelist[$i], $this->filename[$i], $content );
   }
return $content;               
                                  }
function seturl ( $u )
  {
   $this->url = $u;
   return true;
  }
function setstartflag ( $s )
  {
   $this->startflag = $s;
   return true;
  }
function setendflag ( $e )
  {
   $this->endflag = $e;
   return true;
  }
function setsaveimagepath ( $p )
  {
   $this->saveimagepath = $p;
   return true;
  }
function setimageurl ( $i )
  {
   $this->imageurl = $i;
   return true;
  }
}
?>
其它类似信息

推荐信息