content , '>' , $istart )+1 ;
}
function getcontent ( $url )
{
$this->content = file_get_contents($url);
$this->contentlen = strlen( $this->content ) ;
$start = strpos( $this->content , '
') ;
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$title = substr( $this->content , $start , $this->$end-$start ) ;
if ( strpos( $title , '_百度知道' , 1 ) {
return false;
}
return ture ;
}
function gettitle()
{
$start = strpos( $this->content , '') ;
if ( $start > 0 )
{
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$this->curposition = $end ;
return substr( $this->content , $start , $end-$start ) ;
}
return null ;
}
function getqtitle()
{
$start = strpos( $this->content , 'span class=question-title' , $this->curposition ) ;
if ( $start > 0 )
{
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$this->curposition = $end ;
return substr( $this->content , $start , $end-$start ) ;
}
return null ;
}
function getclassfly()
{
;
}
function getqcontent()
{
$start = strpos( $this->content , 'pre id=question-content' , $this->curposition ) ;
if ( $start > 0 )
{
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$this->curposition = $end ;
return substr( $this->content , $start , $end-$start ) ;
}
return null ;
}
function getqsuply()
{
$start = strpos( $this->content , 'id=question-suply' , $this->curposition ) ;
if ( $start > 0 )
{
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$this->curposition = $end ;
return substr( $this->content , $start , $end-$start ) ;
}
return null ;
}
function getanswer()
{
$start = strpos( $this->content , 'class=reply-text mb10' , $this->curposition ) ;
if ( $start > 0 )
{
$start = $this->getstart( $start ) ;
$end = strpos( $this->content , '' , $start ) ;
$this->curposition = $end ;
return substr( $this->content , $start , $end-$start ) ;
}
return null ;
}
}
ini_set('max_execution_time', '0');
$testspider = new spider() ;
$startqid = 1000001 ;
$sndqid = 1000051 ;
$standurl = 'http://zhidao.baidu.com/question/' ;
$html = '.html' ;
$url ;
$nouse = 0 ;
function microtime_float()
{
list($usec, $sec) = explode( , microtime());
return ((float)$usec + (float)$sec);
}
$time_start = microtime_float();
$answer ;
for ($i = $startqid ; $i {
$url = $standurl.$i.$html ;
if ( $testspider->getcontent ( $url ) )
{
echo '
正在爬取编号为'.$i.'的网页
' ;
$testspider->gettitle() ; //得到网页标题,不用显示了
echo '问题: '.$testspider->getqtitle().'
' ; //得到问题题目
echo '问题具体内容:'.$testspider->getqcontent().'
' ; //得到问题内容,有可能不存在
echo '问题补充说明:'.$testspider->getqsuply().'
' ; //问题补充说明,有可能不存在
while ( ($answer = $testspider->getanswer()) != null )
{
echo '问题答案:'.$answer.'
' ; //得到答案。有可能没有答案!
}
ob_flush() ;
flush() ;
}
else
{
echo '错误了'.$url.'
' ;
$nouse++ ;
}
}
$time_end = microtime_float();
$time = $time_end - $time_start;
$i = $i-$startqid ;
echo '爬取'.$i.'个网页用时'.$time.'秒
其中跳过'.$nouse.'个无效网页!' ; ?>
