?php //rmm分词算法 class splitword{ var $tagdic = array(); var $rankdic = array(); var $sourcestr = ''; var $resultstr = ''; var $splitchar = ' '; //分隔符 var $splitlen = 4; //保留词长度 var $maxlen = 7; //词典最大中文字,这里的数值为字节
__construct();
}
function __construct(){
//高级分词,预先载入词典以提分词高速度
$dicfile = dirname(__file__)./ppldic.csv;
$fp = fopen($dicfile,'r'); //读取词库中的词
while($line = fgets($fp,256)){
$ws = explode(' ',$line); //对词库中的词进行拆分
$this->tagdic[$ws[0]] = $ws[1];
$this->rankdic[strlen($ws[0])][$ws[0]] = $ws[2]; //
}
fclose($fp); //关闭词库文件
}
//释放资源
function clear(){
@fclose($this->quickdic);
}
//设置源字符串
function setsource($str){
$this->sourcestr = $this->updatestr($str);
$this->resultstr = ;
}
//检查字符串是否不存在中文
function notgbk($str)
{
if($str==) return ;
if( ord($str[0])>0x80 ) return false;
else return true;
}
//rmm分词算法
function splitrmm($str=){
if($str!=) $this->setsource($str);
if($this->sourcestr==) return ;
$this->sourcestr = $this->updatestr($this->sourcestr);
$spwords = explode( ,$this->sourcestr);
$splen = count($spwords);
$spc = $this->splitchar;
for($i=($splen-1);$i>=0;$i--){
if($spwords[$i]==) continue;
if($this->notgbk($spwords[$i])){
if(ereg([^0-9\.\+\-],$spwords[$i]))
{ $this->resultstr = $spwords[$i].$spc.$this->resultstr; }
else
{
$nextword = ;
@$nextword = substr($this->resultstr,0,strpos($this->resultstr,));
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if(strlen($spwords[$i]) splitlen)
{
}
else
{
$this->resultstr = $this->runrmm($spwords[$i]).$spc.$this->resultstr;
}
}
}
return $this->resultstr;
}
//对全中文字符串进行逆向匹配方式分解
function runrmm($str){
$spc = $this->splitchar;
$splen = strlen($str);
$rsstr = ;
$okword = ;
$tmpword = ;
$wordarray = array();
//逆向字典匹配
for($i=($splen-1);$i>=0;){
//当i达到最小可能词的时候
if($iminlen){
if($i==1){
$wordarray[] = substr($str,0,2);
}else
{
$w = substr($str,0,$this->minlen+1);
if($this->isword($w)){
$wordarray[] = $w;
}else{
$wordarray[] = substr($str,2,2);
$wordarray[] = substr($str,0,2);
}
}
$i = -1; break;
}
//分析在最小词以上时的情况
if($i>=$this->maxlen) $maxpos = $this->maxlen;
else $maxpos = $i;
$ismatch = false;
for($j=$maxpos;$j>=0;$j=$j-2){
$w = substr($str,$i-$j,$j+1);
if($this->isword($w)){
$wordarray[] = $w;
$i = $i-$j-1;
$ismatch = true;
break;
}
}
}
$rsstr = $this->otherword($wordarray);
return $rsstr;
}
function otherword($wordarray){
$wlen = count($wordarray)-1; //计算数组的元素个数
$rsstr = ; //初始化变量
$spc = $this->splitchar;
for($i=$wlen;$i>=0;$i--)
{
$rsstr .= $spc.$wordarray[$i].、; //将数组为顿号进行拆分
}
//返回本段分词结果
$rsstr = preg_replace(/^.$spc./,、,$rsstr);
return $rsstr;
}
//判断词典里是否存在某个词
function isword($okword){
$slen = strlen($okword);
if($slen > $this->maxlen) return false;
else return isset($this->rankdic[$slen][$okword]);
}
//整理字符串(对标点符号,中英文混排等初步处理)
function updatestr($str){
$spc = $this->splitchar;
$slen = strlen($str);
if($slen==0) return '';
$okstr = '';
$prechar = 0; // 0-空白 1-英文 2-中文 3-符号
for($i=0;$i if(ord($str[$i]) //英文的空白符号
if(ord($str[$i]) if($prechar!=0&&$str[$i]!=\r&&$str[$i]!=\n) $okstr .= $spc;
$prechar=0;
continue;
}else if(ereg([^0-9a-za-z@\.%#:/\\&_-],$str[$i])){
if($prechar==0){ $okstr .= $str[$i]; $prechar=3;}
else{ $okstr .= $spc.$str[$i]; $prechar=3;}
}else{
if($prechar==2||$prechar==3)
{ $okstr .= $spc.$str[$i]; $prechar=1;}
else
{
if(ereg(@#%:,$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
else { $okstr .= $str[$i]; $prechar=1; }
}
}
}
else{
//如果上一个字符为非中文和非空格,则加一个空格
if($prechar!=0 && $prechar!=2) $okstr .= $spc;
//如果中文字符
if(isset($str[$i+1])){
$c = $str[$i].$str[$i+1];
$n = hexdec(bin2hex($c));
if($n 0xaa40){
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
else{
$okstr .= $c;
$prechar = 2;
}
$i++;
}
}
}
return $okstr;
}
}
?>