RMM分词算法 PHP实现

?php //rmm分词算法 class splitword{ var $tagdic = array(); var $rankdic = array(); var $sourcestr = ''; var $resultstr = ''; var $splitchar = ' '; //分隔符 var $splitlen = 4; //保留词长度 var $maxlen = 7; //词典最大中文字，这里的数值为字节
__construct();
}
function __construct(){
//高级分词，预先载入词典以提分词高速度
$dicfile = dirname(__file__)./ppldic.csv;
$fp = fopen($dicfile,'r');   //读取词库中的词
while($line = fgets($fp,256)){
    $ws = explode(' ',$line);  //对词库中的词进行拆分
    $this->tagdic[$ws[0]] = $ws[1];
    $this->rankdic[strlen($ws[0])][$ws[0]] = $ws[2]; //
}
fclose($fp);  //关闭词库文件
}
//释放资源
function clear(){
@fclose($this->quickdic);
}
//设置源字符串
function setsource($str){
$this->sourcestr = $this->updatestr($str);
$this->resultstr = ;
}
//检查字符串是否不存在中文
function notgbk($str)
{
    if($str==) return ;
if( ord($str[0])>0x80 ) return false;
else return true;
}
//rmm分词算法
function splitrmm($str=){
if($str!=) $this->setsource($str);
if($this->sourcestr==) return ;
$this->sourcestr = $this->updatestr($this->sourcestr);
$spwords = explode( ,$this->sourcestr);
$splen = count($spwords);
$spc = $this->splitchar;
for($i=($splen-1);$i>=0;$i--){
  if($spwords[$i]==) continue;
  if($this->notgbk($spwords[$i])){
   if(ereg([^0-9\.\+\-],$spwords[$i]))
   { $this->resultstr = $spwords[$i].$spc.$this->resultstr; }
   else
   {
    $nextword = ;
    @$nextword = substr($this->resultstr,0,strpos($this->resultstr,));
   }
  }
  else
  {
    $c = $spwords[$i][0].$spwords[$i][1];
    $n = hexdec(bin2hex($c));
    if(strlen($spwords[$i]) splitlen)
    {
    }
    else
    {
      $this->resultstr = $this->runrmm($spwords[$i]).$spc.$this->resultstr;
    }
   }
}
return $this->resultstr;
}
//对全中文字符串进行逆向匹配方式分解
function runrmm($str){
$spc = $this->splitchar;
$splen = strlen($str);
$rsstr = ;
$okword = ;
$tmpword = ;
$wordarray = array();
//逆向字典匹配
for($i=($splen-1);$i>=0;){
  //当i达到最小可能词的时候
  if($iminlen){
   if($i==1){
     $wordarray[] = substr($str,0,2);
    }else
   {
      $w = substr($str,0,$this->minlen+1);
      if($this->isword($w)){
      $wordarray[] = $w;
      }else{
       $wordarray[] = substr($str,2,2);
       $wordarray[] = substr($str,0,2);
      }
    }
   $i = -1; break;
  }
  //分析在最小词以上时的情况
  if($i>=$this->maxlen) $maxpos = $this->maxlen;
  else $maxpos = $i;
  $ismatch = false;
  for($j=$maxpos;$j>=0;$j=$j-2){
    $w = substr($str,$i-$j,$j+1);
    if($this->isword($w)){
    $wordarray[] = $w;
    $i = $i-$j-1;
    $ismatch = true;
    break;
    }
  }
}
$rsstr = $this->otherword($wordarray);
return $rsstr;
}
function otherword($wordarray){
$wlen = count($wordarray)-1;      //计算数组的元素个数
$rsstr = ;          //初始化变量
$spc = $this->splitchar;
for($i=$wlen;$i>=0;$i--)
{
   $rsstr .= $spc.$wordarray[$i].、;   //将数组为顿号进行拆分
}
//返回本段分词结果
  $rsstr = preg_replace(/^.$spc./,、,$rsstr);
return $rsstr;
}
//判断词典里是否存在某个词
function isword($okword){
$slen = strlen($okword);
if($slen > $this->maxlen) return false;
else return isset($this->rankdic[$slen][$okword]);
}
//整理字符串（对标点符号，中英文混排等初步处理）
function updatestr($str){
$spc = $this->splitchar;
    $slen = strlen($str);
    if($slen==0) return '';
    $okstr = '';
    $prechar = 0; // 0-空白 1-英文 2-中文 3-符号
    for($i=0;$i      if(ord($str[$i])         //英文的空白符号
        if(ord($str[$i])           if($prechar!=0&&$str[$i]!=\r&&$str[$i]!=\n) $okstr .= $spc;
          $prechar=0;
          continue;
        }else if(ereg([^0-9a-za-z@\.%#:/\\&_-],$str[$i])){
          if($prechar==0){ $okstr .= $str[$i]; $prechar=3;}
          else{ $okstr .= $spc.$str[$i]; $prechar=3;}
        }else{
        if($prechar==2||$prechar==3)
        { $okstr .= $spc.$str[$i]; $prechar=1;}
        else
        {
           if(ereg(@#%:,$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
           else { $okstr .= $str[$i]; $prechar=1; }
        }
        }
      }
      else{
        //如果上一个字符为非中文和非空格，则加一个空格
        if($prechar!=0 && $prechar!=2) $okstr .= $spc;
        //如果中文字符
        if(isset($str[$i+1])){
          $c = $str[$i].$str[$i+1];
$n = hexdec(bin2hex($c));
          if($n 0xaa40){
            if($prechar!=0) $okstr .= $spc.$c;
            else $okstr .= $c;
            $prechar = 3;
            }
          else{
            $okstr .= $c;
            $prechar = 2;
          }
          $i++;
        }
      }
    }
    return $okstr;
}
}
?>

RMM分词算法 PHP实现

推荐信息