License: Free for non-commercial use PHP Classes: http://www.phpclasses.org/browse/package/2508.html Description: This class can segment Chinese using maximum probability approach. So some ambiguity mistakes can be avoided. Furthermore, it can segment English in a simple way. ***********************************************************************/ class Segmentation { var $options = array('e' => FALSE, 'l' => TRUE, 's' => TRUE, 'c' => TRUE); var $dict_size = 1; var $dict_name = 'Unknown'; var $dict_words = array(); function setOptions($options) { if (is_string($options)) { $options = explode(' ', $options); } if (is_array($options)) { foreach ($options as $option) { if ($option{0} === '-') { $this->options[$option{1}] = FALSE; } else { $this->options[$option{0}] = TRUE; } } } else { return FALSE; } } function loadDict($dict_file) { if (!file_exists($dict_file)) { return FALSE; } $fp = fopen($dict_file, 'r'); $temp = fgets($fp, 1024); if ($temp === FALSE) { return FALSE; } else { if (strpos($temp, "\t") === FALSE) { return FALSE; } else { list ($dict_type, $dict_size, $dict_name) = explode("\t", trim($temp)); if ($dict_type !== 'DICT_WORD_WF') { return FALSE; } if (!is_numeric($dict_size) || ($dict_size<=0)) { return FALSE; } $this->dict_size = $dict_size; $this->dict_name = $dict_name; } } while (!feof($fp)) { list ($word, $freq) = split("\t", rtrim(fgets($fp, 32))); $this->dict_words[$word] = $freq; } fclose($fp); return TRUE; } function getDictName() { return $this->dict_name; } function segmentString($str) { if (count($this->dict_words) === 0) { return FALSE; } $lines = explode("\n", $str); return $this->_segmentLines($lines); } function segmentFile($filename) { if (count($this->dict_words) === 0) { return FALSE; } $lines = file($filename); return $this->_segmentLines($lines); } function _segmentLines($lines) { $contents_segmented = ''; if ($this->options['e']) { foreach ($lines as $line) { if (!preg_match("/[\x81-\xFE]/", $line)) { $contents_segmented .= ' ' . $this->_segmentEnglish($line) . " \n"; } else { $contents_segmented .= $this->_segmentLine(rtrim($line)) . " \n"; } } } else { foreach ($lines as $line) { $contents_segmented .= $this->_segmentLine(rtrim($line)) . " \n"; } } do { $contents_segmented = str_replace(' ', ' ', $contents_segmented); } while (strpos($contents_segmented, ' ') !== FALSE); return $contents_segmented; } function _segmentLine($str) { $str_final = ' '; $str_length = strlen($str); $str .= ' '; for ($i=0; $i<$str_length; $i++) { if (ord($str{$i}) >= 129) { $str_tmp = $str{$i} . $str{$i+1}; for ($j=$i+2; $j<$str_length; $j=$j+2) { if (ord($str{$j}) >= 129) { $str_tmp .= $str{$j} . $str{$j+1}; } else { break; } } $str_final .= $this->_segmentChinese($str_tmp); $i = $j - 1; } else { $str_tmp = $str{$i}; for ($j=$i+1; $j<$str_length; $j++) { if (ord($str{$j}) < 129) { $str_tmp .= $str{$j}; } else { break; } } $str_final .= $this->_segmentEnglish($str_tmp) . ' '; $i = $j - 1; } } return $str_final; } function _segmentChinese($str) { $len = strlen($str); $candidates = array(); for ($i=0; $i<$len; $i+=2) { for ($l=2; $l<=8; $l+=2) { if ($l <= ($len-$i)) { $w = substr($str, $i, $l); if (array_key_exists($w, $this->dict_words)) { $f = $this->dict_words[$w]; } elseif ($l > 2) { continue; } else { $f = 0; } $candidates[] = array($i, $l, 0, (-log(($f+1)/$this->dict_size)), 0, ($i+$l)); } } } $n = count($candidates); $minID = -1; for ($i=0; $i<$n; $i++) { if ($candidates[$i][0] === 0) { $candidates[$i][2] = -1; $candidates[$i][4] = $candidates[$i][3]; } else { break; } } for (; $i<$n; $i++) { $j = 0; $minID_2 = -1; for ($j=$i-1; $j>=0; $j--) { if ($candidates[$j][5] === $candidates[$i][0]) { if (($minID_2===-1) || ($candidates[$j][4]<=$candidates[$minID_2][4])) { $minID_2 = $j; } } if (($candidates[$i][0]-$candidates[$j][0]) === 8) { break; } } $candidates[$i][2] = $minID_2; $candidates[$i][4] = $candidates[$i][3] + $candidates[$minID_2][4]; if ($candidates[$i][5] === $len) { if (($minID===-1) || ($candidates[$i][4]<$candidates[$minID][4])) { $minID = $i; } } } $str_final = ''; for ($i=$minID; $i>=0; $i=$candidates[$i][2]) { $str_final = substr($str, $candidates[$i][0], $candidates[$i][1]) . ' ' . $str_final; } return $str_final; } function _segmentEnglish($str) { if ($this->options['s']) { $str = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/", " $1 ", $str); $str = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/", " $1 $2 ", $str); } else { $str = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/", " ", $str); } if ($this->options['c']) { do { $str = str_replace(' ', ' ', $str); } while (strpos($str, ' ') !== FALSE); $words = explode(' ', $str); $words_count = count($words); for ($i=0; $i<$words_count; $i++) { if (!preg_match("/^[A-Z\.]+$/", $words[$i])) { $words[$i] = preg_replace("/([A-Z])([^A-Z]+)/", " $1$2 ", $words[$i]); } } $str = implode(' ', $words); } if ($this->options['l']) { $str = strtolower($str); } return $str; } } ?>