基于规则的自动分词算法
原理
(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。
正向最大匹配法
算法描述
设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。
算法代码
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
/**
*
* @author quincy1994
*/
public class Nlp {
private String m_sResult = ""; // 切分后的结果串
private int m_nPosIndex; // 指向待切分语料的指针的具体位置
private int m_MaxLen; // 最大取词长
private int totalMaxLen; //总最大取词长
private Set<String> dictionary; // 分词字典
public Nlp(int maxLen){
this.m_MaxLen = maxLen;
this.m_nPosIndex = 0;
this.totalMaxLen = maxLen;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
(()).log(, null, ex);
}
}
public Nlp(){
this.m_MaxLen = 3;
this.totalMaxLen = 3;
this.m_nPosIndex = 0;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
(()).log(, null, ex);
}
}
public Set<String> loadFile() throws FileNotFoundException, IOException{
//读取字典
Set<String> dictionary = new HashSet<String>();
String filename = "";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while( ( tmp = () )!=null){
String[] token = (",");
String word = token[0];
(word);
}
return dictionary;
}
public String MMSegment(String source){
int len = totalMaxLen;
int frompos = 0;
MM(source, len, frompos);
return m_sResult;
}
public String getSubString(String source, int m_nPosIndex, int len){
int endIndex = m_nPosIndex + len;
int length = ();
//需要判断是否超出句子边界
while(endIndex > length){
endIndex -= 1;
}
String sub = (m_nPosIndex, endIndex);
return sub;
}
public void MM(String source, int len , int frompos){
//递归匹配
if (m_nPosIndex >= ()) return;
String sub = getSubString(source, m_nPosIndex,len);
if((sub)){
//匹配
m_sResult += sub + "/ ";
m_nPosIndex = m_nPosIndex + m_MaxLen;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += sub+ "/ ";
m_nPosIndex += 1;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
Nlp nlp = new Nlp();
String source = "今天天气不错!";
String result = (source);
(result);
}
}
逆向最大匹配法
算法描述
与正向最大匹配法原理一样,只是匹配的开始为句尾
代码实现
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
/**
*
* @author quincy1994
*/
public class RMM {
private String m_sResult = ""; //切分后的结果串
private int m_nPosIndex; //游标指针
private int m_MaxLen; //最大取词长
private int totalMaxlen; //总最大取词长
private Set<String> dictionary; //分词字典
public RMM(int maxLen){
this.m_MaxLen = maxLen;
this.totalMaxlen = maxLen;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
(()).log(, null, ex);
}
}
public RMM(){
this.m_MaxLen = 3;
this.totalMaxlen = 3;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
(()).log(, null, ex);
}
}
public Set<String> loadFile() throws IOException{
//读取字典
Set<String> dictionary = new HashSet<String>();
String filename = "";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while((tmp=())!= null){
String[] token = (",");
String word = token[0];
(word);
}
return dictionary;
}
public String RMMSegment(String source){
int len= totalMaxlen;
this.m_nPosIndex = ();
int frompos = this.m_nPosIndex;
rmm(source, m_MaxLen, m_nPosIndex);
//将结果按顺序输出
String[] token = m_sResult.split("/");
String result = "";
for(int i = -1; i > 0 ; i--){
result += token[i] + "/ ";
}
return result;
}
public String getSubString(String source, int m_nPosIndex, int len){
int startIndex = m_nPosIndex - len;
//判断越界条件
while(startIndex < 0){
startIndex += 1;
}
String sub = (startIndex, m_nPosIndex);
return sub;
}
public void rmm(String source, int len, int frompos){
if(m_nPosIndex < 0) return;
String sub = getSubString(source, m_nPosIndex, len);
if((sub)){
//匹配成功
m_sResult += "/" + sub ;
m_nPosIndex = m_nPosIndex - m_MaxLen;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += "/" + sub ;
m_nPosIndex -= 1;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
}
}
public static void main(String[] args) {
// TODO code application logic here
RMM myRMM = new RMM();
String source = "记录最佳前候选词列表";
String result = (source);
(result);
}
}
基于统计的中文分词算法
基本思想
选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率
具体算法
(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.
字典树
又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希树高。
字典树的代码实现
主要参考:/sadfishsc/article/details/9152647
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import ;
import ;
/**
*
* @author quincy1994
*/
public class TireNode {
private String character; // 单个汉字
private int frequency = -1; // 词频, -1来区别某条路径上的字串是否是一个词组
private double antilog = -1; // 对数化的词频
private Map<String, TireNode> children; //下一个节点
public String getCharacter(){
return character;
}
public void setCharacter(String character){
this.character = character;
}
public int getFrequency(){
return frequency;
}
public void setFrequency(int frequency){
this.frequency = frequency;
}
public double getAntilog(){
return antilog;
}
public void setAntilog(double antilog){
this.antilog = antilog;
}
public void addChild(TireNode node){
if (children == null){
children = new HashMap<String, TireNode>();
}
if (!(())){
((), node);
}
}
public TireNode getChild(String ch){
if (children == null || ! (ch)){
return null;
}
return (ch);
}
public void removeChildren(String ch){
if (children == null || !(ch)){
return;
}
(ch);
}
}
算法实现
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import ;
import ;
import ;
import ;
import ;
import ;
/**
*
* @author quincy1994
*/
public class ChnSeq {
private TireNode tire = null;
public List<String> loadFile() throws FileNotFoundException, IOException {
//读取字典
List<String> lines = new ArrayList<String>();
String filename = "";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while ((tmp = ()) != null) {
(tmp);
}
();
return lines;
}
public void init() throws IOException {
List<String> lines = loadFile();
tire = new TireNode();
for (String line : lines) {
String[] tokens = (",");
String word = tokens[0];
int freq = (tokens[1]);
double antilog = (1+0.01/(tokens[2].replace("%", ""))) ;
//构建词典树
TireNode root = tire;
for (int i = 0; i < (); i++) {
String c = "" + (i);
TireNode node = (c);
if (node == null) {
node = new TireNode();
(c);
(node);
}
root = node;
}
(freq); //为每个词设立词频
(antilog); //为每个词设立逆文档频率
}
}
public TireNode getTire() {
return tire;
}
public TireNode getNodeByWord(String word) {
TireNode node = tire;
for (int i = 0; i < (); i++) {
String ch = (i) + "";
if (node == null) {
break;
} else {
node = (ch);
}
}
return node;
}
private class Segment {
public String word; //词
public String endChar; //结束词
public String lastChar; //前缀词
public double cost;
public final static String START_SIGN = "<< STARTING >>";
public final static String END_SIGN = "<< ENDING >>";
}
//寻找候选词
public List<Segment> preSegment(String sentence) {
List<Segment> segs = new ArrayList<Segment>();
//设置句子的开始标志
Segment terminal = new Segment();
= Segment.START_SIGN;
= Segment.START_SIGN;
= null;
(terminal);
for (int i = 0; i < (); i++) {
for (int j = i + 1; j <= (); j++) {
String word = (i, j);
TireNode tnode = this.getNodeByWord(word);
if (tnode == null) {
break;
}
if (() <= 0) {
continue;
}
Segment seg = new Segment();
= word;
= (() - 1, ());
if (i == 0) {
= Segment.START_SIGN;
} else {
= (i - 1, i);
}
= ();
(word + " " + +" " + ());
(seg);
}
}
//设置句子的结束标志
terminal = new Segment();
= Segment.END_SIGN;
= Segment.END_SIGN;
= (() - 1, ());
(terminal);
return segs;
}
public String dynamicSegment(List<Segment> segs) {
//基于动态规划的概率统计分词
final double INFINITE = 9999999;
if (segs == null || () == 0) {
("找不到候选词");
return null;
}
int n = (); //候选词的个数
//单个词
double[][] costs = new double[n][n];
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n; j++) {
String endChar = (i).endChar;
if (j == i && ((j).word)) {
costs[i][j] = (j).cost; //候选词j的概率
continue;
}
costs[i][j] = INFINITE;
}
}
//寻找前一个候选词
for (int i = 0; i < n - 1; i++) {
String endChar = (i).endChar;
for (int j = i + 1; j < n; j++) {
String lastChar = (j).lastChar;
if (lastChar != null && (endChar) &&( j- i < 4)) { //j前缀词不为空,同时j的前缀词等于i的后缀词,且j和i之间的间隔不超过4个候选词
costs[i][j] = (j).cost; //候选词j的概率
}
}
}
int sp = 0; //开始点
int fp = n - 1; //结束点
double[] dist = new double[n]; // 记录累计概率, n为候选词的个数
List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
List<Integer> list = new ArrayList<Integer>();
for (int i = 0; i < n; i++) {
dist[i] = costs[sp][i]; //i的累计概率的初始值为索引sp到索引i的词的概率
if (sp != i) {
(i); //记录候选词的索引位置
}
if (dist[i] < INFINITE) {
List<Integer> spa = new ArrayList<Integer>(); //如果索引sp到索引i构成一个词,则开启一条划分路径
(spa);
} else {
(null);
}
}
while (!()) {
//选切分点
Integer minIdx = (0);
(minIdx);
//判断minIdx是否为开头的候选词
if(dist[minIdx] == INFINITE){
continue;
}
//动态规划
for (int i = minIdx+1; i < n; i++) {
if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
dist[i] = dist[minIdx] + costs[minIdx][i];
List<Integer> tmp = new ArrayList<Integer>((minIdx));
(minIdx);
(i, tmp); //记录最佳前候选词列表
}
}
}
String result = "";
for (int i = 0; i < (fp).size(); i++) {
result += ((fp).get(i)).word + "/ ";
}
return result;
}
public String segment(String sentences) {
return dynamicSegment(preSegment(sentences));
}
public static void main(String[] args) throws ClassNotFoundException, IOException {
ChnSeq cs = new ChnSeq();
();
String sentence = "在这一年中,改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。";
String segs = (sentence);
(segs);
}
}
具体的代码和字典,可以访问:
/Quincy1994/Segment