经典的分词方法实现（JAVA)

基于规则的自动分词算法

原理

(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。

正向最大匹配法

算法描述

设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。

算法代码

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;

/**
 *
 * @author quincy1994
 */
public class Nlp {

    private String m_sResult = ""; // 切分后的结果串
    private int m_nPosIndex;  // 指向待切分语料的指针的具体位置
    private int m_MaxLen; // 最大取词长
    private int totalMaxLen; //总最大取词长
    private Set<String> dictionary; // 分词字典

    public Nlp(int maxLen){
        this.m_MaxLen = maxLen;
        this.m_nPosIndex = 0;
        this.totalMaxLen = maxLen;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            (()).log(, null, ex);
        }
    }
    public Nlp(){
        this.m_MaxLen = 3;
        this.totalMaxLen = 3;
        this.m_nPosIndex = 0;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            (()).log(, null, ex);
        }
    }
    public Set<String> loadFile() throws FileNotFoundException, IOException{
        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while( ( tmp = () )!=null){
            String[] token = (",");
            String word = token[0];
            (word);
        }
        return dictionary;
    }
    public String MMSegment(String source){
         int len = totalMaxLen;
         int frompos = 0;
         MM(source, len, frompos);
         return m_sResult;
     }
    public String getSubString(String source, int m_nPosIndex, int len){
        int endIndex = m_nPosIndex + len;
        int length = ();

        //需要判断是否超出句子边界
        while(endIndex > length){
            endIndex -= 1;
        }
        String sub = (m_nPosIndex, endIndex);
        return sub;
    }
    public void MM(String source, int len , int frompos){

        //递归匹配
         if (m_nPosIndex >= ()) return;
        String sub = getSubString(source, m_nPosIndex,len);
        if((sub)){
            //匹配
            m_sResult += sub + "/ ";
            m_nPosIndex = m_nPosIndex + m_MaxLen;
            m_MaxLen = totalMaxLen;
            MM(source, m_MaxLen, m_nPosIndex);
        }
        else{
            //不匹配
            if(m_MaxLen > 1){
                m_MaxLen = m_MaxLen - 1;
                MM(source, m_MaxLen, m_nPosIndex);
            }
            else{
                m_sResult += sub+ "/ ";
                m_nPosIndex  += 1;
                m_MaxLen = totalMaxLen;
                MM(source, m_MaxLen, m_nPosIndex);
            }
    }
}
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Nlp nlp = new Nlp();
        String source = "今天天气不错！";
        String result = (source);
        (result);
    } 
}

逆向最大匹配法

算法描述

与正向最大匹配法原理一样，只是匹配的开始为句尾

代码实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import ;
import ;
import ;
import ;
import ;
import ;
import ;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分后的结果串
    private int m_nPosIndex;                //游标指针
    private int m_MaxLen;                    //最大取词长
    private int totalMaxlen;                //总最大取词长
    private Set<String> dictionary;      //分词字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            (()).log(, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            (()).log(, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=())!= null){
            String[] token = (",");
            String word = token[0];
            (word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = ();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //将结果按顺序输出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = -1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判断越界条件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = (startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if((sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "记录最佳前候选词列表";
        String result = (source);
        (result);
    } 
}

基于统计的中文分词算法

基本思想

选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率

具体算法

(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.

字典树

又称单词查找树，Trie树，是一种树形结构，是一种哈希树的变种。典型应用是用于统计，排序和保存大量的字符串（但不仅限于字符串），所以经常被搜索引擎系统用于文本词频统计。它的优点是：利用字符串的公共前缀来减少查询时间，最大限度地减少无谓的字符串比较，查询效率比哈希树高。

字典树的代码实现

主要参考:/sadfishsc/article/details/9152647

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import ;
import ;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           //　单个汉字
    private int frequency = -1;       //     词频, -1来区别某条路径上的字串是否是一个词组
    private double antilog = -1;    //      对数化的词频
    private Map<String, TireNode> children;  //下一个节点

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!(())){
            ((), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! (ch)){
            return null;
        }
        return (ch);
    }

    public void removeChildren(String ch){
        if (children == null || !(ch)){
            return;
        }
        (ch);
    }
}

算法实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import ;
import ;
import ;
import ;
import ;
import ;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //读取字典
        List<String> lines = new ArrayList<String>();
        String filename = "";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = ()) != null) {
            (tmp);
        }
        ();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = (",");
            String word = tokens[0];
            int freq = (tokens[1]);
            double antilog =  (1+0.01/(tokens[2].replace("%", ""))) ;
            //构建词典树
            TireNode root = tire;
            for (int i = 0; i < (); i++) {
                String c = "" + (i);
                TireNode node = (c);
                if (node == null) {
                    node = new TireNode();
                    (c);
                    (node);
                }
                root = node;
            }
            (freq);    //为每个词设立词频
            (antilog);   //为每个词设立逆文档频率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < (); i++) {
            String ch = (i) + "";
            if (node == null) {
                break;
            } else {
                node = (ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //词
        public String endChar; //结束词
        public String lastChar; //前缀词
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //寻找候选词
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //设置句子的开始标志
        Segment terminal = new Segment();
         = Segment.START_SIGN;
         = Segment.START_SIGN;
         = null;
        (terminal);

        for (int i = 0; i < (); i++) {
            for (int j = i + 1; j <= (); j++) {
                String word = (i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                 = word;
                 = (() - 1, ());
                if (i == 0) {
                     = Segment.START_SIGN;
                } else {
                     = (i - 1, i);
                }
                 = ();
                (word + " " +  +" " + ());
                (seg);
            }
        }

        //设置句子的结束标志
        terminal = new Segment();
         = Segment.END_SIGN;
         = Segment.END_SIGN;
         = (() - 1, ());
        (terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基于动态规划的概率统计分词
        final double INFINITE = 9999999;

        if (segs == null || () == 0) {
            ("找不到候选词");
            return null;
        }

        int n = ();    //候选词的个数

        //单个词
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = (i).endChar;
                if (j == i && ((j).word)) {
                    costs[i][j] = (j).cost;    //候选词j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //寻找前一个候选词
        for (int i = 0; i < n - 1; i++) {
            String endChar = (i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = (j).lastChar;
                if (lastChar != null && (endChar) &&( j- i < 4)) {       //ｊ前缀词不为空，同时ｊ的前缀词等于ｉ的后缀词,且j和i之间的间隔不超过4个候选词
                    costs[i][j] = (j).cost;    //候选词j的概率
                }
            }
        }

        int sp = 0;   //开始点
        int fp = n - 1;    //结束点

        double[] dist = new double[n];         // 记录累计概率, n为候选词的个数
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //ｉ的累计概率的初始值为索引sp到索引ｉ的词的概率
            if (sp != i) {
                (i);   //记录候选词的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引ｉ构成一个词，则开启一条划分路径
                (spa);
            } else {
                (null);
            }
        }
        while (!()) {

            //选切分点
            Integer minIdx = (0);
            (minIdx);

            //判断minIdx是否为开头的候选词
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //动态规划
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>((minIdx));
                    (minIdx);
                    (i, tmp);  //记录最佳前候选词列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < (fp).size(); i++) {
            result += ((fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        ();
        String sentence = "在这一年中，改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成，企业改革继续深化，人民生活进一步改善。对外经济技术合作与交流不断扩大。";
        String segs = (sentence);
        (segs);
    }
}

具体的代码和字典，可以访问：
/Quincy1994/Segment

秒客网

经典的分词方法实现（JAVA)

基于规则的自动分词算法

原理

正向最大匹配法

算法描述

算法代码

逆向最大匹配法

算法描述

代码实现

基于统计的中文分词算法

基本思想

具体算法

字典树

字典树的代码实现

算法实现

相关文章