最近参与一个小project,需要编写一个针对英文单词的stem 算法。
1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html
// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.Among; /**
* This class was automatically generated by a Snowball to Java compiler
* It implements the stemming algorithm defined by a snowball script.
*/ public class englishStemmer extends org.tartarus.snowball.SnowballStemmer { private static final long serialVersionUID = 1L; private final static englishStemmer methodObject = new englishStemmer (); private final static Among a_0[] = {
new Among ( "arsen", -1, -1, "", methodObject ),
new Among ( "commun", -1, -1, "", methodObject ),
new Among ( "gener", -1, -1, "", methodObject )
}; private final static Among a_1[] = {
new Among ( "'", -1, 1, "", methodObject ),
new Among ( "'s'", 0, 1, "", methodObject ),
new Among ( "'s", -1, 1, "", methodObject )
}; private final static Among a_2[] = {
new Among ( "ied", -1, 2, "", methodObject ),
new Among ( "s", -1, 3, "", methodObject ),
new Among ( "ies", 1, 2, "", methodObject ),
new Among ( "sses", 1, 1, "", methodObject ),
new Among ( "ss", 1, -1, "", methodObject ),
new Among ( "us", 1, -1, "", methodObject )
}; private final static Among a_3[] = {
new Among ( "", -1, 3, "", methodObject ),
new Among ( "bb", 0, 2, "", methodObject ),
new Among ( "dd", 0, 2, "", methodObject ),
new Among ( "ff", 0, 2, "", methodObject ),
new Among ( "gg", 0, 2, "", methodObject ),
new Among ( "bl", 0, 1, "", methodObject ),
new Among ( "mm", 0, 2, "", methodObject ),
new Among ( "nn", 0, 2, "", methodObject ),
new Among ( "pp", 0, 2, "", methodObject ),
new Among ( "rr", 0, 2, "", methodObject ),
new Among ( "at", 0, 1, "", methodObject ),
new Among ( "tt", 0, 2, "", methodObject ),
new Among ( "iz", 0, 1, "", methodObject )
}; private final static Among a_4[] = {
new Among ( "ed", -1, 2, "", methodObject ),
new Among ( "eed", 0, 1, "", methodObject ),
new Among ( "ing", -1, 2, "", methodObject ),
new Among ( "edly", -1, 2, "", methodObject ),
new Among ( "eedly", 3, 1, "", methodObject ),
new Among ( "ingly", -1, 2, "", methodObject )
}; private final static Among a_5[] = {
new Among ( "anci", -1, 3, "", methodObject ),
new Among ( "enci", -1, 2, "", methodObject ),
new Among ( "ogi", -1, 13, "", methodObject ),
new Among ( "li", -1, 16, "", methodObject ),
new Among ( "bli", 3, 12, "", methodObject ),
new Among ( "abli", 4, 4, "", methodObject ),
new Among ( "alli", 3, 8, "", methodObject ),
new Among ( "fulli", 3, 14, "", methodObject ),
new Among ( "lessli", 3, 15, "", methodObject ),
new Among ( "ousli", 3, 10, "", methodObject ),
new Among ( "entli", 3, 5, "", methodObject ),
new Among ( "aliti", -1, 8, "", methodObject ),
new Among ( "biliti", -1, 12, "", methodObject ),
new Among ( "iviti", -1, 11, "", methodObject ),
new Among ( "tional", -1, 1, "", methodObject ),
new Among ( "ational", 14, 7, "", methodObject ),
new Among ( "alism", -1, 8, "", methodObject ),
new Among ( "ation", -1, 7, "", methodObject ),
new Among ( "ization", 17, 6, "", methodObject ),
new Among ( "izer", -1, 6, "", methodObject ),
new Among ( "ator", -1, 7, "", methodObject ),
new Among ( "iveness", -1, 11, "", methodObject ),
new Among ( "fulness", -1, 9, "", methodObject ),
new Among ( "ousness", -1, 10, "", methodObject )
}; private final static Among a_6[] = {
new Among ( "icate", -1, 4, "", methodObject ),
new Among ( "ative", -1, 6, "", methodObject ),
new Among ( "alize", -1, 3, "", methodObject ),
new Among ( "iciti", -1, 4, "", methodObject ),
new Among ( "ical", -1, 4, "", methodObject ),
new Among ( "tional", -1, 1, "", methodObject ),
new Among ( "ational", 5, 2, "", methodObject ),
new Among ( "ful", -1, 5, "", methodObject ),
new Among ( "ness", -1, 5, "", methodObject )
}; private final static Among a_7[] = {
new Among ( "ic", -1, 1, "", methodObject ),
new Among ( "ance", -1, 1, "", methodObject ),
new Among ( "ence", -1, 1, "", methodObject ),
new Among ( "able", -1, 1, "", methodObject ),
new Among ( "ible", -1, 1, "", methodObject ),
new Among ( "ate", -1, 1, "", methodObject ),
new Among ( "ive", -1, 1, "", methodObject ),
new Among ( "ize", -1, 1, "", methodObject ),
new Among ( "iti", -1, 1, "", methodObject ),
new Among ( "al", -1, 1, "", methodObject ),
new Among ( "ism", -1, 1, "", methodObject ),
new Among ( "ion", -1, 2, "", methodObject ),
new Among ( "er", -1, 1, "", methodObject ),
new Among ( "ous", -1, 1, "", methodObject ),
new Among ( "ant", -1, 1, "", methodObject ),
new Among ( "ent", -1, 1, "", methodObject ),
new Among ( "ment", 15, 1, "", methodObject ),
new Among ( "ement", 16, 1, "", methodObject )
}; private final static Among a_8[] = {
new Among ( "e", -1, 1, "", methodObject ),
new Among ( "l", -1, 2, "", methodObject )
}; private final static Among a_9[] = {
new Among ( "succeed", -1, -1, "", methodObject ),
new Among ( "proceed", -1, -1, "", methodObject ),
new Among ( "exceed", -1, -1, "", methodObject ),
new Among ( "canning", -1, -1, "", methodObject ),
new Among ( "inning", -1, -1, "", methodObject ),
new Among ( "earring", -1, -1, "", methodObject ),
new Among ( "herring", -1, -1, "", methodObject ),
new Among ( "outing", -1, -1, "", methodObject )
}; private final static Among a_10[] = {
new Among ( "andes", -1, -1, "", methodObject ),
new Among ( "atlas", -1, -1, "", methodObject ),
new Among ( "bias", -1, -1, "", methodObject ),
new Among ( "cosmos", -1, -1, "", methodObject ),
new Among ( "dying", -1, 3, "", methodObject ),
new Among ( "early", -1, 9, "", methodObject ),
new Among ( "gently", -1, 7, "", methodObject ),
new Among ( "howe", -1, -1, "", methodObject ),
new Among ( "idly", -1, 6, "", methodObject ),
new Among ( "lying", -1, 4, "", methodObject ),
new Among ( "news", -1, -1, "", methodObject ),
new Among ( "only", -1, 10, "", methodObject ),
new Among ( "singly", -1, 11, "", methodObject ),
new Among ( "skies", -1, 2, "", methodObject ),
new Among ( "skis", -1, 1, "", methodObject ),
new Among ( "sky", -1, -1, "", methodObject ),
new Among ( "tying", -1, 5, "", methodObject ),
new Among ( "ugly", -1, 8, "", methodObject )
}; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WXY[] = {1, 17, 65, 208, 1 }; private static final char g_valid_LI[] = {55, 141, 2 }; private boolean B_Y_found;
private int I_p2;
private int I_p1; private void copy_from(englishStemmer other) {
B_Y_found = other.B_Y_found;
I_p2 = other.I_p2;
I_p1 = other.I_p1;
super.copy_from(other);
} private boolean r_prelude() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
// (, line 25
// unset Y_found, line 26
B_Y_found = false;
// do, line 27
v_1 = cursor;
lab0: do {
// (, line 27
// [, line 27
bra = cursor;
// literal, line 27
if (!(eq_s(1, "'")))
{
break lab0;
}
// ], line 27
ket = cursor;
// delete, line 27
slice_del();
} while (false);
cursor = v_1;
// do, line 28
v_2 = cursor;
lab1: do {
// (, line 28
// [, line 28
bra = cursor;
// literal, line 28
if (!(eq_s(1, "y")))
{
break lab1;
}
// ], line 28
ket = cursor;
// <-, line 28
slice_from("Y");
// set Y_found, line 28
B_Y_found = true;
} while (false);
cursor = v_2;
// do, line 29
v_3 = cursor;
lab2: do {
// repeat, line 29
replab3: while(true)
{
v_4 = cursor;
lab4: do {
// (, line 29
// goto, line 29
golab5: while(true)
{
v_5 = cursor;
lab6: do {
// (, line 29
if (!(in_grouping(g_v, 97, 121)))
{
break lab6;
}
// [, line 29
bra = cursor;
// literal, line 29
if (!(eq_s(1, "y")))
{
break lab6;
}
// ], line 29
ket = cursor;
cursor = v_5;
break golab5;
} while (false);
cursor = v_5;
if (cursor >= limit)
{
break lab4;
}
cursor++;
}
// <-, line 29
slice_from("Y");
// set Y_found, line 29
B_Y_found = true;
continue replab3;
} while (false);
cursor = v_4;
break replab3;
}
} while (false);
cursor = v_3;
return true;
} private boolean r_mark_regions() {
int v_1;
int v_2;
// (, line 32
I_p1 = limit;
I_p2 = limit;
// do, line 35
v_1 = cursor;
lab0: do {
// (, line 35
// or, line 41
lab1: do {
v_2 = cursor;
lab2: do {
// among, line 36
if (find_among(a_0, 3) == 0)
{
break lab2;
}
break lab1;
} while (false);
cursor = v_2;
// (, line 41
// gopast, line 41
golab3: while(true)
{
lab4: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab4;
}
break golab3;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 41
golab5: while(true)
{
lab6: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
} while (false);
// setmark p1, line 42
I_p1 = cursor;
// gopast, line 43
golab7: while(true)
{
lab8: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 43
golab9: while(true)
{
lab10: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab10;
}
break golab9;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// setmark p2, line 43
I_p2 = cursor;
} while (false);
cursor = v_1;
return true;
} private boolean r_shortv() {
int v_1;
// (, line 49
// or, line 51
lab0: do {
v_1 = limit - cursor;
lab1: do {
// (, line 50
if (!(out_grouping_b(g_v_WXY, 89, 121)))
{
break lab1;
}
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab1;
}
if (!(out_grouping_b(g_v, 97, 121)))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// (, line 52
if (!(out_grouping_b(g_v, 97, 121)))
{
return false;
}
if (!(in_grouping_b(g_v, 97, 121)))
{
return false;
}
// atlimit, line 52
if (cursor > limit_backward)
{
return false;
}
} while (false);
return true;
} private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
} private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
} private boolean r_Step_1a() {
int among_var;
int v_1;
int v_2;
// (, line 58
// try, line 59
v_1 = limit - cursor;
lab0: do {
// (, line 59
// [, line 60
ket = cursor;
// substring, line 60
among_var = find_among_b(a_1, 3);
if (among_var == 0)
{
cursor = limit - v_1;
break lab0;
}
// ], line 60
bra = cursor;
switch(among_var) {
case 0:
cursor = limit - v_1;
break lab0;
case 1:
// (, line 62
// delete, line 62
slice_del();
break;
}
} while (false);
// [, line 65
ket = cursor;
// substring, line 65
among_var = find_among_b(a_2, 6);
if (among_var == 0)
{
return false;
}
// ], line 65
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 66
// <-, line 66
slice_from("ss");
break;
case 2:
// (, line 68
// or, line 68
lab1: do {
v_2 = limit - cursor;
lab2: do {
// (, line 68
// hop, line 68
{
int c = cursor - 2;
if (limit_backward > c || c > limit)
{
break lab2;
}
cursor = c;
}
// <-, line 68
slice_from("i");
break lab1;
} while (false);
cursor = limit - v_2;
// <-, line 68
slice_from("ie");
} while (false);
break;
case 3:
// (, line 69
// next, line 69
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// gopast, line 69
golab3: while(true)
{
lab4: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab4;
}
break golab3;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
// delete, line 69
slice_del();
break;
}
return true;
} private boolean r_Step_1b() {
int among_var;
int v_1;
int v_3;
int v_4;
// (, line 74
// [, line 75
ket = cursor;
// substring, line 75
among_var = find_among_b(a_4, 6);
if (among_var == 0)
{
return false;
}
// ], line 75
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 77
// call R1, line 77
if (!r_R1())
{
return false;
}
// <-, line 77
slice_from("ee");
break;
case 2:
// (, line 79
// test, line 80
v_1 = limit - cursor;
// gopast, line 80
golab0: while(true)
{
lab1: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
cursor = limit - v_1;
// delete, line 80
slice_del();
// test, line 81
v_3 = limit - cursor;
// substring, line 81
among_var = find_among_b(a_3, 13);
if (among_var == 0)
{
return false;
}
cursor = limit - v_3;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 83
// <+, line 83
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
case 2:
// (, line 86
// [, line 86
ket = cursor;
// next, line 86
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 86
bra = cursor;
// delete, line 86
slice_del();
break;
case 3:
// (, line 87
// atmark, line 87
if (cursor != I_p1)
{
return false;
}
// test, line 87
v_4 = limit - cursor;
// call shortv, line 87
if (!r_shortv())
{
return false;
}
cursor = limit - v_4;
// <+, line 87
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
}
break;
}
return true;
} private boolean r_Step_1c() {
int v_1;
int v_2;
// (, line 93
// [, line 94
ket = cursor;
// or, line 94
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 94
if (!(eq_s_b(1, "y")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 94
if (!(eq_s_b(1, "Y")))
{
return false;
}
} while (false);
// ], line 94
bra = cursor;
if (!(out_grouping_b(g_v, 97, 121)))
{
return false;
}
// not, line 95
{
v_2 = limit - cursor;
lab2: do {
// atlimit, line 95
if (cursor > limit_backward)
{
break lab2;
}
return false;
} while (false);
cursor = limit - v_2;
}
// <-, line 96
slice_from("i");
return true;
} private boolean r_Step_2() {
int among_var;
// (, line 99
// [, line 100
ket = cursor;
// substring, line 100
among_var = find_among_b(a_5, 24);
if (among_var == 0)
{
return false;
}
// ], line 100
bra = cursor;
// call R1, line 100
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 101
// <-, line 101
slice_from("tion");
break;
case 2:
// (, line 102
// <-, line 102
slice_from("ence");
break;
case 3:
// (, line 103
// <-, line 103
slice_from("ance");
break;
case 4:
// (, line 104
// <-, line 104
slice_from("able");
break;
case 5:
// (, line 105
// <-, line 105
slice_from("ent");
break;
case 6:
// (, line 107
// <-, line 107
slice_from("ize");
break;
case 7:
// (, line 109
// <-, line 109
slice_from("ate");
break;
case 8:
// (, line 111
// <-, line 111
slice_from("al");
break;
case 9:
// (, line 112
// <-, line 112
slice_from("ful");
break;
case 10:
// (, line 114
// <-, line 114
slice_from("ous");
break;
case 11:
// (, line 116
// <-, line 116
slice_from("ive");
break;
case 12:
// (, line 118
// <-, line 118
slice_from("ble");
break;
case 13:
// (, line 119
// literal, line 119
if (!(eq_s_b(1, "l")))
{
return false;
}
// <-, line 119
slice_from("og");
break;
case 14:
// (, line 120
// <-, line 120
slice_from("ful");
break;
case 15:
// (, line 121
// <-, line 121
slice_from("less");
break;
case 16:
// (, line 122
if (!(in_grouping_b(g_valid_LI, 99, 116)))
{
return false;
}
// delete, line 122
slice_del();
break;
}
return true;
} private boolean r_Step_3() {
int among_var;
// (, line 126
// [, line 127
ket = cursor;
// substring, line 127
among_var = find_among_b(a_6, 9);
if (among_var == 0)
{
return false;
}
// ], line 127
bra = cursor;
// call R1, line 127
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 128
// <-, line 128
slice_from("tion");
break;
case 2:
// (, line 129
// <-, line 129
slice_from("ate");
break;
case 3:
// (, line 130
// <-, line 130
slice_from("al");
break;
case 4:
// (, line 132
// <-, line 132
slice_from("ic");
break;
case 5:
// (, line 134
// delete, line 134
slice_del();
break;
case 6:
// (, line 136
// call R2, line 136
if (!r_R2())
{
return false;
}
// delete, line 136
slice_del();
break;
}
return true;
} private boolean r_Step_4() {
int among_var;
int v_1;
// (, line 140
// [, line 141
ket = cursor;
// substring, line 141
among_var = find_among_b(a_7, 18);
if (among_var == 0)
{
return false;
}
// ], line 141
bra = cursor;
// call R2, line 141
if (!r_R2())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 144
// delete, line 144
slice_del();
break;
case 2:
// (, line 145
// or, line 145
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 145
if (!(eq_s_b(1, "s")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 145
if (!(eq_s_b(1, "t")))
{
return false;
}
} while (false);
// delete, line 145
slice_del();
break;
}
return true;
} private boolean r_Step_5() {
int among_var;
int v_1;
int v_2;
// (, line 149
// [, line 150
ket = cursor;
// substring, line 150
among_var = find_among_b(a_8, 2);
if (among_var == 0)
{
return false;
}
// ], line 150
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 151
// or, line 151
lab0: do {
v_1 = limit - cursor;
lab1: do {
// call R2, line 151
if (!r_R2())
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// (, line 151
// call R1, line 151
if (!r_R1())
{
return false;
}
// not, line 151
{
v_2 = limit - cursor;
lab2: do {
// call shortv, line 151
if (!r_shortv())
{
break lab2;
}
return false;
} while (false);
cursor = limit - v_2;
}
} while (false);
// delete, line 151
slice_del();
break;
case 2:
// (, line 152
// call R2, line 152
if (!r_R2())
{
return false;
}
// literal, line 152
if (!(eq_s_b(1, "l")))
{
return false;
}
// delete, line 152
slice_del();
break;
}
return true;
} private boolean r_exception2() {
// (, line 156
// [, line 158
ket = cursor;
// substring, line 158
if (find_among_b(a_9, 8) == 0)
{
return false;
}
// ], line 158
bra = cursor;
// atlimit, line 158
if (cursor > limit_backward)
{
return false;
}
return true;
} private boolean r_exception1() {
int among_var;
// (, line 168
// [, line 170
bra = cursor;
// substring, line 170
among_var = find_among(a_10, 18);
if (among_var == 0)
{
return false;
}
// ], line 170
ket = cursor;
// atlimit, line 170
if (cursor < limit)
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 174
// <-, line 174
slice_from("ski");
break;
case 2:
// (, line 175
// <-, line 175
slice_from("sky");
break;
case 3:
// (, line 176
// <-, line 176
slice_from("die");
break;
case 4:
// (, line 177
// <-, line 177
slice_from("lie");
break;
case 5:
// (, line 178
// <-, line 178
slice_from("tie");
break;
case 6:
// (, line 182
// <-, line 182
slice_from("idl");
break;
case 7:
// (, line 183
// <-, line 183
slice_from("gentl");
break;
case 8:
// (, line 184
// <-, line 184
slice_from("ugli");
break;
case 9:
// (, line 185
// <-, line 185
slice_from("earli");
break;
case 10:
// (, line 186
// <-, line 186
slice_from("onli");
break;
case 11:
// (, line 187
// <-, line 187
slice_from("singl");
break;
}
return true;
} private boolean r_postlude() {
int v_1;
int v_2;
// (, line 203
// Boolean test Y_found, line 203
if (!(B_Y_found))
{
return false;
}
// repeat, line 203
replab0: while(true)
{
v_1 = cursor;
lab1: do {
// (, line 203
// goto, line 203
golab2: while(true)
{
v_2 = cursor;
lab3: do {
// (, line 203
// [, line 203
bra = cursor;
// literal, line 203
if (!(eq_s(1, "Y")))
{
break lab3;
}
// ], line 203
ket = cursor;
cursor = v_2;
break golab2;
} while (false);
cursor = v_2;
if (cursor >= limit)
{
break lab1;
}
cursor++;
}
// <-, line 203
slice_from("y");
continue replab0;
} while (false);
cursor = v_1;
break replab0;
}
return true;
} public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
int v_10;
int v_11;
int v_12;
int v_13;
// (, line 205
// or, line 207
lab0: do {
v_1 = cursor;
lab1: do {
// call exception1, line 207
if (!r_exception1())
{
break lab1;
}
break lab0;
} while (false);
cursor = v_1;
lab2: do {
// not, line 208
{
v_2 = cursor;
lab3: do {
// hop, line 208
{
int c = cursor + 3;
if (0 > c || c > limit)
{
break lab3;
}
cursor = c;
}
break lab2;
} while (false);
cursor = v_2;
}
break lab0;
} while (false);
cursor = v_1;
// (, line 208
// do, line 209
v_3 = cursor;
lab4: do {
// call prelude, line 209
if (!r_prelude())
{
break lab4;
}
} while (false);
cursor = v_3;
// do, line 210
v_4 = cursor;
lab5: do {
// call mark_regions, line 210
if (!r_mark_regions())
{
break lab5;
}
} while (false);
cursor = v_4;
// backwards, line 211
limit_backward = cursor; cursor = limit;
// (, line 211
// do, line 213
v_5 = limit - cursor;
lab6: do {
// call Step_1a, line 213
if (!r_Step_1a())
{
break lab6;
}
} while (false);
cursor = limit - v_5;
// or, line 215
lab7: do {
v_6 = limit - cursor;
lab8: do {
// call exception2, line 215
if (!r_exception2())
{
break lab8;
}
break lab7;
} while (false);
cursor = limit - v_6;
// (, line 215
// do, line 217
v_7 = limit - cursor;
lab9: do {
// call Step_1b, line 217
if (!r_Step_1b())
{
break lab9;
}
} while (false);
cursor = limit - v_7;
// do, line 218
v_8 = limit - cursor;
lab10: do {
// call Step_1c, line 218
if (!r_Step_1c())
{
break lab10;
}
} while (false);
cursor = limit - v_8;
// do, line 220
v_9 = limit - cursor;
lab11: do {
// call Step_2, line 220
if (!r_Step_2())
{
break lab11;
}
} while (false);
cursor = limit - v_9;
// do, line 221
v_10 = limit - cursor;
lab12: do {
// call Step_3, line 221
if (!r_Step_3())
{
break lab12;
}
} while (false);
cursor = limit - v_10;
// do, line 222
v_11 = limit - cursor;
lab13: do {
// call Step_4, line 222
if (!r_Step_4())
{
break lab13;
}
} while (false);
cursor = limit - v_11;
// do, line 224
v_12 = limit - cursor;
lab14: do {
// call Step_5, line 224
if (!r_Step_5())
{
break lab14;
}
} while (false);
cursor = limit - v_12;
} while (false);
cursor = limit_backward; // do, line 227
v_13 = cursor;
lab15: do {
// call postlude, line 227
if (!r_postlude())
{
break lab15;
}
} while (false);
cursor = v_13;
} while (false);
return true;
} public boolean equals( Object o ) {
return o instanceof englishStemmer;
} public int hashCode() {
return englishStemmer.class.getName().hashCode();
} }
porter2 stemming algorithm
然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。
2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)
然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。
它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。
String dictionaryPath = "lemmatiser";
EngLemmatiser lemmatiser = new EngLemmatiser(dictionaryPath, false, true); String a = "brought";
String lemmatizedWord = lemmatiser.lemmatize(a);
System.out.println(lemmatizedWord);
然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。
3. Stanford CoreNLP
后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。
String word="magnificus";
Morphology morph=new Morphology();
System.out.println(morph.stem(word));