1.自定义Analyzer:
@Test public void t01() throws Exception { ArrayList<String> strings = new ArrayList<String>() { { this.add("小鬼子"); this.add("美国佬"); } }; Analyzer analyzer = new CustomStandardAnalyzer(strings); String content = "小鬼子 and 美国佬 are playing together!"; TokenStream tokenStream = analyzer.tokenStream("myfield", content); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { // 已经过滤掉自定义停用词 // 输出:playing together System.out.println(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); analyzer.close(); } @Test public void t02() throws Exception { Analyzer analyzer = new SameWordAnalyzer(); String content = "这花美丽"; TokenStream tokenStream = analyzer.tokenStream("myfield", content); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { System.out.println(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); analyzer.close(); }
2.自定义TokenFilter
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Stack; public class SameWordTokenFilter extends TokenFilter { private CharTermAttribute charTermAttribute; private PositionIncrementAttribute positionIncrementAttribute; private State state; private Stack<String> stack; public SameWordTokenFilter(TokenStream input) { super(input); this.stack = new Stack<>(); this.charTermAttribute = this.addAttribute(CharTermAttribute.class); this.positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class); this.stack = new Stack<>(); } @Override public final boolean incrementToken() throws IOException { while (this.stack.size() > 0) { this.restoreState(this.state); this.charTermAttribute.setEmpty(); this.charTermAttribute.append(this.stack.pop()); this.positionIncrementAttribute.setPositionIncrement(0); return true; } if (!this.input.incrementToken()) { return false; } String term = this.charTermAttribute.toString(); if (this.getSameWords(term)) { this.state = this.captureState(); } return true; } private boolean getSameWords(String name) { Map<String, String[]> map = new HashMap<>(); map.put("美", new String[]{"美丽", "好看"}); map.put("花", new String[]{"鲜花", "花朵"}); String[] words = map.get(name); if (words != null) { for (String word : words) { this.stack.push(word); } return true; } return false; } }
3.使用自定义Analyzer和自定义TokenFilter
ArrayList<String> strings = new ArrayList<String>() {{ this.add("小鬼子"); this.add("美国佬"); }}; Analyzer analyzer = new CustomStandardAnalyzer(strings); String content = "小鬼子 and 美国佬 are playing together!"; TokenStream tokenStream = analyzer.tokenStream("myfield", content); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { // 已经过滤掉自定义停用词 // 输出:playing together System.out.println(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); analyzer.close();
4.代码解释,具体Analyzer和 TokenFilter之间的关联,用Eclipse的DEBUG功能,跟踪理解。