语汇单元:位置增量是唯一的元数据 poter词干提取算法实现 /** * * Stemmer, implementing the Porter Stemming Algorithm * * The Stemmer class transforms a word into its root form. The input * word can be provided a character at time (by calling add()), or at once * by calling one of the various stem(something) methods. */ PorterStemmer.java lucene3.6.0 token获取 TokenStream tokenStream = analyzer.tokenStream("context", new StringReader("旧水泥袋")); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); System.out.println(offsetAttribute.toString()); } WhiteSpaceAnalyzer 在空格处分隔词汇单元 SimpleAnalyzer 在非字母处切分文本,并进行小写规格化 StopAnalyzer 在非字母处切分文本,并进行小写规格化,再移除停用词 StarderAnalyzer 基于复杂的语法来进行词汇切分,可以识别email,首字母缩写词,汉语-日语-汉语字符,字符数字;小写化;移除停用词 stopAnalyzer停用词表 可以通过构造函数自己设置停用词表 public final class StopAnalyzer extends StopwordAnalyzerBase { /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ public static final Set<?> ENGLISH_STOP_WORDS_SET; static { final List<String> stopWords = Arrays.asList( "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" ); final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords.size(), false); stopSet.addAll(stopWords); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } /** Builds an analyzer with the stop words from the given set. * @param matchVersion See <a href="#version">above</a> * @param stopWords Set of stop words */ public StopAnalyzer(Version matchVersion, Set<?> stopWords) { super(matchVersion, stopWords); } StarderAnalyzer:基于javacc,也可以通过构造函数传递停用词表,来达到stopAnalyzer的功能。 PerFieldAnalyzerWrapper.java能够针对某个域采用不同的分析器。 * Map analyzerPerField = new HashMap(); * analyzerPerField.put("firstname", new KeywordAnalyzer()); * analyzerPerField.put("lastname", new KeywordAnalyzer()); * * PerFieldAnalyzerWrapper aWrapper = * new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);