lucene定义自己的分词器将其分成单个字符

时间:2024-01-16 12:27:08

问题描写叙述:将一句话拆分成单个字符。而且去掉空格。

package com.mylucene;

import java.io.IOException;
import java.io.Reader; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory; public class SpiltChar extends Tokenizer { public SpiltChar(AttributeFactory factory, Reader input) {
super(factory, input);
// TODO Auto-generated constructor stub
} public SpiltChar(Reader input) {
super(input);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
buffer[length++] = Character.toLowerCase(c); // buffer it } private final boolean flush() { if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
return false;
} @Override
public boolean incrementToken() throws IOException {
clearAttributes(); length = 0;
start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
} if (dataLen == -1) {
offset--;
return flush();
} else
c = ioBuffer[bufferIndex++]; switch(Character.getType(c)) { case Character.DECIMAL_DIGIT_NUMBER://注意此部分只是滤一些熟悉或者字母
case Character.LOWERCASE_LETTER://注意此部分
case Character.UPPERCASE_LETTER://注意此部分
// push(c);
// if (length == MAX_WORD_LEN) return flush();
// break; case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush(); default:
if (length>0) return flush(); break; }
}
} @Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
} @Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
} }

定义自己的分词器类:

package com.mylucene;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer; /**
* 单字切分
* **/
public class SpiltCharAnalyzer extends Analyzer { @Override
protected TokenStreamComponents createComponents(String arg0, Reader arg1) { Tokenizer token=new SpiltChar(arg1); return new TokenStreamComponents(token);
} }