Lucene.Net和盘古分词应用

时间:2021-07-23 03:14:17

Lucene.Net.dll:用做全文索引

PanGu.dll(盘古分词):作为中文分词的条件

大致原理:

1.Lucene先根据PanGu将需要搜索的内容分隔、分词,然后根据分词的结果,做一个索引页。

2.搜索的时候,直接从索引页里面进行查找个。

 

直接上代码:

分词演示代码:

Lucene.Net和盘古分词应用Lucene.Net和盘古分词应用
 protected void Button1_Click(object sender, EventArgs e)
{
ListBox1.Items.Clear();

//标准分词,只能对英文,不能对中文
//Analyzer analyzer = new StandardAnalyzer();

//盘古分词
Analyzer analyzer = new PanGuAnalyzer();
TokenStream tokenStream
= analyzer.TokenStream("",new StringReader(txtString.Text));
Lucene.Net.Analysis.Token token
= null;

//.Next()获取到下一个词
while ((token=tokenStream.Next())!=null)
{
string word = token.TermText();//分到的词
ListBox1.Items.Add(word);
}
}
View Code

 

 

新建索引代码:演示了两种读取数据的方式

一:文本文件的查找

Lucene.Net和盘古分词应用Lucene.Net和盘古分词应用
protected void Button1_Click(object sender, EventArgs e)
{
string indexPath = @"C:\index";//注意和磁盘上文件夹的大小写一致,否则会报错。
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
//暂时规定:同时只能有一段代码操作索引库
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
//IndexWriter负责把数据向索引库中写入
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 1000; i < 1100; i++)
{
string txt =System.IO.File.ReadAllText(@"D:\net\net\代码\搜索及分词\文章\" + i + ".txt");
Document document
= new Document();//文档对象。相当于表的一行记录
document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(
new Field("body", txt, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);

}
writer.Close();
directory.Close();
//不要忘了Close,否则索引结果搜不到


this.ClientScript.RegisterStartupScript(typeof(indexPage),
"alert", "alert('创建索引完成')", true);
}
View Code

二:数据库里面查找数据

Lucene.Net和盘古分词应用Lucene.Net和盘古分词应用
 protected void Button3_Click(object sender, EventArgs e)
{
string indexPath = @"D:\net\net\代码\搜索及分词\index1";//注意和磁盘上文件夹的大小写一致,否则会报错。
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
//暂时规定:同时只能有一段代码操作索引库
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
//IndexWriter负责把数据向索引库中写入
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);

List
<Writings> list = GetData();
foreach (Writings item in list)
{
Document document
= new Document();//文档对象。相当于表的一行记录
document.Add(new Field("ID",item.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(
new Field("Title", item.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(
new Field("Contents", item.Contents, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);
}
writer.Close();
directory.Close();
//不要忘了Close,否则索引结果搜不到


this.ClientScript.RegisterStartupScript(typeof(indexPage),
"alert", "alert('创建索引完成')", true);
}

private List<Writings> GetData()
{
string conn = "server=.;user id=sa; pwd=123; database=SharesTradeNew";
string sql = "SELECT * FROM dbo.Writings";
SqlDataAdapter da
= new SqlDataAdapter(sql,conn);
DataTable dt
= new DataTable();
int a=da.Fill(dt);
return Newtonsoft.Json.JsonConvert.DeserializeObject<List<Writings>>(Newtonsoft.Json.JsonConvert.SerializeObject(dt));
}
}

public class Writings
{
public int ID { get; set; }
public string Title { get; set; }
public string Contents { get; set; }
}
View Code

 

通过索引查找数据:

对应一:

Lucene.Net和盘古分词应用Lucene.Net和盘古分词应用
protected void Button1_Click(object sender, EventArgs e)
{
//“计算机 专业”
string kw = TextBox1.Text;
FSDirectory directory
= FSDirectory.Open(new DirectoryInfo(@"c:\index"), new NoLockFactory());
IndexReader reader
= IndexReader.Open(directory, true);
IndexSearcher searcher
= new IndexSearcher(reader);
PhraseQuery query
= new PhraseQuery();//查询条件
foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业”
{
query.Add(
new Term("body", word));//Contains("body",word)
}
//where Contains("body","计算机") and Contains("body","专业")

query.SetSlop(
100);
TopScoreDocCollector collector
= TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中

List
<SearchResult> list = new List<SearchResult>();

// collector.GetTotalHits()查询结果的总条数
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
for (int i = 0; i < docs.Length; i++)
{
int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关)
Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
string number = doc.Get("number");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
string body = doc.Get("body");

SearchResult sr
= new SearchResult();
sr.Body
= body;
sr.Number
= number;

list.Add(sr);
}
Repeater1.DataSource
= list;
Repeater1.DataBind();
}
View Code

对应二:

Lucene.Net和盘古分词应用Lucene.Net和盘古分词应用
protected void Button3_Click(object sender, EventArgs e)
{
//“计算机 专业”
string kw = TextBox3.Text;
FSDirectory directory
= FSDirectory.Open(new DirectoryInfo(@"D:\net\net\代码\搜索及分词\index1"), new NoLockFactory());
IndexReader reader
= IndexReader.Open(directory, true);
IndexSearcher searcher
= new IndexSearcher(reader);
PhraseQuery query
= new PhraseQuery();//查询条件
foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业”
{
query.Add(
new Term("Contents", word));//Contains("body",word)
//query.Add(new Term("Title", word));
}
//where Contains("body","计算机") and Contains("body","专业")

query.SetSlop(
100);
TopScoreDocCollector collector
= TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中

List
<Writings> list = new List<Writings>();

// collector.GetTotalHits()查询结果的总条数
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
for (int i = 0; i < docs.Length; i++)
{
int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关)
Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
string id = doc.Get("ID");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
string title = doc.Get("Title");
string content = doc.Get("Contents");

Writings sr
= new Writings();
sr.ID
= int.Parse(id);
sr.Title
= title;
sr.Contents
= content;

list.Add(sr);
}
Repeater3.DataSource
= list;
Repeater3.DataBind();
}
View Code