【文件属性】:
文件名称:word detection
文件大小:39KB
文件格式:RAR
更新时间:2016-04-23 10:10:26
分词 文本 语义
基于词在大数据文本的出现频率来进行分词
using System;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace WordDetection
{
class Program
{
static WordDetector wordDetector = null;
static StreamWriter sw = null;
private static void PrintResults()
{
if (sw == null) return;
foreach (string word in wordDetector.FinalWords)
{
sw.WriteLine("{0}\t{1}", word, wordDetector.Freq[word]);
}
}
static void Main(string[] args)
{
if (args.Length < 2)
{
Console.WriteLine("Usage: worddetector ");
return;
}
wordDetector = new WordDetector(args[0]);
//wordDetector.ProcessOver += PrintResults;
var sw = new StreamWriter(args[1]);
wordDetector.Process();
PrintResults();
sw.Flush(); sw.Close();
}
}
public class WordDetector {
public Action ProcessOver = null;
internal struct CharPos {
public char ThisChar;
public bool PositionOnRight;
public CharPos(char value, bool positionOnRight) {
this.ThisChar = value; this.PositionOnRight = positionOnRight;
}
}
public const int MaxWordLength = 5, // 要检测的最长的词组长度
MinFreq = 10; // 词语出现的最小频数
public const double PSvPThreshold = 100, // theta_c
EntropyThreshold = 1.3; // theta_f
HashSet finalWords = new HashSet();
Dictionary> words = new Dictionary>();
Dictionary freq = new Dictionary();
Dictionary ps = new Dictionary();
Regex regSplit = new Regex(@"\W+|[a-zA-Z0-9]+", RegexOptions.Compiled | RegexOptions.Multiline);
StreamReader sr = null;
int total = 0;
string _filename = "";
public HashSet FinalWords {
get {
return finalWords;
}
}
public Dictionary Freq {
get {
return freq;
}
}
public WordDetector (string filename)
{
_filename = filename;
renewStreamReader();
}
private void renewStreamReader () {
sr = new StreamReader(_filename);
}
public void StartProcess ()
{
System.Threading.Thread thr = new System.Threading.Thread(new System.Threading.ThreadStart(Process));
thr.Start();
}
private void wordInfoEntropy (string word, out double leftEntropy, out double rightEntropy)
{
leftEntropy = rightEntropy = 0;
double totalL = 0, totalR = 0;
foreach (KeyValuePair pair in words[word]) {
if (pair.Key.PositionOnRight) totalR += pair.Value; else totalL += pair.Value;
}
if (totalL <= 0) leftEntropy = double.MaxValue;
if (totalR <= 0) rightEntropy = double.MaxValue;
foreach (KeyValuePair pair in words[word]) {
double p;
if (pair.Key.PositionOnRight) {
p = (double)pair.Value / totalR;
rightEntropy -= p * Math.Log(p);
} else {
p = (double)pair.Value / totalL;
leftEntropy -= p * Math.Log(p);
}
}
}
public void Process ()
{
Console.WriteLine("Reading input...");
string line = "";
while ((line = sr.ReadLine()) != null) {
total += addParagraph (line);
}
finalizeParagraph ();
sr.Close ();
Console.WriteLine("Building candidate word list...");
foreach (KeyValuePair pair in ps) {
if (pair.Key.Length < 2 || pair.Key.Length > MaxWordLength)
continue;
double p = 0;
for (int i=1; i= MinFreq && pair.Value / p > PSvPThreshold)
words.Add (pair.Key, new Dictionary());
}
renewStreamReader ();
Console.WriteLine("Preparing word/adjacent character list...");
foreach(string cword in freq.Keys) {
string wl = cword.Length > 1 ? cword.Substring(1) : "",
wr = cword.Length > 1 ? cword.Substring(0, cword.Length - 1) : "",
wc = cword.Length > 2 ? cword.Substring(1, cword.Length - 1) : "";
CharPos c = new CharPos('a', false); int frq = freq[cword];
if (words.ContainsKey(wl)) {
c = new CharPos(cword[0], false);
if (words[wl].ContainsKey(c)) words[wl][c] += frq; else words[wl].Add(c, frq);
}
if (words.ContainsKey(wr)) {
c = new CharPos(cword[cword.Length - 1], true);
if (words[wr].ContainsKey(c)) words[wr][c] += frq; else words[wr].Add(c, frq);
}
if (words.ContainsKey(wc)) {
c = new CharPos(cword[0], false);
if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq);
c = new CharPos(cword[cword.Length - 1], true);
if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq);
}
}
Console.WriteLine("Calculating word information entropy...");
foreach (string word in words.Keys) {
double leftEntropy = 0, rightEntropy = 0;
wordInfoEntropy(word, out leftEntropy, out rightEntropy);
if (leftEntropy < EntropyThreshold || rightEntropy < EntropyThreshold)
continue;
finalWords.Add(word);
}
Console.WriteLine("Done. Writing results.");
if (ProcessOver != null)
ProcessOver.Invoke();
}
private int addParagraph (string paragraph)
{
int incr_total = 0;
foreach (string sentence in regSplit.Split(paragraph)) {
if (sentence.Length < 2) continue;
for (int i = 0; i
【文件预览】:
WordDetection.sln
bin
----Debug()
--------WordDetection.exe(11KB)
--------WordDetection.pdb(22KB)
--------WordDetection.vshost.exe.manifest(490B)
--------WordDetection.vshost.exe(22KB)
obj
----Debug()
--------WordDetection.exe(11KB)
--------WordDetection.pdb(22KB)
--------WordDetection.csprojResolveAssemblyReference.cache(2KB)
--------WordDetection.csproj.FileListAbsolute.txt(453B)
--------DesignTimeResolveAssemblyReferencesInput.cache(6KB)
--------TempPE()
Properties
----AssemblyInfo.cs(1KB)
Program.cs
WordDetection.v11.suo
WordDetection.csproj