word detection

时间:2016-04-23 10:10:26
【文件属性】:

文件名称:word detection

文件大小:39KB

文件格式:RAR

更新时间:2016-04-23 10:10:26

分词 文本 语义

基于词在大数据文本的出现频率来进行分词 using System; using System.Text.RegularExpressions; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; namespace WordDetection { class Program { static WordDetector wordDetector = null; static StreamWriter sw = null; private static void PrintResults() { if (sw == null) return; foreach (string word in wordDetector.FinalWords) { sw.WriteLine("{0}\t{1}", word, wordDetector.Freq[word]); } } static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: worddetector "); return; } wordDetector = new WordDetector(args[0]); //wordDetector.ProcessOver += PrintResults; var sw = new StreamWriter(args[1]); wordDetector.Process(); PrintResults(); sw.Flush(); sw.Close(); } } public class WordDetector { public Action ProcessOver = null; internal struct CharPos { public char ThisChar; public bool PositionOnRight; public CharPos(char value, bool positionOnRight) { this.ThisChar = value; this.PositionOnRight = positionOnRight; } } public const int MaxWordLength = 5, // 要检测的最长的词组长度 MinFreq = 10; // 词语出现的最小频数 public const double PSvPThreshold = 100, // theta_c EntropyThreshold = 1.3; // theta_f HashSet finalWords = new HashSet(); Dictionary> words = new Dictionary>(); Dictionary freq = new Dictionary(); Dictionary ps = new Dictionary(); Regex regSplit = new Regex(@"\W+|[a-zA-Z0-9]+", RegexOptions.Compiled | RegexOptions.Multiline); StreamReader sr = null; int total = 0; string _filename = ""; public HashSet FinalWords { get { return finalWords; } } public Dictionary Freq { get { return freq; } } public WordDetector (string filename) { _filename = filename; renewStreamReader(); } private void renewStreamReader () { sr = new StreamReader(_filename); } public void StartProcess () { System.Threading.Thread thr = new System.Threading.Thread(new System.Threading.ThreadStart(Process)); thr.Start(); } private void wordInfoEntropy (string word, out double leftEntropy, out double rightEntropy) { leftEntropy = rightEntropy = 0; double totalL = 0, totalR = 0; foreach (KeyValuePair pair in words[word]) { if (pair.Key.PositionOnRight) totalR += pair.Value; else totalL += pair.Value; } if (totalL <= 0) leftEntropy = double.MaxValue; if (totalR <= 0) rightEntropy = double.MaxValue; foreach (KeyValuePair pair in words[word]) { double p; if (pair.Key.PositionOnRight) { p = (double)pair.Value / totalR; rightEntropy -= p * Math.Log(p); } else { p = (double)pair.Value / totalL; leftEntropy -= p * Math.Log(p); } } } public void Process () { Console.WriteLine("Reading input..."); string line = ""; while ((line = sr.ReadLine()) != null) { total += addParagraph (line); } finalizeParagraph (); sr.Close (); Console.WriteLine("Building candidate word list..."); foreach (KeyValuePair pair in ps) { if (pair.Key.Length < 2 || pair.Key.Length > MaxWordLength) continue; double p = 0; for (int i=1; i= MinFreq && pair.Value / p > PSvPThreshold) words.Add (pair.Key, new Dictionary()); } renewStreamReader (); Console.WriteLine("Preparing word/adjacent character list..."); foreach(string cword in freq.Keys) { string wl = cword.Length > 1 ? cword.Substring(1) : "", wr = cword.Length > 1 ? cword.Substring(0, cword.Length - 1) : "", wc = cword.Length > 2 ? cword.Substring(1, cword.Length - 1) : ""; CharPos c = new CharPos('a', false); int frq = freq[cword]; if (words.ContainsKey(wl)) { c = new CharPos(cword[0], false); if (words[wl].ContainsKey(c)) words[wl][c] += frq; else words[wl].Add(c, frq); } if (words.ContainsKey(wr)) { c = new CharPos(cword[cword.Length - 1], true); if (words[wr].ContainsKey(c)) words[wr][c] += frq; else words[wr].Add(c, frq); } if (words.ContainsKey(wc)) { c = new CharPos(cword[0], false); if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); c = new CharPos(cword[cword.Length - 1], true); if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); } } Console.WriteLine("Calculating word information entropy..."); foreach (string word in words.Keys) { double leftEntropy = 0, rightEntropy = 0; wordInfoEntropy(word, out leftEntropy, out rightEntropy); if (leftEntropy < EntropyThreshold || rightEntropy < EntropyThreshold) continue; finalWords.Add(word); } Console.WriteLine("Done. Writing results."); if (ProcessOver != null) ProcessOver.Invoke(); } private int addParagraph (string paragraph) { int incr_total = 0; foreach (string sentence in regSplit.Split(paragraph)) { if (sentence.Length < 2) continue; for (int i = 0; i


【文件预览】:
WordDetection.sln
bin
----Debug()
--------WordDetection.exe(11KB)
--------WordDetection.pdb(22KB)
--------WordDetection.vshost.exe.manifest(490B)
--------WordDetection.vshost.exe(22KB)
obj
----Debug()
--------WordDetection.exe(11KB)
--------WordDetection.pdb(22KB)
--------WordDetection.csprojResolveAssemblyReference.cache(2KB)
--------WordDetection.csproj.FileListAbsolute.txt(453B)
--------DesignTimeResolveAssemblyReferencesInput.cache(6KB)
--------TempPE()
Properties
----AssemblyInfo.cs(1KB)
Program.cs
WordDetection.v11.suo
WordDetection.csproj

网友评论

  • 习惯性好评, 共享是个好习惯
  • 能用,只是得研究一下。
  • 下载了 可是不知道怎么用 好郁闷啊