朴素贝叶斯属于监督学习的分类算法。
package algorithm.machine; /** * 问题:求先验概率 词汇表不存在的单词概率为0,怎么处理 */ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 3、Bayes:机器学习算法 * * @author baolibin 朴素贝叶斯是分类算法、属于监督学习: * 路径下有25封正常邮件以及25封垃圾邮件,随机选取20封正常邮件以及20封垃圾邮件作为训练数据,剩下作为测试数据; * E:\machinedata\bayes\email */ public class _03_Bayes { List<String> wordli = new ArrayList<String>(); // 存词汇表 int[] wordVector1; //正常邮件的词向量 int[] wordVector2; //垃圾邮件的词向量 public static void main(String[] args) throws IOException { _03_Bayes _03_Bayes = new _03_Bayes(); String pathHam = "E:\\machinedata\\bayes\\email\\ham"; // 正常邮件根路径 String pathSpam = "E:\\machinedata\\bayes\\email\\spam"; // 垃圾邮件根路径 algorithm.machine._03_Bayes _03_Bayes2 = _03_Bayes; _03_Bayes2.wordList(pathHam, pathSpam); // 生成词汇表 _03_Bayes2.wordVector(pathHam, pathSpam); //构建词向量 /** * 贝叶斯模型已生成,进行测试 * 测试数据:随机选取的测试数据 */ DecimalFormat df = new DecimalFormat("##0.00"); //double保留小数点后6位 String tmpPath1=null; int accuracy=0; for (int i = 1; i <=25; i++) { tmpPath1="E:\\machinedata\\bayes\\email\\ham\\"+i+".txt"; String result=_03_Bayes2.classify(tmpPath1); if ("正常邮件".equals(result)) { accuracy++; } } System.out.println("\n\n正常邮件判断正确率为:"+(Double.parseDouble(df.format((double)accuracy/(double)25)))); accuracy=0; for (int i = 1; i <=25; i++) { tmpPath1="E:\\machinedata\\bayes\\email\\spam\\"+i+".txt"; String result=_03_Bayes2.classify(tmpPath1); if ("垃圾邮件".equals(result)) { accuracy++; } } System.out.println("垃圾邮件判断正确率为:"+(Double.parseDouble(df.format((double)accuracy/(double)25)))); } /** * 判断一个字符串里是否包含字母 * * @param cardNum * @return */ public boolean judgeContainsStr(String cardNum) { String regex = ".*[a-zA-Z]+.*"; Matcher m = Pattern.compile(regex).matcher(cardNum); return m.matches(); } /** * 1、 将所有训练数据转换为词汇表 * * @param pathHam * 正常邮件根路径 * @param pathSpam * 垃圾邮件根路径 * @throws IOException */ public void wordList(String pathHam, String pathSpam) throws IOException { File file1 = new File(pathHam); File file2 = new File(pathSpam); String[] fileName1 = file1.list(); // 获取正常邮件目录下的所有文件 String[] fileName2 = file2.list(); String[] split = null; BufferedReader reader1 = null; BufferedReader reader2 = null; String tmpStrLine = null; // 存临时读取每一封邮件的每一行 String tmpStr = null; // 存切分出来的单词字符串 /** * 读取训练数据邮件信息 */ for (int i = 0; i < 20; i++) { // 读取20个正常邮件和垃圾邮件 /** * 读取正常邮件 */ File tmpfile = new File((pathHam + "\\" + fileName1[i])); // 对每一封邮件内容进行切分 reader1 = new BufferedReader(new FileReader(tmpfile)); while ((tmpStrLine = reader1.readLine()) != null) { // 读取一封邮件的每一行 split = tmpStrLine.split("\\s+"); // 进行切分 if (split.length > 0) { for (String sp : split) { // 把切分的每一个单词去重加入词汇表 tmpStr = sp.trim().toUpperCase(); // 全部转换为大写 if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母 tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", ""); if (!wordli.contains(tmpStr)) { // 添加进词汇表里 wordli.add(tmpStr); // 把字符串里非字母和数字都去掉 } } } } } /** * 读取垃圾邮件 */ File tmpfile2 = new File((pathSpam + "\\" + fileName2[i])); // 对每一封邮件内容进行切分 reader2 = new BufferedReader(new FileReader(tmpfile2)); while ((tmpStrLine = reader2.readLine()) != null) { // 读取一封邮件的每一行 split = tmpStrLine.split("\\s+"); // 进行切分 if (split.length > 0) { for (String sp : split) { // 把切分的每一个单词去重加入词汇表 tmpStr = sp.trim().toUpperCase(); // 全部转换为大写 if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母 tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", ""); if (!wordli.contains(tmpStr)) { // 添加进词汇表里 wordli.add(tmpStr); // 把字符串里非字母和数字都去掉 } } } } } } reader1.close(); reader2.close(); /** * 测试输出词汇表内容 */ System.out.println("词汇表长度为:" + wordli.size()); System.out.println("词汇表内容为:"); int i=1; for (String spl : wordli) { if (i==10) { System.out.println(); i=1; } System.out.print(spl + " "); i++; } } /** * 2、构建词向量 * 词集模型:每个词的出现与否作为一个特征 * 词袋模型:每个词在文档中出现不止一次,要计算次数,不能计算是否出现 * @param pathHam * @param pathSpam * @throws IOException */ public void wordVector(String pathHam, String pathSpam) throws IOException{ wordVector1=new int[wordli.size()]; //正常邮件的词向量 wordVector2=new int[wordli.size()]; //垃圾邮件的词向量 File file1 = new File(pathHam); File file2 = new File(pathSpam); String[] fileName1 = file1.list(); // 获取正常邮件目录下的所有文件 String[] fileName2 = file2.list(); String[] split = null; BufferedReader reader1 = null; BufferedReader reader2 = null; String tmpStrLine = null; // 存临时读取每一封邮件的每一行 String tmpStr = null; // 存切分出来的单词字符串 /** * 构建词向量 * 采用词袋模型 */ for (int i = 0; i < 20; i++) { // 读取20个正常邮件和垃圾邮件 /** * 构建正常邮件的词向量 */ File tmpfile = new File((pathHam + "\\" + fileName1[i])); // 对每一封邮件内容进行切分 reader1 = new BufferedReader(new FileReader(tmpfile)); while ((tmpStrLine = reader1.readLine()) != null) { // 读取一封邮件的每一行 split = tmpStrLine.split("\\s+"); // 进行切分 if (split.length > 0) { for (String sp : split) { // 切分的每一个单词 tmpStr = sp.trim().toUpperCase(); // 全部转换为大写 if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母 tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", ""); int tmpindex=wordli.indexOf(tmpStr); wordVector1[tmpindex]++; //对应的出现该元素次数加加 } } } } /** * 构建垃圾邮件的词向量 */ File tmpfile2 = new File((pathSpam + "\\" + fileName2[i])); // 对每一封邮件内容进行切分 reader2 = new BufferedReader(new FileReader(tmpfile2)); while ((tmpStrLine = reader2.readLine()) != null) { // 读取一封邮件的每一行 split = tmpStrLine.split("\\s+"); // 进行切分 if (split.length > 0) { for (String sp : split) { // 把切分的每一个单词去重加入词汇表 tmpStr = sp.trim().toUpperCase(); // 全部转换为大写 if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母 tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", ""); int tmpindex=wordli.indexOf(tmpStr); wordVector2[tmpindex]++; //对应的出现该元素次数加加 } } } } } reader1.close(); reader2.close(); /** * 输出词向量 */ System.out.println("\n正常邮件词向量为:"); for (Integer in : wordVector1) { System.out.print(in+" "); } System.out.println("\n垃圾邮件词向量为:"); for (Integer in : wordVector2) { System.out.print(in+" "); } } /** * 对指定邮件进行分类:正常邮件还是垃圾邮件 * p(A|B)=P(B|A)*P(A)/P(B) * * 本例子公式: * P1:P(正常邮件|待求邮件)=P(待求邮件|正常邮件)*P(正常邮件)/P(待求邮件) * P2:P(垃圾邮件|待求邮件)=P(待求邮件|垃圾邮件)*P(垃圾邮件)/P(待求邮件) * 若P1>P2,则该邮件为正常邮件,否则为垃圾邮件 * * 分母一样,一次只需要比较分子大小 * P(待求邮件|正常邮件)*P(正常邮件) 与 P(待求邮件|垃圾邮件)*P(垃圾邮件) * 训练数据一共40封邮件,20封正常邮件,20封垃圾邮件,因此 P(正常邮件)=P(垃圾邮件)=0.5 * * 最后只需要求 P(待求邮件|正常邮件) 与 P(待求邮件|垃圾邮件) * @throws IOException */ public String classify(String filePath) throws IOException{ /** * 依次为:后验概率、先验概率、类条件概率、总体概率密度 * p(c|w)=p(w|c)p(c)/p(w) * * 假设所有词都相互独立 * p(w|c)=p(w0,w1,w2...wn|c)=p(w0|c)p(w1|c)p(w2|c)...p(wn|c) */ File file=new File(filePath); BufferedReader reader=null; String tmpStrLine=null; String tmpStr=null; String[] spl=null; reader=new BufferedReader(new FileReader(file)); double p1=1; //正常邮件先验概率 double p2=1; //垃圾邮件先验概率 int word1=0; //训练数据正常邮件单词个数 int word2=0; //训练数据垃圾邮件单词个数 for (Integer in1 : wordVector1) { //计算训练数据正常邮件单词个数 word1+=in1; } for (Integer in2 : wordVector2) { //计算训练数据垃圾邮件单词个数 word2+=in2; } /** * 计算先验概率 */ // System.out.println("\n训练数据正常邮件单词个数为:"+word1); // System.out.println("训练数据垃圾邮件单词个数为:"+word2); // System.out.println("\n"); DecimalFormat df = new DecimalFormat("##0.00"); //double保留小数点后6位 while ((tmpStrLine=reader.readLine())!=null) { spl=tmpStrLine.split("\\s+"); for (String str : spl) { tmpStr = str.trim().toUpperCase(); if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母 tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", ""); /** * 分子是0情况 * 无限相乘趋近于0 */ if (wordli.contains(tmpStr)) { //词汇表中存在该单词 // System.out.println("单词存在"); /** * 正常邮件先验概率 */ int index=wordVector1[wordli.indexOf(tmpStr)]; //该单词在训练数据正常邮件出现的个数 if(index>0){ p1=Double.parseDouble(df.format((p1*Math.log((double)index/(double)word1)))); // System.out.println("p1="+p1); }else { p1=Double.parseDouble(df.format(p1*(1.0/(double)(word1+2)))); } /** * 垃圾邮件先验概率 */ int index2=wordVector2[wordli.indexOf(tmpStr)]; //该单词在训练数据正常邮件出现的个数 if(index2>0){ p2=Double.parseDouble(df.format((p2*Math.log((double)index2/(double)word2)))); // System.out.println("p2="+p2); }else { p2=Double.parseDouble(df.format(p2*(1.0/(double)(word2+2)))); } }else { // System.out.println("单词不存在!"); } } } } /** * 打印两个所求的先验概率 * 类条件概率、总体概率密度 两者相等,所以只需比较先验概率就可以了。 */ // System.out.println("正常邮件的先验概率为:"+p1); // System.out.println("垃圾邮件的先验概率为:"+p2); reader.close(); // System.out.print("\n该邮件为:"); // System.out.println((p1>p2)?"正常邮件":"垃圾邮件"); if (p1>p2) { return "正常邮件"; }else{ return "垃圾邮件"; } } }
输入的正常邮件:
输入的垃圾邮件:
输出结果:
词汇表长度为:556 词汇表内容为: HI PETER WITH JOSE OUT OF TOWN DO YOU WANT TO MEET ONCE IN A WHILE KEEP THINGS GOING AND SOME INTERESTING STUFF LET ME KNOW EUGENE CODEINE 15MG FOR VISA ONLY METHYLMORPHINE IS NARCOTIC OPIOID PAIN RELIEVER WE HAVE 30MG PILLS 3015MG 6015MG 9015MG RYAN WHYBREW COMMENTED ON YOUR STATUS WROTE TURD FERGUSON OR BUTT HORN ORDERCIALIZVIAGRA ONLINE SAVE 0NLINE PHARMACY NOPRESCRIPTION REQUIRED BUY CANADIAN DRUGS AT WHOLESALE PRICES FDAAPPROVED SUPERB QUALITY ACCEPT ALL MAJOR CREDIT CARDS ARVIND THIRUMALAI REPLY THIS EMAIL COMMENT EVERYTHING GAIN INCREDIB1E GAINS LENGTH INCHES YOURPENIS PERMANANTLY AMAZING INCREASE THICKNESS UP BETTEREJACU1ATION CONTROL EXPERIENCE ROCKHARDERECETIONS EXPLOSIVE INTENSEORGASNS VOLUME OFEJACU1ATE DOCTOR DESIGNED ENDORSED HERBAL NATURAL SAFE THE PROVEN NATURALPENISENHANCEMENT THAT WORKS MONEYBACK GUARANTEEED THANKS ILL DEFINITELY CHECK HOW BOOK I HEARD CHAPTER CAME IT WAS GOOD SHAPE HOPE ARE DOING WELL CHEERS TROY AMBIEM ZOLPIDEM 5MG10MG PILL X MG JAY STEPP SEE THREAD FOLLOW LINK BELOW ORDER TODAY FROM LINKEDIN KERRY HALONEY REQUESTED ADD AS CONNECTION ID LIKE MY PROFESSIONAL NETWORK BUYVIAGRA 25MG 50MG 100MG BRANDVIAGRA FEMALEVIAGRA PER VIAGRANOPRESCRIPTION NEEDED CERTIFIED HERE AMEX ECHECK WORLDWIDE DELIVERY HOTELS ONES RENT TENT THEY LINED HOTEL GROUNDS SO MUCH BEING ONE NATURE MORE COUPLE DOZEN TOUR GROUPS ABOUT 100M PICTURES TRIP CAN GO THROUGH THEM GET JPGS FAVORITE SCENIC WHERE JOCELYN NOW NEW YORK WILL COME TOKYO CHINESE YEAR PERHAPS TWO THEN THAILAND WINTER HOLIDAY MOM TAKE CARE D YEAH AM READY MAY NOT BE BECAUSE JAR HAS PLANE TICKETS GERMANY BENOIT MANDELBROT WILMOTT TEAM MATHEMATICIAN FATHER FRACTAL MATHEMATICS ADVOCATE SOPHISTICATED MODELLING QUANTITATIVE FINANCE DIED 14TH OCTOBER AGED MAGAZINE OFTEN FEATURED HIS IDEAS WORK OTHERS INSPIRED BY FUNDAMENTAL INSIGHTS MUST LOGGED VIEW THESE ARTICLES PAST ISSUES HOME BASED BUSINESS OPPORTUNITY KNOCKING DOOR DONT RUDE CHANCE EARN GREAT INCOME FIND FINANCIAL LIFE TRANSFORMED LEARN SUCCESS FINDER EXPERTS SURE THING SOUNDS WHAT TIME WOULD PREPARED THERE REGARDS VIVEK MOST COMPETITIVE PRICE NET WILSON FREEVIAGRA JULIUS O LOOKING FORWARD INVITATION OFF WATCHESSTORE DISCOUNT WATCHES FAMOUS BRANDS AROLEXBVLGARI DIOR HERMES ORIS CARTIER AP LOUIS VUITTON BAGS WALLETS GUCCI TIFFANY CO JEWERLY ENJOY FULL WARRANTY SHIPMENT VIA REPUTABLE COURIER FEDEX UPS DHL EMS SPEEDPOST RECIEVE YAY BOTH FINE IM WORKING AN MBA DESIGN STRATEGY CCA TOP ART SCHOOL ITS PROGRAM FOCUSING RIGHTBRAINED CREATIVE STRATEGIC APPROACH MANAGEMENT WAY DONE HYDROCODONEVICODIN ESBRAND WATSON VICODIN ES BRAND FREE EXPRESS DAYS OVER IVE THOUGHT THINK POSSIBLE SHOULD ANOTHER LUNCH CAR COULD PICK DOES WEDNESDAY SIGNED COPY SAW COASTTHOUGHT U MIGHT HANGZHOU HUGE DAY WASNT ENOUGH BUT GOT GLIMPSE WENT INSIDE CHINA PAVILION EXPO PRETTY EACH PROVINCE EXHIBIT PERCOCET WITHOUTPRESCRIPTION TABS ANALGESIC USED TREAT MODERATE MODERATELY SEVEREPAIN SHIPPING DISCREET PRIVATE CHEAP HOMMIES JUST PHONE CALL ROOFER SPAYING FOAMING DUSTY PLS CLOSE DOORS WINDOWS HELP BATHROOM WINDOW CAT SLIDING BEHIND TV THOSE CATS SURVIVE SORRY ANY INCONVENIENCE SCIFINANCE AUTOMATICALLY GENERATES GPUENABLED PRICING RISK MODEL SOURCE CODE RUNS 50300X FASTER THAN SERIAL USING NVIDIA FERMICLASS TESLA 20SERIES GPU DERIVATIVES DEVELOPMENT TOOL CC CONCISE HIGHLEVEL SPECIFICATIONS NO PARALLEL COMPUTING CUDA PROGRAMMING EXPERTISE SCIFINANCES AUTOMATIC MONTE CARLO GENERATION CAPABILITIES BEEN SIGNIFICANTLY EXTENDED LATEST RELEASE INCLUDES OK COLD RETIREMENT PARTY LEAVES CHANGING COLOR BIGGERPENIS GROW 3INCHES SAFEST EFFECTIVE METHODS OFPENISEN1ARGEMENT MONEY BETTERERECTIONS MA1EENHANCEMENT PRODUCTS SUPPLEMENT TRUSTED MILLIONS TALKED JOHN COMPUTER THATS BIKE RIDING RAIN MUSEUM SF YESTERDAY HAD FOOD SAME GIANTS GAME WHEN TRAIN FANS DRUNK YO RUNNING WEBSITE JQUERY JQPLOT PLUGIN TOO FAR AWAY HAVING PROTOTYPE LAUNCH RIGHT IF 正常邮件词向量为: 5 6 5 1 2 11 1 2 18 1 30 1 1 10 15 1 1 1 4 18 3 2 2 2 4 4 1 0 0 6 0 0 0 8 0 0 0 0 9 4 0 0 0 0 0 2 1 3 18 4 5 3 1 1 2 1 1 0 0 0 0 0 0 1 0 0 0 4 0 0 0 0 0 1 3 0 0 0 2 1 2 10 3 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 0 0 6 0 0 0 1 1 1 1 2 3 15 1 1 1 8 4 3 1 1 6 2 1 1 1 0 0 0 0 0 0 2 1 3 1 1 1 1 0 2 7 5 2 2 2 3 2 2 1 4 5 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 4 1 1 1 1 1 2 3 2 3 1 1 1 1 3 1 2 1 4 3 1 1 3 1 1 1 1 1 3 4 1 6 4 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 4 6 1 2 3 1 1 1 3 4 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 3 2 1 4 1 1 0 0 0 0 0 0 2 2 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 4 2 3 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 0 0 0 0 0 0 1 0 0 0 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 2 4 3 4 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 垃圾邮件词向量为: 0 0 2 0 0 12 0 0 10 0 27 0 0 12 8 0 0 0 0 17 0 0 0 1 0 0 0 7 2 8 3 4 1 4 3 1 1 1 2 7 5 9 1 1 1 0 0 0 1 7 0 0 0 0 0 0 0 2 7 6 2 3 3 3 7 3 6 6 2 2 2 2 5 3 5 3 3 3 0 0 0 1 0 0 6 6 6 6 6 6 12 6 6 12 6 10 6 6 7 6 6 6 6 6 6 6 6 6 6 8 4 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 9 10 0 0 0 0 0 0 0 5 3 4 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 1 1 0 0 3 0 0 0 0 0 0 0 0 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 2 1 1 1 4 3 0 0 0 0 0 4 3 3 7 3 6 3 3 3 3 3 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 1 1 2 1 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 1 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 正常邮件判断正确率为:0.56 垃圾邮件判断正确率为:0.76