1、问题描述
现在对一个2g的大文件,抽取第二列含有特点16个串的信息,并将这些含有特串的信息,写回到两个文件中
2、具体实现
(1)java代码
package naifen;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Scanner;
public class ParseNaifen {
public static void main(String[] args) throws FileNotFoundException
{
String[] wantedBrand=new String[]{"爱他美","诺优能","惠氏","雅培","美素佳儿","合生元","美赞臣","贝因美","飞鹤","喜宝","雀巢","可瑞康","金领冠","圣元","伊利","多美滋"};
try{
FileInputStream input=new FileInputStream(new File("D:/jd_naifen/jd_naifen_all"));
File dest1 = new File("D:/jd_naifen/jd_naifen_16brand_1.txt");
File dest2 = new File("D:/jd_naifen/jd_naifen_16brand_2.txt");
if(!dest1.exists())
dest1.createNewFile();
if(!dest2.exists())
dest2.createNewFile();
FileWriter outer1 = new FileWriter(dest1);
FileWriter outer2 = new FileWriter(dest2);
Scanner in = new Scanner(input,"GBK");
String str=null;
String[] temp=null;
long count=0l;
System.out.println("解析中,请等待...............");
long starttime=System.currentTimeMillis();
while (in.hasNextLine()) {
str = in.nextLine();
temp=str.split("\t");
// System.out.println(str);
//含有16个品牌的任何一个,就将其保存下来
for(int i=0;i<16;i++)
{
if(temp[1].contains(wantedBrand[i]))
{
count++;
if(count>4036540)
outer2.write(str+"\n");
else
outer1.write(str+"\n");
break;
}
}
}
in.close();
outer1.close();
outer2.close();
long endTime = System.currentTimeMillis();
System.out.println("!!!!!!!!!!!!!提取完毕!!!!!!!!!!花费:"+(endTime-starttime)/1000+"[s]");
}catch(Exception e){
System.out.println("解析异常:"+e.getMessage());
}
}
}
运行结果:
正在解析,请稍候.......
花费时间:62s
(2)python代码
#!/bin/python
#encoding=utf8
'''
Created on 2015年8月26日
@author: zhangchangchang
'''
import sys
import time
def parseNaifen():
"""
parseNaifen
"""
destbrand=["爱他美","诺优能","惠氏","雅培","美素佳儿","合生元","美赞臣","贝因美","飞鹤","喜宝","雀巢","可瑞康","金领冠","圣元","伊利","多美滋"]
try:
file = open("D:/jd_naifen/jd_naifen_all",'r')
file_dest1 = open('D:/jd_naifen/jd_naifen_info_python1','w')
file_dest2= open('D:/jd_naifen/jd_naifen_info_python2','w')
emp = []
count=0
print '正在解析,请稍候.........\n'
starttime=time.clock()
while 1:
line = file.readline()
if not line:
break
#解决了中文乱码问题
type = sys.getfilesystemencoding()
line = line.decode(type).encode('utf-8')
emp = line.split('\t')
for brand in destbrand:
if brand in emp[1]:
count=count+1
if count > 4036540:
file_dest2.write(line +'\n')
else:
file_dest1.write(line +'\n')
break
elapsed=(time.clock()-starttime)
print ('花费的时间:',elapsed)
finally:
file.close()
file_dest1.close()
file_dest2.close()
if __name__ == '__main__':
parseNaifen()
运行结果:
正在解析,请稍候.........
花费时间:94.19s
3、总结
(1)在读取文件操作及字符串匹配上,python比java表现出更简的操作;
(2) 在运算上,特别是大量io的情况下java比python性能更佳。