这学期参加了服务外包大赛,具体要实现对非结构化数据的分析处理,所以在这里把这个过程一点点记录一下。
首先根据python的爬虫框架,从网页上获取了中文文本
但是由于我不怎么会处理中文数据,摸索了很久,简单的通过java的substring把数据分开
package se;
import java.io.File;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
public class sdf {
public static void main(String args[]) {
try {
String pathname = "info.txt";
File filename = new File(pathname);
InputStreamReader reader = new InputStreamReader(
new FileInputStream(filename));
BufferedReader br = new BufferedReader(reader);
String line = "";
File writename = new File("output1.txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
writename.createNewFile(); // 创建新文件
BufferedWriter out = new BufferedWriter(new FileWriter(writename));
line = br.readLine();
// System.out.println(line);
out.write(line);
out.write("\r\n");
out.write(" \r\n");
while (line != null) {
line = br.readLine(); // 一次读入一行数据
// System.out.println(line);
int b=0;
for(int i=0;i<line.length();i++)
{
if(line.substring(i,i+1).equalsIgnoreCase(":"))
b=i;
}
// System.out.println(b);
// System.out.println(line.length());
if(b==0||b==line.length()-1)continue;
else
{
System.out.print(line.substring(0,b));
out.write(line.substring(0,b));
//out.flush();
for(int i=1;i<=20-b;i++)
{
System.out.printf(" ");
out.write(" ");
}
System.out.print(line.substring(b+1, line.length()));
out.write(line.substring(b+1, line.length()));
out.write("\r\n");
//System.out.printf("\t");
System.out.printf("\n");
out.flush();
}
//System.out.println(b);
}
out.close(); // 最后记得关闭文件
} catch (Exception e) {
e.printStackTrace();
}
}
}
然后再将数据分开,由于中间有空格,导入到excel中