【文件属性】:
文件名称:java解析给定url
文件大小:4KB
文件格式:RAR
更新时间:2016-10-13 10:06:56
url解析 java 正则表达式
HtmlParse,解析给定url中的中文字符,输出到文本文件中:
url:可配置多个
输出路径:可配置
package com.lhs;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 现在要求用Java编写一个程序,该程序访问上面的URL,并从页面中,
* 提取出上图所示位置的页面信息(仅图中所要求的内容),将其内容按如下格式,输出到控制台
* GrapWeatherInfo
* @author lihsh
* @version 1.0
*
*/
public class HtmlParse {
List configList = new ArrayList();
private String savePath = "d:\\htmlParse.txt";
private String reg = "[\u4E00-\u9FA5]+";
Set resultSet = new LinkedHashSet();
/**
* @param args
*/
public static void main(String[] args) {
HtmlParse hp = new HtmlParse();
hp.getConfig();
hp.start();
hp.write2file();
}
/**
* 获得配置文件,得到公司要求的文件类型
*/
private void getConfig() {
Properties props = new Properties();
InputStream in = getClass().getResourceAsStream("/config.properties");
try {
props.load(in);
Enumeration en = props.propertyNames();
System.out.print("读取配置文件:");
while(en.hasMoreElements()) {
String key = (String) en.nextElement();
String value = (String) props.get(key);
if(key.startsWith("url")) {
configList.add(value);
}else if(key.equals("savePath")) {
savePath = value;
}else if(key.equals("reg")) {
reg = value;
}
System.out.print(key + ":" + value +"; ");
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
System.out.println("读取配置文件/config.properties出错");
}
}
/**
* 程序总入口
*/
private void start() {
for(int i = 0; i < configList.size(); i++) {
URLConnection con = getConnection(configList.get(i));
readContent(con);
System.out.println("读取:" + configList.get(i) + " 结束");
}
}
/**
* 获取url链接
* @return 链接
*/
private URLConnection getConnection(String _url) {
URLConnection con = null;
URL url = null;
try {
url = new URL(_url);
con=url.openConnection();
} catch (IOException e) {
e.printStackTrace();
}
return con;
}
/**
* 初步过滤出含有天气的行
* @param con url链接
* @return 关键行
*/
private void readContent(URLConnection con) {
BufferedReader br=null;
BufferedWriter bw = null;
try {
br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(savePath)), "UTF-8"));
String line="";
while((line=br.readLine()) != null) {
resultSet.addAll(parse(line));
}
bw.flush();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}finally {
try {
bw.close();
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 正则表达式匹配关键数据
* @param line
* @return
*/
private Set parse(String line) {
Set resSet = new LinkedHashSet();
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(line);
while(matcher.find()) {
String group = matcher.group();
resSet.add(group);
}
return resSet;
}
private void write2file() {
BufferedWriter bw = null;
try {
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(savePath)), "UTF-8"));
for(Iterator it = resultSet.iterator(); it.hasNext();) {
bw.write(it.next());
bw.newLine();
}
bw.flush();
System.out.println("解析结果保存至:" + savePath);
} catch (IOException e1) {
e1.printStackTrace();
}finally {
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
【文件预览】:
HtmlParse
----run.bat(30B)
----HtmlParse.jar(4KB)