java访问站点 并实现简易网络爬虫时间:2021-12-29 16:56:25import java.io.*;import java.net.HttpURLConnection;import java.net.URL;import java.util.*;public class HttpConnTest {public static void main (String[] args) throws Exception{//URL url = new URL("http://javaeye.com"); URL url = new URL("http://blog.sina.com.cn/buptaa"); HttpURLConnection conn = (HttpURLConnection) url.openConnection();conn.connect();//打印请求相应的头部文件Map<String,List<String>> header = conn.getHeaderFields();for(String key : header.keySet()){System.out.println(key + ":" + header.get(key));}//打印相应内容BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8"));String str = null;while((str = br.readLine()) != null){System.out.println(str);} conn.disconnect();}} 基于广度优先算法 在上面代码基础上实现简易爬虫 如下 import java.io.*;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.util.*;import java.util.regex.*;public class HttpConnTest {private List<URL> urlList = new ArrayList<URL>();private int count = 0;private void doHttpConn() throws Exception{count ++;URL url = new URL("http://blog.sina.com.cn/buptaa");if(! urlList.isEmpty()){url = urlList.get(0);}String urlRegx = "(http|www|ftp)(://)?(//w+(-//w+)*)" +"(//.(//w+(-//w+)*))*((://d+)?)(/(//w+(-//w+)*))" +"*(//.?(//w)*)(//?)?(((//w*%)*(//w*//?)*(//w*:)" +"*(//w*//+)*(//w*//.)*(//w*&)*(//w*-)*(//w*=)*" +"(//w*%)*(//w*//?)*(//w*:)*(//w*//+)*(//w*//.)*" +"(//w*&)*(//w*-)*(//w*=)*)*(//w*)*)";Pattern p = Pattern.compile(urlRegx, Pattern.CASE_INSENSITIVE);HttpURLConnection conn = (HttpURLConnection) url.openConnection();conn.connect();// 打印请求相应的头部文件Map<String, List<String>> header = conn.getHeaderFields();for (String key : header.keySet()) {System.out.println(key + ":" + header.get(key));}// 打印相应内容BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));String str = null;while ((str = br.readLine()) != null) {System.out.println(str);Matcher m = p.matcher(str);while (m.find()) {urlList.add(new URL(m.group(0)));}}conn.disconnect();System.out.println("-----------------------");System.out.println(urlList.size());for (URL aurl : urlList) {System.out.println(aurl.toString());}}public static void main(String[] args) throws Exception {HttpConnTest hct = new HttpConnTest();while(hct.count <= 3){hct.doHttpConn();}System.out.println("---DONE---"+hct.count);}}