crawler_基础之_java.net.HttpURLConnection 访问网络资源

java访问网络资源由底层到封装为 scoket==> java.net.HttpURLConnection==>HttpClient
这次阐述先 java.net.HttpURLConnection 的方式，好处是用导包，jdk原生自带的。
HtmlUtil 包含尝试重连（3次），编码识别，保存文件到磁盘
package com.cph.crawler.core.utils;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLEncoder;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

/**

 * 类说明：html有关的操作 <br>

 * --22下午08::20创建<br>

 *

 * @author cphmvp

 */

public final class HtmlUtil {

    public final static Log LOG = LogFactory.getLog(HtmlUtil.class);

    static String defaultEncoding = "utf-8";

    static HttpURLConnection httpURLConnection = null;

    static URL urlModel = null;

    // 链接超时时间

    static int connectTimeout = ;

    // 读取响应超时时间

    static int readTimeout = ;

    /**

     * 下载图片<br>

     *

     * @param url

     *            图片的下载地址<br>

     * @param savePath

     *            保存路径<br>

     * @throws IOException

     */

    @SuppressWarnings("resource")

    public static void downloadAndSavePictureToDisk(String url, String savePath)

            throws IOException {

        urlModel = new URL(url);

        httpURLConnection = (HttpURLConnection) urlModel.openConnection();

        httpURLConnection.setConnectTimeout(connectTimeout);

        httpURLConnection.setReadTimeout(readTimeout);

        httpURLConnection.setDoOutput(true);

        InputStream is = httpURLConnection.getInputStream();

        BufferedReader rd = new BufferedReader(new InputStreamReader(is));

        FileOutputStream fw = null;

        File f = new File(savePath.substring(, savePath.lastIndexOf("/")));

        if (!f.exists()) {

            f.mkdirs();

        }

        File eixtsFile = new File(savePath);

        if (eixtsFile.exists()) {

            return;

        }

        fw = new FileOutputStream(savePath, true);

        int num = -;

        while ((num = is.read()) != (-))// 是否读完所有数据

        {

            fw.write(num);// 将数据写往文件

        }

        rd.close();

        is.close();

        if (httpURLConnection != null) {

            httpURLConnection.disconnect();

        }

    }

    /**

     * 讲url后面的参数进行编码

     *

     * @param url

     * @return

     * @throws UnsupportedEncodingException

     */

    private static String encodParamters(String url)

            throws UnsupportedEncodingException {

        String returnStr = new String(url);

        String regex = "=([^&]+)";

        Pattern p = Pattern.compile(regex);

        Matcher m = p.matcher(url);

        while (m.find()) {

            String replaceStr = m.group();

            returnStr = returnStr.replaceFirst(replaceStr,

                    URLEncoder.encode(replaceStr, "utf-8"));

        }

        return returnStr;

    }

    /**

     * 获取会话的JSESSIONID

     *

     * @param url

     * @return

     */

    public static String getSession(String url) {

        String sessionId = "";

        try {

            urlModel = new URL(url);

            httpURLConnection = (HttpURLConnection) urlModel.openConnection();

            httpURLConnection.setConnectTimeout(connectTimeout);

            httpURLConnection.setReadTimeout(readTimeout);

            String cookieVal = null;

            String key = null;

            for (int i = ; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {

                if (key.equalsIgnoreCase("set-cookie")) {

                    cookieVal = httpURLConnection.getHeaderField(i);

                    cookieVal = cookieVal.substring(, cookieVal.indexOf(";"));

                    sessionId = sessionId + cookieVal + ";";

                }

            }

        } catch (MalformedURLException e) {

            LOG.error(e);

        } catch (IOException e) {

            LOG.error(e);

        }

        return sessionId;

    }

    /**

     * 下载页面</br>

     *

     * @param page

     *            </br>

     * @return 页面源码

     * @throws IOException

     * @throws UnsupportedEncodingException

     */

    public static StringBuffer downloadHtml(String url,String encoding) {

        StringBuffer sb = new StringBuffer();

        BufferedReader in = null;

        int tryNum = ;

        while (true) {

            try {

                if (tryNum > ) {

                    String ecodingUrl = encodParamters(url);

                    urlModel = new URL(ecodingUrl);

                } else {

                    urlModel = new URL(url);

                }

                httpURLConnection = (HttpURLConnection) urlModel

                        .openConnection();

                httpURLConnection.setConnectTimeout(connectTimeout);

                httpURLConnection.setReadTimeout(readTimeout);

                httpURLConnection

                        .setRequestProperty("User-Agent",

                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");

                String redirectUrl = httpURLConnection.getURL().toString();

                if (!redirectUrl.equals(url)) {

                    LOG.info(url + "重定向后为" + redirectUrl);

                }

                String charSetHeader = httpURLConnection

                        .getHeaderField("Content-Type");

                String charSet = null;

                if (charSetHeader != null) {

                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");

                    Matcher m = p.matcher(charSetHeader);

                    if (m.find())

                        charSet = m.group().trim();

                    if (null == charSet) {

                        charSet = encoding;

                    }

                }

                charSet = (charSet == null ? encoding : charSet);

                in = new BufferedReader(new InputStreamReader(

                        httpURLConnection.getInputStream(), charSet));

                String inputLine;

                while ((inputLine = in.readLine()) != null) {

                    sb.append(inputLine + "\n");

                    inputLine = null;

                }

                if (in != null)

                    try {

                        in.close();

                    } catch (IOException e) {

                        LOG.error(e);

                    }

                if (httpURLConnection != null)

                    httpURLConnection.disconnect();

                break;

            } catch (Exception e) {

                if (tryNum++ == ) {

                    LOG.error("download page error [ " + urlModel + " ] ");

                    return null;

                }

                LOG.warn(tryNum + "次下载失败", e);

            }

        }

        return sb;

    }

    /**

     * 下载页面</br>

     *

     * @param page

     *            </br>

     * @return 页面源码

     * @throws IOException

     * @throws UnsupportedEncodingException

     */

    public static StringBuffer downloadHtml(String url) {

        StringBuffer sb = new StringBuffer();

        BufferedReader in = null;

        int tryNum = ;

        while (true) {

            try {

                if (tryNum > ) {

                    String ecodingUrl = encodParamters(url);

                    urlModel = new URL(ecodingUrl);

                } else {

                    urlModel = new URL(url);

                }

                httpURLConnection = (HttpURLConnection) urlModel

                        .openConnection();

                httpURLConnection.setConnectTimeout(connectTimeout);

                httpURLConnection.setReadTimeout(readTimeout);

                httpURLConnection

                        .setRequestProperty("User-Agent",

                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");

                String redirectUrl = httpURLConnection.getURL().toString();

                if (!redirectUrl.equals(url)) {

                    LOG.info(url + "重定向后为" + redirectUrl);

                }

                String charSetHeader = httpURLConnection

                        .getHeaderField("Content-Type");

                String charSet = null;

                if (charSetHeader != null) {

                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");

                    Matcher m = p.matcher(charSetHeader);

                    if (m.find())

                        charSet = m.group().trim();

                    if (null == charSet) {

                        charSet = defaultEncoding;

                    }

                }

                charSet = (charSet == null ? defaultEncoding : charSet);

                in = new BufferedReader(new InputStreamReader(

                        httpURLConnection.getInputStream(), charSet));

                String inputLine;

                while ((inputLine = in.readLine()) != null) {

                    sb.append(inputLine + "\n");

                    inputLine = null;

                }

                if (in != null)

                    try {

                        in.close();

                    } catch (IOException e) {

                        LOG.error(e);

                    }

                if (httpURLConnection != null)

                    httpURLConnection.disconnect();

                break;

            } catch (Exception e) {

                if (tryNum++ == ) {

                    LOG.error("download page error [ " + urlModel + " ] ");

                    return null;

                }

                LOG.warn(tryNum + "次下载失败", e);

            }

        }

        return sb;

    }

}
秒客网

crawler_基础之_java.net.HttpURLConnection 访问网络资源

相关文章