Java爬虫到一些总结和心得

时间:2021-10-23 13:58:58

最近做了很多关于爬虫到项目,写点感想,以后查询

1.请求http连接,并保存内容,catch不同到exception进行反爬处理

int countUrl=0;  
      
    public String getOneHtml(String htmlurl,String encoding,String cookie) throws IOException, InterruptedException  
    {  
               //最多重复请求5次,用来反爬的  
    if(countUrl==5){  
        countUrl=0;  
        return "0";  
    }  
    //System.out.println(cookie);  
      
String temp;  
final StringBuffer sb = new StringBuffer();  
HttpURLConnection httpConn = null;  
try  
{  
    URL url = new URL(htmlurl);  
      
    httpConn = (HttpURLConnection) url.openConnection();  
    //头设置,get方法  
       HttpURLConnection.setFollowRedirects(true);  
       httpConn.setRequestMethod("GET");  
       httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");  
       httpConn.setRequestProperty("Connection","keep-alive");  
       httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");  
       httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");  
       httpConn.setRequestProperty("cookie",cookie);  
       httpConn.setRequestProperty("Cache-control","no-cache, no-store");   
       httpConn.setRequestProperty("Host","www.linkedin.com");  
       httpConn.setConnectTimeout(20000);  
       httpConn.setReadTimeout(20000);  
     // logger.info(httpConn.getResponseMessage());  
       BufferedReader in = new BufferedReader(new InputStreamReader(httpConn.getInputStream(), encoding));////打开连接,获取内容  
   if(httpConn.getResponseCode()!=200){  
       //System.out.println(httpConn.getHeaderField("location"));  
      // System.out.println(httpConn.getResponseCode()+htmlurl);  
       httpConn.disconnect();  
       Thread.sleep(30000);  
  
       cookie=login();  
       return getOneHtml(htmlurl,encoding,cookie);  
   }  
    while ((temp = in.readLine()) != null)  
       //替换点一些无用到符号  
    {  
       temp=temp.replaceAll("   ","");  
       temp=temp.replaceAll("u002d","-");  
       temp=temp.replaceAll("u0026","&");  
       temp=temp.replaceAll("u002d","-");  
       temp=temp.replaceAll("u0026","&");  
       temp=temp.replaceAll("n","");  
       temp=temp.replaceAll("t","");  
       temp=temp.replaceAll("r","");  
       sb.append(temp);  
   }  
   in.close();  
   httpConn.disconnect();  
   
}  
catch (final MalformedURLException me)  
{  
   System.out.println("url不存在!");  
   me.getMessage();  
   throw me;  
}  
catch (final FileNotFoundException me)  
{  
    System.out.println(htmlurl+"反爬启动");  
   return "0";  
}  
catch (final IOException e)  
{  
    e.printStackTrace();  
    System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);  
    httpConn.disconnect();  
    Thread.sleep(20000);  
    return this.getOneHtml(htmlurl, encoding,cookie);  
}  
      
    //System.out.println(sb);  
    countUrl=0;  
    httpConn.disconnect();  
      
    return sb.toString();  
      
}  
2.模拟登录,获取cookie:  
[java] view plain copy print ?
public String login() throws MalformedURLException, InterruptedException{  
        //Thread.sleep(3000000);  
        String htmlurl="https://www.linkedin.com/uas/login-submit";  
        HttpURLConnection httpConn = null;  
        String cookie="";  
        try  
        {  
            URL url = new URL(htmlurl);  
              
            httpConn = (HttpURLConnection) url.openConnection();  
              
               HttpURLConnection.setFollowRedirects(true);  
               httpConn.setRequestMethod("POST");  
               httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");  
               httpConn.setRequestProperty("Connection","keep-alive");  
               httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");  
               httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");  
               httpConn.setRequestProperty("Cache-control","no-cache, no-store");  
               httpConn.setRequestProperty("Host","www.linkedin.com");  
               //httpConn.setRequestProperty("Referer","https://www.linkedin.com/uas/login?session_redirect=http://www.linkedin.com/profile/view?id=222323610&authType=name&authToken=fcEe");  
               //post方法,重定向设置  
               httpConn.setDoOutput(true);  
               httpConn.setDoInput(true);  
               httpConn.setUseCaches(false);  
               httpConn.setInstanceFollowRedirects(true);  
               //写入,post方法必须用流写入的方式传输数据  
               StringBuffer str_buf = new StringBuffer(4096);  
               OutputStream os = httpConn.getOutputStream();  
               str_buf.append("session_key").append("=").append("email").append("&");  
               str_buf.append("session_password").append("=").append("gmail").append("&");  
               //str_buf.append("session_redirect").append("=").append(redictURL);  
               os.write(str_buf.toString().getBytes());  
               os.flush();  
               os.close();  
               httpConn.setConnectTimeout(20000);  
               httpConn.setReadTimeout(20000);  
                //获取重定向和cookie  
                //String redictURL= httpConn.getHeaderField( "Location" );                    
                //System.out.println("第一次请求重定向地址 location="+redictURL);     
                 
                //获取cookie    
                Map> map=httpConn.getHeaderFields();    
                //System.out.println(map.toString());    
                Set set=map.keySet();    
                for (Iterator iterator = set.iterator(); iterator.hasNext();) {    
                    String key = iterator.next();    
                    if(key!=null){  
                        if (key.equals("Set-Cookie")) {    
                            System.out.println("key=" + key+",开始获取cookie");    
                            List list = map.get(key);      
                            for (String str : list) {    
                                String temp=str.split("=")[0];  
                                //System.out.println(temp);   
                                              //cookie包含到信息非常多,调试发现登录只需这条信息   
                        if(temp.equals("li_at")){  
                            cookie=str;  
                            return cookie;  
                        }  
                         
                    }    
                }    
            }  
              
        }    
   httpConn.disconnect();  
   
}  
catch (final MalformedURLException me)  
{  
   System.out.println("url不存在!");  
   me.getMessage();  
   throw me;  
}  
catch (final FileNotFoundException me)  
{  
    System.out.println(htmlurl+"反爬启动");  
   return "0";  
}  
catch (final IOException e)  
{  
    e.printStackTrace();  
    System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);  
    httpConn.disconnect();  
    Thread.sleep(20000);  
    return login();  
}  
      
    //System.out.println(sb);  
return cookie;  
//return redictURL;  

以上是http处理部分,灵活应用post和get方法,可以获取HTML内容。

但是不同网站反爬策略不同。有的封IP,需要登录到有封帐号的,我这个是最简单到断开链接的,直接进程休眠。。。需要换IP,代理,cookie的情况,可以自己分析,基本也就是设置httpConn的一些值。

3.数据获取:

我一般采用正则匹配,这比较适用于爬取数据不多,网站只返回HTML内容,非常不规范的。。。比如linkedin,所有数据都在一个注释到json里,各种链接和奇怪的符号,用工具很难解析。。。

//教育信息"fosList":.*?schoolLogo  
        String edu="null";  
        ArrayList listEdu=new ArrayList();  
        String regex1 = ""fosList":.*?schoolLogo";  
        Pattern pa1 = Pattern.compile(regex1, Pattern.DOTALL);  
        Matcher ma1 = pa1.matcher(s);  
        while(ma1.find()){  
            EduInfor ei=new EduInfor(ui.getCv_id());  
            edu=ma1.group();  
            //学校  
            String school="null";  
            String regex = ""schoolName":.*?,";  
            Pattern pa= Pattern.compile(regex, Pattern.DOTALL);  
            Matcher ma = pa.matcher(edu);  
            if(ma.find()){  
                school=ma.group();  
                school=school.replaceAll(""schoolName":""");  
                school=school.replaceAll(""", "");  
                school=school.replaceAll("
,", "");  
                if(!school.equals("
")){  
                    ei.setCollege(school);  
                }     
            }  
            //学位  
            String degree="
null";  
            regex = "
"fmt__degree_highlight":.*?,";  
            pa= Pattern.compile(regex, Pattern.DOTALL);  
            ma = pa.matcher(edu);  
            if(ma.find()){  
                degree=ma.group();  
                degree=degree.replaceAll("
"fmt__degree_highlight":", "");  
                degree=degree.replaceAll("
"""");  
                degree=degree.replaceAll(",""");  
                degree=degree.replaceAll("u0027s""");  
                if(!degree.equals("")){  
                    ei.setDegree_name(degree);  
                }     
            }  
            //专业  
            String major="null";  
            regex = ""fmt__fos_highlight":.*?,";  
            pa= Pattern.compile(regex, Pattern.DOTALL);  
            ma = pa.matcher(edu);  
            if(ma.find()){  
                major=ma.group();  
                major=major.replaceAll(""fmt__fos_highlight":""");  
                major=major.replaceAll(""", "");  
                major=major.replaceAll("
,", "");  
                if(!major.equals("
")){  
                    ei.setMajor(major);  
                }     
            }  
            //学历"
grade":"1st"  
            String academic="
null";  
            regex = "
"grade":.*?,";  
            pa= Pattern.compile(regex, Pattern.DOTALL);  
            ma = pa.matcher(edu);  
            if(ma.find()){  
                academic=ma.group();  
                academic=academic.replaceAll("
"grade":", "");  
                academic=academic.replaceAll("
"""");  
                academic=academic.replaceAll(",""");  
                if(!academic.equals("")){  
                    ei.setAcademic_name(academic);  
                }     
            }  
            //时间"enddate_my":"2005","startdate_my":"2002"  
            String s_time="null";  
            regex = ""startdate_my":.*?,";  
            pa= Pattern.compile(regex, Pattern.DOTALL);  
            ma = pa.matcher(edu);  
            if(ma.find()){  
                s_time=ma.group();  
                s_time=s_time.replaceAll(""startdate_my":""");  
                s_time=s_time.replaceAll(""", "");  
                s_time=s_time.replaceAll("
,", "");  
                s_time=s_time.replaceAll("
 ", "");  
                if(!s_time.equals("
")){  
                    ei.setStart_time(s_time);  
                }     
            }  
              
            String e_time="
null";  
            regex = "
"enddate_my":.*?,";  
            pa= Pattern.compile(regex, Pattern.DOTALL);  
            ma = pa.matcher(edu);  
            if(ma.find()){  
                e_time=ma.group();  
                e_time=e_time.replaceAll("
"enddate_my":", "");  
                e_time=e_time.replaceAll("
"""");  
                e_time=e_time.replaceAll(",""");  
                e_time=e_time.replaceAll(" """);  
                if(!e_time.equals("")){  
                    ei.setEnd_time(e_time);  
                }     
            }else{  
                ei.setEnd_time("目前");  
            }  
            listEdu.add(ei);  
              
        }