爬虫代理ip设置

时间:2021-01-02 16:59:03

爬虫的过程中可能会涉及到ip代理的问题。网上有很多可用的ip代理信息:

http://pachong.org/

爬虫代理ip设置

下面就是从中选择一个最合适的ip。实现思路:

1. 获取该页面内容,提取每一行ip信息。

2. 新建一个java Bean 封装每一个ip属性。

3.将所有的javaBean添加到一个arraylist中。

4.对整个arraylist排序。

代码如下:

/*
 * 下载时事可用的网络爬虫代理
 */
public class CrawlProxyIp5Net {


public static ArrayList<ProxyConfigBean> getProxyConfigs(){
ArrayList<ProxyConfigBean> list = new ArrayList<ProxyConfigBean>();
try {
WebClient client = new WebClient(BrowserVersion.CHROME);
client.getOptions().setJavaScriptEnabled(false);
client.getOptions().setCssEnabled(false);


HtmlPage page = client.getPage("http://pachong.org/");

HtmlTableBody tableBody = (HtmlTableBody) page.getByXPath("//table[@class='tb']/tbody").get(0);

List<HtmlTableRow> tableRows = tableBody.getRows();

if(tableRows!=null){
for(int i=0;i<tableRows.size();i++){

ProxyConfigBean bean = new ProxyConfigBean();

HtmlTableRow tableRow = tableRows.get(i);
HtmlTableCell ipCell = tableRow.getCell(1);
HtmlTableCell portCell = tableRow.getCell(2);
HtmlTableCell countryCell = tableRow.getCell(3);
HtmlTableCell typeCell = tableRow.getCell(4);
HtmlTableCell statuCell = tableRow.getCell(5);

String ip = MyStringUtils.pureString(ipCell.asText());
bean.setIp(ip);

String portValue = MyStringUtils.pureString(portCell.asText());
if(portValue!=null && !portValue.matches("^[0-9]")){
int port = Integer.parseInt(portValue);
bean.setPort(port);
}

String country = MyStringUtils.pureString(countryCell.asText());
bean.setCountry(country);

String type = MyStringUtils.pureString(typeCell.asText());
bean.setType(type);

String statu = MyStringUtils.pureString(statuCell.asText());
bean.setStatu(statu);

//最后设置优先级,在设置优先级之前,必须设置好其他属性的值
bean.setPriority();
list.add(bean);
}
}

client.closeAllWindows();

Collections.sort(list,new Comparator<ProxyConfigBean>() {


@Override
public int compare(ProxyConfigBean bean1, ProxyConfigBean bean2) {
// TODO Auto-generated method stub
int scores1 = bean1.getPriority();
int scores2 = bean2.getPriority();
return scores2-scores1;
}
});


} catch (FailingHttpStatusCodeException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}


return list;
}

public static ProxyConfig getProxyConfig(){
ArrayList<ProxyConfigBean> list = getProxyConfigs();
if(list!=null && list.size()>0){
ProxyConfigBean bean = list.get(0);
ProxyConfig proxyConfig = new ProxyConfig();
proxyConfig.setProxyHost(bean.getIp());
proxyConfig.setProxyPort(bean.getPort());
return proxyConfig;
}else{
return null;
}
}

public static void main(String[] args) {
ArrayList<ProxyConfigBean> list = getProxyConfigs();
for(int i=0;i<list.size();i++){
ProxyConfigBean bean = list.get(i);
System.out.println(bean.getCountry() +"  "+bean.getPort()+ "  "+bean.getIp());
}
System.out.println("done.......");
}
}

其中JAVABean代码:

public class ProxyConfigBean {


private String ip;
private int port;
private String country;//中国,其他国家
private String type;//匿名 程度high、anonymous、elite、transparent
private String statu;//空闲、繁忙、较忙
private int priority;//优先级


public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getStatu() {
return statu;
}
public void setStatu(String statu) {
this.statu = statu;
}
public int getPriority() {
return priority;
}
public void setPriority() {

//根据国家设置优先级
if(this.country.contains("中国")){
this.priority +=50;
}

//根据状态设置优先级,空闲(10)、繁忙(0)、较忙(5)
if(this.statu.contains("空闲")){
this.priority +=20;
}else if(this.statu.contains("较忙")){
this.priority +=10;
}

//根据类型设置优先级匿名 程度high(4)、anonymous(5)、elite(2)、transparent(3)
if(this.type.contains("anonymous")){
this.priority +=5;
}else if(this.type.contains("high")){
this.priority +=4;
}else if(this.type.contains("transparent")){
this.priority +=3;
}else if(this.type.contains("elite")){
this.priority +=2;
}

//根据端口,80端口最优
if(this.port == 80){
this.priority +=1;
}

}



}