爬虫的过程中可能会涉及到ip代理的问题。网上有很多可用的ip代理信息:
下面就是从中选择一个最合适的ip。实现思路:
1. 获取该页面内容,提取每一行ip信息。
2. 新建一个java Bean 封装每一个ip属性。
3.将所有的javaBean添加到一个arraylist中。
4.对整个arraylist排序。
代码如下:
/*
* 下载时事可用的网络爬虫代理
*/
public class CrawlProxyIp5Net {
public static ArrayList<ProxyConfigBean> getProxyConfigs(){
ArrayList<ProxyConfigBean> list = new ArrayList<ProxyConfigBean>();
try {
WebClient client = new WebClient(BrowserVersion.CHROME);
client.getOptions().setJavaScriptEnabled(false);
client.getOptions().setCssEnabled(false);
HtmlPage page = client.getPage("http://pachong.org/");
HtmlTableBody tableBody = (HtmlTableBody) page.getByXPath("//table[@class='tb']/tbody").get(0);
List<HtmlTableRow> tableRows = tableBody.getRows();
if(tableRows!=null){
for(int i=0;i<tableRows.size();i++){
ProxyConfigBean bean = new ProxyConfigBean();
HtmlTableRow tableRow = tableRows.get(i);
HtmlTableCell ipCell = tableRow.getCell(1);
HtmlTableCell portCell = tableRow.getCell(2);
HtmlTableCell countryCell = tableRow.getCell(3);
HtmlTableCell typeCell = tableRow.getCell(4);
HtmlTableCell statuCell = tableRow.getCell(5);
String ip = MyStringUtils.pureString(ipCell.asText());
bean.setIp(ip);
String portValue = MyStringUtils.pureString(portCell.asText());
if(portValue!=null && !portValue.matches("^[0-9]")){
int port = Integer.parseInt(portValue);
bean.setPort(port);
}
String country = MyStringUtils.pureString(countryCell.asText());
bean.setCountry(country);
String type = MyStringUtils.pureString(typeCell.asText());
bean.setType(type);
String statu = MyStringUtils.pureString(statuCell.asText());
bean.setStatu(statu);
//最后设置优先级,在设置优先级之前,必须设置好其他属性的值
bean.setPriority();
list.add(bean);
}
}
client.closeAllWindows();
Collections.sort(list,new Comparator<ProxyConfigBean>() {
@Override
public int compare(ProxyConfigBean bean1, ProxyConfigBean bean2) {
// TODO Auto-generated method stub
int scores1 = bean1.getPriority();
int scores2 = bean2.getPriority();
return scores2-scores1;
}
});
} catch (FailingHttpStatusCodeException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
public static ProxyConfig getProxyConfig(){
ArrayList<ProxyConfigBean> list = getProxyConfigs();
if(list!=null && list.size()>0){
ProxyConfigBean bean = list.get(0);
ProxyConfig proxyConfig = new ProxyConfig();
proxyConfig.setProxyHost(bean.getIp());
proxyConfig.setProxyPort(bean.getPort());
return proxyConfig;
}else{
return null;
}
}
public static void main(String[] args) {
ArrayList<ProxyConfigBean> list = getProxyConfigs();
for(int i=0;i<list.size();i++){
ProxyConfigBean bean = list.get(i);
System.out.println(bean.getCountry() +" "+bean.getPort()+ " "+bean.getIp());
}
System.out.println("done.......");
}
}
其中JAVABean代码:
public class ProxyConfigBean {
private String ip;
private int port;
private String country;//中国,其他国家
private String type;//匿名 程度high、anonymous、elite、transparent
private String statu;//空闲、繁忙、较忙
private int priority;//优先级
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getStatu() {
return statu;
}
public void setStatu(String statu) {
this.statu = statu;
}
public int getPriority() {
return priority;
}
public void setPriority() {
//根据国家设置优先级
if(this.country.contains("中国")){
this.priority +=50;
}
//根据状态设置优先级,空闲(10)、繁忙(0)、较忙(5)
if(this.statu.contains("空闲")){
this.priority +=20;
}else if(this.statu.contains("较忙")){
this.priority +=10;
}
//根据类型设置优先级匿名 程度high(4)、anonymous(5)、elite(2)、transparent(3)
if(this.type.contains("anonymous")){
this.priority +=5;
}else if(this.type.contains("high")){
this.priority +=4;
}else if(this.type.contains("transparent")){
this.priority +=3;
}else if(this.type.contains("elite")){
this.priority +=2;
}
//根据端口,80端口最优
if(this.port == 80){
this.priority +=1;
}
}
}