使用代理(WebProxy)爬虫

时间:2021-08-17 09:11:07

关键代码:

 private Hashtable hash;//储存代理ip
private WebProxy currentdaili;
private int dailiExecMaxCount; //每个代理执行最大次数
private int currentDailiExecCount; //当前代理执行次数
public Handler2() //构造函数
{
dailiExecMaxCount = ;
currentDailiExecCount = ;
//hash = GetDailiList();
currentdaili = GetOneDaili();
} //http://www.xici.net.co
/// <summary>
/// 获取代理ip返回hashtable
/// KK 2015-04-22
/// </summary>
/// <returns></returns>
private Hashtable GetDailiList()
{
Hashtable result = new Hashtable();
string strUrl = string.Format("http://www.xici.net.co");
string detailContext = GetHtmlByUrl(strUrl);
if (!string.IsNullOrEmpty(detailContext))
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
string strkeyvalue = string.Empty;
try
{
doc.LoadHtml(detailContext);
HtmlNode node = doc.DocumentNode;
HtmlNodeCollection trlist = node.SelectNodes("//table[@id='ip_list']//tr[@class='odd' or @class='']");
foreach (HtmlNode item in trlist)
{
if (item.SelectNodes("td")[].InnerText.ToUpper() == "HTTP")
{
strkeyvalue = item.SelectNodes("td")[].InnerText + ":" + item.SelectNodes("td")[].InnerText;
result.Add(strkeyvalue, strkeyvalue);
}
}
}
catch (Exception ex)
{
webframework.common.logclass.Debug("======取代理ip出错====GetDaili==" + ex.Message);
result = null;
} }
else
{
result = null;
}
return result;
} /// <summary>
/// 从hashtable代理中取任意ip代理
/// </summary>
/// <param name="hash"></param>
/// <returns></returns>
private WebProxy GetOneDaili()
{
try
{
if (hash == null || hash.Count == )
hash = GetDailiList();
if (currentdaili != null && hash.Contains(currentdaili.Address.Authority + ":" + currentdaili.Address.Port))
{
hash.Remove(currentdaili.Address.Authority + ":" + currentdaili.Address.Port);
}
System.Collections.IDictionaryEnumerator enumerator = hash.GetEnumerator(); //随机取代理
Random rd = new Random();
int n = rd.Next(hash.Count);
int intCount = ;
while (enumerator.MoveNext())
{
intCount++;
if (intCount == n)
{
currentdaili = new WebProxy(enumerator.Key.ToString(), true);
break;
}
}
}
catch (Exception ex)
{
webframework.common.logclass.Debug("======从hashtable代理中取任意ip代理出错====GetOneDaili==" + ex.Message);
currentdaili = null;
}
logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port );
return currentdaili;
}

使用:

 /// <summary>
/// 发送get请求
/// </summary>
/// <param name="strUrl"></param>
/// <param name="isRetry"></param>
/// <returns></returns>
private string GetHtmlByUrl(string strUrl, bool isRetry = false, WebProxy daili = null)
{
currentDailiExecCount++;
if (currentDailiExecCount > dailiExecMaxCount)
{
logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+")");
}
try
{
HttpWebResponse response = new webframework.common.HttpHelper()
{
URL = string.Format("{0}", strUrl),
//Proxy = daili == null ? currentdaili : daili,
//Proxy = new WebProxy("218.204.140.97:8118", true),
Proxy = daili == null ? (currentDailiExecCount > dailiExecMaxCount ? GetOneDaili() : currentdaili) : daili,
Timeout = * ,
}.CreateGetHttpResponse(); return response.HttpString(Encoding.UTF8);
}
catch (Exception)
{
//重试请求
if (!isRetry)
return GetHtmlByUrl(strUrl, true, GetOneDaili());
else
throw null;
} } /// <summary>
/// 发送post请求
/// </summary>
/// <param name="strUrl"></param>
/// <param name="isRetry"></param>
/// <returns></returns>
private string PostHtmlByUrl(string strUrl, string strPostString, bool isRetry = false, WebProxy daili = null)
{
currentDailiExecCount++;
if (currentDailiExecCount > dailiExecMaxCount)
{
logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+")");
}
try
{
HttpWebResponse response = new HttpHelper()
{
URL = strUrl,
PostString = strPostString,
//Proxy = new WebProxy("218.204.140.97:8118", true),
Proxy = daili == null ? (currentDailiExecCount>dailiExecMaxCount?GetOneDaili(): currentdaili) : daili,
//Proxy = daili == null ? currentdaili : daili,
PostEncoding = Encoding.UTF8,
Timeout = * ,
}.CreatePostHttpResponse(); return response.HttpString(Encoding.UTF8);
}
catch (Exception)
{
//重试请求
if (!isRetry)
return PostHtmlByUrl(strUrl, strPostString, true, GetOneDaili());
else
throw null;
} }

参考资料:

http://www.haolizi.net/example/view_199.html