使用代理(WebProxy)爬虫

时间:2022-06-28 14:00:48

关键代码:

使用代理(WebProxy)爬虫使用代理(WebProxy)爬虫
 1 private Hashtable hash;//储存代理ip
2 private WebProxy currentdaili;
3 private int dailiExecMaxCount; //每个代理执行最大次数
4 private int currentDailiExecCount; //当前代理执行次数
5 public Handler2() //构造函数
6 {
7 dailiExecMaxCount = 100;
8 currentDailiExecCount = 0;
9 //hash = GetDailiList();
10 currentdaili = GetOneDaili();
11 }
12
13
14 //http://www.xici.net.co
15 /// <summary>
16 /// 获取代理ip返回hashtable
17 /// KK 2015-04-22
18 /// </summary>
19 /// <returns></returns>
20 private Hashtable GetDailiList()
21 {
22 Hashtable result = new Hashtable();
23 string strUrl = string.Format("http://www.xici.net.co");
24 string detailContext = GetHtmlByUrl(strUrl);
25 if (!string.IsNullOrEmpty(detailContext))
26 {
27 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
28 string strkeyvalue = string.Empty;
29 try
30 {
31 doc.LoadHtml(detailContext);
32 HtmlNode node = doc.DocumentNode;
33 HtmlNodeCollection trlist = node.SelectNodes("//table[@id='ip_list']//tr[@class='odd' or @class='']");
34 foreach (HtmlNode item in trlist)
35 {
36 if (item.SelectNodes("td")[5].InnerText.ToUpper() == "HTTP")
37 {
38 strkeyvalue = item.SelectNodes("td")[1].InnerText + ":" + item.SelectNodes("td")[2].InnerText;
39 result.Add(strkeyvalue, strkeyvalue);
40 }
41 }
42 }
43 catch (Exception ex)
44 {
45 webframework.common.logclass.Debug("======取代理ip出错====GetDaili==" + ex.Message);
46 result = null;
47 }
48
49
50 }
51 else
52 {
53 result = null;
54 }
55 return result;
56 }
57
58 /// <summary>
59 /// 从hashtable代理中取任意ip代理
60 /// </summary>
61 /// <param name="hash"></param>
62 /// <returns></returns>
63 private WebProxy GetOneDaili()
64 {
65 try
66 {
67 if (hash == null || hash.Count == 0)
68 hash = GetDailiList();
69 if (currentdaili != null && hash.Contains(currentdaili.Address.Authority + ":" + currentdaili.Address.Port))
70 {
71 hash.Remove(currentdaili.Address.Authority + ":" + currentdaili.Address.Port);
72 }
73 System.Collections.IDictionaryEnumerator enumerator = hash.GetEnumerator();
74
75 //随机取代理
76 Random rd = new Random();
77 int n = rd.Next(hash.Count);
78 int intCount = 0;
79 while (enumerator.MoveNext())
80 {
81 intCount++;
82 if (intCount == n)
83 {
84 currentdaili = new WebProxy(enumerator.Key.ToString(), true);
85 break;
86 }
87 }
88 }
89 catch (Exception ex)
90 {
91 webframework.common.logclass.Debug("======从hashtable代理中取任意ip代理出错====GetOneDaili==" + ex.Message);
92 currentdaili = null;
93 }
94 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port );
95 return currentdaili;
96 }
View Code

使用:

使用代理(WebProxy)爬虫使用代理(WebProxy)爬虫
 1 /// <summary>
2 /// 发送get请求
3 /// </summary>
4 /// <param name="strUrl"></param>
5 /// <param name="isRetry"></param>
6 /// <returns></returns>
7 private string GetHtmlByUrl(string strUrl, bool isRetry = false, WebProxy daili = null)
8 {
9 currentDailiExecCount++;
10 if (currentDailiExecCount > dailiExecMaxCount)
11 {
12 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
13 }
14 try
15 {
16 HttpWebResponse response = new webframework.common.HttpHelper()
17 {
18 URL = string.Format("{0}", strUrl),
19 //Proxy = daili == null ? currentdaili : daili,
20 //Proxy = new WebProxy("218.204.140.97:8118", true),
21 Proxy = daili == null ? (currentDailiExecCount > dailiExecMaxCount ? GetOneDaili() : currentdaili) : daili,
22 Timeout = 5 * 1000,
23 }.CreateGetHttpResponse();
24
25 return response.HttpString(Encoding.UTF8);
26 }
27 catch (Exception)
28 {
29 //重试请求
30 if (!isRetry)
31 return GetHtmlByUrl(strUrl, true, GetOneDaili());
32 else
33 throw null;
34 }
35
36 }
37
38
39 /// <summary>
40 /// 发送post请求
41 /// </summary>
42 /// <param name="strUrl"></param>
43 /// <param name="isRetry"></param>
44 /// <returns></returns>
45 private string PostHtmlByUrl(string strUrl, string strPostString, bool isRetry = false, WebProxy daili = null)
46 {
47 currentDailiExecCount++;
48 if (currentDailiExecCount > dailiExecMaxCount)
49 {
50 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
51 }
52 try
53 {
54 HttpWebResponse response = new HttpHelper()
55 {
56 URL = strUrl,
57 PostString = strPostString,
58 //Proxy = new WebProxy("218.204.140.97:8118", true),
59 Proxy = daili == null ? (currentDailiExecCount>dailiExecMaxCount?GetOneDaili(): currentdaili) : daili,
60 //Proxy = daili == null ? currentdaili : daili,
61 PostEncoding = Encoding.UTF8,
62 Timeout = 5 * 1000,
63 }.CreatePostHttpResponse();
64
65 return response.HttpString(Encoding.UTF8);
66 }
67 catch (Exception)
68 {
69 //重试请求
70 if (!isRetry)
71 return PostHtmlByUrl(strUrl, strPostString, true, GetOneDaili());
72 else
73 throw null;
74 }
75
76 }
View Code

 

 

 

参考资料:

http://www.haolizi.net/example/view_199.html