公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具
这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
private string gethttpwebrequest( string url)
{
httpwebresponse result;
string strhtml = string .empty;
try
{
uri uri = new uri(url);
webrequest webreq = webrequest.create(uri);
webresponse webres = webreq.getresponse();
httpwebrequest myreq = (httpwebrequest)webreq;
myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;
myreq.accept = "*/*" ;
myreq.keepalive = true ;
myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );
result = (httpwebresponse)myreq.getresponse();
stream recevicestream = result.getresponsestream();
streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "utf-8" ));
strhtml = readerofstream.readtoend();
readerofstream.close();
recevicestream.close();
result.close();
}
catch
{
uri uri = new uri(url);
webrequest webreq = webrequest.create(uri);
httpwebrequest myreq = (httpwebrequest)webreq;
myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;
myreq.accept = "*/*" ;
myreq.keepalive = true ;
myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );
//result = (httpwebresponse)myreq.getresponse();
try
{
result = (httpwebresponse)myreq.getresponse();
}
catch (webexception ex)
{
result = (httpwebresponse)ex.response;
}
stream recevicestream = result.getresponsestream();
streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "gb2312" ));
strhtml = readerofstream.readtoend();
readerofstream.close();
recevicestream.close();
result.close();
}
return strhtml;
}
|
这是根据url爬取网页远吗,有一些小改动,很多网页有不同的编码格式,甚至有些网站做了反爬取的防范,这个方法经过能够改动也能爬去
以下是爬取网页所有的网址链接
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
/// <summary>
/// 提取html代码中的网址
/// </summary>
/// <param name="htmlcode"></param>
/// <returns></returns>
private static list< string > gethyperlinks( string htmlcode, string url)
{
arraylist al = new arraylist();
bool isgenxin = false ;
stringbuilder weburlsb = new stringbuilder(); //sql
stringbuilder linksb = new stringbuilder(); //展示数据
list< string > weburllistzx = new list< string >(); //新增
list< string > weburllist = new list< string >(); //旧的
string productioncontent = htmlcode;
regex reg = new regex( @"http(s)?://([\w-]+\.)+[\w-]+/?" );
string wangzhanyuming = reg.match(url, 0).value;
matchcollection mc = regex.matches(productioncontent.replace( "href=\"/" , "href=\"" + wangzhanyuming).replace( "href='/" , "href='" + wangzhanyuming).replace( "href=/" , "href=" + wangzhanyuming).replace( "href=\"./" , "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>" , regexoptions.singleline);
int index = 1;
foreach (match m in mc)
{
matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*" , regexoptions.singleline);
if (mc1.count > 0)
{
foreach (match m1 in mc1)
{
string linkurlstr = string .empty;
linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" );
weburlsb.append( "$-$" );
weburlsb.append(linkurlstr);
weburlsb.append( "$_$" );
if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))
{
isgenxin = true ;
weburllistzx.add(linkurlstr);
linksb.appendformat( "{0}<br/>" , linkurlstr);
}
}
}
else
{
if (m.value.indexof( "javascript" ) == -1)
{
string amstr = string .empty;
string wangzhanxiangduilujin = string .empty;
wangzhanxiangduilujin = url.substring(0, url.lastindexof( "/" ) + 1);
amstr = m.value.replace( "href=\"" , "href=\"" + wangzhanxiangduilujin).replace( "href='" , "href='" + wangzhanxiangduilujin);
matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*" , regexoptions.singleline);
foreach (match m1 in mc11)
{
string linkurlstr = string .empty;
linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" );
weburlsb.append( "$-$" );
weburlsb.append(linkurlstr);
weburlsb.append( "$_$" );
if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))
{
isgenxin = true ;
weburllistzx.add(linkurlstr);
linksb.appendformat( "{0}<br/>" , linkurlstr);
}
}
}
}
index++;
}
return weburllistzx;
}
|
这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
/// <summary>
/// // 把网址写入xml文件
/// </summary>
/// <param name="strurl"></param>
/// <param name="alhyperlinks"></param>
private static void writetoxml( string strurl, list< string > alhyperlinks)
{
xmltextwriter writer = new xmltextwriter( @"d:\hyperlinks.xml" , encoding.utf8);
writer.formatting = formatting.indented;
writer.writestartdocument( false );
writer.writedoctype( "hyperlinks" , null , "urls.dtd" , null );
writer.writecomment( "提取自" + strurl + "的超链接" );
writer.writestartelement( "hyperlinks" );
writer.writestartelement( "hyperlinks" , null );
writer.writeattributestring( "datetime" , datetime.now.tostring());
foreach ( string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
writer.writeelementstring(title, null , body);
}
writer.writeendelement();
writer.writeendelement();
writer.flush();
writer.close();
}
/// <summary>
/// 获取网址的域名后缀
/// </summary>
/// <param name="strurl"></param>
/// <returns></returns>
private static string getdomain( string strurl)
{
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)" ;
regex r = new regex(strregex, regexoptions.ignorecase);
match m = r.match(strurl);
retval = m.tostring();
strregex = @"\.|/$" ;
retval = regex.replace(retval, strregex, "" ).tostring();
if (retval == "" )
retval = "other" ;
return retval;
}
/// <summary>
/// 获取标题
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private static string gettitle( string html)
{
string titlefilter = @"<title>[\s\s]*?</title>" ;
string h1filter = @"<h1.*?>.*?</h1>" ;
string clearfilter = @"<.*?>" ;
string title = "" ;
match match = regex.match(html, titlefilter, regexoptions.ignorecase);
if (match.success)
{
title = regex.replace(match.groups[0].value, clearfilter, "" );
}
// 正文的标题一般在h1中,比title中的标题更干净
match = regex.match(html, h1filter, regexoptions.ignorecase);
if (match.success)
{
string h1 = regex.replace(match.groups[0].value, clearfilter, "" );
if (! string .isnullorempty(h1) && title.startswith(h1))
{
title = h1;
}
}
return title;
}
|
这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。