这是我面试的题目,借用了很多网上同仁的代码,如有冒犯,请海涵!
现在工作真难找,我应聘的单位是http://www.027dns.net/,希望公司经理能给我上班的机会,我会很努力的,因为软件行业才是我的世界!我一个大学本科生当保安都成了同事们的笑话了,呵呵。
这是我第一次求职软件行业,第一次做面试题目,第一次自己这么认真写博客文章,写的不好,大家请指正,我会进步的!
张素丰,转载请注明出处http://www.cnblogs.com/zhangsufeng/archive/2009/02/28/1400224.html
屁话少说,正文开始:
假如我们采集网址:http://info.laser.hc360.com/list/z_news_yw.shtml 上的新闻,要求采集标题、时间、内容、单篇文章如果有翻页则采集完全。
这种类型的采集就是从指定网页获得新闻列表(即url),然后通过其url获得新闻详情,这是一种很常见的采集方式,有可能到很多页面上去采集,所以我们可以采用接口来构造基类。
首先定义 IGatherInfo.cs

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Linq;
4
using System.Text;
5
6
namespace ClassLibrary
7

{
8
/**//// <summary>
9
/// 新闻采集类接口
10
/// </summary>
11
interface IGatherInfo
12
{
13
/**//// <summary>
14
/// 采集时间
15
/// </summary>
16
string gatherTime
17
{
18
get;
19
set;
20
}
21
/**//// <summary>
22
/// NewsListUrl:抽取页地址
23
/// RegexString:正则表达式,抽取逻辑
24
/// 返回新闻页url
25
/// </summary>
26
List<string> GatherUrlList(string NewsListUrl, string RegexString);
27
//采集新闻详细内容
28
List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString);
29
}
30
}
31
接口定义了三个成员:gatherTime采集时间,GatherUrlList()从指定网址抽取新闻Url,GatherNewsDetail()读取新闻详细内容。
下面我们分析一下该程序中可能要用的公共方法,定义在 GatherInfoBase.cs
1.时间转换函数string DateToString()

Code
/// 日期转换函数
/// 日期->字符串
public string DateToString()
{
DateTime d = DateTime.Now;
string s = null, y, m, dd, h, mm, ss;
y = d.Year.ToString();
m = d.Month.ToString();
if (m.Length < 2) m = "0" + m;
dd = d.Day.ToString();
if (dd.Length < 2) dd = "0" + dd;
h = d.Hour.ToString();
if (h.Length < 2) h = "0" + h;
mm = d.Minute.ToString();
if (mm.Length < 2) mm = "0" + mm;
ss = d.Second.ToString();
if (ss.Length < 2) ss = "0" + ss;
s += y + m + dd + h + mm + ss;
return s;
}
2.获取远程文件源代码 string GetRemoteHtmlCode(string url)

Code
1
/**//// 获取远程文件源代码
2
/// param:远程url
3
/// 需要添加引用 X:\windows\system32\msxml2.dll
4
public string GetRemoteHtmlCode(string Url)
5
{
6
string s = null;
7
try
8
{
9
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
10
_xmlhttp.open("GET", Url, false, null, null);
11
_xmlhttp.send("");
12
if (_xmlhttp.readyState == 4)
13
{
14
s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);
15
}
16
}
17
catch
18
{
19
}
20
return s;
21
}
22
3.从HtmlCode截取字符串 string SniffwebCode(string code, string wordsBegin, string wordsEnd),用于抽取标题,时间,正文

Code
1
截取字符串,获取网页标题#region 截取字符串,获取网页标题
2
public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
3
{
4
string NewsTitle = "";
5
Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
6
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
7
{
8
NewsTitle = match1.Groups["title"].ToString();
9
}
10
return NewsTitle;
11
}
12
#endregion
13
4.替换HTML源代码 string RemoveHTML(string HtmlCode),用于将抽取到的正文内容去Html

Code
1
public string RemoveHTML(string HtmlCode)
2
{
3
string MatchVale = HtmlCode;
4
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))
5
{
6
MatchVale = MatchVale.Replace(s.Value, "");
7
}
8
return MatchVale;
9
}
10
11
12
//另一个方法,该方法保留了图片的连接
13
public string RemoveHTML2(string strHtml)
14
{
15
string[] aryReg =
{
16
@"<script[^>]*?>.*?</script>",
17
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(file://[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",
18
@"([\r\n])[\s]+",
19
@"&(quot|#34);",
20
@"&(amp|#38);",
21
@"&(lt|#60);",
22
@"&(gt|#62);",
23
@"&(nbsp|#160);",
24
@"&(iexcl|#161);",
25
@"&(cent|#162);",
26
@"&(pound|#163);",
27
@"&(copy|#169);",
28
@"&#(\d+);",
29
@"-->",
30
@"<!--.*\n"
31
};
32
33
string[] aryRep =
{
34
"",
35
"",
36
"",
37
"\"",
38
"&",
39
"<",
40
">",
41
" ",
42
"\xa1",//chr(161),
43
"\xa2",//chr(162),
44
"\xa3",//chr(163),
45
"\xa9",//chr(169),
46
"",
47
"\r\n",
48
""
49
};
50
51
string newReg = aryReg[0];
52
string strOutput = strHtml;
53
for (int i = 0; i < aryReg.Length; i++)
54
{
55
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
56
strOutput = regex.Replace(strOutput, aryRep[i]);
57
58
}
59
60
strOutput.Replace("<", "");
61
strOutput.Replace(">", "");
62
strOutput.Replace("\r\n", "");
63
64
65
return strOutput;
66
}
67
5.更改文件名方法string changFileName(string filename, string addStr),利用其分页规律定义其增加的字符,

Code
1
//文件名+增加字符如a.php -> a-1.php
2
public string changFileName(string filename, string addStr)
3
{
4
string ext = null;
5
string tmpFileName = "";
6
if (filename.IndexOf('.') > 0)
7
{
8
string[] fs = filename.Split('.');
9
ext = fs[fs.Length - 1];//获取后缀名
10
for (int i = 0; i < fs.Length - 2; i++)
11
{
12
tmpFileName += fs[i] + ".";
13
}
14
tmpFileName += fs[fs.Length - 2];
15
tmpFileName += addStr + ".";
16
}
17
return tmpFileName + ext;
18
}
19
6.获取页面连接

Code
1
//获取指定区域的连接,返回一个List<url>
2
public List<string> getHrefList(string HtmlCode, string urlHead)
3
{
4
string tempStr = "";
5
List<string> MatchVale = new List<string>();
6
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";
7
foreach (Match m in Regex.Matches(HtmlCode, Reg))
8
{
9
tempStr = (m.Value).ToLower().Replace("href=\"", "").Trim();
10
tempStr = tempStr.Replace("\"", "").Trim();
11
MatchVale.Add(urlHead + tempStr);
12
}
13
return MatchVale;
14
}
15
接下来我们定义一个NewsDeatil.cs

Code
1
public class NewsDetail
2
{
3
public string strUrl;
4
public string title;
5
public string upTime;
6
public string contents;
7
}
因为程序比较小,所以我采用access来存取数据,创建GatherInfo_laser_hc360.db,添加两个表
GatherUrls:strUrl 备注,strGahterTime 文本
GatherInfos:strUrl 备注,upTime 文本,title 文本,content 备注
用数据集实现数据连接,代码中可见。
最后我们来实现对所给网址的采集,直接给出代码

Code
1
/**//// <summary>
2
/// 慧聪网 激光频道 新闻咨询
3
/// Http://info.laser.hc360.com/list/z_news_yw.shtml
4
///
5
/// </summary>
6
public class laser_hc360 : GatherInfoBase, IGatherInfo
7
{
8
public void Dispose()
9
{
10
GC.SuppressFinalize(this);
11
}
12
13
IGatherInfo 成员#region IGatherInfo 成员
14
15
private string _gatherTime;
16
private string _newsListUrl = @"http://info.laser.hc360.com/list/z_news_yw.shtml";
17
private string _regexString = "";
18
19
public string gatherTime
20
{
21
get
22
{
23
return _gatherTime;
24
}
25
set
26
{
27
_gatherTime = value;
28
}
29
}
30
/**//// <summary>
31
/// 获取指定网页连接的新闻url,并写入数据库
32
/// </summary>
33
/// <returns>完成返回true</returns>
34
public bool aGatherUrlsList()
35
{
36
List<string> urlsList = this.GatherUrlList();
37
gatherTime = this.DateToString();
38
using (ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter())
39
{
40
foreach (string str in urlsList)
41
{
42
try
43
{
44
ta.InsertNewsUrl(str, gatherTime);
45
}
46
catch
47
{
48
}
49
}
50
}
51
return true;
52
}
53
/**//// <summary>
54
/// 获取新闻内容,并写入数据库
55
/// </summary>
56
/// <returns>完成返回true</returns>
57
public bool aGatherNewsDetails()
58
{
59
List<NewsDetail> newsDtl = this.GatherNewsDetail();
60
using (ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter())
61
{
62
foreach (NewsDetail nd in newsDtl)
63
{
64
try
65
{
66
ta.InsertNewsDetail(nd.strUrl, nd.title, nd.upTime, nd.contents);
67
}
68
catch
69
{
70
}
71
}
72
}
73
return true;
74
}
75
76
/**//// <summary>
77
/// 抽取其中指定部分的url地址
78
/// </summary>
79
/// <returns>url列表</returns>
80
public List<string> GatherUrlList()
81
{
82
return GatherUrlList(_newsListUrl, _regexString);
83
}
84
public List<string> GatherUrlList(string NewsListUrl, string RegexString)
85
{
86
string HtmlCode = GetRemoteHtmlCode(NewsListUrl);
87
int i = HtmlCode.IndexOf("<tr>", 2858);
88
int j = HtmlCode.IndexOf("</tr>", 3830);
89
HtmlCode = HtmlCode.Substring(i, j + 6);
90
string urlHead = @"http://info.laser.hc360.com";
91
List<string> returnList = getHrefList(HtmlCode, urlHead);
92
return returnList;
93
}
94
public List<NewsDetail> GatherNewsDetail()
95
{
96
return GatherNewsDetail(GatherUrlList(), _regexString); ;
97
}
98
public List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString)
99
{
100
List<NewsDetail> newsdetail = new List<NewsDetail>();
101
foreach (string str in NewsUrlList)
102
{
103
string HtmlCode = GetRemoteHtmlCode(str);
104
NewsDetail nd = new NewsDetail();
105
nd.strUrl = str;
106
nd.title = SniffwebCode(HtmlCode, "<h1>", "</h1>");
107
nd.upTime = SniffwebCode(HtmlCode, "<span id=\"endData\">", "</span>");
108
nd.contents = SniffwebCode(HtmlCode, "<div id=\"artical\">", "</div>");
109
int i = 2;
110
string s = str;
111
s = changFileName(str, "-" + i.ToString());
112
bool isMore = true;
113
while (isMore)
114
{
115
string htmlcode = GetRemoteHtmlCode(s);
116
isMore = htmlcode.Contains("对不起,您查找的页面不存在!5秒钟后将自动跳转。");
117
if (isMore == true) break;
118
nd.contents += SniffwebCode(htmlcode, "<div id=\"artical\">", "</div>");
119
i++;
120
s = changFileName(str, "-" + i.ToString());
121
isMore = true;
122
}
123
nd.contents = StripHTML(nd.contents);
124
newsdetail.Add(nd);
125
}
126
return newsdetail;
127
}
128
129
#endregion
130
}
131