这是我面试的题目,借用了很多网上同仁的代码,如有冒犯,请海涵!
现在工作真难找,我应聘的单位是http://www.027dns.net/,希望公司经理能给我上班的机会,我会很努力的,因为软件行业才是我的世界!我一个大学本科生当保安都成了同事们的笑话了,呵呵。
这是我第一次求职软件行业,第一次做面试题目,第一次自己这么认真写博客文章,写的不好,大家请指正,我会进步的!
张素丰,转载请注明出处http://www.cnblogs.com/zhangsufeng/archive/2009/02/28/1400224.html
屁话少说,正文开始:
假如我们采集网址:http://info.laser.hc360.com/list/z_news_yw.shtml 上的新闻,要求采集标题、时间、内容、单篇文章如果有翻页则采集完全。
这种类型的采集就是从指定网页获得新闻列表(即url),然后通过其url获得新闻详情,这是一种很常见的采集方式,有可能到很多页面上去采集,所以我们可以采用接口来构造基类。
首先定义 IGatherInfo.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5
6namespace ClassLibrary
7{
8 /**//// <summary>
9 /// 新闻采集类接口
10 /// </summary>
11 interface IGatherInfo
12 {
13 /**//// <summary>
14 /// 采集时间
15 /// </summary>
16 string gatherTime
17 {
18 get;
19 set;
20 }
21 /**//// <summary>
22 /// NewsListUrl:抽取页地址
23 /// RegexString:正则表达式,抽取逻辑
24 /// 返回新闻页url
25 /// </summary>
26 List<string> GatherUrlList(string NewsListUrl, string RegexString);
27 //采集新闻详细内容
28 List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString);
29 }
30}
31
接口定义了三个成员:gatherTime采集时间,GatherUrlList()从指定网址抽取新闻Url,GatherNewsDetail()读取新闻详细内容。
下面我们分析一下该程序中可能要用的公共方法,定义在 GatherInfoBase.cs
1.时间转换函数string DateToString()
/// 日期转换函数
/// 日期->字符串
public string DateToString()
{
DateTime d = DateTime.Now;
string s = null, y, m, dd, h, mm, ss;
y = d.Year.ToString();
m = d.Month.ToString();
if (m.Length < 2) m = "0" + m;
dd = d.Day.ToString();
if (dd.Length < 2) dd = "0" + dd;
h = d.Hour.ToString();
if (h.Length < 2) h = "0" + h;
mm = d.Minute.ToString();
if (mm.Length < 2) mm = "0" + mm;
ss = d.Second.ToString();
if (ss.Length < 2) ss = "0" + ss;
s += y + m + dd + h + mm + ss;
return s;
}
2.获取远程文件源代码 string GetRemoteHtmlCode(string url)
1 /**//// 获取远程文件源代码
2 /// param:远程url
3 /// 需要添加引用 X:\windows\system32\msxml2.dll
4 public string GetRemoteHtmlCode(string Url)
5 {
6 string s = null;
7 try
8 {
9 MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
10 _xmlhttp.open("GET", Url, false, null, null);
11 _xmlhttp.send("");
12 if (_xmlhttp.readyState == 4)
13 {
14 s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);
15 }
16 }
17 catch
18 {
19 }
20 return s;
21 }
22
3.从HtmlCode截取字符串 string SniffwebCode(string code, string wordsBegin, string wordsEnd),用于抽取标题,时间,正文
1 截取字符串,获取网页标题#region 截取字符串,获取网页标题
2 public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
3 {
4 string NewsTitle = "";
5 Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
6 for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
7 {
8 NewsTitle = match1.Groups["title"].ToString();
9 }
10 return NewsTitle;
11 }
12 #endregion
13
4.替换HTML源代码 string RemoveHTML(string HtmlCode),用于将抽取到的正文内容去Html
1 public string RemoveHTML(string HtmlCode)
2 {
3 string MatchVale = HtmlCode;
4 foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))
5 {
6 MatchVale = MatchVale.Replace(s.Value, "");
7 }
8 return MatchVale;
9 }
10
11
12 //另一个方法,该方法保留了图片的连接
13 public string RemoveHTML2(string strHtml)
14 {
15 string[] aryReg ={
16 @"<script[^>]*?>.*?</script>",
17 @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(file://[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",
18 @"([\r\n])[\s]+",
19 @"&(quot|#34);",
20 @"&(amp|#38);",
21 @"&(lt|#60);",
22 @"&(gt|#62);",
23 @"&(nbsp|#160);",
24 @"&(iexcl|#161);",
25 @"&(cent|#162);",
26 @"&(pound|#163);",
27 @"&(copy|#169);",
28 @"&#(\d+);",
29 @"-->",
30 @"<!--.*\n"
31 };
32
33 string[] aryRep = {
34 "",
35 "",
36 "",
37 "\"",
38 "&",
39 "<",
40 ">",
41 " ",
42 "\xa1",//chr(161),
43 "\xa2",//chr(162),
44 "\xa3",//chr(163),
45 "\xa9",//chr(169),
46 "",
47 "\r\n",
48 ""
49 };
50
51 string newReg = aryReg[0];
52 string strOutput = strHtml;
53 for (int i = 0; i < aryReg.Length; i++)
54 {
55 Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
56 strOutput = regex.Replace(strOutput, aryRep[i]);
57
58 }
59
60 strOutput.Replace("<", "");
61 strOutput.Replace(">", "");
62 strOutput.Replace("\r\n", "");
63
64
65 return strOutput;
66 }
67
5.更改文件名方法string changFileName(string filename, string addStr),利用其分页规律定义其增加的字符,
1 //文件名+增加字符如a.php -> a-1.php
2 public string changFileName(string filename, string addStr)
3 {
4 string ext = null;
5 string tmpFileName = "";
6 if (filename.IndexOf('.') > 0)
7 {
8 string[] fs = filename.Split('.');
9 ext = fs[fs.Length - 1];//获取后缀名
10 for (int i = 0; i < fs.Length - 2; i++)
11 {
12 tmpFileName += fs[i] + ".";
13 }
14 tmpFileName += fs[fs.Length - 2];
15 tmpFileName += addStr + ".";
16 }
17 return tmpFileName + ext;
18 }
19
6.获取页面连接
1 //获取指定区域的连接,返回一个List<url>
2 public List<string> getHrefList(string HtmlCode, string urlHead)
3 {
4 string tempStr = "";
5 List<string> MatchVale = new List<string>();
6 string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";
7 foreach (Match m in Regex.Matches(HtmlCode, Reg))
8 {
9 tempStr = (m.Value).ToLower().Replace("href=\"", "").Trim();
10 tempStr = tempStr.Replace("\"", "").Trim();
11 MatchVale.Add(urlHead + tempStr);
12 }
13 return MatchVale;
14 }
15
接下来我们定义一个NewsDeatil.cs
1 public class NewsDetail
2 {
3 public string strUrl;
4 public string title;
5 public string upTime;
6 public string contents;
7 }
因为程序比较小,所以我采用access来存取数据,创建GatherInfo_laser_hc360.db,添加两个表
GatherUrls:strUrl 备注,strGahterTime 文本
GatherInfos:strUrl 备注,upTime 文本,title 文本,content 备注
用数据集实现数据连接,代码中可见。
最后我们来实现对所给网址的采集,直接给出代码
1 /**//// <summary>
2 /// 慧聪网 激光频道 新闻咨询
3 /// Http://info.laser.hc360.com/list/z_news_yw.shtml
4 ///
5 /// </summary>
6 public class laser_hc360 : GatherInfoBase, IGatherInfo
7 {
8 public void Dispose()
9 {
10 GC.SuppressFinalize(this);
11 }
12
13 IGatherInfo 成员#region IGatherInfo 成员
14
15 private string _gatherTime;
16 private string _newsListUrl = @"http://info.laser.hc360.com/list/z_news_yw.shtml";
17 private string _regexString = "";
18
19 public string gatherTime
20 {
21 get
22 {
23 return _gatherTime;
24 }
25 set
26 {
27 _gatherTime = value;
28 }
29 }
30 /**//// <summary>
31 /// 获取指定网页连接的新闻url,并写入数据库
32 /// </summary>
33 /// <returns>完成返回true</returns>
34 public bool aGatherUrlsList()
35 {
36 List<string> urlsList = this.GatherUrlList();
37 gatherTime = this.DateToString();
38 using (ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter())
39 {
40 foreach (string str in urlsList)
41 {
42 try
43 {
44 ta.InsertNewsUrl(str, gatherTime);
45 }
46 catch
47 {
48 }
49 }
50 }
51 return true;
52 }
53 /**//// <summary>
54 /// 获取新闻内容,并写入数据库
55 /// </summary>
56 /// <returns>完成返回true</returns>
57 public bool aGatherNewsDetails()
58 {
59 List<NewsDetail> newsDtl = this.GatherNewsDetail();
60 using (ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter())
61 {
62 foreach (NewsDetail nd in newsDtl)
63 {
64 try
65 {
66 ta.InsertNewsDetail(nd.strUrl, nd.title, nd.upTime, nd.contents);
67 }
68 catch
69 {
70 }
71 }
72 }
73 return true;
74 }
75
76 /**//// <summary>
77 /// 抽取其中指定部分的url地址
78 /// </summary>
79 /// <returns>url列表</returns>
80 public List<string> GatherUrlList()
81 {
82 return GatherUrlList(_newsListUrl, _regexString);
83 }
84 public List<string> GatherUrlList(string NewsListUrl, string RegexString)
85 {
86 string HtmlCode = GetRemoteHtmlCode(NewsListUrl);
87 int i = HtmlCode.IndexOf("<tr>", 2858);
88 int j = HtmlCode.IndexOf("</tr>", 3830);
89 HtmlCode = HtmlCode.Substring(i, j + 6);
90 string urlHead = @"http://info.laser.hc360.com";
91 List<string> returnList = getHrefList(HtmlCode, urlHead);
92 return returnList;
93 }
94 public List<NewsDetail> GatherNewsDetail()
95 {
96 return GatherNewsDetail(GatherUrlList(), _regexString); ;
97 }
98 public List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString)
99 {
100 List<NewsDetail> newsdetail = new List<NewsDetail>();
101 foreach (string str in NewsUrlList)
102 {
103 string HtmlCode = GetRemoteHtmlCode(str);
104 NewsDetail nd = new NewsDetail();
105 nd.strUrl = str;
106 nd.title = SniffwebCode(HtmlCode, "<h1>", "</h1>");
107 nd.upTime = SniffwebCode(HtmlCode, "<span id=\"endData\">", "</span>");
108 nd.contents = SniffwebCode(HtmlCode, "<div id=\"artical\">", "</div>");
109 int i = 2;
110 string s = str;
111 s = changFileName(str, "-" + i.ToString());
112 bool isMore = true;
113 while (isMore)
114 {
115 string htmlcode = GetRemoteHtmlCode(s);
116 isMore = htmlcode.Contains("对不起,您查找的页面不存在!5秒钟后将自动跳转。");
117 if (isMore == true) break;
118 nd.contents += SniffwebCode(htmlcode, "<div id=\"artical\">", "</div>");
119 i++;
120 s = changFileName(str, "-" + i.ToString());
121 isMore = true;
122 }
123 nd.contents = StripHTML(nd.contents);
124 newsdetail.Add(nd);
125 }
126 return newsdetail;
127 }
128
129 #endregion
130 }
131