毕业设计准备做搜索方面,所以开始写爬虫程序,想法是这样,从一个网站开始,抓内容,分析页面,获取页面所有链接,将链接放到UrlList列表,然后索引,一直不断循环。这星期一直在学习多线程,下面是抓取页面内容的代码,先做个备忘先。
开始事件,以及线程函数
private
void
Start_Click(
object
sender, EventArgs e)
{
startUrl = StartUrl.Text.Trim();
// 获取网页内容的线程数
if ( ! string .IsNullOrEmpty(ThreadCount.Text.Trim()))
{
threadCount = int .Parse(ThreadCount.Text.Trim());
}
else
{
threadCount = 1 ;
}
// 获取链接线程数
if ( ! string .IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
{
getUrlThreadCount = int .Parse(GetUrlThreadCount.Text.Trim());
}
else
{
getUrlThreadCount = 1 ;
}
if (startUrl == null )
{
MessageBox.Show( " 请输入链接地址 " );
return ;
}
else
{
Regex re = new Regex( @" http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " );
if ( ! re.Match(startUrl).Success)
{
MessageBox.Show( " 链接格式错误 " );
return ;
}
else
{
urllist.Url.Add(startUrl);
urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl), 0 );
}
}
urllist.Url.Add( " http://www.hao123.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.hao123.com " ), 0 );
urllist.Url.Add( " http://www.zhku.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.com " ), 0 );
urllist.Url.Add( " http://www.sina.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.sina.com " ), 0 );
urllist.Url.Add( " http://www.zhku.edu.cn " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.edu.cn " ), 0 );
urllist.Url.Add( " http://www.39.net " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.39.net " ), 0 );
urllist.Url.Add( " http://www.cnblogs.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.cnblogs.com " ), 0 );
urllist.Url.Add( " http://www.google.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.google.com " ), 0 );
Thread[] threadPool = new Thread[threadCount];
int count = 3 ;
for ( int i = 0 ; i < count; i ++ )
{
threadPool[i] = new Thread( new ParameterizedThreadStart(GetPageContent));
threadPool[i].Start(i);
}
while ( true )
{
if ( ! threadPool[ 0 ].IsAlive && ! threadPool[ 1 ].IsAlive && ! threadPool[ 2 ].IsAlive)
{
listBox1.DataSource = PageContentList.PageContents;
threadPool[ 0 ].Abort();
threadPool[ 1 ].Abort();
threadPool[ 2 ].Abort();
break ;
}
}
}
public void GetPageContent( object startindex)
{
int start = ( int )startindex;
lock (urllist)
{
Monitor.Enter(urllist);
int n = 0 ;
if (urllist.IsDownload.Count < threadCount)
{
n = urllist.IsDownload.Count;
}
else
{
n = threadCount;
}
for ( int i = start; i < urllist.IsDownload.Count; i = i + n)
{
if (i > urllist.IsDownload.Count)
{
break ;
}
string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
if (( int )urllist.IsDownload[key] == 0 )
{
urllist.IsDownload[key] = 1 ;
string sb = null ;
WebClient client = new WebClient();
Byte[] read = new Byte[ 1024 ];
read = client.DownloadData(urllist.Url[i].Trim().ToString());
System.Text.Encoding encoder = System.Text.Encoding.GetEncoding( " GB2312 " );
sb += encoder.GetString(read, 0 , read.Length);
lock (PageContentList)
{
Monitor.Enter(PageContentList);
PageContentList.PageContents.Add(sb);
PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 0 );
PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 0 );
Monitor.Pulse(PageContentList);
Monitor.Exit(PageContentList);
}
}
}
Monitor.Pulse(urllist);
Monitor.Exit(urllist);
}
Thread.Sleep( 500 );
}
{
startUrl = StartUrl.Text.Trim();
// 获取网页内容的线程数
if ( ! string .IsNullOrEmpty(ThreadCount.Text.Trim()))
{
threadCount = int .Parse(ThreadCount.Text.Trim());
}
else
{
threadCount = 1 ;
}
// 获取链接线程数
if ( ! string .IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
{
getUrlThreadCount = int .Parse(GetUrlThreadCount.Text.Trim());
}
else
{
getUrlThreadCount = 1 ;
}
if (startUrl == null )
{
MessageBox.Show( " 请输入链接地址 " );
return ;
}
else
{
Regex re = new Regex( @" http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " );
if ( ! re.Match(startUrl).Success)
{
MessageBox.Show( " 链接格式错误 " );
return ;
}
else
{
urllist.Url.Add(startUrl);
urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl), 0 );
}
}
urllist.Url.Add( " http://www.hao123.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.hao123.com " ), 0 );
urllist.Url.Add( " http://www.zhku.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.com " ), 0 );
urllist.Url.Add( " http://www.sina.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.sina.com " ), 0 );
urllist.Url.Add( " http://www.zhku.edu.cn " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.edu.cn " ), 0 );
urllist.Url.Add( " http://www.39.net " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.39.net " ), 0 );
urllist.Url.Add( " http://www.cnblogs.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.cnblogs.com " ), 0 );
urllist.Url.Add( " http://www.google.com " );
urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.google.com " ), 0 );
Thread[] threadPool = new Thread[threadCount];
int count = 3 ;
for ( int i = 0 ; i < count; i ++ )
{
threadPool[i] = new Thread( new ParameterizedThreadStart(GetPageContent));
threadPool[i].Start(i);
}
while ( true )
{
if ( ! threadPool[ 0 ].IsAlive && ! threadPool[ 1 ].IsAlive && ! threadPool[ 2 ].IsAlive)
{
listBox1.DataSource = PageContentList.PageContents;
threadPool[ 0 ].Abort();
threadPool[ 1 ].Abort();
threadPool[ 2 ].Abort();
break ;
}
}
}
public void GetPageContent( object startindex)
{
int start = ( int )startindex;
lock (urllist)
{
Monitor.Enter(urllist);
int n = 0 ;
if (urllist.IsDownload.Count < threadCount)
{
n = urllist.IsDownload.Count;
}
else
{
n = threadCount;
}
for ( int i = start; i < urllist.IsDownload.Count; i = i + n)
{
if (i > urllist.IsDownload.Count)
{
break ;
}
string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
if (( int )urllist.IsDownload[key] == 0 )
{
urllist.IsDownload[key] = 1 ;
string sb = null ;
WebClient client = new WebClient();
Byte[] read = new Byte[ 1024 ];
read = client.DownloadData(urllist.Url[i].Trim().ToString());
System.Text.Encoding encoder = System.Text.Encoding.GetEncoding( " GB2312 " );
sb += encoder.GetString(read, 0 , read.Length);
lock (PageContentList)
{
Monitor.Enter(PageContentList);
PageContentList.PageContents.Add(sb);
PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 0 );
PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 0 );
Monitor.Pulse(PageContentList);
Monitor.Exit(PageContentList);
}
}
}
Monitor.Pulse(urllist);
Monitor.Exit(urllist);
}
Thread.Sleep( 500 );
}
链接列表类,页面内容类
public
class
UrlList
{
private List < string > url = new List < string > ();
private Hashtable isDownload = new Hashtable();
/// <summary>
/// 下载链接
/// </summary>
public List < string > Url
{
get { return url; }
set { url = value; }
}
/// <summary>
/// 是否为已下载链接0为否,1为是
/// </summary>
public Hashtable IsDownload
{
get { return isDownload; }
set { isDownload = value; }
}
}
public class PageContent
{
private List < string > pageContents = new List < string > ();
private Hashtable isAnalyse = new Hashtable();
private Hashtable isIndexed = new Hashtable();
/// <summary>
/// 页面内容列表
/// </summary>
public List < string > PageContents
{
get { return pageContents; }
set { pageContents = value; }
}
/// <summary>
/// 是否分析了页面,0为否,1为是
/// </summary>
public Hashtable IsAnalyse
{
get { return isAnalyse; }
set { isAnalyse = value; }
}
/// <summary>
/// 是否对页面进行索引,0为否,1为是
/// </summary>
public Hashtable IsIndexed
{
get { return isIndexed; }
set { isIndexed = value; }
}
}
{
private List < string > url = new List < string > ();
private Hashtable isDownload = new Hashtable();
/// <summary>
/// 下载链接
/// </summary>
public List < string > Url
{
get { return url; }
set { url = value; }
}
/// <summary>
/// 是否为已下载链接0为否,1为是
/// </summary>
public Hashtable IsDownload
{
get { return isDownload; }
set { isDownload = value; }
}
}
public class PageContent
{
private List < string > pageContents = new List < string > ();
private Hashtable isAnalyse = new Hashtable();
private Hashtable isIndexed = new Hashtable();
/// <summary>
/// 页面内容列表
/// </summary>
public List < string > PageContents
{
get { return pageContents; }
set { pageContents = value; }
}
/// <summary>
/// 是否分析了页面,0为否,1为是
/// </summary>
public Hashtable IsAnalyse
{
get { return isAnalyse; }
set { isAnalyse = value; }
}
/// <summary>
/// 是否对页面进行索引,0为否,1为是
/// </summary>
public Hashtable IsIndexed
{
get { return isIndexed; }
set { isIndexed = value; }
}
}
上面的页面都可以抓取到,可是点击开始后就好卡,不知道什么原因,各路高人帮忙看看,指点一下。小弟感激不尽!