多线程的一点东西。。

时间:2023-02-09 04:06:27

毕业设计准备做搜索方面,所以开始写爬虫程序,想法是这样,从一个网站开始,抓内容,分析页面,获取页面所有链接,将链接放到UrlList列表,然后索引,一直不断循环。这星期一直在学习多线程,下面是抓取页面内容的代码,先做个备忘先。

开始事件,以及线程函数

         private   void  Start_Click( object  sender, EventArgs e)
        {
            startUrl 
=  StartUrl.Text.Trim();
            
// 获取网页内容的线程数
             if  ( ! string .IsNullOrEmpty(ThreadCount.Text.Trim()))
            {
                threadCount 
=   int .Parse(ThreadCount.Text.Trim());
            }
            
else
            {
                threadCount 
=   1 ;
            }
            
// 获取链接线程数
             if  ( ! string .IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
            {
                getUrlThreadCount 
=   int .Parse(GetUrlThreadCount.Text.Trim());
            }
            
else
            {
                getUrlThreadCount 
=   1 ;
            }

            
if  (startUrl  ==   null )
            {
                MessageBox.Show(
" 请输入链接地址 " );
                
return ;
            }
            
else
            {
                Regex re 
=   new  Regex( @" http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " );
                
if  ( ! re.Match(startUrl).Success)
                {
                    MessageBox.Show(
" 链接格式错误 " );
                    
return ;
                }
                
else
                {
                    urllist.Url.Add(startUrl);
                    urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl),
0 );
                }
            }


            urllist.Url.Add(
" http://www.hao123.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.hao123.com " ),  0 );

            urllist.Url.Add(
" http://www.zhku.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.zhku.com " ),  0 );

            urllist.Url.Add(
" http://www.sina.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.sina.com " ),  0 );

            urllist.Url.Add(
" http://www.zhku.edu.cn " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.zhku.edu.cn " ),  0 );

            urllist.Url.Add(
" http://www.39.net " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.39.net " ),  0 );

            urllist.Url.Add(
" http://www.cnblogs.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.cnblogs.com " ),  0 );

            urllist.Url.Add(
" http://www.google.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
" http://www.google.com " ),  0 );

            Thread[] threadPool 
=   new  Thread[threadCount];
            
int  count = 3 ;

            
for  ( int  i  =   0 ; i  <  count; i ++ )
            {
                threadPool[i] 
=   new  Thread( new  ParameterizedThreadStart(GetPageContent));
                threadPool[i].Start(i);
            }

                       
while  ( true )
            {
                
if  ( ! threadPool[ 0 ].IsAlive  &&   ! threadPool[ 1 ].IsAlive  &&   ! threadPool[ 2 ].IsAlive)
                {
                    listBox1.DataSource 
=  PageContentList.PageContents;
                    threadPool[
0 ].Abort();
                    threadPool[
1 ].Abort();
                    threadPool[
2 ].Abort();
                    
break ;
                }
            }
        }

        
public   void  GetPageContent( object  startindex)
        {
            
int  start  =  ( int )startindex;
            
lock (urllist)
            {
                Monitor.Enter(urllist);
                
int  n  =   0 ;
                
if  (urllist.IsDownload.Count  <  threadCount)
                {
                    n 
=  urllist.IsDownload.Count;
                }
                
else
                {
                    n 
=  threadCount;
                }
                
for  ( int  i  =  start; i  <  urllist.IsDownload.Count; i  =  i  +  n)
                {
                    
if  (i  >  urllist.IsDownload.Count)
                    {
                        
break ;
                    }
                    
string  key  =  Encrypt.MD5EncryptStr(urllist.Url[i]);
                    
                    
if  (( int )urllist.IsDownload[key]  ==   0 )
                    {
                        urllist.IsDownload[key] 
=   1 ;
                        
string  sb  =   null ;

                        WebClient client 
=   new  WebClient();
                        Byte[] read 
=   new  Byte[ 1024 ];
                        read 
=  client.DownloadData(urllist.Url[i].Trim().ToString());

                        System.Text.Encoding encoder 
=  System.Text.Encoding.GetEncoding( " GB2312 " );
                        sb 
+=  encoder.GetString(read,  0 , read.Length);

                        
lock  (PageContentList)
                        {
                            Monitor.Enter(PageContentList);
                            PageContentList.PageContents.Add(sb);
                            PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 
0 );
                            PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 
0 );
                            Monitor.Pulse(PageContentList);
                            Monitor.Exit(PageContentList);
                        }
                    }
                }
                Monitor.Pulse(urllist);
                Monitor.Exit(urllist);
            }
            Thread.Sleep(
500 );
        }

 

链接列表类,页面内容类

 

     public   class  UrlList
    {
        
private  List < string >  url  =   new  List < string > ();
        
private  Hashtable isDownload  =   new  Hashtable();

        
///   <summary>
        
///  下载链接
        
///   </summary>
         public  List < string >  Url
        {
            
get  {  return  url; }
            
set  { url  =  value; }
        }

        
///   <summary>
        
///  是否为已下载链接0为否,1为是
        
///   </summary>
         public  Hashtable IsDownload
        {
            
get  {  return  isDownload; }
            
set  { isDownload  =  value; }
        }
    }

    
public   class  PageContent
    {
        
private  List < string >  pageContents  =   new  List < string > ();
        
private  Hashtable isAnalyse  =   new  Hashtable();
        
private  Hashtable isIndexed  =   new  Hashtable();

        
///   <summary>
        
///  页面内容列表
        
///   </summary>
         public  List < string >  PageContents
        {
            
get  {  return  pageContents; }
            
set  { pageContents  =  value; }
        }
        
///   <summary>
        
///  是否分析了页面,0为否,1为是
        
///   </summary>
         public  Hashtable IsAnalyse
        {
            
get  {  return  isAnalyse; }
            
set  { isAnalyse  =  value; }
        }
        
///   <summary>
        
///  是否对页面进行索引,0为否,1为是
        
///   </summary>
         public  Hashtable IsIndexed
        {
            
get  {  return  isIndexed; }
            
set  { isIndexed  =  value; }
        }
    }

 

上面的页面都可以抓取到,可是点击开始后就好卡,不知道什么原因,各路高人帮忙看看,指点一下。小弟感激不尽!