多线程的一点东西。。

毕业设计准备做搜索方面，所以开始写爬虫程序，想法是这样，从一个网站开始，抓内容，分析页面，获取页面所有链接，将链接放到UrlList列表，然后索引，一直不断循环。这星期一直在学习多线程，下面是抓取页面内容的代码，先做个备忘先。

开始事件，以及线程函数

         private void Start_Click( object sender, EventArgs e)
        {
            startUrl = StartUrl.Text.Trim();
             // 获取网页内容的线程数
             if ( ! string .IsNullOrEmpty(ThreadCount.Text.Trim()))
            {
                threadCount = int .Parse(ThreadCount.Text.Trim());
            }
             else
            {
                threadCount = 1 ;
            }
             // 获取链接线程数
             if ( ! string .IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
            {
                getUrlThreadCount = int .Parse(GetUrlThreadCount.Text.Trim());
            }
             else
            {
                getUrlThreadCount = 1 ;
            }

             if (startUrl == null )
            {
                MessageBox.Show( " 请输入链接地址 " );
                 return ;
            }
             else
            {
                Regex re = new Regex( @" http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " );
                 if ( ! re.Match(startUrl).Success)
                {
                    MessageBox.Show( " 链接格式错误 " );
                     return ;
                }
                 else
                {
                    urllist.Url.Add(startUrl);
                    urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl), 0 );
                }
            }

            urllist.Url.Add( " http://www.hao123.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.hao123.com " ), 0 );

            urllist.Url.Add( " http://www.zhku.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.com " ), 0 );

            urllist.Url.Add( " http://www.sina.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.sina.com " ), 0 );

            urllist.Url.Add( " http://www.zhku.edu.cn " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.zhku.edu.cn " ), 0 );

            urllist.Url.Add( " http://www.39.net " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.39.net " ), 0 );

            urllist.Url.Add( " http://www.cnblogs.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.cnblogs.com " ), 0 );

            urllist.Url.Add( " http://www.google.com " );
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr( " http://www.google.com " ), 0 );

            Thread[] threadPool = new Thread[threadCount];
             int count = 3 ;

             for ( int i = 0 ; i < count; i ++ )
            {
                threadPool[i] = new Thread( new ParameterizedThreadStart(GetPageContent));
                threadPool[i].Start(i);
            }

                        while ( true )
            {
                 if ( ! threadPool[ 0 ].IsAlive && ! threadPool[ 1 ].IsAlive && ! threadPool[ 2 ].IsAlive)
                {
                    listBox1.DataSource = PageContentList.PageContents;
                    threadPool[ 0 ].Abort();
                    threadPool[ 1 ].Abort();
                    threadPool[ 2 ].Abort();
                     break ;
                }
            }
        }

         public void GetPageContent( object startindex)
        {
             int start = ( int )startindex;
             lock (urllist)
            {
                Monitor.Enter(urllist);
                 int n = 0 ;
                 if (urllist.IsDownload.Count < threadCount)
                {
                    n = urllist.IsDownload.Count;
                }
                 else
                {
                    n = threadCount;
                }
                 for ( int i = start; i < urllist.IsDownload.Count; i = i + n)
                {
                     if (i > urllist.IsDownload.Count)
                    {
                         break ;
                    }
                     string key = Encrypt.MD5EncryptStr(urllist.Url[i]);

                     if (( int )urllist.IsDownload[key] == 0 )
                    {
                        urllist.IsDownload[key] = 1 ;
                         string sb = null ;

                        WebClient client = new WebClient();
                        Byte[] read = new Byte[ 1024 ];
                        read = client.DownloadData(urllist.Url[i].Trim().ToString());

                        System.Text.Encoding encoder = System.Text.Encoding.GetEncoding( " GB2312 " );
                        sb += encoder.GetString(read, 0 , read.Length);

                         lock (PageContentList)
                        {
                            Monitor.Enter(PageContentList);
                            PageContentList.PageContents.Add(sb);
                            PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 0 );
                            PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 0 );
                            Monitor.Pulse(PageContentList);
                            Monitor.Exit(PageContentList);
                        }
                    }
                }
                Monitor.Pulse(urllist);
                Monitor.Exit(urllist);
            }
            Thread.Sleep( 500 );
        }

链接列表类，页面内容类

     public class UrlList
    {
         private List < string > url = new List < string > ();
         private Hashtable isDownload = new Hashtable();

         /// <summary>
         /// 下载链接
         /// </summary>
         public List < string > Url
        {
             get { return url; }
             set { url = value; }
        }

         /// <summary>
         /// 是否为已下载链接0为否，1为是
         /// </summary>
         public Hashtable IsDownload
        {
             get { return isDownload; }
             set { isDownload = value; }
        }
    }

     public class PageContent
    {
         private List < string > pageContents = new List < string > ();
         private Hashtable isAnalyse = new Hashtable();
         private Hashtable isIndexed = new Hashtable();

         /// <summary>
         /// 页面内容列表
         /// </summary>
         public List < string > PageContents
        {
             get { return pageContents; }
             set { pageContents = value; }
        }
         /// <summary>
         /// 是否分析了页面，0为否，1为是
         /// </summary>
         public Hashtable IsAnalyse
        {
             get { return isAnalyse; }
             set { isAnalyse = value; }
        }
         /// <summary>
         /// 是否对页面进行索引，0为否，1为是
         /// </summary>
         public Hashtable IsIndexed
        {
             get { return isIndexed; }
             set { isIndexed = value; }
        }
    }

上面的页面都可以抓取到，可是点击开始后就好卡，不知道什么原因，各路高人帮忙看看，指点一下。小弟感激不尽！

秒客网

多线程的一点东西。。

相关文章