简简单单C#爬虫小计

时间:2023-03-10 02:36:38
简简单单C#爬虫小计
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks; namespace 正则
{
class Program
{
static void Main(string[] args)
{
string url = "http://www.admin5.com/browse/177/";
string html = GetHtml(url, Encoding.UTF8);
Regex r = new Regex("(?<=href=\").*?(?=\")");
MatchCollection mc = r.Matches(html);
int a = 1;
foreach (Match m in mc)
{
if (m.Value.Contains("article"))
{
Console.WriteLine("http://www.admin5.com/" + m.Value);
Console.WriteLine("抓取内容");
string content = GetHtml(m.Value, Encoding.UTF8);
Regex i = new Regex("(?<=title>).*?(?=</title>)");
MatchCollection mm = i.Matches(content);
Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
MatchCollection nr = rcontent.Matches(content);
string title = mm[0].Value;
string neirong = nr[0].Value;
Console.WriteLine("保存数据");
string path = Directory.GetCurrentDirectory();
if (!Directory.Exists(path + "\\data"))
{
Directory.CreateDirectory(path + "\\data");
}
File.WriteAllText(path + "\\data" + "\\" + a + ".txt", title + "\r\n" + neirong);
a++;
Console.WriteLine("保存成功");
}
}
Console.WriteLine("ok");
Console.ReadKey();
} private static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream s = response.GetResponseStream();
StreamReader sr = new StreamReader(s);
return sr.ReadToEnd();
}
}
}

  简简单单C#爬虫小计

简简单单C#爬虫小计