c# http请求ajax页面

　　我们在用Http请求的时候，某些页面是ajax加载的，所以请求过来的页面数据不完整。也就是说ajax局部加载数据的地方，我们请求不到，这时候该怎么办呢？

　　WebDriver+phantomjs 这两个组合在一起使用，可以完成此任务。分别简单介绍下，WebDriver是一个前端的自动化测试框架，phantomjs是一个*面的浏览器，基于webkit。WebDriver调用phantomjs.exe工作。下面是WebDriver提供的API，看来它能驱动各种浏览器工作。

　　 c# http请求ajax页面

　　使用前准备：

在Nuget上，下载 Selenium.WebDriver和Selenium.PhantomJS.WebDriver两个包，在项目中引用 WebDriver.dll，在输出目录下要有phantomjs.exe。

　　我们看一个完整的例子：

using OpenQA.Selenium;

using OpenQA.Selenium.PhantomJS;

using OpenQA.Selenium.Support.UI;

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Threading;

using System.Threading.Tasks;

namespace ConsoleApplication1

{

    public interface ICrawler

    {

        event EventHandler<OnStartEventArgs> OnStart;

        event EventHandler<OnCompletedEvent> OnCompleted;

        event EventHandler<OnErrorEventArgs> OnError;

        Task Start(Uri uri, Script script, Operation opreation);

    }

    public class Operation

    {

        public Action<PhantomJSDriver> Action;

        public Func<IWebDriver, bool> Condition;

        public int timeout { get; set; }

    }

    public class Script

    {

        public string Code { set; get; }

        public object[] Args { set; get; }

    }

    public class OnStartEventArgs

    {

        public Uri Uri { set; get; }

        public OnStartEventArgs(Uri uri)

        {

            this.Uri = uri;

        }

    }

    public class OnErrorEventArgs

    {

        public Uri Uri { set; get; }

        public Exception Exception { set; get; }

        public OnErrorEventArgs(Uri uri, Exception ex)

        {

            this.Uri = uri;

            this.Exception = ex;

        }

    }

    public class OnCompletedEvent

    {

        public Uri Uri { set; get; }

        public int ThreadId { set; get; }

        public string PageSource { get; private set; }

        public long Milliseconds { get; private set; }

        public PhantomJSDriver Driver { get; private set; }

        public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)

        {

            this.Uri = uri;

            this.ThreadId = threadId;

            this.PageSource = pageSource;

            this.Milliseconds = milliseconds;

            this.Driver = driver;

        }

    }

    public class HighCrawler : ICrawler

    {

        public event EventHandler<OnStartEventArgs> OnStart;

        public event EventHandler<OnCompletedEvent> OnCompleted;

        public event EventHandler<OnErrorEventArgs> OnError;

        private static PhantomJSOptions _options;

        private static PhantomJSDriverService _service;

        static HighCrawler()

        {

            var service = PhantomJSDriverService.CreateDefaultService();

            service.DiskCache = true;

            service.IgnoreSslErrors = true;

            service.HideCommandPromptWindow = true;

            service.LoadImages = false;

            service.LocalToRemoteUrlAccess = true;

            _service = service;

            _options = new PhantomJSOptions();

        }

        public Task Start(Uri uri, Script script, Operation operation)

        {

            return Task.Factory.StartNew(() =>

            {

                if (OnStart != null)

                {

                    this.OnStart(this, new OnStartEventArgs(uri));

                }

                var driver = new PhantomJSDriver(_service, _options);

                try

                {

                    var watch = DateTime.Now;

                    driver.Navigate().GoToUrl(uri.ToString());

                    if (script != null)

                        driver.ExecuteScript(script.Code, script.Args);

                    if (operation.Action != null) operation.Action.Invoke(driver);

                    var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout));  //设置超时时间

                    if (operation.Condition != null) driverWait.Until(operation.Condition);

                    var threadId = Thread.CurrentThread.ManagedThreadId;

                    var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;

                    var pageSource = driver.PageSource;

                    if (this.OnCompleted != null)

                        this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver));

                }

                catch (Exception ex)

                {

                    if (OnError != null)

                        this.OnError(this, new OnErrorEventArgs(uri, ex));

                }

                finally

                {

                    driver.Close();

                    driver.Quit();

                }

            });

        }

    }

}

　　这是封装了一个类，方便使用，我们看如何使用：

        /// <summary>

        /// 解析网站

        /// </summary>

        /// <param name="url">待解析的网站</param>

        /// <param name="waitId">等待加载的元素Id："search-main"</param>

        /// <param name="xpath">解析路径："//div[@class=\"article panel article-result\"]//h5[@class=\"title\"]//a"</param>

        private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = )

        {

            var crawler = new HighCrawler();

            crawler.OnStart += (s, e) =>

            {

                Console.WriteLine("爬虫开始抓取地址：" + e.Uri.ToString());

            };

            crawler.OnError += (s, e) =>

            {

                Console.WriteLine("爬虫出现错误：" + e.Uri.ToString() + ",异常信息" + e.Exception.ToString());

            };

            crawler.OnCompleted += (s, e) =>

            {

                Console.WriteLine("接收到的源码长度：" + e.PageSource.Length);

                Thread.Sleep();

                Console.WriteLine("爬虫结束,花费时间：" + e.Milliseconds);

                var items = e.Driver.FindElements(By.XPath(xpath));

                foreach (var item in items)

                {

                    Console.WriteLine(item.Text);

                }

            };

            var operition = new Operation

            {

                Action = (x) =>

                {

                },

                Condition = (x) =>

                {

                    return x.FindElement(By.Id(waitId)).Displayed;

                },

                timeout = timeout

            };

            crawler.Start(new Uri(url), null, operition);

        }

　　取ajax异步结果的核心原理：WebDriver把页面上的某个元素，作为标识，一旦出现此元素，表明ajax结束，这时候再返回结果，中间有个等待的过程。

秒客网

c# http请求ajax页面

相关文章