Asp 使用 Microsoft.XMLHTTP 抓取网页内容。并过滤须要的内容

Asp 使用 Microsoft.XMLHTTP 抓取网页内容无乱码处理，并过滤须要的内容

演示样例源代码：

<%

 Dim xmlUrl,http,strHTML,strBody

 xmlUrl = Request.QueryString("u")

 REM 异步读取XML源

 Set http = server.CreateObject("Microsoft.XMLHTTP")

 http.Open "POST",xmlUrl,false

 http.setrequestheader "User-Agent", "Mozilla/4.0"

 http.setrequestheader "Connection", "Keep-Alive"

 http.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"

 http.Send()

 strHTML = BytesToBstr(http.ResponseBody)

 set http = nothing

 REM 抓取主要内容

 strBody = GetBody(strHTML,"<div id=""Div_newsContentc"" class=""cnt"">","</div>",0,0)

 strBody =Replace(strBody,"（本文首发于","")

 strBody =Replace(strBody,"財富动力网</a>。转载请注明出处。）","")

 strBody =Replace(strBody,"本文首发于，转载请注明出处。

）","")

 strBody =Replace(strBody,"財富动力网</a>:http://www.927953.com","")

 strBody =Replace(strBody,"本文首发于","") 

 Response.Write RegRemoveHref(strBody)

REM 获取相应网址响应的HTML

Function BytesToBstr(body)

    dim objstream

    set objstream = Server.CreateObject("adodb.stream")

    objstream.Type = 1

    objstream.Mode =3

    objstream.Open

    objstream.Write body

    objstream.Position = 0

    objstream.Type = 2

    objstream.Charset = "UTF-8"

    '转换原来默认的UTF-8编码转换成GB2312编码。否则直接用

    'XMLHTTP调用有中文字符的网页得到的将是乱码

    BytesToBstr = objstream.ReadText

    objstream.Close

    set objstream = nothing

End Function

REM 使用正則表達式。抓取标签之内的的内容

Function GetBody(ConStr,StartStr,OverStr,IncluL,IncluR)

   If ConStr="$False$" or ConStr="" or IsNull(ConStr)=True Or StartStr="" or IsNull(StartStr)=True Or OverStr="" or IsNull(OverStr)=True Then

      GetBody="$False$"

      Exit Function

   End If

   Dim ConStrTemp

   Dim Start,Over

   ConStrTemp=Lcase(ConStr)

   StartStr=Lcase(StartStr)

   OverStr=Lcase(OverStr)

   Start = InStrB(1, ConStrTemp, StartStr, vbBinaryCompare)

   If Start<=0 then

      GetBody="$False$"

      Exit Function

   Else

      If IncluL=False Then

         Start=Start+LenB(StartStr)

      End If

   End If

   Over=InStrB(Start,ConStrTemp,OverStr,vbBinaryCompare)

   If Over<=0 Or Over<=Start then

      GetBody="$False$"

      Exit Function

   Else

      If IncluR=True Then

         Over=Over+LenB(OverStr)

      End If

   End If

   GetBody=MidB(ConStr,Start,Over-Start)

End Function

REM 过滤a超链接

Function RegRemoveHref(HTMLstr)

     Dim ClsTempLoseStr,regEx

     ClsTempLoseStr = Cstr(HTMLstr)

     Set regEx = New RegExp

     regEx.Pattern = "<(\/){0,1}a[^<>]*>"

     regEx.IgnoreCase = True

     regEx.Global = True

     ClsTempLoseStr = regEx.Replace(ClsTempLoseStr,"")

     RegRemoveHref = ClsTempLoseStr

     Set regEx = Nothing

End Function

%>

效果图例如以下：

Asp 使用 Microsoft.XMLHTTP 抓取网页内容无乱码处理，并过滤须要的内容

watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQveWltaXl1YW5nZ3Vhbmc=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast" alt="" />

秒客网

Asp 使用 Microsoft.XMLHTTP 抓取网页内容无乱码处理，并过滤须要的内容

Asp 使用 Microsoft.XMLHTTP 抓取网页内容。并过滤须要的内容

相关文章