Python 爬虫学习

#coding:utf-8

#author:Blood_Zero

'''

    1、获取网页信息

    2、解决编码问题，通过charset库(默认不安装这个库文件)

'''

import urllib

import urllib2

url = "http://192.168.1.135/myself/"

html = urllib.urlopen(url)

content = html.read()

print content

#如果网页中存在其他编码，就会出现乱码

#print content.decode('gbk').encode('utf-8')

'''

    简易获取网页信息

'''

#获取当前url

print "当前URL："+str(html.geturl())

#网页状态码

print "当前状态码："+str(html.code)

#print "当前状态码："+str(html.getcode())

#网站头信息

print "当前头信息：\n"+str(html.headers)

#print "当前头信息：\n"+str(html.info())

#获取网站编码

print "当前网站使用编码："+str(html.info().getparam("charset"))

#下载网页源码

urllib.urlretrieve(url,"E:\\Python_Code\\pyTools\\url.txt")

'''

    模拟浏览器访问网址

'''

#方法一

req=urllib2.Request(url)

# 添加头信息

req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0")

req.add_header("Get",url)

req.add_header("Host","192.168.1.135")

new_html = urllib2.urlopen(req)

print new_html.read()

print req.headers.items()

#方法二

myheader={

    "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",

    "Host":"192.168.1.135",

    "Get":url

}

req1 = urllib2.Request(url,headers=myheader)

new_html_1 = urllib2.urlopen(req1)

print new_html_1.read()

print req1.headers.items()

'''

    在网页中查询指定文件

'''

def get_content(url):

    html = urllib.urlopen(url)

    content = html.read()

    html.close()

    return content

def get_file(self):

    #匹配php文件

    regex = r'a href=(.+?\.php)'

    pat=re.compile(regex)

    file_code = re.findall(pat,self)

    print str(file_code)+"\n"

info = get_content("http://192.168.1.135/myself/SQL_Injection/")

get_file(info)
秒客网

Python 爬虫学习

相关文章