python使用关键字爬取url

python网路爬虫 --------- 使用百度输入的关键字搜索内容然后爬取搜索内容的url

开发环境：windows7+python3.6.3

开发语言：Python

开发工具：pycharm

第三方软件包：需安装lxml4.0，如果只安装lxml会出错，要需要lxml中的etree

废话不多说，贴上代码：

爬取数据保存以TXT格式保存，等会尝试使用Excel表格跟数据库保存。

 import requests,time

 from lxml import etree

 def Redirect(url):

     try :

         res = requests.get(url,timeout=10)

         url = res.url

     except Exception as e:

         print('',e)

         time.sleep(1)

     return url

 def baidu_search(wd,pn_max,sav_file_name):

     url = 'http://www.baidu.com/s'

     return_set = set()

     for page in range(pn_max):

         pn = page*10

         querystring = {'wd':wd,'pn':pn}

         headers = {

             'pragma':'no-cache',

             'accept-encoding': 'gzip,deflate,br',

             'accept-language' : 'zh-CN,zh;q=0.8',

             'upgrade-insecure-requests' : '',

             'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0",

             'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

             'cache-control': "no-cache",

             'connection': "keep-alive",

         }

         try :

             response = requests.request('GET',url,headers=headers,params=querystring)

             print('!!!!!!!!!!!!!!',response.url)

             selector = etree.HTML(response.text,parser = etree.HTMLParser(encoding='utf-8'))

         except Exception as e:

             print('页面加载失败',e)

             continue

         with open(sav_file_name,'a+') as f:

             for i in range(1,10):

                 try :

                     context = selector.xpath('//*[@id="'+str(pn+i)+'"]/h3/a[1]/@href')

                     print(len(context),context[0])

                     i = Redirect(context[0])

                     print('context='+context[0])

                     print ('i='+i)

                     f.write(i)

                     f.write('\n')

                     break

                     return_set.add(i)

                     f.write('\n')

                 except Exception as e:

                     print(i,return_set)

                     print('',e)

     return return_set

 if __name__ == '__main__':

     wd = '网络贷款'

     pn = 100

     save_file_name = 'save_url_soup.txt'

     return_set = baidu_search(wd,pn,save_file_name)

秒客网

python使用关键字爬取url

相关文章