在Python中使用CasperJS获取JS渲染生成的HTML内容的教程

时间:2022-01-03 12:55:02

文章摘要:其实这里casperjs与python没有直接关系,主要依赖casperjs调用phantomjs webkit获取html文件内容。长期以来,爬虫抓取 客户端javascript渲染生成的html页面 都极为 困难, Java里面有 HtmlUnit, 而Python里,我们可以使用独立的跨平台的CasperJS。

    创建site.js(接口文件,输入:url,输出:html file)  

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
//USAGE: E:\toolkit\n1k0-casperjs-e3a77d0\bin>python casperjs site.js --url=http://spys.ru/free-proxy-list/IE/ --outputfile='temp.html'
  
 var fs = require('fs');
 var casper = require('casper').create({
  pageSettings: {
  loadImages: false,    
  loadPlugins: false,   
  userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER'
 },
 logLevel: "debug",//日志等级
 verbose: true  // 记录日志到控制台
  });
 var url = casper.cli.raw.get('url');
 var outputfile = casper.cli.raw.get('outputfile');
 //请求页面
 casper.start(url, function () {
 fs.write(outputfile, this.getHTML(), 'w');
 });
  
 casper.run();

    python 代码, checkout_proxy.py      

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
   import sys
   #import requests
   #import requests.utils, pickle
   from bs4 import BeautifulSoup
   import os.path,os
   import threading
   #from multiprocessing import Process, Manager
   from datetime import datetime
   import traceback
   import logging
   import re,random
   import subprocess
   import shutil
   import platform
     
    
    
    
   output_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'proxy.txt')
   global_log = 'http_proxy' + datetime.now().strftime('%Y-%m-%d') + '.log'
   if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs')):
     os.mkdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs'))
   global_log = os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs',global_log)
    
   logging.basicConfig(level=logging.DEBUG,format='[%(asctime)s] [%(levelname)s] [%(module)s] [%(funcName)s] [%(lineno)d] %(message)s',filename=global_log,filemode='a')
   log = logging.getLogger(__name__) 
   #manager = Manager()
   #PROXY_LIST = manager.list()
   mutex = threading.Lock()
   PROXY_LIST = []
    
    
   def isWindows():
     if "Windows" in str(platform.uname()):
     return True
     else:
     return False
    
    
   def getTagsByAttrs(tagName,pageContent,attrName,attrRegValue):
     soup = BeautifulSoup(pageContent)                                                
     return soup.find_all(tagName, { attrName : re.compile(attrRegValue) })
    
    
   def getTagsByAttrsExt(tagName,filename,attrName,attrRegValue):
     if os.path.isfile(filename):
     f = open(filename,'r')   
     soup = BeautifulSoup(f)
     f.close()
     return soup.find_all(tagName, { attrName : re.compile(attrRegValue) })
     else:
     return None
    
    
   class Site1Thread(threading.Thread):
     def __init__(self,outputFilePath):
       threading.Thread.__init__(self)
     self.outputFilePath = outputFilePath
     self.fileName = str(random.randint(100,1000)) + ".html"
     self.setName('Site1Thread')
     
     def run(self):
     site1_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'site.js')
     site2_file = os.path.join(self.outputFilePath,'site.js')
     if not os.path.isfile(site2_file) and os.path.isfile(site1_file):
       shutil.copy(site1_file,site2_file)
     #proc = subprocess.Popen(["bash","-c", "cd %s && ./casperjs site.js --url=http://spys.ru/free-proxy-list/IE/ --outputfile=%s" % (self.outputFilePath,self.fileName) ],stdout=subprocess.PIPE)
     if isWindows():
       proc = subprocess.Popen(["cmd","/c", "%s/casperjs site.js --url=http://spys.ru/free-proxy-list/IE/ --outputfile=%s" % (self.outputFilePath,self.fileName) ],stdout=subprocess.PIPE)
     else:
       proc = subprocess.Popen(["bash","-c", "cd %s && ./casperjs site.js --url=http://spys.ru/free-proxy-list/IE/ --outputfile=%s" % (self.outputFilePath,self.fileName) ],stdout=subprocess.PIPE)
     out=proc.communicate()[0]
     htmlFileName = ''
     #因为输出路径在windows不确定,所以这里加了所有可能的路径判断
     if os.path.isfile(self.fileName):
       htmlFileName = self.fileName
     elif os.path.isfile(os.path.join(self.outputFilePath,self.fileName)):
       htmlFileName = os.path.join(self.outputFilePath,self.fileName)
     elif os.path.isfile(os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)):
       htmlFileName = os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName) 
     if (not os.path.isfile(htmlFileName)):
       print 'Failed to get html content from http://spys.ru/free-proxy-list/IE/'
       print out
       sys.exit(3
     mutex.acquire()
     PROXYList= getTagsByAttrsExt('font',htmlFileName,'class','spy14$')
     for proxy in PROXYList:
       tdContent = proxy.renderContents()
       lineElems = re.split('[<>]',tdContent)
       if re.compile(r'\d+').search(lineElems[-1]) and re.compile('(\d+\.\d+\.\d+)').search(lineElems[0]):
       print lineElems[0],lineElems[-1]
       PROXY_LIST.append("%s:%s" % (lineElems[0],lineElems[-1]))
     mutex.release()
     try:
       if os.path.isfile(htmlFileName):
       os.remove(htmlFileName)
     except:
       pass
    
   if __name__ == '__main__':
     try:
     if(len(sys.argv)) < 2:
       print "Usage:%s [casperjs path]" % (sys.argv[0])
       sys.exit(1
     if not os.path.exists(sys.argv[1]):
       print "casperjs path: %s does not exist!" % (sys.argv[1])
       sys.exit(2
     if os.path.isfile(output_file):
       f = open(output_file)
       lines = f.readlines()
       f.close
       for line in lines:
       PROXY_LIST.append(line.strip())
     thread1 = Site1Thread(sys.argv[1])
     thread1.start()
     thread1.join()
      
     f = open(output_file,'w')
     for proxy in set(PROXY_LIST):
       f.write(proxy+"\n")
     f.close()
     print "Done!"
     except SystemExit:
     pass
     except:
       errMsg = traceback.format_exc()
       print errMsg
       log.error(errMsg)