爬取以太坊Solidity智能合约代码的简约Python爬虫

因为一些网络环境的问题，以太坊的这个https://etherscan.io/网站并不能直接访问，所以需要配置一下。
此爬虫能将官网上的最新的500个智能合约爬取下来，健壮性还行。
代码直接Copy后，需要修改一下文件的路径filepath，即可执行。
运行环境建议使用Python3.6以上。
# -*- coding: utf8 -*-
# SmartContactSpider.py
import requests
from bs4 import BeautifulSoup
import traceback
import os
import time
import datetime
from sys import stdin


def printtime():
    print(time.strftime("%Y-%m-%d %H:%M:%S:", time.localtime()), end = \' \')
    return 0


def getsccodecore(eachLine):
    filename = eachLine[29:71]
    filepath = "C:\\Users\\15321\\Desktop\\SmartContract\\code\\"
    if (os.path.exists(filepath + filename + \'.sol\')):
        printtime()
        print(filename + \'已存在！\')
        return 0

    # 伪装成浏览器
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36\'}

    failedTimes = 100
    while True:  # 在制定次数内一直循环，直到访问站点成功

        if (failedTimes <= 0):
            printtime()
            print("失败次数过多，请检查网络环境！")
            break

        failedTimes -= 1
        try:
            # 以下except都是用来捕获当requests请求出现异常时，
            # 通过捕获然后等待网络情况的变化，以此来保护程序的不间断运行
            printtime()
            print(\'正在连接的的网址链接是 \' + eachLine, end = \'\')
            response = requests.get(eachLine, headers=headers, timeout=5)
            break

        except requests.exceptions.ConnectionError:
            printtime()
            print(\'ConnectionError！请等待3秒！\')
            time.sleep(3)

        except requests.exceptions.ChunkedEncodingError:
            printtime()
            print(\'ChunkedEncodingError！请等待3秒！\')
            time.sleep(3)

        except:
            printtime()
            print(\'Unfortunitely,出现未知错误！请等待3秒！\')
            time.sleep(3)

    response.encoding = response.apparent_encoding

    soup = BeautifulSoup(response.text, "html.parser")

    targetPRE = soup.find_all(\'pre\', \'js-sourcecopyarea editor\')

    fo = open(filepath + filename + \'.sol\', "w+", encoding="utf-8");
    fo.write(targetPRE[0].text)
    fo.close()
    printtime()
    print(filename + \'新建完成！\')

    return 0


def getsccode():
    try:
        SCAddress = open("C:\\Users\\15321\\Desktop\\SmartContract\\address\\address.txt", "r")

    except:
        printtime()
        print(\'打开智能合约URL地址仓库错误！请检查文件目录是否正确！\')

    for eachLine in SCAddress:
        getsccodecore(eachLine)  # 这个才是获取智能合约代码的核心函数

    SCAddress.close()
    return 0


def getSCAddress(eachurl, filepath):
    # 伪装成某种浏览器，防止被服务器拒绝服务
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36\'}

    # 设置访问网址失败的最高次数，达到制定次数后，报告错误，停止程序
    failedTimes = 50

    while True:  # 一直循环，直到在制定的次数内访问站点成功

        if (failedTimes <= 0):
            printtime()
            print("失败次数过多，请检查网络环境！")
            break

        failedTimes -= 1  # 每执行一次就要减1
        try:
            # 以下except都是用来捕获当requests请求出现异常时，
            # 通过捕获然后等待网络情况的变化，以此来保护程序的不间断运行
            print(\'正在连接的的网址链接是 \' + eachurl )

            response = requests.get(url=eachurl, headers=headers, timeout=5)

            # 执行到这一句意味着成功访问，于是退出while循环
            break
        except requests.exceptions.ConnectionError:
            printtime()
            print(\'ConnectionError!请等待3秒！\')
            time.sleep(3)

        except requests.exceptions.ChunkedEncodingError:
            printtime()
            print(\'ChunkedEncodingError!请等待3秒！\')
            time.sleep(3)

        except:
            printtime()
            print(\'出现未知错误！请等待3秒！\')
            time.sleep(3)

    # 转换成UTF-8编码
    response.encoding = response.apparent_encoding

    # 煲汤
    soup = BeautifulSoup(response.text, "html.parser")

    # 查找这个字段，这个字段下，包含智能合约代码的URL地址
    targetDiv = soup.find_all(\'div\',\'table-responsive mb-2 mb-md-0\')

    try:
        targetTBody = targetDiv[0].table.tbody
    except:
        printtime()
        print("targetTBody未成功获取！")
        return 1

    # 以追加的方式打开文件。
    # 如果文件不存在，则新建；如果文件已存在，则在文件指针末尾追加
    fo = open(filepath + "address.txt", "a")

    # 把每一个地址，都写到文件里面保存下来
    for targetTR in targetTBody:
        if targetTR.name == \'tr\':
            fo.write("https://etherscan.io" + targetTR.td.find(\'a\', \'hash-tag text-truncate\').attrs[\'href\'] + "\n")
    fo.close()
    return 0


def updatescurl():
    urlList = ["https://etherscan.io/contractsVerified/1?ps=100",
               "https://etherscan.io/contractsVerified/2?ps=100",
               "https://etherscan.io/contractsVerified/3?ps=100",
               "https://etherscan.io/contractsVerified/4?ps=100",
               "https://etherscan.io/contractsVerified/5?ps=100"]

    # filepath是保存要爬取的智能合约地址的文件的存放路径
    # 请根据自己的需求改成自己想要的路径。
    filepath = \'C:\\Users\\15321\\Desktop\\SmartContract\\address\\\'

    # 把旧的存放合约地址的文件清除干净
    try:
        if (os.path.exists(filepath + "address.txt")):
            os.remove(filepath + "address.txt")
            printtime()
            print(\'已清除%s目录下的旧文件（仓库）！\' % filepath)
    except IOError:

        printtime()
        print("出现一个不能处理的错误，终止程序：IOError!")

        # 函数不正常执行，返回1
        return 1

    # 读取urlList里的每一个URL网页里的智能合约地址
    for eachurl in urlList:
        time = 0
        while( 1 == getSCAddress(eachurl, filepath)):
            time += 1
            if(time == 10):
                break
            pass


    # 函数正常执行，返回0
    return 0


def main():
    # 更新要爬取的智能合约的地址
    if(os.path.exists("C:\\Users\\15321\\Desktop\\SmartContract\\address\\address.txt") == True):

        print(\'是否更新智能合约地址库？输入Y或者y开头的字符串表示确定更新，其他字符表示不更新\')
        input_string = str(stdin.readline())

        if((input_string[0] == \'Y\') | (input_string[0] == \'y\')):
            print(\'开始更新智能合约地址库:\')
            updatescurl()
    else:
        print(\'开始新建智能合约地址库:\')
        updatescurl()

    # 根据智能合约的地址去爬取智能合约的代码
    getsccode()
    input()


main()
秒客网

爬取以太坊Solidity智能合约代码的简约Python爬虫

相关文章