Java爬虫（Jsoup）---爬取Etherscan上的智能合约代码

Java爬虫（Jsoup）—爬取Etherscan上的智能合约代码

最近在做一个和智能合约漏洞相关的研究, 计划使用基本深度学习的方法来分析合约漏洞，需要大量的合约集来使用，所有决定在Etherscan上爬取智能合约，借此把这次爬虫的解决过程记录下来。

一、工具准备

首先，使用Java爬虫技术需要有相应的爬虫工具包Jsoup，将每一条数据写入Excel文件时也需要用到相应的jar包poi。
Etherscan地址为：https://etherscan.io/contractsVerified
选取存储的路径地址：/home/jion1/crawlerData/data_contract1.xls

二、合约实体类

对于要爬取的数据来说，如下图所示，它是一条条的数据，一条数据中分为多个字段。
Java爬虫（Jsoup）---爬取Etherscan上的智能合约代码
从图中可以看到，这一条数据中有Address、CotractName、Complier、Balance、TxCount、Settings、DateVerified这些字段，所以创建一个合约的实体类如下所示：

Contract

public class Contract {
	private String address;
	private String name;
	private String compiler;
	private String balance;
	private String txCount;
	private String settings;
	private String dateTime;
	private String code;

	public Contract() {
		super();
	}

	public Contract(String address, String name, String compiler, String balance, String txCount, String settings,
			String dateTime) {
		super();
		this.address = address;
		this.name = name;
		this.compiler = compiler;
		this.balance = balance;
		this.txCount = txCount;
		this.settings = settings;
		this.dateTime = dateTime;
	}

	public String getAddress() {
		return address;
	}

	public void setAddress(String address) {
		this.address = address;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getCompiler() {
		return compiler;
	}

	public void setCompiler(String compiler) {
		this.compiler = compiler;
	}

	public String getBalance() {
		return balance;
	}

	public void setBalance(String balance) {
		this.balance = balance;
	}

	public String getTxCount() {
		return txCount;
	}

	public void setTxCount(String txCount) {
		this.txCount = txCount;
	}

	public String getSettings() {
		return settings;
	}

	public void setSettings(String settings) {
		this.settings = settings;
	}

	public String getDateTime() {
		return dateTime;
	}

	public void setDateTime(String dateTime) {
		this.dateTime = dateTime;
	}

	public String getCode() {
		return code;
	}

	public void setCode(String code) {
		this.code = code;
	}

	public String toString() {
		return "Contract [address=" + address + ", name=" + name + ", compiler=" + compiler + ", balance=" + balance
				+ ", txCount=" + txCount + ", settings=" + settings + ", dateTime=" + dateTime + "]";
	}

}

三、爬取具体合约代码

对于要爬取的目标：合约代码。当我们将所有的合约地址爬下来后，例如：0xE42Ef56340bCa5072E3c7bA07df835a65eCd06a6。而对应此地址的合约代码如下图所示：
Java爬虫（Jsoup）---爬取Etherscan上的智能合约代码
我们需要通过获取合约地址，然后爬取具体的合约代码。

Crawler

public class crawler_1 {
	public static void main(String[] args) throws IOException {
		// readFile();
		String path = "/home/jion1/crawlerData/data_contract1.xls";
		String url1 = "https://etherscan.io/address/";
		List<Contract> list1 = readExcel(path);
		System.out.println(list1.size());

		List<Map<String, String>> list2 = new ArrayList<Map<String, String>>();

		for (int i = 0; i < 400; i++) {
			Map<String, String> map = new HashMap<String, String>();
			String url = url1 + list1.get(i).getAddress() + "#code";
			System.out.println(url);
			String compiler = "compiler:" + list1.get(i).getCompiler();
			String balance = "balance:" + list1.get(i).getBalance();
			String txCount = "txCount:" + list1.get(i).getTxCount();
			String settings = "settings:" + list1.get(i).getSettings();
			String dateTime = "dateTime:" + list1.get(i).getDateTime();
			String code = "code:" + getData(url);
			String collection = "code:" + "\n" + code + "\n" + compiler + "\n" + balance + "\n" + txCount + "\n"
					+ settings + "\n" + dateTime;
			map.put("address:" + list1.get(i).getAddress() + "\n", collection);
//			System.out.println(map);
			
			String data = "address:" + list1.get(i).getAddress();
//			System.out.println(data);
			
			String filename = "";
			filename = i + ".txt";
			System.out.println(filename);
			File file = new File("/home/jion1/contract/contract103/" + filename);
			BufferedWriter bw = null;
			try {
//				String s = map.toString();
				bw = new BufferedWriter(new FileWriter(file));
				bw.write(data);
				bw.newLine();
				bw.write(compiler);
				bw.newLine();
				bw.write(code);
				bw.newLine();
				bw.write(balance);
				bw.newLine();
				bw.write(txCount);
				bw.newLine();
				bw.write(settings);
				bw.newLine();
				bw.write(dateTime);
				bw.flush();
				bw.close();
			} catch (Exception e) {
				e.printStackTrace();
			}

			System.out.println("done");
			System.out.println();
			list2.add(map);
		}
	}

	public static String getData(String url) throws IOException {
		String linkText = null;
		Document doc = Jsoup.connect(url)
				.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0")
				.timeout(600000).get();
		Elements links = doc.select("pre[class=js-sourcecopyarea]");
		for (Element link : links) {
			linkText = link.text();
		}
		return linkText;
	}

	public static void readFile() throws IOException {
		FileReader fr = new FileReader("D://contract.txt");
		BufferedReader br = new BufferedReader(fr);
		String line = "";
		String[] arrs = null;
		while ((line = br.readLine()) != null) {
			arrs = line.split(",");
		}
		br.close();
		fr.close();
	}

	public static List<Contract> readExcel(String path) throws IOException {
		List<Contract> list = new ArrayList<Contract>();

		InputStream ExcelFileToRead = new FileInputStream(path);
		HSSFWorkbook wb = new HSSFWorkbook(ExcelFileToRead);
		HSSFSheet sheet = wb.getSheetAt(0);
		System.out.println(sheet.getLastRowNum());
		HSSFRow row; // 

		for (int i = 0; i <= sheet.getLastRowNum(); i++) {
			row = sheet.getRow(i);
			Contract ct = new Contract();

			if (row == null) {
				continue;
			}

			int j = row.getFirstCellNum();

			ct.setAddress(row.getCell(j).toString());
			ct.setName(row.getCell(j + 1).toString());
			ct.setCompiler(row.getCell(j + 2).toString());
			ct.setBalance(row.getCell(j + 3).toString());
			ct.setTxCount(row.getCell(j + 4).toString());
			ct.setSettings(row.getCell(j + 5).toString());
			ct.setDateTime(row.getCell(j + 6).toString());

			list.add(ct);
		}
		return list;
	}
}

四、爬取所有的合约数据

首先，我们需要将所有的合约数据爬取下来，插入到excel表中，将每一条数据插入到excel表中，然后循环读出excel表中的合约地址，利用这些合约地址爬取每一个合约的代码。
Crawler_excel

public class crawler {

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {
        // 采集的网址
        String url = "https://etherscan.io/contractsVerified/";
        Document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36").get();
        // 获取审查页面的页数信息
        int total_page = Integer.parseInt(document.select("body > div.wrapper > div.profile.container > div:nth-child(4) > div:nth-child(2) > p > span > b:nth-child(2)").text());
        System.out.println(total_page);

        HSSFWorkbook wb = new HSSFWorkbook();
        HSSFSheet sheet = wb.createSheet("sheet1");
        HSSFRow row = sheet.createRow(0);
        HSSFCellStyle style = wb.createCellStyle();
        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);

        HSSFCell cell1 = row.createCell(0);
        cell1.setCellValue("contract");
        cell1.setCellStyle(style);
        HSSFCell cell2 = row.createCell(1);
        cell2.setCellValue("name");
        cell2.setCellStyle(style);
        HSSFCell cell3 = row.createCell(2);
        cell3.setCellValue("compiler");
        cell3.setCellStyle(style);
        HSSFCell cell4 = row.createCell(3);
        cell4.setCellValue("balance");
        cell4.setCellStyle(style);
        HSSFCell cell5 = row.createCell(4);
        cell5.setCellValue("txCount");
        cell5.setCellStyle(style);
        HSSFCell cell6 = row.createCell(5);
        cell6.setCellValue("settings");
        cell6.setCellStyle(style);
        HSSFCell cell7 = row.createCell(6);
        cell7.setCellValue("dateTime");
        cell7.setCellStyle(style);

        for (int current_page = 1; current_page <= total_page; current_page++) {
            if (current_page == 1) {
                List<Contract> list = getData(url);
                JSONArray array = new JSONArray();
                array.add(list);
                for (int i = 0; i < list.size(); i++) {
                    row = sheet.createRow(i + 1);

                    row.createCell(0).setCellValue(list.get(i).getAddress());
                    row.createCell(1).setCellValue(list.get(i).getName());
                    row.createCell(2).setCellValue(list.get(i).getCompiler());
                    row.createCell(3).setCellValue(list.get(i).getBalance());
                    row.createCell(4).setCellValue(list.get(i).getTxCount());
                    row.createCell(5).setCellValue(list.get(i).getSettings());
                    row.createCell(6).setCellValue(list.get(i).getDateTime());
                }

            } else {
                List<Contract> list = getData(url + current_page);
                JSONArray array = new JSONArray();
                array.add(list);
                System.out.println("**************************************");
                for (int i = 0; i < list.size(); i++) {
                    row = sheet.createRow((short) (sheet.getLastRowNum() + 1)); //现有的行号后面追加
                    //row = sheet.createRow(i + 1);
                    row.createCell(0).setCellValue(list.get(i).getAddress());
                    row.createCell(1).setCellValue(list.get(i).getName());
                    row.createCell(2).setCellValue(list.get(i).getCompiler());
                    row.createCell(3).setCellValue(list.get(i).getBalance());
                    row.createCell(4).setCellValue(list.get(i).getTxCount());
                    row.createCell(5).setCellValue(list.get(i).getSettings());
                    row.createCell(6).setCellValue(list.get(i).getDateTime());
                }
            }

            try {
                FileOutputStream fos = new FileOutputStream("/home/jion1/crawler_data/data_contract2.xls");
                wb.write(fos);
                fos.flush();
                fos.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        System.out.println("done");

    }

    public static List<Contract> getData(String url) throws Exception {
        List<Contract> contractList = new ArrayList<Contract>();
        Document doc = Jsoup.connect(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0")
                .timeout(600000).get();

        Elements elements2 = doc.select("div.table-responsive").select("table").select("tbody").select("tr");

        for (int i = 0; i < elements2.size(); i++) {
            String contract = elements2.get(i).select("td").get(0).text();
            String name = elements2.get(i).select("td").get(1).text();
            String compiler = elements2.get(i).select("td").get(2).text();
            String balance = elements2.get(i).select("td").get(3).text();
            String txCount = elements2.get(i).select("td").get(4).text();
            String settings = elements2.get(i).select("td").get(5).text();
            String dateTime = elements2.get(i).select("td").get(6).text();
            contractList.add(new Contract(contract, name, compiler, balance, txCount, settings, dateTime));
        }
        return contractList;
    }
}

以上是获取etherscan网站里面的所有具体合约代码，具体的工具包和代码可以参考github：https://github.com/Messi-Q/Crawler

秒客网

Java爬虫（Jsoup）---爬取Etherscan上的智能合约代码