Java爬虫(Jsoup)—爬取Etherscan上的智能合约代码
最近在做一个和智能合约漏洞相关的研究, 计划使用基本深度学习的方法来分析合约漏洞, 需要大量的合约集来使用,所有决定在Etherscan上爬取智能合约,借此把这次爬虫的解决过程记录下来。
一、 工具准备
首先,使用Java爬虫技术需要有相应的爬虫工具包Jsoup,将每一条数据写入Excel文件时也需要用到相应的jar包poi。
Etherscan地址为:https://etherscan.io/contractsVerified
选取存储的路径地址:/home/jion1/crawlerData/data_contract1.xls
二、 合约实体类
对于要爬取的数据来说,如下图所示,它是一条条的数据,一条数据中分为多个字段。
从图中可以看到,这一条数据中有Address、CotractName、Complier、Balance、TxCount、Settings、DateVerified这些字段,所以创建一个合约的实体类如下所示:
Contract
public class Contract {
private String address;
private String name;
private String compiler;
private String balance;
private String txCount;
private String settings;
private String dateTime;
private String code;
public Contract() {
super();
}
public Contract(String address, String name, String compiler, String balance, String txCount, String settings,
String dateTime) {
super();
this.address = address;
this.name = name;
this.compiler = compiler;
this.balance = balance;
this.txCount = txCount;
this.settings = settings;
this.dateTime = dateTime;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCompiler() {
return compiler;
}
public void setCompiler(String compiler) {
this.compiler = compiler;
}
public String getBalance() {
return balance;
}
public void setBalance(String balance) {
this.balance = balance;
}
public String getTxCount() {
return txCount;
}
public void setTxCount(String txCount) {
this.txCount = txCount;
}
public String getSettings() {
return settings;
}
public void setSettings(String settings) {
this.settings = settings;
}
public String getDateTime() {
return dateTime;
}
public void setDateTime(String dateTime) {
this.dateTime = dateTime;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String toString() {
return "Contract [address=" + address + ", name=" + name + ", compiler=" + compiler + ", balance=" + balance
+ ", txCount=" + txCount + ", settings=" + settings + ", dateTime=" + dateTime + "]";
}
}
三、 爬取具体合约代码
对于要爬取的目标:合约代码。当我们将所有的合约地址爬下来后,例如:0xE42Ef56340bCa5072E3c7bA07df835a65eCd06a6。而对应此地址的合约代码如下图所示:
我们需要通过获取合约地址,然后爬取具体的合约代码。
Crawler
public class crawler_1 {
public static void main(String[] args) throws IOException {
// readFile();
String path = "/home/jion1/crawlerData/data_contract1.xls";
String url1 = "https://etherscan.io/address/";
List<Contract> list1 = readExcel(path);
System.out.println(list1.size());
List<Map<String, String>> list2 = new ArrayList<Map<String, String>>();
for (int i = 0; i < 400; i++) {
Map<String, String> map = new HashMap<String, String>();
String url = url1 + list1.get(i).getAddress() + "#code";
System.out.println(url);
String compiler = "compiler:" + list1.get(i).getCompiler();
String balance = "balance:" + list1.get(i).getBalance();
String txCount = "txCount:" + list1.get(i).getTxCount();
String settings = "settings:" + list1.get(i).getSettings();
String dateTime = "dateTime:" + list1.get(i).getDateTime();
String code = "code:" + getData(url);
String collection = "code:" + "\n" + code + "\n" + compiler + "\n" + balance + "\n" + txCount + "\n"
+ settings + "\n" + dateTime;
map.put("address:" + list1.get(i).getAddress() + "\n", collection);
// System.out.println(map);
String data = "address:" + list1.get(i).getAddress();
// System.out.println(data);
String filename = "";
filename = i + ".txt";
System.out.println(filename);
File file = new File("/home/jion1/contract/contract103/" + filename);
BufferedWriter bw = null;
try {
// String s = map.toString();
bw = new BufferedWriter(new FileWriter(file));
bw.write(data);
bw.newLine();
bw.write(compiler);
bw.newLine();
bw.write(code);
bw.newLine();
bw.write(balance);
bw.newLine();
bw.write(txCount);
bw.newLine();
bw.write(settings);
bw.newLine();
bw.write(dateTime);
bw.flush();
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("done");
System.out.println();
list2.add(map);
}
}
public static String getData(String url) throws IOException {
String linkText = null;
Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0")
.timeout(600000).get();
Elements links = doc.select("pre[class=js-sourcecopyarea]");
for (Element link : links) {
linkText = link.text();
}
return linkText;
}
public static void readFile() throws IOException {
FileReader fr = new FileReader("D://contract.txt");
BufferedReader br = new BufferedReader(fr);
String line = "";
String[] arrs = null;
while ((line = br.readLine()) != null) {
arrs = line.split(",");
}
br.close();
fr.close();
}
public static List<Contract> readExcel(String path) throws IOException {
List<Contract> list = new ArrayList<Contract>();
InputStream ExcelFileToRead = new FileInputStream(path);
HSSFWorkbook wb = new HSSFWorkbook(ExcelFileToRead);
HSSFSheet sheet = wb.getSheetAt(0);
System.out.println(sheet.getLastRowNum());
HSSFRow row; //
for (int i = 0; i <= sheet.getLastRowNum(); i++) {
row = sheet.getRow(i);
Contract ct = new Contract();
if (row == null) {
continue;
}
int j = row.getFirstCellNum();
ct.setAddress(row.getCell(j).toString());
ct.setName(row.getCell(j + 1).toString());
ct.setCompiler(row.getCell(j + 2).toString());
ct.setBalance(row.getCell(j + 3).toString());
ct.setTxCount(row.getCell(j + 4).toString());
ct.setSettings(row.getCell(j + 5).toString());
ct.setDateTime(row.getCell(j + 6).toString());
list.add(ct);
}
return list;
}
}
四、 爬取所有的合约数据
首先,我们需要将所有的合约数据爬取下来,插入到excel表中,将每一条数据插入到excel表中,然后循环读出excel表中的合约地址,利用这些合约地址爬取每一个合约的代码。
Crawler_excel
public class crawler {
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
// 采集的网址
String url = "https://etherscan.io/contractsVerified/";
Document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36").get();
// 获取审查页面的页数信息
int total_page = Integer.parseInt(document.select("body > div.wrapper > div.profile.container > div:nth-child(4) > div:nth-child(2) > p > span > b:nth-child(2)").text());
System.out.println(total_page);
HSSFWorkbook wb = new HSSFWorkbook();
HSSFSheet sheet = wb.createSheet("sheet1");
HSSFRow row = sheet.createRow(0);
HSSFCellStyle style = wb.createCellStyle();
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
HSSFCell cell1 = row.createCell(0);
cell1.setCellValue("contract");
cell1.setCellStyle(style);
HSSFCell cell2 = row.createCell(1);
cell2.setCellValue("name");
cell2.setCellStyle(style);
HSSFCell cell3 = row.createCell(2);
cell3.setCellValue("compiler");
cell3.setCellStyle(style);
HSSFCell cell4 = row.createCell(3);
cell4.setCellValue("balance");
cell4.setCellStyle(style);
HSSFCell cell5 = row.createCell(4);
cell5.setCellValue("txCount");
cell5.setCellStyle(style);
HSSFCell cell6 = row.createCell(5);
cell6.setCellValue("settings");
cell6.setCellStyle(style);
HSSFCell cell7 = row.createCell(6);
cell7.setCellValue("dateTime");
cell7.setCellStyle(style);
for (int current_page = 1; current_page <= total_page; current_page++) {
if (current_page == 1) {
List<Contract> list = getData(url);
JSONArray array = new JSONArray();
array.add(list);
for (int i = 0; i < list.size(); i++) {
row = sheet.createRow(i + 1);
row.createCell(0).setCellValue(list.get(i).getAddress());
row.createCell(1).setCellValue(list.get(i).getName());
row.createCell(2).setCellValue(list.get(i).getCompiler());
row.createCell(3).setCellValue(list.get(i).getBalance());
row.createCell(4).setCellValue(list.get(i).getTxCount());
row.createCell(5).setCellValue(list.get(i).getSettings());
row.createCell(6).setCellValue(list.get(i).getDateTime());
}
} else {
List<Contract> list = getData(url + current_page);
JSONArray array = new JSONArray();
array.add(list);
System.out.println("**************************************");
for (int i = 0; i < list.size(); i++) {
row = sheet.createRow((short) (sheet.getLastRowNum() + 1)); //现有的行号后面追加
//row = sheet.createRow(i + 1);
row.createCell(0).setCellValue(list.get(i).getAddress());
row.createCell(1).setCellValue(list.get(i).getName());
row.createCell(2).setCellValue(list.get(i).getCompiler());
row.createCell(3).setCellValue(list.get(i).getBalance());
row.createCell(4).setCellValue(list.get(i).getTxCount());
row.createCell(5).setCellValue(list.get(i).getSettings());
row.createCell(6).setCellValue(list.get(i).getDateTime());
}
}
try {
FileOutputStream fos = new FileOutputStream("/home/jion1/crawler_data/data_contract2.xls");
wb.write(fos);
fos.flush();
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("done");
}
public static List<Contract> getData(String url) throws Exception {
List<Contract> contractList = new ArrayList<Contract>();
Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0")
.timeout(600000).get();
Elements elements2 = doc.select("div.table-responsive").select("table").select("tbody").select("tr");
for (int i = 0; i < elements2.size(); i++) {
String contract = elements2.get(i).select("td").get(0).text();
String name = elements2.get(i).select("td").get(1).text();
String compiler = elements2.get(i).select("td").get(2).text();
String balance = elements2.get(i).select("td").get(3).text();
String txCount = elements2.get(i).select("td").get(4).text();
String settings = elements2.get(i).select("td").get(5).text();
String dateTime = elements2.get(i).select("td").get(6).text();
contractList.add(new Contract(contract, name, compiler, balance, txCount, settings, dateTime));
}
return contractList;
}
}
以上是获取etherscan网站里面的所有具体合约代码,具体的工具包和代码可以参考github:https://github.com/Messi-Q/Crawler