Python 程序 for 百万英雄（详细版）

思路：利用 adb 对手机截图，用 pytesseract 识别题目与答案，用爬虫的思路获取百度搜索出的答案信息，与识别出的答案对比，标红出现频率最高的答案（即可能的正确答案）
心路历程：最初打算用 webdriver 驱动浏览器搜索答案，发现实在太慢，达不到要求，所以改用爬虫的思路。如果觉得用OCR识别图中汉字比较慢，可以采用抓包的方式获取题目的相关信息。

所需软件：adb ，Python的 pytesseract 库
程序：

import subprocess
from PIL import Image
import pytesseract
import requests
from bs4 import BeautifulSoup
import time

#利用adb得到屏幕截图，保存为screenpic.png
def pull_screenshot():
    process = subprocess.Popen('adb shell screencap -p', shell=True, stdout=subprocess.PIPE)
    screenshot = process.stdout.read()
    binary_screenshot = screenshot.replace(b'\r\n', b'\n')
    f = open('screenpic.png', 'wb')
    f.write(binary_screenshot)
    f.close()

#识别图中文字。其中box的值是为了对屏幕截图进行切片，四个值依次为（左，上，右，下）的像素值，本函数以自己的手机1920*1080为例
def getImagecode():
    image = Image.open('screenpic.png')
#box切片出题目所在的范围
    box = (50, 300, 1000, 600)
    cutimage = image.crop(box)
#box2切片出答案所在的范围
    box2 = (50, 620, 1000, 1220)
    cutimage2 = image.crop(box2)
#识别图中文字，lang='chi_sim'，是指中文模式
    code = pytesseract.image_to_string(cutimage, lang='chi_sim').strip()
    code2 = pytesseract.image_to_string(cutimage2, lang='chi_sim').strip()
    print('Question : ' + code)
#返还识别出的两个值，分别为识别出的问题与答案
return [code, code2]

###解析百度页面，传入需要搜索的keyword
def getpageindex(keyword):
    params = {
"Cookie": "",##填入浏览器的Cookie
"Host": "www.baidu.com",
"Upgrade-Insecure-Requests": '1',
"User-Agent": "",##填入浏览器的User-Agent
    }
    url = "https://www.baidu.com/s?cl=3&wd={}".format(keyword)
    res = requests.get(url, headers=params)
    soup = BeautifulSoup(res.text, 'lxml')
    answers = soup.select('.c-abstract')
    answertext = []
for i in range(len(answers)):
try:
            answer = answers[i].text.strip()
            answertext.append(answer)
except:
pass
return '\n'.join(answertext)##返回的是字符串，以换行链接的

#红色显示答案文字，将百度出的一段文字，与识别出的三个答案进行匹配，标红出现的答案，并进行count计数，标红出现频率最高的答案（视为可能的答案）
def distinguish(answers, answord01, answord02, answord03):
    count1 = 0
    count2 = 0
    count3 = 0
for x1 in answers:
        state = True
if x1 in answord01:
            print("\033[0;31m%s\033[0m" % x1, end='')
            state = False
# print(x1, end='')
            count1 += 1
if x1 in answord02:
if state:
                print("\033[0;31m%s\033[0m" % x1, end='')
                state = False
# print(x1, end='')
            count2 += 1
if x1 in answord03:
if state:
                print("\033[0;31m%s\033[0m" % x1, end='')
                state = False
# print(x1, end='')
            count3 += 1
if state:
             print(x1, end='')
    print('\n')
    print('--' * 10)
    maxcount = max(count1, count2, count3)
    ansli = [answord01, answord02, answord03]
    couli = [count1, count2, count3]
for i in range(3):
if couli[i] == maxcount:
            print('Your May Choose:', end='')
            print("\033[0;31m%s\033[0m" % (ansli[i] + ' : ' + str(couli[i])))
else:
            print(ansli[i] + ' : ' + str(couli[i]))

def main():
    inputstate = input('Run program input [0]:')
#开始程序，输入 0 开始截图并返回结果
#开始计时
    start = time.clock()
if inputstate == '0':
        pull_screenshot()
        [keyword, code2] = getImagecode()
try:
            answord01 = code2.split('\n')[0]
            answord02 = code2.split('\n')[2]
            answord03 = code2.split('\n')[4]
            print('answord01 = ', answord01)
            print('answord02 = ', answord02)
            print('answord03 = ', answord03)
except:
            answord01 = ''
            answord02 = ''
            answord03 = ''
        answers = getpageindex(keyword)
        print('--' * 10)
        distinguish(answers, answord01, answord02, answord03)
        print('--' * 10)
else:
pass
    end = time.clock()
    print('Running time: %s Seconds' % (end-start))

if __name__ == '__main__':
    main()