pyautogui_pdf批量转换为TXT

pyautogui_pdf批量转换为TXT，

用pdf自带无损转换

# -*- coding: utf-8 -*-

"""

Created on Thu May  5 15:39:54 2016

一定要有time.sleep(1)时间控制，否则出错

pdf另存文本，效果可能很差

typewrite("content") 用于输入文字

typewrite（["right","left","up"]） 用于输入连续键盘按钮

@author: daxiong

"""

import pyautogui,time,os

dir_file="C:/Users/daxiong/Desktop/test"

#获取文件夹的文件名

fileNames=os.listdir(dir_file)

#打开存储PDF软件；（50,50）为pdf坐标

pyautogui.doubleClick(50,50)

time.sleep(1)

for fileName in fileNames:

    #打开第一个PDF，按热键Ctrl+o即可

    pyautogui.hotkey("ctrl","o")

    time.sleep(1)

    #输入PDF文件名,并进入

    pyautogui.typewrite(fileName)

    time.sleep(1)

    pyautogui.press("enter")

    time.sleep(1)

    #另存为纯文本

    pyautogui.hotkey("shift","ctrl","s")

    time.sleep(1)

    pyautogui.press("tab") #切换到下面的保存格式

    time.sleep(1)

    pyautogui.press("down")

    time.sleep(1)

    pyautogui.typewrite(["down","down","down","down","down","down","down","down"\

    ,"down","down","down","down","down","down","down","down","down","down"\

    ,"down","down","enter"])

    #选择储存路径

    pyautogui.press('f4')  #定位地址栏

    time.sleep(1)

    pyautogui.hotkey("ctrl","a") #选中内容

    time.sleep(1)

    pyautogui.press('delete') #删除旧的地址

    time.sleep(1)

    pyautogui.typewrite(dir_file)

    time.sleep(1)

    #连续按下10个tab就到保存按钮

    pyautogui.typewrite(["tab","tab","tab","tab","tab","tab","tab","tab","tab",\

    "tab","enter"])

    #关闭PDF,组合键ctrl+w

    time.sleep(2)

    pyautogui.hotkey("ctrl","w")

pyautogui.hotkey("ctrl","q")

办公室电脑测试代码

pdf_to_txt

# -*- coding: utf-8 -*-

"""

Created on Thu May 12 11:22:57 2016

pdf更换为最新版本，尝试提高转换成功率。

txt必须转换为纯文本格式

等待时间必须和pdf页码数匹配

@author: Administrator

"""

import pyautogui,time,os,PyPDF2

dir_file="C:/Users/Administrator/Desktop/test/pdf/"

#获取文件夹的文件名

fileNames=os.listdir('.')

pdf_fileNames=[i for i in fileNames if os.path.splitext(i)[1]==".pdf"]

def Get_time(filename):

    try:

        pdfFileObj=open(filename,'rb')

        pdfReader=PyPDF2.PdfFileReader(pdfFileObj)

        pages=pdfReader.numPages #显示页数 在第4100行时读取pdfReader也会出错

    except:

        print ("wrong when read pdf:",filename)

        sleepTime=10

        return sleepTime

    if pages<=10:

        sleepTime=pages+2

    else:

        sleepTime=15

    return sleepTime

#打开存储PDF软件；（50,50）为pdf坐标

pyautogui.doubleClick(50,50)

time.sleep(3)

for fileName in fileNames:

    #打开第一个PDF，按热键Ctrl+o即可

    pyautogui.hotkey("ctrl","o")

    time.sleep(1)

    #输入PDF文件名,并进入

    pyautogui.typewrite(fileName)

    time.sleep(2)

    pyautogui.press("enter")

    time.sleep(1)

    #另存为纯文本

    pyautogui.hotkey("shift","ctrl","s")

    time.sleep(1)

    pyautogui.press("tab") #切换到下面的保存格式

    time.sleep(1)

    pyautogui.press("down")

    time.sleep(1)

    #不准确

    pyautogui.typewrite(["down","down","down","down","down","down","down","down"\

    ,"down","down","down","down","down","down","down","down","down","down","enter","enter"])

    sleepTime=Get_time(fileName)

    #关闭PDF,组合键ctrl+w

    time.sleep(sleepTime)

    pyautogui.hotkey("ctrl","w")

pyautogui.hotkey("ctrl","q")

txt 文件包提取到excel

# -*- coding: utf-8 -*-

"""

Created on Thu May 12 14:05:06 2016

1.先用filenameToExcel.exe程序导入文件名

2.B11写入me_txt

3.批量写入内容

list不能写入cell，str才可以.txt必须是纯文本格式

@author: Administrator

"""

import PyPDF2,os,openpyxl,sys,time,xlrd

from openpyxl.cell import get_column_letter,column_index_from_string

#开始时间

timeBegin=time.clock()

excelFileName="test.xlsx"

wb=openpyxl.load_workbook(excelFileName)

sheet=wb.active

columnIndex="A"

start=1

expandName=".txt"

expandName_upper=expandName.upper()

excelFile = xlrd.open_workbook(excelFileName)

table = excelFile.sheet_by_index(0) #通过索引顺序获取

#A列的单元格

cells_columnA=sheet.columns[0]

#B列单元格

cells_columnB=sheet.columns[1]

#content="你好"

def Get_col_values(i):

    list_col_values=table.col_values(i)

    list_col_values1=list_col_values[1:]

    return list_col_values1

def single_txt_extract(filename,i):

    try:

        txtFileObj=open(filename)

        #不知道readlines()效果和readline相比如何，要测试

        content=txtFileObj.read()

    except:

        print ("wrong when read txt:",filename)

    cells_columnB[i+1].value=content   #list不能写入cell，str才可以.txt必须是纯文本格式

    txtFileObj.close()

list_pdf_fileNames=Get_col_values(0)

single_txt_extract("1151.txt",0)

wb.save(excelFileName)

秒客网

pyautogui_pdf批量转换为TXT

相关文章