I've constructed Python script and it works well on OS X/Linux but I'm having problems in Windows (see title). It's using Pillow module and the error originates in module PIL\Image.py
on line 2274.
我已经构建了Python脚本,它在OS X/Linux上运行良好,但在Windows中有问题(参见标题)。它使用的是枕模,而错误的根源在于模块的PIL\Image。py 2274行。
My code:
我的代码:
# -*- coding: utf-8 -*-
import os
import sys
import urllib2
from PIL import Image, ImageFile
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
from bs4 import BeautifulSoup
ImageFile.LOAD_TRUNCATED_IMAGES = True
def parser():
try:
return sys.argv[1].lower()
except IndexError:
print 'no argument specified'
the_url = 'http://www.oldgames.sk'
base_url = the_url + '/mags/'
# Add magazines + relative URLs here
magazines = {
'score': 'score/',
'level': 'level/',
'amiga': 'amiga-magazin/',
'bit': 'bit/',
'commodore': 'commodore-amater/',
'CGW': 'cgw/',
'excalibur': 'excalibur/',
'hrac': 'hrac-cz/',
'joystick': 'joystick-sk/',
'pocitac-aktivne': 'pocitac-aktivne/',
'pocitacove-hry': 'pocitacove-hry/',
'riki': 'riki/',
'zzap64': 'zzap64/'}
issue_links = []
download_list = {}
def parse_args(arg):
if arg == '--list':
items = [i for i in magazines.keys()]
for item in items:
print item
sys.exit()
elif arg in magazines:
print "Scraping %s magazine..." % arg.capitalize()
return base_url + magazines[arg]
else:
return sys.exit('invalid magazine name')
def extract_links_to_issue(url):
soup = BeautifulSoup(urllib2.urlopen(url))
for div in soup.findAll('div','mImage'):
issue_links.append(the_url + div.a['href'])
print 'Scraped %d links' % len(issue_links)
def issue_renamer(issue_name):
char1 = '\\'
char2 = '/'
replacement = '-'
if char1 in issue_name:
issue_name = issue_name.replace(char1, replacement)
print 'inv. char (%s): renaming to %s' % (char1, issue_name)
elif char2 in issue_name:
issue_name = issue_name.replace(char2, replacement)
print 'inv. char (%s): renaming to %s' % (char2, issue_name)
return issue_name
def extract_links_to_images(issue_links):
for index, link in enumerate(issue_links):
print 'Scraping issue #%d: %s' % (index + 1, link)
issue_soup = BeautifulSoup(urllib2.urlopen(link))
image_list = []
for image in issue_soup.findAll('div', 'mags_thumb_article'):
issue_name = issue_renamer(issue_soup.findAll('h1','top')[0].text)
image_list.append(the_url + image.a['href'])
download_list[issue_name] = image_list
def clean_up(list_of_files, list_of_pdfs):
num = len(list_of_files) + len(list_of_pdfs)
for file in list_of_files:
os.remove(file)
for pdf in list_of_pdfs:
os.remove(pdf)
print 'Cleaned up %d files' % num
def convert_images(list_of_files, issue):
list_of_pdfs = []
for index, file in enumerate(list_of_files):
im = Image.open(file)
outfile = file + '.pdf'
im.save(outfile, 'PDF')
list_of_pdfs.append(outfile)
print 'converting ...' + str((index + 1)) + '/' + str(len(list_of_files))
final_pdf = PdfFileMerger()
for pdf in list_of_pdfs:
final_pdf.append(open(pdf, 'rb'))
issue_name = issue + '.pdf'
final_pdf.write(open(issue_name, 'wb'))
final_pdf.close()
print '--- PDF completed ---'
clean_up(list_of_files, list_of_pdfs)
def download_images(download_list):
for issues,image_list in download_list.items():
print 'Preparing %s ...' % issues
list_of_files = []
for image in image_list:
image_name = os.path.split(image)[1]
list_of_files.append(image_name)
f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())
print 'Downloading image: %s' % image
f.close()
convert_images(list_of_files, issues)
arg = parser()
extract_links_to_issue(parse_args(arg))
extract_links_to_images(issue_links)
download_images(download_list)
I'd like to fix this, can anyone help me?
我想解决这个问题,有人能帮我吗?
1 个解决方案
#1
2
You are copying images into a file opened in text mode:
您正在将图像复制到以文本模式打开的文件中:
f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())
On Windows this means that any 0A (newline) bytes are translated to 0D 0A byte sequences (carriage return, newline), as that is the Windows line separator.
在Windows上,这意味着任何0A(换行)字节都被转换为0d0a字节序列(回车,换行),因为这是Windows行分隔符。
Open your files in binary mode:
以二进制模式打开文件:
f = open(image_name, 'wb')
f.write(urllib2.urlopen(image).read())
I'd switch to using the file as a context manager (with the with
statement) so you don't have to manually close it, and using shutil.copyfileobj()
to stream the data straight to disk (in blocks) rather than read the whole image into memory in one go:
我将切换到使用文件作为上下文管理器(使用with语句),这样您就不必手动关闭它,并使用shutil.copyfileobj()将数据直接流到磁盘(在块中),而不是一次性将整个图像读入内存:
import shutil
# ...
with open(image_name, 'wb') as f:
shutil.copyfileobj(urllib2.urlopen(image), f)
#1
2
You are copying images into a file opened in text mode:
您正在将图像复制到以文本模式打开的文件中:
f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())
On Windows this means that any 0A (newline) bytes are translated to 0D 0A byte sequences (carriage return, newline), as that is the Windows line separator.
在Windows上,这意味着任何0A(换行)字节都被转换为0d0a字节序列(回车,换行),因为这是Windows行分隔符。
Open your files in binary mode:
以二进制模式打开文件:
f = open(image_name, 'wb')
f.write(urllib2.urlopen(image).read())
I'd switch to using the file as a context manager (with the with
statement) so you don't have to manually close it, and using shutil.copyfileobj()
to stream the data straight to disk (in blocks) rather than read the whole image into memory in one go:
我将切换到使用文件作为上下文管理器(使用with语句),这样您就不必手动关闭它,并使用shutil.copyfileobj()将数据直接流到磁盘(在块中),而不是一次性将整个图像读入内存:
import shutil
# ...
with open(image_name, 'wb') as f:
shutil.copyfileobj(urllib2.urlopen(image), f)