关于python爬虫不能显示中文问题，急！

系统：ubuntu　14.04； python：2.7.6； scrapy：0.14.4

用于爬取拉钩网python招聘信息的一些信息

现运行图片如下（不能显示中文）
关于python爬虫不能显示中文问题，急！

源代码：
pipelines.py

from scrapy import signals
import json
import codecs

class LagouPythonInvitePipeline(object):
    #def process_item(self, item, spider):
        #return item

#class PyInvitePipeline(object):
    def __init__(self):
	self.file = codecs.open('PythonInvite.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
	line = json.dumps(dict(item), ensure_ascii=False) + "\n"
	self.file.write(line)
	return item

    def spider_closed(self, spider):
	self.file.close()

items.py

from scrapy.item import Item, Field

class LagouPythonInviteItem(Item):
    # define the fields for your item here like:
    # name = Field()
    pass

class PyInviteItem(Item):
    companyName = Field()	# 公司名称
    inAreas = Field()		# 涉及领域
    workLocation = Field()	# 工作地点
    monthlyPay = Field()	# 月薪
    workExperience = Field()	# 工作经验
    lowerEducation = Field()	# 最低学历
    postTempt = Field()		# 职位诱惑
    publishTime = Field()	# 发布时间
    dataLink = Field()		# 职位详情链接

PyInvite_spider.py

# coding: UTF-8

import re
import json

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

from scrapy.utils.response import get_base_url  
from scrapy.utils.url import urljoin_rfc  
from scrapy.contrib.spiders import CrawlSpider, Rule  
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle 

from lagou_python_invite.items import PyInviteItem
from lagou_python_invite.misc.log import *

class PyInviteSpider(CrawlSpider):
    name = "PyInvite"
    allowed_domains = ["lagou.con"]
    start_urls = [
	"http://www.lagou.com/zhaopin/Python?labelWords=label"
    ]
    
    rules = [
	Rule(sle(allow=("/Python\?labelWords=label&pn=\d{,4}")), follow=True, callback='parse_item')
    ]

    def parse(self, response):
	items = []
	hxs = HtmlXPathSelector(response)
	base_url = get_base_url(response)
	#ll= hxs.select('//ul[@class="hot_pos reset"]/li[@class="odd clearfix"]/div[@class="mb10"]/span/text()').extract()
	#print ll
	#print type(ll)
	#return ll
	site_odd_clearfix = hxs.select('//ul[@class="hot_pos reset"]/li[@class="odd clearfix"]')
	for site in site_odd_clearfix:
	    item = PyInviteItem()
	    item['companyName'] = site.select("//div[@class='hot_pos_r']/div[@class='mb10']/a/@title/text()").extract()
	    item['inAreas'] = site.select("//div[@class='hot_pos_r']/span[1]/text()").extract()
	    item['workLocation'] = site.select("//div[@class='hot_pos_l']/div[@class='mb10']/span/text()").extract()
	    item['monthlyPay'] = site.select("//div[@class='hot_pos_l']/span[1]/text()").extract()
	    item['workExperience'] = site.select("//div[@class='hot_pos_l']/span[2]/text()").extract()
	    item['lowerEducation'] = site.select("//div[@class='hot_pos_l']/span[3]/text()").extract()
	    item['postTempt'] = site.select("//div[@class='hot_pos_l']/span[4]/text()").extract()
	    item['publishTime'] = site.select("//div[@class='hot_pos_l']/span[5]/text()").extract()
	    relative_url = site.select("//div[@class='hot_pos_l']/div[@class='mb10']/a/@href/text()").extract()
	    item['dataLink'] = urljoin_rfc(base_url, relative_url)
	    items.append(item)
	    #print repr(item).decode("unicode-escape") + '\n'

	site_clearfix = hxs.select('//ul[@class="hot_pos reset"]/li[@class="clearfix"]')
	for site in site_clearfix:
	    item = PyInviteItem()
	    item['companyName'] = site.select("//div[@class='hot_pos_r']/div[@class='mb10']/a/@title/text()").extract()
	    item['inAreas'] = site.select("//div[@class='hot_pos_r']/span[1]/text()").extract()
	    item['workLocation'] = site.select("//div[@class='hot_pos_l']/div[@class='mb10']/span/text()").extract()
	    item['monthlyPay'] = site.select("//div[@class='hot_pos_l']/span[1]/text()").extract()
	    item['workExperience'] = site.select("//div[@class='hot_pos_l']/span[2]/text()").extract()
	    item['lowerEducation'] = site.select("//div[@class='hot_pos_l']/span[3]/text()").extract()
	    item['postTempt'] = site.select("//div[@class='hot_pos_l']/span[4]/text()").extract()
	    item['publishTime'] = site.select("//div[@class='hot_pos_l']/span[5]/text()").extract()
	    relative_url = site.select("//div[@class='hot_pos_l']/div[@class='mb10']/a/@href/text()").extract()
	    item['dataLink'] = urljoin_rfc(base_url, relative_url)
	    items.append(item)
	    #print repr(item).decode("unicode-escape") + '\n'

	info('parsed '+ repr(response))
	return items

    def _process_request(self, request):
	info('process ' + str(request))
	return request

不知那里出错了　求各位大神帮忙看看　　非常感谢　　

8 个解决方案

#1

统一所有编码，包括ide

#2

引用 1 楼 u013171165 的回复:

统一所有编码，包括ide

不是很明白　　能不能具体点　　我是用vim编辑器写的代码　　　查看了一下　　我系统默认支持'zh_CN', 'UTF-8'这两种编码　　　是不是我要设置为其中一种

#3

统一用utf8吧，除非你有特殊用途。

#4

引用 3 楼 u013171165 的回复:

统一用utf8吧，除非你有特殊用途。

我刚设置了默认编码为utf-8　　可是一运行python程序后查看默认编码被改为ascii　　　怎么会这样　　　我是不是也可以在每个python模块最前面（顶行）加入 # coding: UTF-8 代码来显示目的

#5

1.使用for循环看看是否还是乱码。
2.str.encode('utf-8')转码看看。

#6

import sys reload（sys）重新设置编码为utf8，代码怎么写记不太清楚，你自己查下

#7

python2直接打印字典或列表，里面的字串就是转义表示，很正常啊。#print repr(item).decode("unicode-escape")如果不注释的话，打印时看到啥呢？

#8

你最后如何解决的，我也遇到相同问题，求教

#1

统一所有编码，包括ide

#2

引用 1 楼 u013171165 的回复:

统一所有编码，包括ide

#3

统一用utf8吧，除非你有特殊用途。

#4

引用 3 楼 u013171165 的回复:

统一用utf8吧，除非你有特殊用途。

#5

1.使用for循环看看是否还是乱码。
2.str.encode('utf-8')转码看看。

#6

import sys reload（sys）重新设置编码为utf8，代码怎么写记不太清楚，你自己查下

#7

python2直接打印字典或列表，里面的字串就是转义表示，很正常啊。#print repr(item).decode("unicode-escape")如果不注释的话，打印时看到啥呢？

#8

你最后如何解决的，我也遇到相同问题，求教

秒客网

关于python爬虫不能显示中文问题，急！

8 个解决方案

#1

#2

#3

#4

#5

#6

#7

#8

#1

#2

#3

#4

#5

#6

#7

#8

相关文章