1、python爬取贴吧壁纸 1.1、获取整个页面数据 #coding=utf-8 import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html html = getHtml("http://tieba.baidu.com/p/2738151262") print html 复制代码 1.2、筛选页面中想要的数据 import re import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) return imglist html = getHtml("http://tieba.baidu.com/p/2460150866") print getImg(html) 1.3、将页面筛选的数据保存到本地 #coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 html = getHtml("http://tieba.baidu.com/p/2460150866") print getImg(html) 抓取昵图网图片 --修改版 #coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.*?)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 解释: %s意思是字符串参数,就是将变量的值传入到字符串里面,字符串后的'%'后就是写要传入的参数。 在你给出的例子中,就是用x的值替代%s。比如说x=5,那么就是爬取url后面是'5.jpg'这个图片 保存的位置默认为程序的存放目录 如何保存到指定目录:urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word 2、python抓取价格 前两个不用加 text #-*—coding:utf8-*- from lxml import etree import urllib import urllib.request #headers构造一个字典,里面保存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = urllib.request.urlopen(url).read() data=html.decode('utf-8') selector = etree.HTML(data) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 或者 #-*—coding:utf8-*- from lxml import etree import urllib import urllib.request #headers构造一个字典,里面保存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = urllib.request.urlopen(url).read() selector = etree.HTML(html) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 或者 :注意:这个需要加text html.text #-*—coding:utf8-*- from lxml import etree import requests #headers构造一个字典,里面保存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b" html = requests.get(url) selector = etree.HTML(html.text) #xpath qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()') #print(qiubai_text) for i in qiubai_text: print(i) 3、python爬取昵图网图片 #coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.*?)" ' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 4、爬音乐 # coding:utf-8 import urllib import urllib.request import re url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml" html = urllib.request.urlopen(url).read() data=html.decode('GBK') #print(data) music_id = int(re.findall(r'MusicId=(\d+)',data)[0]) music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip() music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0] article='word' with open("%s.txt" % article,'w') as f: f.write(music_word) #print(music_word) quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3" #print(quanurl) bata=urllib.request.urlopen(quanurl).read() with open("%s.mp3" % music_name,'wb') as f: f.write(bata) 注意问题: music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0] python中AttributeError解决 【Python 脚本报错】AttributeError:'module' has no attribute 'xxx'的解决方法 http://blog.csdn.net/cn_wk/article/details/50839159 int库的.pyc文件 python 去掉 .pyc http://blog.csdn.net/ubuntu64fan/article/details/48241985 python操作对象属性 http://www.jianshu.com/p/c38a81b8cb38 Python学习日记4|python爬虫常见报错小结及解决方法 http://www.jianshu.com/p/17c921639ad0 #coding=utf-8 from Tkinter import * import tkMessageBox import urllib import json import mp3play import time import threading from pinyin import PinYin import os import stat test = PinYin() test.load_word() stop=0 def music(): if not entry.get(): tkMessageBox.showinfo("温馨提示","搜索内容不能为空") return name = test.hanzi2pinyin_split(entry.get()) html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read() js=json.loads(html) n = 0 global x x = [] for i in js['result']['songs']: listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name'])) n+=1 x.append(i['audio']) count = 0 #isplaying = None def play(): global count count += 1 index=listbox.curselection() var1.set(u"正在加载"+listbox.get(index,last=None)) urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count)) var1.set(u"正在播放"+listbox.get(index,last=None)) mp3=mp3play.load("tmp%s.mp3"%str(count)) mp3.play() time.sleep(mp3.seconds()) import inspect import ctypes def _async_raise(tid, exctype): """raises the exception, performs cleanup if needed""" tid = ctypes.c_long(tid) if not inspect.isclass(exctype): exctype = type(exctype) res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype)) if res == 0: raise ValueError("invalid thread id") elif res != 1: ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None) raise SystemError("PyThreadState_SetAsyncExc failed") def stop_thread(thread): _async_raise(thread.ident, SystemExit) threads=list() t=None def excute(event): global t for i in threads: stop_thread(i) t = threading.Thread(target=play) t.setDaemon(True) t.start() threads.append(t) root = Tk()#创建一个窗口 root.title("云音乐") root.geometry("500x300+500+200") entry=Entry(root)#创建输入框(单行),置父 entry.pack() btn=Button(root,text="搜 索",command=music) btn.pack()#布局方式必须用同一种 var=StringVar() listbox=Listbox(root,width=50,listvariable=var) listbox.bind('<Double-Button-1>',excute) listbox.pack() var1=StringVar() label=Label(root,text="云音乐播放器",fg="purple",textvariable=var1) var1.set("云音乐播放器") label.pack() root.mainloop()#显示窗口