核心代码
requests.get 下载html网页
bs4.BeautifulSoup 分析html内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
from requests import get
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
def Today(style = 1 ):
date = dt.today()
if style! = 1 : return f '{date.month}月{date.day}日'
return f '{date.year}-{date.month:02}-{date.day:02}'
def SinaNews(style = 1 ):
url1 = 'http://news.***.com.cn/'
if style = = 1 : url1 + = 'world'
elif style = = 2 : url1 + = 'china'
else : url1 = 'https://mil.news.sina.com.cn/'
text = get(url1)
text.encoding = 'uft-8'
soup = bs(text.text, 'html.parser' )
aTags = soup.find_all( "a" )
return [(t.text,t[ 'href' ]) for t in aTags if Today() in str (t)]
|
爬取标题
1
2
3
4
5
6
7
8
9
|
for i,news in enumerate (SinaNews( 1 )):
print (f 'No{i+1}:' ,news[ 0 ])
No1: 外媒: * * * * *
No2: 日媒: * * * * * *
......
......
|
内容已马赛克!!!
首次做爬虫,为了方便下手找一个不用破解网页的某新闻网站,下载网页就能直接取得内容。其中的国际、国内和军事新闻三个网页作内容源,requests.get下载网页后,分析所得html文本,所有<a href=...>标记带日期刚好所需要的。
爬取正文
然后再根据url下载正文网页,分析可知id=‘article'的<div>层就是正文所在位置,.get_text()是取得文本的关键函数,然后适当做一些格式处理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
>>> def NewsDownload(url):
html = get(url)
html.encoding = 'uft-8'
soup = bs(html.text, 'html.parser' )
text = soup.find( 'div' , id = 'article' ).get_text().strip()
text = text.replace( '点击进入专题:' , '相关专题:' )
text = text.replace( ' ' , '\n ' )
while '\n\n\n' in text:
text = text.replace( '\n\n\n' , '\n\n' )
return text
>>> url = 'https://******/w/2021-09-29/doc-iktzqtyt8811588.shtml'
>>> NewsDownload(url)
'原标题:******************************************************'
>>>
|
界面代码
使用内置的图形界面库 tkinter 控件 Text 、Listbox、Scrollbar、Button。设置基本属性、放置位置、绑定命令,然后调试到程序完工!
源代码 News.pyw :其中涉及的网站名称已马赛克!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
from requests import get
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
from os import path
import tkinter as tk
def Today(style = 1 ):
date = dt.today()
if style! = 1 : return f '{date.month}月{date.day}日'
return f '{date.year}-{date.month:02}-{date.day:02}'
def SinaNews(style = 1 ):
url1 = 'http://news.****.com.cn/'
if style = = 1 : url1 + = 'world'
elif style = = 2 : url1 + = 'china'
else : url1 = 'https://mil.****.com.cn/'
text = get(url1)
text.encoding = 'uft-8'
soup = bs(text.text, 'html.parser' )
aTags = soup.find_all( "a" )
return [(t.text,t[ 'href' ]) for t in aTags if Today() in str (t)]
def NewsList(i):
global news
news = SinaNews(i)
tList.delete( 0 ,tk.END)
for idx,item in enumerate (news):
tList.insert(tk.END,f '{idx+1:03} {item[0]}' )
tText.config(state = tk.NORMAL)
tText.delete( 0.0 ,tk.END)
tText.config(state = tk.DISABLED)
NewsShow( 0 )
def NewsList1(): NewsList( 1 )
def NewsList2(): NewsList( 2 )
def NewsList3(): NewsList( 3 )
def NewsShow(idx):
if idx! = 0 :
idx = tList.curselection()[ 0 ]
title,url = news[idx][ 0 ],news[idx][ 1 ]
html = get(url)
html.encoding = 'uft-8'
soup = bs(html.text, 'html.parser' )
text = soup.find( 'div' , id = 'article' ).get_text().strip()
text = text.replace( '点击进入专题:' , '相关专题:' )
text = text.replace( ' ' , '\n ' )
while '\n\n\n' in text:
text = text.replace( '\n\n\n' , '\n\n' )
tText.config(state = tk.NORMAL)
tText.delete( 0.0 ,tk.END)
tText.insert(tk.END, title + '\n\n' + text)
tText.config(state = tk.DISABLED)
def InitWindow( self ,W,H):
Y = self .winfo_screenheight()
winPosition = str (W) + 'x' + str (H) + '+8+' + str (Y - H - 100 )
self .geometry(winPosition)
icoFile = 'favicon.ico'
f = path.exists(icoFile)
if f: win.iconbitmap(icoFile)
self .resizable( False , False )
self .wm_attributes( '-topmost' , True )
self .title(bTitle[ 0 ])
SetControl()
self .update()
self .mainloop()
def SetControl():
global tList,tText
tScroll = tk.Scrollbar(win, orient = tk.VERTICAL)
tScroll.place(x = 450 ,y = 320 ,height = 300 )
tList = tk.Listbox(win,selectmode = tk.BROWSE,yscrollcommand = tScroll. set )
tScroll.config(command = tList.yview)
for idx,item in enumerate (news):
tList.insert(tk.END,f '{idx+1:03} {item[0]}' )
tList.place(x = 15 ,y = 320 ,width = 435 ,height = 300 )
tList.select_set( 0 )
tList.focus()
bW,bH = 70 , 35 #按钮的宽高
bX,bY = 95 , 270 #按钮的坐标
tBtn1 = tk.Button(win,text = bTitle[ 1 ],command = NewsList1)
tBtn1.place(x = bX,y = bY,width = bW,height = bH)
tBtn2 = tk.Button(win,text = bTitle[ 2 ],command = NewsList2)
tBtn2.place(x = bX + 100 ,y = bY,width = bW,height = bH)
tBtn3 = tk.Button(win,text = bTitle[ 3 ],command = NewsList3)
tBtn3.place(x = bX + 200 ,y = bY,width = bW,height = bH)
tScroll2 = tk.Scrollbar(win, orient = tk.VERTICAL)
tScroll2.place(x = 450 ,y = 10 ,height = 240 )
tText = tk.Text(win,yscrollcommand = tScroll2. set )
tScroll2.config(command = tText.yview)
tText.place(x = 15 ,y = 10 ,width = 435 ,height = 240 )
tText.config(state = tk.DISABLED,bg = 'azure' ,font = ( '宋体' , '14' ))
NewsShow( 0 )
tList.bind( "<Double-Button-1>" ,NewsShow)
if __name__ = = '__main__' :
win = tk.Tk()
bTitle = ( '今日新闻' , '国际新闻' , '国内新闻' , '军事新闻' )
news = SinaNews()
InitWindow(win, 480 , 640 )
|
奉上全部代码,在此就不作详细分析了,如有需要请留言讨论。我的使用环境 Win7+Python3.8.8 下可以无错运行!文中涉及网站名称已打上马赛克,猜不出名字的可以私下里问我。
软件编译
使用pyinstaller.exe编译成单个运行文件,注意源码文件的后缀名应该用.pyw否则会有cmd黑窗口出现。还有一个小知识点,任意网站的Logo图标icon文件,一般都能在根目录里下载到,即:
http(s)://websiteurl.com(.cn)/favicon.ico
编译命令如下:
D:\>pyinstaller --onefile --nowindowed --icon="D:\favicon.ico" News.pyw
编译完成后,在dist文件夹下生成一个News.exe可执行文件,大小约15M还能接受。
反正拿走就能直接用
以上就是Python小程序爬取今日新闻拿走就能用的详细内容,更多关于Python小程序的资料请关注服务器之家其它相关文章!
原文链接:https://blog.csdn.net/boysoft2002/article/details/120549021