解析url用的类库:
python2版本:
1
2
|
from urlparse import urlparse
import urllib
|
python3版本:
1
2
|
from urllib.parse import urlparse
import urllib.request
|
研究了不同的url规则发现:只要在搜索关键字是用=嫁接的,查询的关键在解析后的query里
如果不是用=嫁接,查询的关键在解析后的path里。
解析的规则都是一样的,正则如下:(6中不同情况的组合)
另外host为‘s.weibo.com'的url编码与其他不同要另做处理。
代码如下:有些网站的规则还不是很清楚,需要花大量时间找规则,规则越清晰,关键字就越清楚,如下规则已适合绝大部分网站,酌情参考。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
# -*- coding:utf-8 -*-
from urlparse import urlparse
import urllib
import re
# url
source_txt = "e:\\python_anaconda_code\\url.txt"
# 规则
regular = r '(\w+(%\w\w)+\w+|(%\w\w)+\w+(%\w\w)+|\w+(%\w\w)+|(%\w\w)+\w+|(%\w\w)+|\w+)'
# 存放关键字
kw_list = list ()
# key为要研究网站的host,value为关键字的嫁接标识符
dict = {
"www.baidu.com" : "wd=" ,
"news.baidu.com" : "word=" ,
"www.sogou.com" : "query=" ,
"tieba.baidu.com" : "kw=" ,
"wenku.baidu.com" : "word=" ,
"music.sina.com.cn" : "k=" ,
"www.haosou.com" : "q=" ,
"www.lagou.com" : "list_" ,
"www.chunyuyisheng.com" : "query=" ,
"s.weibo.com" : "weibo/"
}
def main():
with open (source_txt, 'r' ) as f_source_txt:
for url in f_source_txt:
host = url.split( "//" )[ 1 ].split( "/" )[ 0 ]
if host in dict :
flag = dict [host]
if flag.find( "=" ) ! = - 1 :
query = urlparse(url).query.replace( '+' , '')
kw = re.search(flag + regular, query, re.i) # .group(0)
if kw:
kw = urllib.unquote(kw.group( 0 ).split(flag)[ 1 ])
print (kw)
else :
path = urlparse(url).path.replace( '+' , '')
kw = re.search(flag + regular, path.replace( "%25" , "%" ), re.i)
if kw:
kw = urllib.unquote(kw.group( 0 ).split(flag)[ 1 ])
print (kw)
if __name__ = = '__main__' :
main()
|
url.txt的内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
|
https: / / www.baidu.com / s?ie = utf - 8 &f = 8 &rsv_bp = 0 &rsv_idx = 1 &ch = &tn = baidu&bar = &wd = python&rn = &oq = &rsv_pq = ece0867c0002c793&rsv_t = edeaqq7ddvznxq % 2fzvra5k % 2beuanltiuxhgihvutaqdfoecluxr25xkdp % 2bi0i &rqlang = cn&rsv_enter = 1 &inputt = 218
https: / / www.baidu.com / s?ie = utf - 8 &f = 8 &rsv_bp = 1 &rsv_idx = 1 &ch = &tn = baidu&bar = &wd = python % e9 % 87 % 8c % e7 % 9a % 84 % e5 % ad % 97 % e5 % 85 % b8dict&oq = python&rsv_pq = 96c160e70003f332 &rsv_t = 0880nkovmir3tvoddp1t8eblod8qwr4yep6cfpjqihqnnhdexfuwyofmrx0 &rqlang = cn&rsv_enter = 0 &inputt = 10411
https: / / www.baidu.com / s?ie = utf - 8 &f = 8 &rsv_bp = 1 &rsv_idx = 1 &ch = &tn = baidu&bar = &wd = python % e9 % 87 % 8c % e7 % 9a % 84urlprese &oq = python % 25e9 % 2587 % 258c % 25e7 % 259a % 2584re % 25e9 % 2587 % 258c % 25e7 % 259a % 2584 % 257c % 25e6 % 2580 % 258e % 25e4 % 25b9 % 2588 % 25e7 % 2594 % 25a8 &rsv_pq = d1d4e7b90003d391&rsv_t = 5ff4vok4eelk1pgj4osk8l0vvkan51 % 2bl8ns % 2fjsubexg7lb7znkctvnvtn8m &rqlang = cn&rsv_enter = 1 &inputt = 2797
https: / / www.baidu.com / s?ie = utf - 8 &f = 8 &rsv_bp = 1 &rsv_idx = 1 &ch = &tn = baidu&bar = &wd = python + + wo + % e7 % 88 % b1urlprese&oq = python % 25e9 % 2587 % 258c % 25e7 % 259a % 2584urlprese &rsv_pq = eecf45e900033e87&rsv_t = 1c70xayhrvw5joza7lpvgt4pw % 2bw1to8hqtejth67jgeqfqagydydd25hamu &rqlang = cn&rsv_enter = 0 &inputt = 10884
http: / / news.baidu.com / ns?word = % e8 % b6 % b3 % e7 % 90 % 83 &tn = news& from = news&cl = 2 &rn = 20 &ct = 1
http: / / news.baidu.com / ns?ct = 1 &rn = 20 &ie = utf - 8 &bs = % e8 % b6 % b3 % e7 % 90 % 83 &rsv_bp = 1 &sr = 0 &cl = 2 &f = 8 &prevct = no&tn = news&word = + + + + + + % e8 % b6 % b3 + + % e7 % 90 % 83 + + + + + % e4 % bd % a0 % e5 % a5 % bd + % e5 % 98 % 9b % ef % bc % 9f &rsv_sug3 = 14 &rsv_sug4 = 912 &rsv_sug1 = 4 &inputt = 8526
http: / / tieba.baidu.com / f?ie = utf - 8 &kw = % e7 % ba % a2 % e6 % b5 % b7 % e8 % a1 % 8c % e5 % 8a % a8&fr = search&red_tag = q0224393377
https: / / www.sogou.com / web?query = ni + zai + % e6 % 88 % 91 + % e5 % bf % 83li &_asf = www.sogou.com&_ast = 1520388441 &w = 01019900 &p = 40040100 &ie = utf8& from = index - nologin&s_from = index&sut = 9493 &sst0 = 1520388440692 &lkt = 8 % 2c1520388431200 % 2c1520388436842 &sugsuv = 1498714959961744 &sugtime = 1520388440692
https: / / www.lagou.com / jobs / list_python % e5 % a4 % a7 % e6 % 95 % b0 % e6 % 8d % aemr?labelwords = &fromsearch = true&suginput =
https: / / www.chunyuyisheng.com / pc / search / ?query = % e6 % 85 % a2 % e6 % 80 % a7 % e4 % b9 % 99 % e8 % 82 % 9d %
http: / / s.weibo.com / weibo / % 25e5 % 2594 % 2590 % 25e4 % 25ba % 25ba % 25e8 % 25a1 % 2597 % 25e6 % 258e % 25a2 % 25e6 % 25a1 % 25882 &refer = index
http: / / s.weibo.com / weibo / % 25e4 % 25bd % 25a0 % 25e5 % 25a5 % 25bd123mm % 2520 % 25e5 % 2597 % 25af % 2520mm11 &refer = stopic_box
|
结果如下:
如果要研究其他host,可以加到字典dict里。
备注:以上代码和思路仅供参考,如有更好的方法敬请留言!
以上这篇python解析、提取url关键字的实例详解就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/IBoyMan/article/details/79482827