解码文件网站并保存

时间:2022-09-06 15:31:56
 1 import urllib.request
 2 import chardet
 3 
 4 def main():
 5     count = 1
 6     with open('D:\\urls.txt') as f:
 7         while True:    
 8             url = f.readline()
 9             if url == '':
10                 break
11             
12             url_content = urllib.request.urlopen(url).read()
13             
14             #获取网页编码
15             encode = chardet.detect(url_content)['encoding']
16             if encode == 'GB2312':
17                 encode = 'GBK'
18             #解码
19             url_content = url_content.decode(encode)
20 
21             #文件名称
22             file_name = 'D:\\url_%d.txt' % count
23 
24             #写入内容
25             with open(file_name,'a',encoding = encode) as g:                
26                 g.write(url_content)
27                 
28             count += 1
29             
30 if __name__ == '__main__':
31     main()