自建yum镜像

时间:2021-08-28 19:40:35
#!/usr/bin/python
#-*- coding: utf-8 -*-
import requests
import sys,logging,traceback
from bs4 import BeautifulSoup as bsp #计算顶层目录
num=0
num_2=0
data_url=[]
tmp=[]
def dg(url,url_head,url_tail,num,centos_ver):
global tmp,num_3
r = requests.get("%s%s/%s"%(url,url_head,url_tail))
html=r.content
soup=bsp(html,'html.parser')
#显示所有内容
#print(soup.prettify())
try:
#用soup的方法直接将title格式化有问题,这里手动格式化
dg_url_head=str(soup.title).replace('<title>','').replace('</title>','').split()[-1]
for i in soup.find_all('a') :
#用于获取centos版本
try:
#判断失败则代表不是数字类型就继续
if not i.get('href').endswith('../') and num == 0 and float(i.get('href').split('/')[0].split()[0]) < centos_ver:
#显示当前跳过的版本
#print(int(i.get('href').split('/')[0]))
continue except:
#判断包含.的并不完全正确,yum源里有的文件名有.这里做冗余
#print("%s%s"%(url,dg_url_head))
pass
try:
if not i.get('href').endswith('../') and num == 0:
if float(i.get('href').split('/')[0].split()[0]) >= centos_ver:
#显示当前在哪个版本
#print("%s%s/%s"%(url,url_head,i.get('href').split('/')[0]))
error_log="%s%s/%s"%(url,url_head,i.get('href').split('/')[0])
logging.info(error_log)
except:
error_log="%s%s/%s"%(url,url_head,i.get('href').split('/')[0])
logging.info(error_log)
if not i.get('href').endswith('../') and i.get('href').endswith('/'):
#显示完整目录路径
#print("%s%s%s"%(url,url_head,i.get('href')))
#记录递归层数,每递归一次加1
num+=1
#每递归一层就记录一层目录
dg(url,dg_url_head,i.get('href'),num,centos_ver) num_3=num_2
#递归结束一层则代表目录退出一层,所以num要建议
num-=1 elif not i.get('href').endswith('../'):
#pass
#显示完整目录和下载路径
#print('/'.join(tmp))
#print("%s%s%s"%(url,lj,i.get('href')))
data_url.append("%s%s%s"%(url,dg_url_head,i.get('href'))) except:
#print("%s%s/%s"%(url,url_head,url_tail))
#print(soup.prettify())
traceback.print_exc()
sys.exit(0) #print(url_head)
def start(file,url,url_head,url_tail,centos_ver):
url=url
url_head=url_head
url_tail=url_tail
centos_ver=centos_ver
data=dg(url,url_head,url_tail,num,centos_ver)
output = open(file, 'w')
output.write('\n'.join(data_url))
output.close( )
return 'ok'

上面保存为dg.py

 #!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib,sys,json,shutil
import os,requests,re,time
import dg,logging,traceback
from multiprocessing import Process,Pool date_ymd=time.strftime("%Y-%m-%d", time.localtime())
def date_time():
return time.strftime("%Y-%m-%dT%H-%M-%S", time.localtime())
#下载文件存储路径
file_path='/data/wwwroot/yum/centos'
#信息存储路径
file_dir='.'
file_dir_log="./log"
if not os.path.exists(file_dir):
os.makedirs(file_dir)
if not os.path.exists(file_dir_log):
os.makedirs(file_dir_log)
if not os.path.exists(file_path):
os.makedirs(file_path)
download_log_name="%s/download_log_%s.log"%(file_dir_log,date_ymd)
#存储下载行数
download_Record_name="%s/download_Record.lock"%file_dir
#下载列表
network_list="%s/all_list.txt"%file_dir
#进程数
process_num=6
#dg下载的地址
dg_url='https://mirrors.aliyun.com'
#dg_url目录
dg_url_head='/centos'
#文件
dg_url_tail=''
#指定开始的版本
dg_centos_ver=7 #存储日志 logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]',
filename="%s_debug"%(download_log_name),
filemode='a') #################################################################################################
#定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('[%(asctime)s] %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
#输入一份到日志文件里
file_handler = logging.FileHandler(download_log_name)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logging.getLogger('').addHandler(file_handler)
################################################################################################# def date_def():
date=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return date def Schedule(a,b,c):
'''''
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
'''
per = 100.0 * a * b / c
if per > 100 :
per = 100
#print ('%.2f%%' %(per))
logging.debug('%.2f%%' %(per)) def file_add_del(filename,data):
output = open(filename, 'wb')
output.write(data)
output.close( ) def file_log(filename,data):
output = open(filename, 'a')
output.write(data)
output.close( ) #获取需要下载的列表
#print('\n'.join(add_list_dir_Size())) def url_down(url_n,num):
#记录错误次数,重试3次
num=int(num)
error_num=0
if url_n != '':
url=url_n.replace('\n','').replace('\r','')
r=requests.get(url)
#通过headers信息获取文件大小
size=r.headers['Content-Length'] dir=url.split('/')
file=dir[-1]
del dir[0:4]
del dir[-1]
dir='/'.join(dir)
logging.debug(url)
while True: #文件存在则重新下载
if os.path.exists('%s/%s/%s'%(file_path,dir,file)):
os.remove('%s/%s/%s'%(file_path,dir,file))
#判断文件夹是否存在
if not os.path.exists('%s/%s'%(file_path,dir)):
os.makedirs('%s/%s'%(file_path,dir))
url_date=date_time()
#下载进度
#urllib.urlretrieve(url,'%s/%s/%s_%s'%(file_path,dir,file,url_date),Schedule)
#urllib.urlretrieve(url,'%s/%s/%s_%s'%(file_path,dir,file,url_date))
os.popen("wget --limit-rate=200k %s/%s/%s_%s %s"%(file_path,dir,file,url_date,url))
shutil.move('%s/%s/%s_%s'%(file_path,dir,file,url_date),'%s/%s/%s'%(file_path,dir,file))
#文件下载后存在则判断大小是否一致
if os.path.exists('%s/%s/%s'%(file_path,dir,file)):
path_size=os.path.getsize('%s/%s/%s'%(file_path,dir,file))
if float(path_size) == float(size):
#print({"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
error_log=json.dumps({num:{"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
#将正常的日志就输入到debug
logging.info(error_log)
#return error_log
#print("%s\t%s\n"%(date,error_log))
break
else:
if error_num >2:
#print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
#将正常的日志就输入到debug
logging.info(error_log)
#print("%s\t%s\n"%(date,error_log))
break
return error_log
error_num+=1
#下载后文件不存在则重试
else:
if error_num >2:
#print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
#将错误的日志输入到普通日志里
logging.error(error_log) #print("%s\t%s\n"%(date,error_log))
#return error_log
#data_log[num]={"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date}
break
error_num+=1
#获取本地文件
def dg_Local_files_and_network_files(path):
file_list=[]
for root, dirs, files in os.walk(path, topdown=False):
for name in files:
file_list.append(os.path.join(root, name).replace(path,"https://mirrors.aliyun.com/centos").replace("\\","/")) return file_list
#校验
def delete(): while True:
data=dg_Local_files_and_network_files(file_path)
network=open(network_list).read().split('\n')
new_network=[]
new_data=[]
for i in network:
if i != '':
new_network.append(i)
for i in data:
if i != '':
new_data.append(i)
delete_data=list(set(new_data)-set(new_network))
add=list(set(new_network) - set(new_data))
if not os.listdir(file_path):
logging.info("删除空目录%s"%'/'.join(file_path))
if len(add) == 0 and len(delete_data) ==0:
logging.info("校验成功本地与网络无差别")
break
elif len(add) != 0:
for i in add:
if i!='':
logging.info("下载差异文件%s"%i)
url_down(i,"")
elif len(delete_data) != 0:
for i in delete_data:
if i!='':
i=i.replace("https://mirrors.aliyun.com/centos/","%s/"%file_path)
logging.info("删除差异文件%s"%i)
os.remove(i) #测试
#print("开始下载"%date)
#url_data()
if __name__ == '__main__':
while True:
try: num=1
exit=0 if os.path.exists(download_Record_name):
logging.info("检测到上次未下载完,重新上次的下载")
dg_Local_files_and_network_files(file_path)
logging.info("开始下载")
####################下面为下载方法############################################## ##########################################################################
mainStart = time.time()
num=0
#data_log=dict()
p = Pool(process_num)
nework_list=open(network_list).read().split('\n')
load_list=dg_Local_files_and_network_files(file_path)
for url_n in list(set(nework_list)-set(load_list)):
num+=1
#下载
p.apply_async(url_down,args=(url_n,str(num),))
logging.info('等待所有子进程完成…')
p.close()
p.join()
mainEnd = time.time()
logging.info('所有进程运行 %s 秒.'%(mainEnd-mainStart))
#下载完成日志分割
file_log(download_log_name,"#"*100)
logging.info("下载完成")
logging.info("开始校验")
delete()
#下载完成清空进度
os.remove(download_Record_name)
########################################################################## ##########################################################################
else:
#这里为第一次运行的
if not os.path.exists(download_Record_name):
logging.info("dg.py运行")
dg_po=dg.start(network_list,dg_url,dg_url_head,dg_url_tail,dg_centos_ver)
if 'ok' not in dg_po:
logging.error("dg运行故障")
else:
file_add_del(download_Record_name,'')
else:
logging.info("dg.py检测已经执行过了")
#如果network_list_old文件存在,就代表不是第一次下载,则进行筛选下载,判断有无更新
nework_list=open(network_list).read().split('\n')
load_list=dg_Local_files_and_network_files(file_path)
if len(list(set(nework_list) - set(load_list))) == 0:
logging.info("不用更新")
os.remove(download_Record_name)
exit=1
sys.exit(0) #开始下载
if num == 1:
logging.info("开始下载")
file_add_del(download_Record_name,"")
#######################下面为下载方法############################################# ##########################################################################
mainStart = time.time()
num=0
#data_log=dict()
p = Pool(process_num)
nework_list=open(network_list).read().split('\n')
load_list=dg_Local_files_and_network_files(file_path)
for url_n in list(set(nework_list)-set(load_list)):
num+=1
#下载
p.apply_async(url_down,args=(url_n,str(num),))
logging.info('等待所有子进程完成…')
p.close()
p.join()
mainEnd = time.time()
logging.info('所有进程运行 %0.2f 秒.'%(mainEnd-mainStart))
logging.info("下载完成")
logging.info("开始校验")
delete()
#下载完成清空进度
os.remove(download_Record_name)
#下载完成日志分割
file_log(download_log_name,"#"*100)
########################################################################## ##########################################################################
#运行结束删除锁
os.remove(download_Record_name)
logging.info("结束")
break
except:
if exit==0:
#for i in traceback.format_exc().split('\n'):
# logging.error(i)
if not os.path.exists(download_Record_name) and os.path.exists(network_list):
logging.info("由于dg.py执行故障要将刚生成的以下文件去除")
os.remove(network_list)
logging.info(network_list)
logging.error('\n%s'%traceback.format_exc())

这个保存为dg_download.py

执行pg_download.py就可以开始爬取了,可以修改里面的爬取版本

dg.py爬取镜像,pg_download.py判断是否需要更新

剩下的就是等爬取好后,搭建web服务发布出去