wget 无法限制下载文件的大小,如果你的URL列表中有一个很大的文件,势必导致下载过程延长,故用curl获得文件的header,解析出其中的content-length,来获得将要下载的文件长度。如果超过预先设置的threshold,则不予下载。
当然目前的bash shell版本对不存在content-length的URL,不做特殊处理,所以仍无法避免大文件的下载。具体扩展思路:
在下载过程中,不断query文件的大小,当超过一定阈值,kill掉下载进程。
所以当前版本还有待改进:
if [ $# -eq 4 ]
then
echo "start downloading..."
urllist=$1
limitsize=$2
outfolder=$3
logfolder=$4
echo "url list file:$urllist"
echo "limited file size:$limitsize bytes"
echo "output folder:$outfolder"
echo "log folder:$logfolder"
else
echo "usage: ./download.sh <url list> <limited file size> <output folder> <log folder>..."
exit
fi
if [ -d "$outfolder" ]
then
echo "$outfolder exists..."
else
echo "make $outfolder..."
mkdir $outfolder
fi
if [ -d "$logfolder" ]
then
echo "$logfolder exists..."
else
echo "make $logfolder..."
mkdir $logfolder
fi
cat $urllist|while read url;do
echo "downloading:$url"
len=$(curl -I -s "$url"|grep Content-Length|cut -d' ' -f2|tr -d '\15')
if [ ! -z $len ]
then
echo "length:$len bytes"
if [ $len -gt $limitsize ]
then
echo "$url is greater than $limitsize bytes, can't be downloaded."
else
echo "$url is smaller than $limitsize bytes, can be downloaded."
filename=$(echo $url|tr -d ':/?\|*<>')
wget -P $outfolder -x -t 3 --save-headers --connect-timeout=10 --read-timeout=10 --level=1 $url -o $logfolder/$filename.txt
fi
else
echo "$url file size is unknown."
filename=$(echo $url|tr -d ':/?\|*<>')
wget -P $outfolder -x -t 3 --save-headers --connect-timeout=10 --read-timeout=10 --level=1 $url -o $logfolder/$filename.txt
fi
done