python爬虫(2.获取网页外链与内链)

时间:2024-10-09 10:52:13
from import urlopen from import urlparse from bs4 import BeautifulSoup import re import datetime import random pages = set() (()) #获取页面内链 def getInternalLinks(bsObj,includeUrl): includeUrl=urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc internalLinks=[] for link in ("a",href=("^(/|.*"+includeUrl+")")): if ["href"] is not None: if ["href"] not in internalLinks: (['href']) return internalLinks #获取页面外链并且不包含当前url的链接 "^(http|www)((?!"+excludeUrl+").)*$" def getExternalUrl(bsObj,excludeUrl): externalUrl=[] for link in ("a",("^(www|http)((?!"+excludeUrl+").)*$")): if ["href"] is not None: if ["href"] not in externalUrl: (['href']) return externalUrl #主页面的外链,若无就找主页面中内链随机一个,再在该内链中找外链 def getRandomExternalLink(startingPage): html=urlopen(startingPage) bsObj=BeautifulSoup(html) externalLinks=getExternalUrl(bsObj,startingPage) if len(externalLinks)==0: domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc print("No external links, looking around the site for one") InternalLinks=getInternalLinks(bsObj,domain) page="http://"+InternalLinks[(0,len(InternalLinks)-1)] getRandomExternalLink(page) else: return externalLinks[(0, len(externalLinks)-1)] def followExternalOnly(startingSite): externalLinks=getRandomExternalLink(startingSite) print("tha random external page is:"+externalLinks) followExternalOnly(externalLinks)