[置顶] python3.x爬虫实战:某网站定向信息抓取

时间:2020-12-10 18:34:37

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymongo
import time
from pymongo.collection import ReturnDocument
b=webdriver.Chrome()
wait=WebDriverWait(b,10)
KEY_WORD="建筑"
URL="https://www.XXXXXXXXX.com"
MONGO_URL='localhost'
MONGO_DB='albbs'
MONGO_TABLE='supplier'
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

def search():
try:
b.get(URL)
ul=b.find_element_by_css_selector("#masthead > div.ali-search.fd-right > div.searchtypeContainer > ul")
ul.click()
b.find_element(By.XPATH,"//*[@id='masthead']/div[2]/div[1]/ul/li[2]").click()
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#alisearch-keywords")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#alisearch-submit")))
input.send_keys(KEY_WORD)
submit.click()
total= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#sw_mod_pagination_form > div > span")))
get_url()
time.sleep(2)
return total.text
except TimeoutException:
return search()
def next_page(page_number):
try:
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jumpto")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#jump-sub")))
input.clear()
input.send_keys(page_number)
submit.click()
time.sleep(5)
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#sw_mod_pagination_content > div > span.page-cur"),str(page_number)))
get_url()
time.sleep(8)
except TimeoutException:
next_page(page_number)
def get_url():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#sw_mod_searchlist")))

html=b.page_source
soup = BeautifulSoup(html,'html.parser')
a=soup.find_all("a",class_="list-item-title-text")
for i in a:
try:
get_information_url(i.attrs['href'])

except:
continue
def save_to_mongo(results):
try:
if db[MONGO_TABLE].insert(results):
print('存储成功')
except:
print('存储异常')


def get_information_url(url):
try:
b.get(url)
contactinfo=b.find_element_by_link_text("联系方式")
contactinfo.click()
time.sleep(1)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#site_content > div.grid-main > div > div > div > div.m-content")))
html=b.page_source
soup = BeautifulSoup(html,'html.parser')
div=soup.find("h4")
company_name=div.text
contactinfo_name=soup.find("a",class_="membername").text
MobilePhone=soup.find("dl",class_="m-mobilephone").text
MobilePhone=int(re.compile('(\d+)').search(MobilePhone).group())
addr=soup.find("dd",class_="address").text
print(company_name)
print(contactinfo_name)
print(MobilePhone)
print(addr)
contactinfos={"公司名":company_name,"联系人":contactinfo_name,"手机号码":MobilePhone,"地址":addr}
save_to_mongo(contactinfos)
time.sleep(8)
except TimeoutException:
get_information_url(url)

def main():
total=search()
total=int(re.compile('(\d+)').search(total).group(1))
print(total)
for i in range(2,total +1):
next_page(i)

if __name__ == '__main__':
main()

自己的学习笔记。