爬取熊猫TV,javascript,selenium,模拟点击

时间:2023-11-10 12:25:14
from selenium import webdriver
import csv def get_pages_numger(browser):
res = browser.find_elements_by_xpath('//div[@class="page-component"]/a[7]')
return int(res.text) def get_next_page_buttun(browser):
button = browser.find_elements_by_xpath()
return button def get_rooms_number_in_a_page(browser):
res = []
for li in browser.find_elements_by_xpath('//li[@data-id]'):
id = li.get_attribute("data-id")
nickname = li.find_element_by_xpath('//span[@class="video-nickname"]')
number = li.find_element_by_xpath('//span[@class="video-number"]') # 观众数
cate = li.find_element_by_xpath('//span[@class="video-cate"]')
res.append([id, nickname, cate, number])
return res def get_rooms_number_in_all_pages(browser, pages_number):
res = []
for i in range(pages_number):
print('第{}页'.format(i+1))
# 抓取
res.extend(get_rooms_number_in_a_page(browser)) # 点击进入下一页
next_page_button = get_next_page_buttun(browser)
next_page_button.click() browser.close() return res def save_to_csv(rooms_number):
with open('live_rooms_number.csv', 'w') as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow(['id','nickname','cate', 'number']) # 表头
writer.writerows(rooms_number) def read_from_csv():
with open('live_rooms_number.csv', 'r') as f:
reader = csv.reader(f)
your_list = list(reader)[1:] # 去掉表头
#print(your_list)
return your_list def get_rooms_number():
browser = webdriver.Firefox() browser.get('http://www.panda.tv/all')
assert '熊猫TV' in browser.title pages_number = get_pages_numger(browser) all_live_rooms_number = get_rooms_number_in_all_pages(browser, pages_number) return all_live_rooms_number if __name__ == '__main__': rooms_number = get_rooms_number()
save_to_csv(rooms_number) #rooms_number = read_from_csv()