R语言爬虫 电影票房-艺恩网

时间:2024-02-16 10:43:42

一、爬取目的:
爬取数据用于论文-大数据背景下我国电影票房预测研究
数据来源:http://www.cbooo.cn/

二、思路解析:

  1. 爬取首页 电影名称+ID
  2. 拼接 http://www.cbooo.cn/m/ + ID 获取电影详情页
library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(plyr)
##tidyverse:包含R语言常用的8个包,合集
##httr:相当于py的Request库,

#获取总览页
yien_data <- data.frame()
for (i in 1:395){
    url <- paste(\'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=\',i,seq=\'\')
    
    Sys.sleep(0.5) 
    
    response <- GET(url, user_agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36")
   
    result <- fromJSON(content(response,as="text"))
    yien_data1 <- as.data.frame(result[[1]])
    #合并不等长的数据框rbind.fill()
    yien_data <- rbind.fill(yien_data,yien_data1)
    print(paste(\'已完成\',i,seq=\' \'))
}
#获取详情页
details_data <- data.frame()
for (i in 1:nrow(yien_data)){
  movieID<- yien_data[i,2]
  details_url <- paste(\'http://www.cbooo.cn/m/\',movieID,sep = \'\')
  
  Sys.sleep(0.2)
  
  session = details_url %>% html_session(add_headers(`User-Agent`="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"))
  
  movie_list <-  session %>% html_nodes(\'#top > div:nth-child(3) > div.mainbox.fr > div > div.ziliaoku > div.ziliaofr > div.cont > p\') %>% html_text()
  movie_list <- gsub(\'\r\n\',\'\',movie_list)
  movie_list <- gsub(\' \',\'\',movie_list)
  movie_df <- as.data.frame(str_split_fixed(movie_list, ":", 2))
  
  for (j in 1:nrow(movie_df)){
    if (j==1){
      details_data[i,j] <- as.character(movie_df[j,1])
    }else if(j==2){
      details_data[i,j] <- as.character(movie_df[j,1])
    }else if(movie_df[j,1] == \'类型\'){
      details_data[i,3] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == \'片长\'){
      details_data[i,4] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == \'上映时间\'){
      details_data[i,5] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == \'制式\'){
      details_data[i,6] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == \'国家及地区\'){
      details_data[i,7] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == \'发行公司\'){
      details_data[i,8] <- as.character(movie_df[j,2])
    }else{
      details_data[i,9] <- as.character(movie_df[j,2])
      }
    }
  }
  
yien_newfile <- cbind(yien_data,details_data)

最终数据形式如下:
G列为总览页展示的票房数据;H列为详情页票房数据