一、爬取目的:
爬取数据用于论文-大数据背景下我国电影票房预测研究
数据来源:http://www.cbooo.cn/
二、思路解析:
- 爬取首页 电影名称+ID
- 拼接 http://www.cbooo.cn/m/ + ID 获取电影详情页
library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(plyr)
##tidyverse:包含R语言常用的8个包,合集
##httr:相当于py的Request库,
#获取总览页
yien_data <- data.frame()
for (i in 1:395){
url <- paste(\'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=\',i,seq=\'\')
Sys.sleep(0.5)
response <- GET(url, user_agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36")
result <- fromJSON(content(response,as="text"))
yien_data1 <- as.data.frame(result[[1]])
#合并不等长的数据框rbind.fill()
yien_data <- rbind.fill(yien_data,yien_data1)
print(paste(\'已完成\',i,seq=\' \'))
}
#获取详情页
details_data <- data.frame()
for (i in 1:nrow(yien_data)){
movieID<- yien_data[i,2]
details_url <- paste(\'http://www.cbooo.cn/m/\',movieID,sep = \'\')
Sys.sleep(0.2)
session = details_url %>% html_session(add_headers(`User-Agent`="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"))
movie_list <- session %>% html_nodes(\'#top > div:nth-child(3) > div.mainbox.fr > div > div.ziliaoku > div.ziliaofr > div.cont > p\') %>% html_text()
movie_list <- gsub(\'\r\n\',\'\',movie_list)
movie_list <- gsub(\' \',\'\',movie_list)
movie_df <- as.data.frame(str_split_fixed(movie_list, ":", 2))
for (j in 1:nrow(movie_df)){
if (j==1){
details_data[i,j] <- as.character(movie_df[j,1])
}else if(j==2){
details_data[i,j] <- as.character(movie_df[j,1])
}else if(movie_df[j,1] == \'类型\'){
details_data[i,3] <- as.character(movie_df[j,2])
}else if(movie_df[j,1] == \'片长\'){
details_data[i,4] <- as.character(movie_df[j,2])
}else if(movie_df[j,1] == \'上映时间\'){
details_data[i,5] <- as.character(movie_df[j,2])
}else if(movie_df[j,1] == \'制式\'){
details_data[i,6] <- as.character(movie_df[j,2])
}else if(movie_df[j,1] == \'国家及地区\'){
details_data[i,7] <- as.character(movie_df[j,2])
}else if(movie_df[j,1] == \'发行公司\'){
details_data[i,8] <- as.character(movie_df[j,2])
}else{
details_data[i,9] <- as.character(movie_df[j,2])
}
}
}
yien_newfile <- cbind(yien_data,details_data)
最终数据形式如下: