R语言中的rvest库写个视频爬虫通用代码
# 加载必要库
library(rvest) # 网页抓取
library(httr) # 处理HTTP请求
library(tools) # 处理文件扩展名
# 设置通用视频爬虫函数
video_crawler <- function(
url, # 目标页面URL
video_selector, # 视频链接的CSS选择器 (e.g., "video source", ".video-link")
referer = NULL, # 需要设置的Referer头
file_prefix = "video",# 保存文件前缀
delay = 1 # 请求延迟(秒,防止被封)
{
# 设置浏览器标识
user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# 发送HTTP请求
response <- GET(url, add_headers(
'User-Agent' = user_agent,
'Referer' = ifelse(is.null(referer), url, referer)
))
# 检查响应状态
if (status_code(response) != 200) {
stop(paste("请求失败,状态码:", status_code(response)))
}
# 解析网页内容
page <- content(response, as = "parsed")
# 提取视频链接
video_elements <- html_nodes(page, video_selector)
video_links <- html_attr(video_elements, "src")
# 处理相对链接
video_links <- url_absolute(video_links, url)
# 去重处理
video_links <- unique(video_links)
# 检查是否找到视频
if (length(video_links) == 0) {
stop("未找到视频链接,请检查选择器设置")
}
# 创建保存目录
dir.create("downloads", showWarnings = FALSE)
# 下载视频
for (i in seq_along(video_links)) {
tryCatch({
# 获取视频文件信息
video_url <- video_links[i]
file_ext <- file_ext(video_url)
# 生成文件名
filename <- sprintf("downloads/%s_%03d.%s",
file_prefix, i,
ifelse(nchar(file_ext) > 0, file_ext, "mp4"))
# 添加延迟
Sys.sleep(delay)
# 下载文件
message(sprintf("正在下载第 %d 个视频:%s", i, video_url))
GET(video_url,
add_headers(Referer = url),
write_disk(filename, overwrite = TRUE),
user_agent(user_agent))
}, error = function(e) {
message(sprintf("下载失败:%s", e$message))
})
}
message(sprintf("\n成功下载 %d/%d 个视频",
sum(file.exists(sprintf("downloads/%s_%03d.*", file_prefix, seq_along(video_links)))),
length(video_links)))
}
# 使用示例 (需要替换实际参数)
# video_crawler(
# url = "https://example.com/videos",
# video_selector = "video source", # 根据目标网站结构调整
# referer = "https://example.com/", # 有些网站需要验证来源
# delay = 2 # 更长的延迟更安全
# )