R语言中的rvest库写个视频爬虫通用代码

时间:2025-04-10 10:57:01
# 加载必要库 library(rvest) # 网页抓取 library(httr) # 处理HTTP请求 library(tools) # 处理文件扩展名 # 设置通用视频爬虫函数 video_crawler <- function( url, # 目标页面URL video_selector, # 视频链接的CSS选择器 (e.g., "video source", ".video-link") referer = NULL, # 需要设置的Referer头 file_prefix = "video",# 保存文件前缀 delay = 1 # 请求延迟(秒,防止被封) { # 设置浏览器标识 user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # 发送HTTP请求 response <- GET(url, add_headers( 'User-Agent' = user_agent, 'Referer' = ifelse(is.null(referer), url, referer) )) # 检查响应状态 if (status_code(response) != 200) { stop(paste("请求失败,状态码:", status_code(response))) } # 解析网页内容 page <- content(response, as = "parsed") # 提取视频链接 video_elements <- html_nodes(page, video_selector) video_links <- html_attr(video_elements, "src") # 处理相对链接 video_links <- url_absolute(video_links, url) # 去重处理 video_links <- unique(video_links) # 检查是否找到视频 if (length(video_links) == 0) { stop("未找到视频链接,请检查选择器设置") } # 创建保存目录 dir.create("downloads", showWarnings = FALSE) # 下载视频 for (i in seq_along(video_links)) { tryCatch({ # 获取视频文件信息 video_url <- video_links[i] file_ext <- file_ext(video_url) # 生成文件名 filename <- sprintf("downloads/%s_%03d.%s", file_prefix, i, ifelse(nchar(file_ext) > 0, file_ext, "mp4")) # 添加延迟 Sys.sleep(delay) # 下载文件 message(sprintf("正在下载第 %d 个视频:%s", i, video_url)) GET(video_url, add_headers(Referer = url), write_disk(filename, overwrite = TRUE), user_agent(user_agent)) }, error = function(e) { message(sprintf("下载失败:%s", e$message)) }) } message(sprintf("\n成功下载 %d/%d 个视频", sum(file.exists(sprintf("downloads/%s_%03d.*", file_prefix, seq_along(video_links)))), length(video_links))) } # 使用示例 (需要替换实际参数) # video_crawler( # url = "https://example.com/videos", # video_selector = "video source", # 根据目标网站结构调整 # referer = "https://example.com/", # 有些网站需要验证来源 # delay = 2 # 更长的延迟更安全 # )