使用golang+代理IP+goquery开发爬虫(爬取国外电影网站)

时间:2022-01-14 12:56:53
package main

import (
        "fmt"
        "github.com/PuerkitoBio/goquery"
        "net/http"
        "net/url"
        "time"
        "strconv"
        "strings"
        "log"
)

func main() {
	for k:= 206044;k<300000;k++{
		Bluray(k)
	}
	
}


func Between(str, starting, ending string) string {
    s := strings.Index(str, starting)
    if s < 0 {
        return ""
    }
    s += len(starting)
    e := strings.Index(str[s:], ending)
    if e < 0 {
        return ""
    }
    return str[s : s+e]
}

func Bluray(i int){
		
	        req_url := "https://www.blu-ray.com/movies/The-Meg-4K-Blu-ray/" + strconv.Itoa(i)
	LABEL1:        
			fmt.Println("start id:",i)
	
			ipAddress := httpGet()//这里获取代理IP 返回eg:127.0.0.1:6666
			
	        proxy := func(_ *http.Request) (*url.URL, error) {
	                return url.Parse("http://"+ipAddress)
	        }
	        transport := &http.Transport{Proxy: proxy}
	
	        c := &http.Client{Transport: transport,Timeout:30*time.Second}
	
	        req, err := http.NewRequest("GET", req_url, nil)
	        if err != nil {//这里处理异常方式有些不恰当,暂时对golang error没有详细了解,所以直接goto了
	        	goto LABEL1
	        },
	
	        res, err := c.Do(req)
	        if err != nil {
	        	goto LABEL1
	        }

	        doc, err := goquery.NewDocumentFromReader(res.Body)
	        if err != nil {
	        	goto LABEL1
	        }    
	        
	        res.Body.Close()
	        
	        amazonAddress,al := doc.Find("#movie_buylink").Attr("href")
	        if al{
	        	request, err := http.NewRequest("GET", amazonAddress, nil)
	        	if err != nil {
		        	log.Fatal(err)
		        }
	        	resp, err := c.Do(request)
	        	if err != nil {
		        	log.Fatal(err)
		        }
			
				baseURI := resp.Request.URL.Path//这里被坑了一下,开始使用resp.Request.URL获取网页baseuri死活转不了string,后来看了下源码发现path是返回string
				baseARR := strings.Split(baseURI, "/")
				asin := baseARR[len(baseARR)-1]
				
				
				fmt.Println(asin)
				
		        resp.Body.Close()
	        }
	        
	        
	        
	        _,fl := doc.Find("div[itemprop=review][itemscope][itemtype]").Attr("itemtype")//网页结构不一,这里使用两条路径判断
	        
	        var xpath = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>span.subheading"
	        
	        var xpath2 = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>table"
	        
	        if fl{
	        	
		        xpath = "div[itemprop=review][itemscope][itemtype]>span.subheading"
		        
		        xpath2 = "div[itemprop=review][itemscope][itemtype]>table"
	        }
	        
	        
	        
			doc.Find(xpath).Each(func(i int, selection *goquery.Selection) {
				runtime := selection.Find("#runtime").Text()
				fmt.Println(runtime)
				
				selection.Find("a").Each(func(i int, se *goquery.Selection) {
					a,_ := se.Attr("href")
					if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?studioid="){
						studio := se.Text()
						fmt.Println(studio)
					}
					if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?year="){
						years := se.Text()
						fmt.Println(years)
					}
					if strings.Contains(a,"https://www.blu-ray.com/movies/releasedates.php?year="){
						uptime := se.Text()
						fmt.Println(uptime)
					}
				})
			})
			
			doc.Find(xpath2).Each(func(i int, selection *goquery.Selection) {
				
				title := selection.Find("h1[itemprop=itemReviewed]").Text()
				fmt.Println(title)
				
				state,_ := selection.Find("img[src][width][height][title][alt][style]").Attr("title")
				fmt.Println(state)
				
				
				
				style,_ := selection.Attr("style")
				if strings.Contains(style,"margin-bottom: 10px; "){
					
					
					subtitles := selection.Find("#longsubs").Text()
					newstr := string([]rune(subtitles)[:])
					arr := strings.Split(newstr, "(less)")
					fmt.Println(arr[0])
					
					
					
					
					
					audio := selection.Find("#longaudio").Text()
					newstr2 := string([]rune(audio)[:])
					arr2 := strings.Split(newstr2, "(less)")
					fmt.Println(arr2[0])
					//fmt.Println(strings.TrimSpace(audio))
					
					
					
					

					html,_ := selection.Find("td[width='228px']").Html()
					
					fi := strings.Split(html, "<br/>")//这里是为了获取<br>之间的内容,所以直接切分了
					
					for j:=0;j<len(fi);j++{
						if strings.Contains(fi[j],"Codec"){
							codec := string([]rune(fi[j])[7:])//这里也遇坑了,golang中文字符处理必须的rune转换一下,不然无效
							fmt.Println(strings.TrimSpace(codec))
						}
						if strings.Contains(fi[j],"Resolution"){
							resolution := string([]rune(fi[j])[11:])
							fmt.Println(strings.TrimSpace(resolution))
						}
						if strings.Contains(fi[j],"Region"){
							playback := string([]rune(fi[j])[7:])
							num := strings.Index(playback, "<")
							if num != -1{
								playback = string([]rune(fi[j])[7:7+num])
							}
							fmt.Println(strings.TrimSpace(playback))
						
						}
						
					}	
				
				}

			})
			fmt.Println("--------------------") 
			return //这里可以根据实际情况返回一个结构体 ,暂时没写返回值
	        
	}

  

第一个爬虫遇坑比较多,golang了解不足,写得比较粗糙跟耗资源,欢迎大家指正!

使用golang+代理IP+goquery开发爬虫(爬取国外电影网站)