package main import ( "fmt" "github.com/PuerkitoBio/goquery" "net/http" "net/url" "time" "strconv" "strings" "log" ) func main() { for k:= 206044;k<300000;k++{ Bluray(k) } } func Between(str, starting, ending string) string { s := strings.Index(str, starting) if s < 0 { return "" } s += len(starting) e := strings.Index(str[s:], ending) if e < 0 { return "" } return str[s : s+e] } func Bluray(i int){ req_url := "https://www.blu-ray.com/movies/The-Meg-4K-Blu-ray/" + strconv.Itoa(i) LABEL1: fmt.Println("start id:",i) ipAddress := httpGet()//这里获取代理IP 返回eg:127.0.0.1:6666 proxy := func(_ *http.Request) (*url.URL, error) { return url.Parse("http://"+ipAddress) } transport := &http.Transport{Proxy: proxy} c := &http.Client{Transport: transport,Timeout:30*time.Second} req, err := http.NewRequest("GET", req_url, nil) if err != nil {//这里处理异常方式有些不恰当,暂时对golang error没有详细了解,所以直接goto了 goto LABEL1 }, res, err := c.Do(req) if err != nil { goto LABEL1 } doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { goto LABEL1 } res.Body.Close() amazonAddress,al := doc.Find("#movie_buylink").Attr("href") if al{ request, err := http.NewRequest("GET", amazonAddress, nil) if err != nil { log.Fatal(err) } resp, err := c.Do(request) if err != nil { log.Fatal(err) } baseURI := resp.Request.URL.Path//这里被坑了一下,开始使用resp.Request.URL获取网页baseuri死活转不了string,后来看了下源码发现path是返回string baseARR := strings.Split(baseURI, "/") asin := baseARR[len(baseARR)-1] fmt.Println(asin) resp.Body.Close() } _,fl := doc.Find("div[itemprop=review][itemscope][itemtype]").Attr("itemtype")//网页结构不一,这里使用两条路径判断 var xpath = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>span.subheading" var xpath2 = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>table" if fl{ xpath = "div[itemprop=review][itemscope][itemtype]>span.subheading" xpath2 = "div[itemprop=review][itemscope][itemtype]>table" } doc.Find(xpath).Each(func(i int, selection *goquery.Selection) { runtime := selection.Find("#runtime").Text() fmt.Println(runtime) selection.Find("a").Each(func(i int, se *goquery.Selection) { a,_ := se.Attr("href") if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?studioid="){ studio := se.Text() fmt.Println(studio) } if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?year="){ years := se.Text() fmt.Println(years) } if strings.Contains(a,"https://www.blu-ray.com/movies/releasedates.php?year="){ uptime := se.Text() fmt.Println(uptime) } }) }) doc.Find(xpath2).Each(func(i int, selection *goquery.Selection) { title := selection.Find("h1[itemprop=itemReviewed]").Text() fmt.Println(title) state,_ := selection.Find("img[src][width][height][title][alt][style]").Attr("title") fmt.Println(state) style,_ := selection.Attr("style") if strings.Contains(style,"margin-bottom: 10px; "){ subtitles := selection.Find("#longsubs").Text() newstr := string([]rune(subtitles)[:]) arr := strings.Split(newstr, "(less)") fmt.Println(arr[0]) audio := selection.Find("#longaudio").Text() newstr2 := string([]rune(audio)[:]) arr2 := strings.Split(newstr2, "(less)") fmt.Println(arr2[0]) //fmt.Println(strings.TrimSpace(audio)) html,_ := selection.Find("td[width='228px']").Html() fi := strings.Split(html, "<br/>")//这里是为了获取<br>之间的内容,所以直接切分了 for j:=0;j<len(fi);j++{ if strings.Contains(fi[j],"Codec"){ codec := string([]rune(fi[j])[7:])//这里也遇坑了,golang中文字符处理必须的rune转换一下,不然无效 fmt.Println(strings.TrimSpace(codec)) } if strings.Contains(fi[j],"Resolution"){ resolution := string([]rune(fi[j])[11:]) fmt.Println(strings.TrimSpace(resolution)) } if strings.Contains(fi[j],"Region"){ playback := string([]rune(fi[j])[7:]) num := strings.Index(playback, "<") if num != -1{ playback = string([]rune(fi[j])[7:7+num]) } fmt.Println(strings.TrimSpace(playback)) } } } }) fmt.Println("--------------------") return //这里可以根据实际情况返回一个结构体 ,暂时没写返回值 }
第一个爬虫遇坑比较多,golang了解不足,写得比较粗糙跟耗资源,欢迎大家指正!