Golang实战【IP池-自动切换IP爬虫】

时间:2021-02-22 19:23:24
继上次爬boss直聘数据IP被封,虽然后面发现boss直聘 只要稍微没个请求停一秒钟就可以一直爬下去,但是任然心有不甘。欺负我ip少???
于是这两天抽空写了个IP池加上可以自动切换IP 千千万万个IP 让你封···········哈哈

我这边用的是西刺的代理 还是比较稳的········拿着西刺的免费IP 爬着西刺的免费IP [:P]

实现效果如下
Golang实战【IP池-自动切换IP爬虫】
········································································································································
Golang实战【IP池-自动切换IP爬虫】
感兴趣的同学可以玩玩:不过需要redis可以先安装一个

package main

import (
    "github.com/PuerkitoBio/goquery"
    "log"
    "fmt"
    "github.com/garyburd/redigo/redis"
    "net/url"
    "net/http"
    "encoding/json"
    "os"
    "strconv"
    "math/rand"
    "time"
    "strings"

)

const(
        PAGE int = 40
)
var (
    xici string = "http://www.xicidaili.com/wn/"
)

func main() {
    //getIp("local")
    getIp("http://171.104.132.87:9999")
}

func getIp(ip string){
    var count int

    for i := 1; i <= PAGE; i++ {

        response := getRep(xici + strconv.Itoa(i),ip)

        if (response.StatusCode == 200) {
            dom, err := goquery.NewDocumentFromResponse(response)
            if err != nil {
                log.Fatalf("失败原因", response.StatusCode)
            }
            dom.Find("#ip_list tbody tr").Each(func(i int, context *goquery.Selection) {
                ipInfo := make(map[string][]string)
                //地址
                ip := context.Find("td").Eq(1).Text()
                //端口
                port := context.Find("td").Eq(2).Text()
                //地址
                address := context.Find("td").Eq(3).Find("a").Text()
                //匿名
                anonymous := context.Find("td").Eq(4).Text()
                //协议
                protocol := context.Find("td").Eq(5).Text()
                //存活时间
                survivalTime := context.Find("td").Eq(8).Text()
                //验证时间
                checkTime := context.Find("td").Eq(9).Text()
                ipInfo[ip] = append(ipInfo[ip], ip, port, address, anonymous, protocol, survivalTime, checkTime)
                fmt.Println(ipInfo)
                hBody, _ := json.Marshal(ipInfo[ip])

                //存入redis
                saveRedis(ip+":"+port,string(hBody))
                fmt.Println(ipInfo)
                count++
            })
        }
    }

}

/** * 返回response */
func getRep(urls string,ip string) *http.Response {

    request, _ := http.NewRequest("GET", urls, nil)
    //随机返回User-Agent 信息
    request.Header.Set("User-Agent", getAgent())
    request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
    request.Header.Set("Connection", "keep-alive")
    proxy, err := url.Parse(ip)
    //设置超时时间
    timeout := time.Duration(20* time.Second)
    fmt.Printf("使用代理:%s\n",proxy)
    client := &http.Client{}
    if ip != "local"{
        client = &http.Client{
            Transport: &http.Transport{
                Proxy: http.ProxyURL(proxy),
            },
            Timeout: timeout,
        }
    }

    response, err := client.Do(request)
    if err != nil || response.StatusCode != 200{

        fmt.Printf("line-99:遇到了错误-并切换ip %s\n",err)
        getIp(returnIp())

    }


    return response
}

/** * 随机返回一个User-Agent */
func getAgent() string {
    agent  := [...]string{
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    }

    r := rand.New(rand.NewSource(time.Now().UnixNano()))
    len := len(agent)
    return agent[r.Intn(len)]
}



func saveRedis(ip string,hBody string){
    c, err := redis.Dial("tcp", "127.0.0.1:6379")
    if err != nil {
        fmt.Println("Connect to redis error", err)
        return
    }

    defer c.Close()
    //键值对的方式存入hash
    _, err = c.Do("HSET", "ippool", ip, string(hBody))
    //将ip:port 存入set  方便返回随机的ip
    _,err  = c.Do("SADD","ippoolkey",ip)
    if err != nil {
        log.Fatalf("err:%s", err)
        os.Exit(1)
    }
}
/** * 随机返回一个IP */
func returnIp() string{
    c, err := redis.Dial("tcp", "127.0.0.1:6379")
    if err != nil {
        fmt.Println("Connect to redis error", err)
        os.Exit(0)
    }

    defer c.Close()

    key,err:=redis.String(c.Do("SRANDMEMBER", "ippoolkey"))
    res,err:=redis.String(c.Do("HGET", "ippool",key))

    res=strings.TrimLeft(res, "[")
    res=strings.TrimRight(res, "]")

    array := strings.Split(res,",")

    for i:=0;i<len(array);i++{
        array[i] = strings.Trim(array[i],"\"")
    }
    host:= strings.ToLower(array[4])+"://"+array[0]+":"+array[1]
    return host

}