From 1ec1f2645a09058985281e6b7cb3812f0efe4043 Mon Sep 17 00:00:00 2001 From: jiangwei1995910 Date: Wed, 17 Jul 2019 22:00:28 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E5=BB=B6=E8=BF=9F=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.yaml | 3 +++ config.yaml.all | 3 +++ lianjia.go | 31 ++++++++++++++++++++++- proxypool/proxy.go | 63 ++++++++++++++++++++++++++++++++++++++++++++++ zhilian.go | 9 +++++++ 5 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 proxypool/proxy.go diff --git a/config.yaml b/config.yaml index 185e26a..0dd6ac1 100644 --- a/config.yaml +++ b/config.yaml @@ -8,6 +8,9 @@ dbDatabase: pachong collyDatabase: colly +# 抓取间隔时间 单位秒 +crawlDelay: 3 + # MongoDB 集合名 # 存链家的数据 # 自己添加Link 字段的唯一键,不然有重复 diff --git a/config.yaml.all b/config.yaml.all index 2d2ed73..6d96394 100644 --- a/config.yaml.all +++ b/config.yaml.all @@ -8,6 +8,9 @@ dbDatabase: pachong collyDatabase: colly +# 抓取间隔时间 单位秒 +crawlDelay: 3 + # MongoDB 集合名 # 存链家的数据 # 自己添加Link 字段的唯一键,不然有重复 diff --git a/lianjia.go b/lianjia.go index e396c4a..80e7b11 100644 --- a/lianjia.go +++ b/lianjia.go @@ -25,6 +25,17 @@ type Page struct { func crawlerOneCity(cityUrl string) { c := colly.NewCollector() configInfo := configs.Config() + + if configInfo["crawlDelay"] != nil { + delay, _ := configInfo["crawlDelay"].(json.Number).Int64() + if delay > 0 { + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + Delay: time.Duration(delay) * time.Second, + }) + } + } + if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 { var proxyList []string for _, v := range configInfo["proxyList"].([]interface{}) { @@ -52,13 +63,17 @@ func crawlerOneCity(cityUrl string) { fmt.Println("列表抓取:", r.URL.String()) }) + c.OnHTML("title", func(element *colly.HTMLElement) { + fmt.Println(element.Text) + }) + c.OnHTML("body", func(element *colly.HTMLElement) { // 获取一页的数据 element.ForEach(".LOGCLICKDATA", func(i int, e *colly.HTMLElement) { link := e.ChildAttr("a", "href") title := e.ChildText("a:first-child") - //fmt.Println(title) + fmt.Println(title) price := e.ChildText(".totalPrice") price = strings.Replace(price, "万", "0000", 1) @@ -138,6 +153,16 @@ func crawlDetail() (sucnum int) { c := colly.NewCollector() configInfo := configs.Config() + if configInfo["crawlDelay"] != nil { + delay, _ := configInfo["crawlDelay"].(json.Number).Int64() + if delay > 0 { + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + Delay: time.Duration(delay) * time.Second, + }) + } + } + if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 { var proxyList []string for _, v := range configInfo["proxyList"].([]interface{}) { @@ -173,6 +198,10 @@ func crawlDetail() (sucnum int) { }) + c.OnHTML("title", func(element *colly.HTMLElement) { + fmt.Println(element.Text) + }) + c.OnHTML(".aroundInfo .communityName .info", func(element *colly.HTMLElement) { db.Update(element.Request.URL.String(), bson.M{"xiaoqu": element.Text, "detailCrawlTime": time.Now()}) }) diff --git a/proxypool/proxy.go b/proxypool/proxy.go new file mode 100644 index 0000000..765efcb --- /dev/null +++ b/proxypool/proxy.go @@ -0,0 +1,63 @@ +package proxypool + +// 代理实现层 +import ( + "context" + "fmt" + "getAwayBSG/configs" + "github.com/gocolly/colly" + "io/ioutil" + "math/rand" + "net/http" + "net/url" +) + +type proxyPool struct { + proxyURLs []*url.URL +} + +func (r *proxyPool) GetProxy(pr *http.Request) (*url.URL, error) { + // 从配置文件读取代理,可以修改返回,从其他地方获取代理,比如代理池 + if len(r.proxyURLs) > 0 { + proxyLink := r.proxyURLs[rand.Intn(len(r.proxyURLs))] + // 将代理写入上下文 + ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, proxyLink) + *pr = *pr.WithContext(ctx) + return proxyLink, nil + } else { + proxyLink, ip := getOneProxy() + // 将代理写入上下文 + ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, ip) + *pr = *pr.WithContext(ctx) + return url.Parse(proxyLink) + } + +} + +func GetProxyPool() (colly.ProxyFunc, error) { + configInfo := configs.Config() + var proxyURLs []*url.URL + if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 { + for _, v := range configInfo["proxyList"].([]interface{}) { + urlLink, err := url.Parse(v.(string)) + if err == nil { + proxyURLs = append(proxyURLs, urlLink) + } + } + + } + return (&proxyPool{proxyURLs: proxyURLs}).GetProxy, nil + +} + +func getOneProxy() (string, string) { + resp, _ := http.Get("http://45.78.45.70:5015/get/") + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + // handle error + } + proxy := "http://" + string(body) + fmt.Println("使用默认代理:" + proxy) + return proxy, string(body) +} \ No newline at end of file diff --git a/zhilian.go b/zhilian.go index bb4a4e3..40bd8e8 100644 --- a/zhilian.go +++ b/zhilian.go @@ -61,9 +61,18 @@ func main() { } func get(link string) (bodystr string) { + bodystr = "" var client *http.Client configInfo := configs.Config() + + if configInfo["crawlDelay"] != nil { + delay, _ := configInfo["crawlDelay"].(json.Number).Int64() + if delay > 0 { + time.Sleep(time.Duration(delay) * time.Second) + } + } + if configInfo["proxyList"] != nil { var proxyList []string for _, v := range configInfo["proxyList"].([]interface{}) {