Skip to content

Commit

Permalink
加入延迟设置
Browse files Browse the repository at this point in the history
  • Loading branch information
jiangwei1995910 committed Jul 17, 2019
1 parent a12a8c3 commit 1ec1f26
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 1 deletion.
3 changes: 3 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ dbDatabase: pachong
collyDatabase: colly


# 抓取间隔时间 单位秒
crawlDelay: 3

# MongoDB 集合名
# 存链家的数据
# 自己添加Link 字段的唯一键,不然有重复
Expand Down
3 changes: 3 additions & 0 deletions config.yaml.all
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ dbDatabase: pachong
collyDatabase: colly


# 抓取间隔时间 单位秒
crawlDelay: 3

# MongoDB 集合名
# 存链家的数据
# 自己添加Link 字段的唯一键,不然有重复
Expand Down
31 changes: 30 additions & 1 deletion lianjia.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ type Page struct {
func crawlerOneCity(cityUrl string) {
c := colly.NewCollector()
configInfo := configs.Config()

if configInfo["crawlDelay"] != nil {
delay, _ := configInfo["crawlDelay"].(json.Number).Int64()
if delay > 0 {
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Delay: time.Duration(delay) * time.Second,
})
}
}

if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 {
var proxyList []string
for _, v := range configInfo["proxyList"].([]interface{}) {
Expand Down Expand Up @@ -52,13 +63,17 @@ func crawlerOneCity(cityUrl string) {
fmt.Println("列表抓取:", r.URL.String())
})

c.OnHTML("title", func(element *colly.HTMLElement) {
fmt.Println(element.Text)
})

c.OnHTML("body", func(element *colly.HTMLElement) {
// 获取一页的数据
element.ForEach(".LOGCLICKDATA", func(i int, e *colly.HTMLElement) {
link := e.ChildAttr("a", "href")

title := e.ChildText("a:first-child")
//fmt.Println(title)
fmt.Println(title)

price := e.ChildText(".totalPrice")
price = strings.Replace(price, "万", "0000", 1)
Expand Down Expand Up @@ -138,6 +153,16 @@ func crawlDetail() (sucnum int) {
c := colly.NewCollector()
configInfo := configs.Config()

if configInfo["crawlDelay"] != nil {
delay, _ := configInfo["crawlDelay"].(json.Number).Int64()
if delay > 0 {
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Delay: time.Duration(delay) * time.Second,
})
}
}

if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 {
var proxyList []string
for _, v := range configInfo["proxyList"].([]interface{}) {
Expand Down Expand Up @@ -173,6 +198,10 @@ func crawlDetail() (sucnum int) {

})

c.OnHTML("title", func(element *colly.HTMLElement) {
fmt.Println(element.Text)
})

c.OnHTML(".aroundInfo .communityName .info", func(element *colly.HTMLElement) {
db.Update(element.Request.URL.String(), bson.M{"xiaoqu": element.Text, "detailCrawlTime": time.Now()})
})
Expand Down
63 changes: 63 additions & 0 deletions proxypool/proxy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package proxypool

// 代理实现层
import (
"context"
"fmt"
"getAwayBSG/configs"
"github.com/gocolly/colly"
"io/ioutil"
"math/rand"
"net/http"
"net/url"
)

type proxyPool struct {
proxyURLs []*url.URL
}

func (r *proxyPool) GetProxy(pr *http.Request) (*url.URL, error) {
// 从配置文件读取代理,可以修改返回,从其他地方获取代理,比如代理池
if len(r.proxyURLs) > 0 {
proxyLink := r.proxyURLs[rand.Intn(len(r.proxyURLs))]
// 将代理写入上下文
ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, proxyLink)
*pr = *pr.WithContext(ctx)
return proxyLink, nil
} else {
proxyLink, ip := getOneProxy()
// 将代理写入上下文
ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, ip)
*pr = *pr.WithContext(ctx)
return url.Parse(proxyLink)
}

}

func GetProxyPool() (colly.ProxyFunc, error) {
configInfo := configs.Config()
var proxyURLs []*url.URL
if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 {
for _, v := range configInfo["proxyList"].([]interface{}) {
urlLink, err := url.Parse(v.(string))
if err == nil {
proxyURLs = append(proxyURLs, urlLink)
}
}

}
return (&proxyPool{proxyURLs: proxyURLs}).GetProxy, nil

}

func getOneProxy() (string, string) {
resp, _ := http.Get("http://45.78.45.70:5015/get/")
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
// handle error
}
proxy := "http://" + string(body)
fmt.Println("使用默认代理:" + proxy)
return proxy, string(body)
}
9 changes: 9 additions & 0 deletions zhilian.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,18 @@ func main() {
}

func get(link string) (bodystr string) {

bodystr = ""
var client *http.Client
configInfo := configs.Config()

if configInfo["crawlDelay"] != nil {
delay, _ := configInfo["crawlDelay"].(json.Number).Int64()
if delay > 0 {
time.Sleep(time.Duration(delay) * time.Second)
}
}

if configInfo["proxyList"] != nil {
var proxyList []string
for _, v := range configInfo["proxyList"].([]interface{}) {
Expand Down

0 comments on commit 1ec1f26

Please sign in to comment.