-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrobot.go
81 lines (75 loc) · 1.98 KB
/
robot.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package main
import (
"fmt"
"regexp"
"strings"
)
type Robots struct {
robot []Robot
}
type Robot struct {
ua_name string
ua_re *regexp.Regexp
ua []uastruct
}
type uastruct struct {
url string
url_re *regexp.Regexp
allow bool
}
func GetRobotFromTxt(txt string) Robots {
ua := ""
ua_name := ""
ua_list := Robots{}
for _, line := range strings.Split(txt, "\n") {
if strings.HasPrefix(line, "User-agent: ") {
ua = strings.TrimPrefix(line, "User-agent: ")
ua_name = ua
ua_list.robot = append(ua_list.robot, Robot{ua_name, regexp.MustCompile(strings.ReplaceAll(ua_name, "*", ".*")), []uastruct{}})
} else if strings.HasPrefix(line, "Disallow: ") {
u := strings.TrimPrefix(line, "Disallow: ")
ua_list.robot[len(ua_list.robot)-1].ua = append(ua_list.robot[len(ua_list.robot)-1].ua, uastruct{u, regexp.MustCompile(strings.ReplaceAll(u, "*", ".*")), false})
} else if strings.HasPrefix(line, "Allow: ") {
u := strings.TrimPrefix(line, "Allow: ")
ua_list.robot[len(ua_list.robot)-1].ua = append(ua_list.robot[len(ua_list.robot)-1].ua, uastruct{u, regexp.MustCompile(strings.ReplaceAll(u, "*", ".*")), true})
}
}
return ua_list
}
func (r *Robots) IsAllow(ua string, url string) error {
for _, robot := range r.robot {
var allow bool
var err error
if robot.ua_re.MatchString(ua) {
for _, ua := range robot.ua {
if !ua.allow && ua.url_re.MatchString(url) {
err = fmt.Errorf("由于robots.txt限制,不允许爬取(UA:%s URL:%s ),当前目标链接:%s", robot.ua_name, ua.url, url)
allow = false
}
}
for _, ua := range robot.ua {
if ua.allow && ua.url_re.MatchString(url) {
if !allow {
return nil
}
}
}
if !allow {
return err
}
}
}
return nil
}
func (r *Robots) Output() {
for _, robot := range r.robot {
fmt.Printf("User-agent: %s\n", robot.ua_name)
for _, ua := range robot.ua {
if ua.allow {
fmt.Printf("Allow: %s\n", ua.url)
} else {
fmt.Printf("Disallow: %s\n", ua.url)
}
}
}
}