forked from mvdan/xurls
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xurls.go
125 lines (102 loc) · 2.36 KB
/
xurls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Package xurls extracts urls from plain text using regular expressions.
package xurls
import (
"html"
"net/url"
"strings"
"unicode"
)
// ExtractSubdomains finds all subdomains from a given text
func ExtractSubdomains(text, domain string) (urls []string) {
allUrls := findAllUrls(text)
var finalUrls []string
for _, u := range allUrls {
finalUrls = append(finalUrls, handleURI(u)...)
}
// Filter by domains and remove duplicates
finalUrls = filterByDomain(finalUrls, domain)
return finalUrls
}
func findAllUrls(text string) (urls []string) {
for i, r := range text {
if r == '.' {
bck := string(r)
//Go back till first valid ascii or number
for backIndex := i - 1; backIndex >= 0; backIndex-- {
rr := rune(text[backIndex])
if isValidRuneBack(rr) {
bck = string(rr) + bck
} else {
break
}
}
//Go forth till the last valid ascii or number
for forwardIndex := i + 1; forwardIndex < len(text); forwardIndex++ {
rr := rune(text[forwardIndex])
if isValidRuneForward(rr) {
bck = bck + string(rr)
} else {
break
}
}
urls = append(urls, bck)
}
}
return urls
}
func isValidRuneBack(r rune) bool {
return unicode.IsNumber(r) || unicode.IsLetter(r) || r == ':' || r == '/' || r == '_' || r == '-' || r == '%'
}
func isValidRuneForward(r rune) bool {
return isValidRuneBack(r) || r == '.'
}
func handleURI(u string) []string {
var urls []string
// Try to parse as normal URI
if u, err := url.ParseRequestURI(u); err == nil {
urls = append(urls, u.Host)
return urls
}
// Html Unescape
u = html.UnescapeString(u)
// Query Unescape
u, _ = url.QueryUnescape(u)
replacer := strings.NewReplacer(
"u003d", " ",
"/", " ",
"\\", " ",
)
// Suppress bad chars
u = replacer.Replace(u)
// Suppress bad starting characters
u = suppressLeftChar(u)
// Split on spaces
return strings.Split(u, " ")
}
func suppressLeftChar(s string) string {
if strings.HasPrefix(s, "-www") {
return s[1:]
}
if strings.HasPrefix(s, "-site") {
return s[5:]
}
for i, r := range s {
if r == '/' {
return s[i:]
}
}
return s
}
func filterByDomain(urls []string, domain string) []string {
result := []string{}
seen := map[string]string{}
for _, u := range urls {
if strings.HasSuffix(u, domain) {
if _, ok := seen[u]; !ok {
result = append(result, u)
seen[u] = u
}
}
}
return result
}