-
Notifications
You must be signed in to change notification settings - Fork 1
/
proxier.go
273 lines (232 loc) · 7.88 KB
/
proxier.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
// Package proxier finds proxies and makes requests through them for web crawling
package proxier
import (
"context"
"fmt"
"io"
"net/http"
"time"
"github.com/vertoforce/proxier/proxy"
"github.com/vertoforce/proxier/proxy/proxyDBs/inmemory"
"github.com/vertoforce/proxier/proxy/proxysources/getproxylist"
"github.com/vertoforce/proxier/proxy/proxysources/gimmeproxy"
)
// Defaults
const (
DefaultProxyDBTimeout = time.Second * 5
DefaultProxyTimeout = time.Second * 4
)
// DefaultSources are the default proxy sources available
var (
DefaultProxySources = []proxy.ProxySource{
&getproxylist.GetProxyListSource{},
&gimmeproxy.GimmeProxySource{},
}
)
// CheckResponseFunc is a func given a http response it returns true if the request succeeded
// By default proxier looks for a HTTP 200
type CheckResponseFunc func(*http.Response) bool
// Proxier A proxier object
type Proxier struct {
// proxySources are sources of proxies, using a map so we randomize our use of each
proxySources map[proxy.ProxySource]bool
// proxyDB is where we store the proxies we know about
proxyDB proxy.ProxyDB
// ProxyTimeout is how long to try a proxy before giving up
ProxyTimeout time.Duration
// True to check no proxy first by default
TryNoProxyFirst bool
// Allowed proxy protocols to get when fetching a proxy, note that
AllowedProxyProtocols []proxy.Protocol
}
// NewBare Creates a new bare proxier with no proxy sources
func NewBare() *Proxier {
p := &Proxier{}
p.proxySources = map[proxy.ProxySource]bool{}
p.ProxyTimeout = DefaultProxyDBTimeout
return p
}
// New Creates a new proxier with default proxy sources and in memory proxyDB
func New() *Proxier {
return NewBare().
WithProxySources(DefaultProxySources...).
WithProxyDB(inmemory.New()).
WithAllowedProxyProtocols(proxy.Socks4Protocol, proxy.Socks4aProtocol, proxy.Socks5Protocol, proxy.Socks5hProtocol, proxy.SocksProtocol)
}
// WithAllowedProxyProtocols List of proxy protocols to fetch from our project sources
func (p *Proxier) WithAllowedProxyProtocols(protocols ...proxy.Protocol) *Proxier {
for _, protocol := range protocols {
p.AllowedProxyProtocols = append(p.AllowedProxyProtocols, protocol)
}
return p
}
// WithProxySources Add proxy sources
func (p *Proxier) WithProxySources(sources ...proxy.ProxySource) *Proxier {
for _, proxySource := range sources {
p.proxySources[proxySource] = true
}
return p
}
// WithProxyDB Add proxy DB, there can only be one proxy db
func (p *Proxier) WithProxyDB(proxyDB proxy.ProxyDB) *Proxier {
p.proxyDB = proxyDB
return p
}
// WithProxies Adds proxies to our DB
func (p *Proxier) WithProxies(ctx context.Context, proxies ...*proxy.Proxy) *Proxier {
for _, proxy := range proxies {
p.proxyDB.StoreProxy(ctx, proxy)
}
return p
}
// -- functionality --
// GetProxyFromSources Get a ProxySource from one of our proxySources
// This will continue to try and get proxies from each source until it finds a SOCKS proxy
func (p *Proxier) GetProxyFromSources(ctx context.Context) (*proxy.Proxy, error) {
var proxy *proxy.Proxy
for proxySource := range p.proxySources {
// Try to find a valid proxy from this source
for {
var err error
proxy, err = proxySource.GetProxy(ctx)
if err != nil {
// This proxy source has no more proxies (for now)
break
}
// Check if it's our allowed protocols
for _, protocol := range p.AllowedProxyProtocols {
if proxy.Protocol == protocol {
// We found a proxy!
return proxy, nil
}
}
}
}
// No proxies to be found
return nil, fmt.Errorf("no new proxies available")
}
// CacheProxies Get "count" proxies from our sources and put each in the database for later use
func (p *Proxier) CacheProxies(ctx context.Context, count int) (added int, err error) {
added = 0
for i := 0; i < count; i++ {
// Get proxy
proxy, err := p.GetProxyFromSources(ctx)
if err != nil {
// No more proxies available
break
}
// Store proxy
err = p.proxyDB.StoreProxy(ctx, proxy)
if err != nil {
return added, err
}
added++
}
return added, nil
}
// DoRequestRaw Do a request using a random proxy in our DB and keep cycling through proxies until we find one that passes DefaultCheckResponseFunc
func (p *Proxier) DoRequestRaw(ctx context.Context, method, URL string, body io.Reader) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, URL, body)
if err != nil {
return nil, err
}
return p.DoRequest(ctx, req)
}
// DoRequest Do a request using a random proxy in our DB and keep cycling through proxies until we find one that passes DefaultCheckResponseFunc
func (p *Proxier) DoRequest(ctx context.Context, req *http.Request) (*http.Response, error) {
return p.DoRequestExtra(ctx, req, p.TryNoProxyFirst, DefaultCheckResponseFunc)
}
// DoRequestExtra Same as DoRequest with additional
/// tryNoProxyFirst to try a normal request first
// and also a checkResponeFunc to check if the response was successful
func (p *Proxier) DoRequestExtra(ctx context.Context, req *http.Request, tryNoProxyFirst bool, checkResponseFunc CheckResponseFunc) (*http.Response, error) {
// TODO: Add max request count to avoid endless requesting for a down server, or a server that always returns 403
req = req.WithContext(ctx)
// -- Try default request --
if tryNoProxyFirst {
resp, err := http.DefaultClient.Do(req)
if err == nil && checkResponseFunc(resp) {
return resp, nil
}
}
// -- Try our DB Proxies --
if p.proxyDB == nil {
return nil, fmt.Errorf("no proxydb set and is required")
}
proxies, err := p.proxyDB.GetProxies(ctx)
if err != nil {
return nil, err
}
// Convert to map so we use randomly
proxiesMap := map[*proxy.Proxy]bool{}
for _, proxy := range proxies {
proxiesMap[proxy] = true
}
for proxy := range proxiesMap {
resp, err := p.makeProxyRequest(ctx, proxy, req)
// Check if this was a success
if err == nil && checkResponseFunc(resp) {
return resp, nil
}
// This wasn't a success, we should ditch this proxy from the database
// TODO: Change this to delete after 3 failures or something
p.proxyDB.DelProxy(ctx, proxy)
}
// -- Get new proxies --
// If we are here, there are no valid proxies available in the proxyDB
// Keep trying to get new proxies forever (until we run out of proxies from our proxy sources)
for {
proxy, err := p.GetProxyFromSources(ctx)
if err != nil {
// No more proxies to try
return nil, fmt.Errorf("no proxies available")
}
// Try this proxy
resp, err := p.makeProxyRequest(ctx, proxy, req)
// Check if this was a success
if err != nil || !checkResponseFunc(resp) {
continue
}
// It worked! Add this to our database
p.proxyDB.StoreProxy(ctx, proxy)
// Return response
return resp, nil
}
}
// makeProxyRequest makes a proxy request or times out if it takes too long
func (p *Proxier) makeProxyRequest(ctx context.Context, proxy *proxy.Proxy, req *http.Request) (resp *http.Response, err error) {
// TODO: This complains about a context leak, but is it really a problem?
// I cannot cancel this as it would cause reading from the body to fail
proxyCtx, cancel := context.WithCancel(ctx)
// Start request in background
done := make(chan int)
go func() {
resp, err = proxy.DoRequest(proxyCtx, req)
done <- 1
}()
// Wait for it either to be done, or timeout
select {
case <-time.After(p.ProxyTimeout):
cancel()
return nil, proxyCtx.Err()
case <-done:
return resp, err
}
}
// DefaultCheckResponseFunc Returns true if status code is 200 OR 500-599, false if status code is 429 OR 403, true otherwise
func DefaultCheckResponseFunc(resp *http.Response) bool {
// TODO: Change this from checking 200
if resp.StatusCode == 200 {
return true
}
// If there is a server error, it's not the proxies fault, the request succeeded
if resp.StatusCode >= 500 && resp.StatusCode <= 599 {
return true
}
// These typically indicate a failure
if resp.StatusCode == 429 || resp.StatusCode == 403 {
return false
}
// Return true otherwise
return true
}