diff --git a/go.mod b/go.mod index 2f529c9..717bfce 100644 --- a/go.mod +++ b/go.mod @@ -1,19 +1,18 @@ module github.com/detectify/page-fetch -go 1.20 +go 1.23.0 require ( - github.com/chromedp/cdproto v0.0.0-20230605001715-1e95ea08ffe6 - github.com/chromedp/chromedp v0.9.1 + github.com/chromedp/cdproto v0.0.0-20250319231242-a755498943c8 + github.com/chromedp/chromedp v0.13.2 golang.org/x/net v0.38.0 ) require ( - github.com/chromedp/sysutil v1.0.0 // indirect + github.com/chromedp/sysutil v1.1.0 // indirect + github.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.2.1 // indirect - github.com/josharian/intern v1.0.0 // indirect - github.com/mailru/easyjson v0.7.7 // indirect + github.com/gobwas/ws v1.4.0 // indirect golang.org/x/sys v0.31.0 // indirect ) diff --git a/go.sum b/go.sum index 2ebbee5..ef566be 100644 --- a/go.sum +++ b/go.sum @@ -1,28 +1,23 @@ -github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/cdproto v0.0.0-20230605001715-1e95ea08ffe6 h1:lH/2I023SRn9+1SaCEQk0iax3HKq8bJ6aj4Iyq4VwjM= -github.com/chromedp/cdproto v0.0.0-20230605001715-1e95ea08ffe6/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/chromedp v0.9.1 h1:CC7cC5p1BeLiiS2gfNNPwp3OaUxtRMBjfiw3E3k6dFA= -github.com/chromedp/chromedp v0.9.1/go.mod h1:DUgZWRvYoEfgi66CgZ/9Yv+psgi+Sksy5DTScENWjaQ= -github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= -github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= +github.com/chromedp/cdproto v0.0.0-20250319231242-a755498943c8 h1:AqW2bDQf67Zbq6Tpop/+yJSIknxhiQecO2B8jNYTAPs= +github.com/chromedp/cdproto v0.0.0-20250319231242-a755498943c8/go.mod h1:NItd7aLkcfOA/dcMXvl8p1u+lQqioRMq/SqDp71Pb/k= +github.com/chromedp/chromedp v0.13.2 h1:f6sZFFzCzPLvWSzeuXQBgONKG7zPq54YfEyEj0EplOY= +github.com/chromedp/chromedp v0.13.2/go.mod h1:khsDP9OP20GrowpJfZ7N05iGCwcAYxk7qf9AZBzR3Qw= +github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= +github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= +github.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535 h1:yE7argOs92u+sSCRgqqe6eF+cDaVhSPlioy1UkA0p/w= +github.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535/go.mod h1:BWmvoE1Xia34f3l/ibJweyhrT+aROb/FQ6d+37F0e2s= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= -github.com/gobwas/ws v1.2.1 h1:F2aeBZrm2NDsc7vbovKrWSogd4wvfAxg0FQ89/iqOTk= -github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/sys v0.0.0-20201207223542-d4d67f95c62d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= diff --git a/main.go b/main.go index a902f71..2164fad 100644 --- a/main.go +++ b/main.go @@ -15,6 +15,7 @@ import ( "time" "github.com/chromedp/cdproto/fetch" + "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "golang.org/x/net/publicsuffix" ) @@ -25,22 +26,22 @@ func init() { "Request URLs using headless Chrome, storing the results", "", "Usage:", - " page-fetch [options] < urls.txt", + " page-fetch [options] < urls.txt", "", "Options:", - " -c, --concurrency Concurrency Level (default 2)", - " -d, --delay Milliseconds to wait between requests (default 0)", - " -e, --exclude Do not save responses matching the provided string (can be specified multiple times)", - " -i, --include Only save requests matching the provided string (can be specified multiple times)", - " -j, --javascript JavaScript to run on each page", - " -o, --output Output directory name (default 'out')", - " -p, --proxy Use proxy on given URL", - " -w, --overwrite Overwrite output files when they already exist", - " --no-third-party Do not save responses to requests on third-party domains", - " --third-party Only save responses to requests on third-party domains", + " -c, --concurrency Concurrency Level (default 2)", + " -d, --delay Milliseconds to wait between requests (default 0)", + " -e, --exclude Do not save responses matching the provided string (can be specified multiple times)", + " -i, --include Only save requests matching the provided string (can be specified multiple times)", + " -j, --javascript JavaScript to run on each page", + " -o, --output Output directory name (default 'out')", + " -p, --proxy Use proxy on given URL", + " -t, --timeout Timeout in seconds for each request (default 10)", + " -w, --overwrite Overwrite output files when they already exist", + " --no-third-party Do not save responses to requests on third-party domains", + " --third-party Only save responses to requests on third-party domains", "", } - fmt.Fprint(os.Stderr, strings.Join(h, "\n")) } } @@ -56,39 +57,31 @@ type options struct { delay int js string proxy string + timeout int } func main() { - opts := options{} - flag.Var(&opts.includes, "include", "") flag.Var(&opts.includes, "i", "") - flag.Var(&opts.excludes, "exclude", "") flag.Var(&opts.excludes, "e", "") - flag.BoolVar(&opts.thirdPartyOnly, "third-party", false, "") flag.BoolVar(&opts.noThirdParty, "no-third-party", false, "") - flag.BoolVar(&opts.overwrite, "overwrite", false, "") flag.BoolVar(&opts.overwrite, "w", false, "") - flag.StringVar(&opts.output, "output", "out", "") flag.StringVar(&opts.output, "o", "out", "") - flag.IntVar(&opts.concurrency, "concurrency", 2, "") flag.IntVar(&opts.concurrency, "c", 2, "") - flag.IntVar(&opts.delay, "delay", 0, "") flag.IntVar(&opts.delay, "d", 0, "") - flag.StringVar(&opts.js, "j", "", "") flag.StringVar(&opts.js, "javascript", "", "") - flag.StringVar(&opts.proxy, "p", "", "") flag.StringVar(&opts.proxy, "proxy", "", "") - + flag.IntVar(&opts.timeout, "timeout", 10, "") + flag.IntVar(&opts.timeout, "t", 10, "") flag.Parse() if opts.thirdPartyOnly && opts.noThirdParty { @@ -99,25 +92,22 @@ func main() { copts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("ignore-certificate-errors", true), ) - if opts.proxy != "" { _, err := url.ParseRequestURI(opts.proxy) if err != nil { fmt.Fprintln(os.Stderr, "invalid proxy URL") return } - copts = append(copts, chromedp.ProxyServer(opts.proxy)) } - - // bypass chrome headless detection + // bypass chrome headless detection (updated flags and UA) copts = append(copts, - chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0"), + chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"), chromedp.WindowSize(1920, 1080), chromedp.NoFirstRun, chromedp.NoDefaultBrowserCheck, - chromedp.Headless, - chromedp.DisableGPU) + chromedp.Flag("headless", "new"), // Use new headless mode + ) ectx, ecancel := chromedp.NewExecAllocator(context.Background(), copts...) defer ecancel() @@ -134,7 +124,6 @@ func main() { } sc := bufio.NewScanner(os.Stdin) - var wg sync.WaitGroup jobs := make(chan string) @@ -142,66 +131,55 @@ func main() { wg.Add(1) go func() { for requestURL := range jobs { - - ctx, cancel := context.WithTimeout(pctx, time.Second*10) + ctx, cancel := context.WithTimeout(pctx, time.Duration(opts.timeout)*time.Second) ctx, _ = chromedp.NewContext(ctx) - // we want to intercept all requests, so we add a listener here chromedp.ListenTarget(ctx, makeListener(ctx, requestURL, opts)) - // default to evaluating "false" to avoid errant errors jsCode := opts.js if jsCode == "" { jsCode = "false" } - var jsOutput interface{} err := chromedp.Run( ctx, - fetch.Enable().WithPatterns([]*fetch.RequestPattern{{RequestStage: fetch.RequestStageResponse}}), + fetch.Enable().WithPatterns([]*fetch.RequestPattern{{URLPattern: "*", RequestStage: fetch.RequestStageResponse}}), chromedp.Navigate(requestURL), chromedp.EvaluateAsDevTools(jsCode, &jsOutput), ) - if opts.js != "" { fmt.Printf("JS (%s): %v\n", requestURL, jsOutput) } - if err != nil { fmt.Fprintf(os.Stderr, "run error: %s\n", err) } - if opts.delay > 0 { sleepDuration := time.Duration(opts.delay) time.Sleep(sleepDuration * time.Millisecond) } - cancel() } wg.Done() }() } + for sc.Scan() { jobs <- sc.Text() } close(jobs) - wg.Wait() } func saveResponse(requestURL string, data []byte, output string, overwrite bool) (string, error) { - path, err := makeFilepath(output, requestURL) if err != nil { return "", err } - dir := filepath.Dir(path) err = os.MkdirAll(dir, 0755) if err != nil { return "", err } - i := 1 for !overwrite { // should probably do something like get all the files @@ -212,13 +190,10 @@ func saveResponse(requestURL string, data []byte, output string, overwrite bool) if _, err := os.Stat(path); err != nil { break } - path = fmt.Sprintf("%s.%d", strings.TrimRight(path, ".1234567890"), i) i++ } - return path, os.WriteFile(path, data, 0644) - } func makeFilepath(prefix, requestURL string) (string, error) { @@ -226,123 +201,94 @@ func makeFilepath(prefix, requestURL string) (string, error) { if err != nil { return "", err } - requestPath := u.EscapedPath() if requestPath == "/" { requestPath = "/index" } - savePath := fmt.Sprintf("%s/%s%s", prefix, u.Hostname(), requestPath) - re := regexp.MustCompile("[^a-zA-Z0-9_.%/-]") savePath = re.ReplaceAllString(savePath, "-") - // remove multiple dashes in a row re = regexp.MustCompile("-+") savePath = re.ReplaceAllString(savePath, "-") - // remove multiple slashes in a row re = regexp.MustCompile("/+") savePath = re.ReplaceAllString(savePath, "/") - // we shouldn't see any, but remove any double-dots just in case re = regexp.MustCompile("\\.\\.") savePath = re.ReplaceAllString(savePath, "-") - savePath = strings.TrimSuffix(savePath, "/") - return savePath, nil - } -func saveMeta(path string, parentURL string, ev *fetch.EventRequestPaused) error { - +func saveMeta(path string, parentURL string, ev *fetch.EventRequestPaused, postData string) error { b := &bytes.Buffer{} - fmt.Fprintf(b, "url: %s\n", ev.Request.URL) fmt.Fprintf(b, "parent: %s\n", parentURL) fmt.Fprintf(b, "method: %s\n", ev.Request.Method) fmt.Fprintf(b, "type: %s\n", ev.ResourceType) - b.WriteRune('\n') - for k, v := range ev.Request.Headers { fmt.Fprintf(b, "> %s: %s\n", k, v) } - - if ev.Request.PostData != "" { - fmt.Fprintf(b, "\n%s\n", ev.Request.PostData) + if postData != "" { + fmt.Fprintf(b, "\n%s\n", postData) } - b.WriteRune('\n') - for _, h := range ev.ResponseHeaders { fmt.Fprintf(b, "< %s: %s\n", h.Name, h.Value) } - return os.WriteFile(path, b.Bytes(), 0644) } func shouldSave(ev *fetch.EventRequestPaused, requestURL string, opts options) bool { - contentType := "unknown" for _, h := range ev.ResponseHeaders { if strings.ToLower(h.Name) == "content-type" { contentType = strings.ToLower(h.Value) } } - for _, i := range opts.includes { if strings.Contains(contentType, strings.ToLower(i)) { break } return false } - for _, e := range opts.excludes { if strings.Contains(contentType, strings.ToLower(e)) { return false } } - var domain string if u, err := url.Parse(requestURL); err == nil { domain = u.Hostname() } - var subRequestDomain string if u, err := url.Parse(ev.Request.URL); err == nil { subRequestDomain = u.Hostname() } - if opts.thirdPartyOnly { return isThirdParty(domain, subRequestDomain) } - // you might be thinking "wait, what if opts.thirdPartyOnly and // opts.noThirdParty are both true?!". We check in main() that // is not the case so we should be all good here (: if opts.noThirdParty { return !isThirdParty(domain, subRequestDomain) } - return true } func makeListener(ctx context.Context, requestURL string, opts options) func(interface{}) { - return func(ev interface{}) { if ev, ok := ev.(*fetch.EventRequestPaused); ok { - go func() { - contentType := "unknown" for _, h := range ev.ResponseHeaders { if strings.ToLower(h.Name) == "content-type" { contentType = strings.ToLower(h.Value) } } - if !shouldSave(ev, requestURL, opts) { err := chromedp.Run(ctx, fetch.ContinueRequest(ev.RequestID)) if err != nil { @@ -350,7 +296,6 @@ func makeListener(ctx context.Context, requestURL string, opts options) func(int } return } - err := chromedp.Run( ctx, chromedp.ActionFunc(func(ctx context.Context) error { @@ -360,28 +305,35 @@ func makeListener(ctx context.Context, requestURL string, opts options) func(int // otherwise the ContinueRequest does not run return nil } - path, err := saveResponse(ev.Request.URL, data, opts.output, opts.overwrite) if err != nil { fmt.Fprintf(os.Stderr, "failed to save response data for %s: %s\n", ev.Request.URL, err) return nil } - + var postData string + if ev.Request.HasPostData { + if ev.NetworkID == "" { + fmt.Fprintf(os.Stderr, "has post data but no network ID for %s\n", ev.Request.URL) + } else { + postData, err = network.GetRequestPostData(ev.NetworkID).Do(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to get post data for %s: %s\n", ev.Request.URL, err) + postData = "" + } + } + } // save the headers etc in a separate file - err = saveMeta(path+".meta", requestURL, ev) + err = saveMeta(path+".meta", requestURL, ev, postData) if err != nil { fmt.Fprintf(os.Stderr, "failed to save response meta data for %s: %s\n", ev.Request.URL, err) return nil } - // Log the request fmt.Printf("%s %s %d %s\n", ev.Request.Method, ev.Request.URL, ev.ResponseStatusCode, contentType) - return nil }), fetch.ContinueRequest(ev.RequestID), ) - if err != nil { fmt.Fprintf(os.Stderr, "continue request err: %s\n", err) } @@ -396,12 +348,10 @@ func isThirdParty(base, sub string) bool { if err != nil { return false } - sub, err = publicsuffix.EffectiveTLDPlusOne(sub) if err != nil { return false } - return base != sub }