From dbc477ce7e0d50bfa2492b142eb48232197b0917 Mon Sep 17 00:00:00 2001 From: Colin Ramsay Date: Wed, 5 Jun 2024 16:30:03 +0100 Subject: [PATCH] fix: add ability to track referrer/parent page and link --- start.go | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/start.go b/start.go index f9f8bd0..e3edb43 100644 --- a/start.go +++ b/start.go @@ -23,13 +23,13 @@ import ( type Configuration struct { Debug bool `json:"debug"` UserAgent string `json:"userAgent"` - HtmlCache struct { + HtmlCache struct { Directory string `json:"directory"` } `json:"htmlCache"` PdfCache struct { Directory string `json:"directory"` } `json:"pdfCache"` - Request struct { + Request struct { TimeoutInMs int `json:"timeoutInMs"` DomainGlob string `json:"domainGlob"` Parallelism int `json:"parellelism"` @@ -56,8 +56,15 @@ type Configuration struct { type HtmlSelectorTemplateVars struct { Request colly.Request Response colly.Response + Referrer Referrer } +type Referrer struct { + Url string + LinkText string +} + + type PdfSelectorTemplateVars struct { Response colly.Response Request colly.Request @@ -135,7 +142,7 @@ func regexpFromConfig(input []string) []*regexp.Regexp { } func main() { - + log.Println("Starting...") testUrlPtr := flag.String("testUrl", "", "A single URL. When provided, will show the output from that URL only.") flag.Parse() @@ -172,6 +179,7 @@ func main() { } options = append(options, colly.Async(true)) + log.Println("Creating collector...") c := colly.NewCollector(options...) c.SetRequestTimeout(time.Duration(configuration.Request.TimeoutInMs) * time.Millisecond) @@ -192,6 +200,7 @@ func main() { }) c.OnHTML("html", func(htmlEl *colly.HTMLElement) { + log.Println("Starting doc...") document := make(map[string]string) htmlEl.DOM.Find("script,style,link,form").Remove() @@ -201,7 +210,7 @@ func main() { if strings.Contains(selector, "{{") { t := template.Must(template.New("selectorTpl").Funcs(sprig.TxtFuncMap()).Parse(selector)) var tpl bytes.Buffer - data := HtmlSelectorTemplateVars{Request: *htmlEl.Request, Response: *htmlEl.Response} + data := HtmlSelectorTemplateVars{Request: *htmlEl.Request, Response: *htmlEl.Response, Referrer: Referrer{ Url: htmlEl.Request.Ctx.Get("refUrl"), LinkText: htmlEl.Request.Ctx.Get("linkText") }} err := t.Execute(&tpl, data) if err != nil { @@ -223,8 +232,15 @@ func main() { } htmlEl.ForEach("a[href]", func(_ int, el *colly.HTMLElement) { - htmlEl.Request.Visit(el.Attr("href")) + ctx := colly.NewContext() + ctx.Put("refUrl", el.Request.URL.String()) + ctx.Put("linkText", el.Text) + c.Request("GET", + el.Request.AbsoluteURL(el.Attr("href")), + nil, ctx, nil) + + // htmlEl.Request.Visit(el.Attr("href")) }) } else { fmt.Println(document) @@ -236,11 +252,12 @@ func main() { err := os.Mkdir(configuration.PdfCache.Directory, 0755) if err != nil { - log.Fatal(err) + log.Fatal("Error creating PDF cache:", configuration.PdfCache.Directory, err) } } c.OnResponse(func(resp *colly.Response) { + log.Println("response") ext := filepath.Ext(resp.Request.URL.Path) if ext == ".pdf" { @@ -279,6 +296,8 @@ func main() { panic(err) } val = tpl.String() + + log.Print(val) } document[key] = val }