From af0e0aada15305ca22228c141ce9c9c180fdd680 Mon Sep 17 00:00:00 2001 From: Cornel Date: Wed, 1 Jan 2025 09:48:58 -0600 Subject: [PATCH] support inline css (#51) --- css/css.go | 51 +++++++++ htmlindex/attributes.go | 25 ++++- htmlindex/htmlindex.go | 122 +++++++++++++++------- htmlindex/htmlindex_test.go | 4 +- scraper/css.go | 70 ------------- scraper/download.go | 34 +++++- scraper/{css_test.go => download_test.go} | 4 +- scraper/html.go | 62 ++++++++++- scraper/html_test.go | 2 +- scraper/scraper.go | 2 +- scraper/scraper_test.go | 63 +++++++++++ 11 files changed, 322 insertions(+), 117 deletions(-) create mode 100644 css/css.go delete mode 100644 scraper/css.go rename scraper/{css_test.go => download_test.go} (93%) diff --git a/css/css.go b/css/css.go new file mode 100644 index 0000000..eaa0ccb --- /dev/null +++ b/css/css.go @@ -0,0 +1,51 @@ +package css + +import ( + "net/url" + "regexp" + "strings" + + "github.com/cornelk/gotokit/log" + "github.com/gorilla/css/scanner" +) + +var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) + +type Token = scanner.Token + +type urlProcessor func(token *Token, data string, url *url.URL) + +// Process the CSS data and call a processor for every found URL. +func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) { + css := scanner.New(data) + + for { + token := css.Next() + if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { + break + } + if token.Type != scanner.TokenURI { + continue + } + + match := cssURLRe.FindStringSubmatch(token.Value) + if match == nil { + continue + } + + src := match[1] + if strings.HasPrefix(strings.ToLower(src), "data:") { + continue // skip embedded data + } + + u, err := url.Parse(src) + if err != nil { + logger.Error("Parsing URL failed", + log.String("url", src), + log.Err(err)) + continue + } + + processor(token, src, u) + } +} diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go index 36721ce..16e2277 100644 --- a/htmlindex/attributes.go +++ b/htmlindex/attributes.go @@ -1,13 +1,29 @@ package htmlindex +import ( + "net/url" + + "github.com/cornelk/gotokit/log" + "golang.org/x/net/html" +) + +type nodeAttributeParserData struct { + logger *log.Logger + url *url.URL + node *html.Node + attribute string + value string +} + // nodeAttributeParser returns the URL values of the attribute of the node and // whether the attribute has been processed. -type nodeAttributeParser func(attribute, value string) ([]string, bool) +type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool) type Node struct { Attributes []string - parser nodeAttributeParser + noChildParsing bool + parser nodeAttributeParser } const ( @@ -27,6 +43,7 @@ const ( ImgTag = "img" LinkTag = "link" ScriptTag = "script" + StyleTag = "style" ) // Nodes describes the HTML tags and their attributes that can contain URL. @@ -47,6 +64,10 @@ var Nodes = map[string]Node{ ScriptTag: { Attributes: []string{SrcAttribute}, }, + StyleTag: { + noChildParsing: true, + parser: styleParser, + }, } // SrcSetAttributes contains the attributes that contain srcset values. diff --git a/htmlindex/htmlindex.go b/htmlindex/htmlindex.go index aaf15a1..19e7033 100644 --- a/htmlindex/htmlindex.go +++ b/htmlindex/htmlindex.go @@ -6,55 +6,64 @@ import ( "sort" "strings" + "github.com/cornelk/goscrape/css" + "github.com/cornelk/gotokit/log" "golang.org/x/net/html" ) // Index provides an index for all HTML tags of relevance for scraping. type Index struct { + logger *log.Logger + // key is HTML tag, value is a map of all its urls and the HTML nodes for it data map[string]map[string][]*html.Node } // New returns a new index. -func New() *Index { +func New(logger *log.Logger) *Index { return &Index{ - data: make(map[string]map[string][]*html.Node), + logger: logger, + data: make(map[string]map[string][]*html.Node), } } // Index the given HTML document. -func (h *Index) Index(baseURL *url.URL, node *html.Node) { +func (idx *Index) Index(baseURL *url.URL, node *html.Node) { for child := node.FirstChild; child != nil; child = child.NextSibling { - if child.Type != html.ElementNode { - continue + switch child.Type { + case html.ElementNode: + idx.indexElementNode(baseURL, node, child) + default: } + } +} - var references []string +func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) { + var references []string - info, ok := Nodes[child.Data] - if ok { - references = nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...) - } + info, ok := Nodes[child.Data] + if ok { + references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...) + } - m, ok := h.data[child.Data] - if !ok { - m = map[string][]*html.Node{} - h.data[child.Data] = m - } + m, ok := idx.data[child.Data] + if !ok { + m = map[string][]*html.Node{} + idx.data[child.Data] = m + } - for _, reference := range references { - m[reference] = append(m[reference], child) - } + for _, reference := range references { + m[reference] = append(m[reference], child) + } - if node.FirstChild != nil { - h.Index(baseURL, child) - } + if node.FirstChild != nil && !info.noChildParsing { + idx.Index(baseURL, child) } } // URLs returns all URLs of the references found for a specific tag. -func (h *Index) URLs(tag string) ([]*url.URL, error) { - m, ok := h.data[tag] +func (idx *Index) URLs(tag string) ([]*url.URL, error) { + m, ok := idx.data[tag] if !ok { return nil, nil } @@ -78,8 +87,8 @@ func (h *Index) URLs(tag string) ([]*url.URL, error) { } // Nodes returns a map of all URLs and their HTML nodes. -func (h *Index) Nodes(tag string) map[string][]*html.Node { - m, ok := h.data[tag] +func (idx *Index) Nodes(tag string) map[string][]*html.Node { + m, ok := idx.data[tag] if ok { return m } @@ -87,11 +96,23 @@ func (h *Index) Nodes(tag string) map[string][]*html.Node { } // nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values. -func nodeAttributeURLs(baseURL *url.URL, node *html.Node, +func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node, parser nodeAttributeParser, attributeName ...string) []string { var results []string + processReferences := func(references []string) { + for _, reference := range references { + ur, err := url.Parse(reference) + if err != nil { + continue + } + + ur = baseURL.ResolveReference(ur) + results = append(results, ur.String()) + } + } + for _, attr := range node.Attr { var process bool for _, name := range attributeName { @@ -108,34 +129,44 @@ func nodeAttributeURLs(baseURL *url.URL, node *html.Node, var parserHandled bool if parser != nil { - references, parserHandled = parser(attr.Key, strings.TrimSpace(attr.Val)) + data := nodeAttributeParserData{ + logger: idx.logger, + url: baseURL, + node: node, + attribute: attr.Key, + value: strings.TrimSpace(attr.Val), + } + references, parserHandled = parser(data) } if parser == nil || !parserHandled { references = append(references, strings.TrimSpace(attr.Val)) } - for _, reference := range references { - ur, err := url.Parse(reference) - if err != nil { - continue - } + processReferences(references) + } - ur = baseURL.ResolveReference(ur) - results = append(results, ur.String()) + // special case to support style tag + if len(attributeName) == 0 && parser != nil { + data := nodeAttributeParserData{ + logger: idx.logger, + url: baseURL, + node: node, } + references, _ := parser(data) + processReferences(references) } return results } // srcSetValueSplitter returns the URL values of the srcset attribute of img nodes. -func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) { - if _, isSrcSet := SrcSetAttributes[attribute]; !isSrcSet { +func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) { + if _, isSrcSet := SrcSetAttributes[data.attribute]; !isSrcSet { return nil, false } // split the set of responsive images - values := strings.Split(attributeValue, ",") + values := strings.Split(data.value, ",") for i, value := range values { value = strings.TrimSpace(value) @@ -145,3 +176,20 @@ func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) { return values, true } + +// styleParser returns the URL values of a CSS style tag. +func styleParser(data nodeAttributeParserData) ([]string, bool) { + if data.node.FirstChild == nil { + return nil, false + } + + var urls []string + processor := func(_ *css.Token, _ string, url *url.URL) { + urls = append(urls, url.String()) + } + + cssData := data.node.FirstChild.Data + css.Process(data.logger, data.url, cssData, processor) + + return urls, true +} diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go index 9324509..33fef19 100644 --- a/htmlindex/htmlindex_test.go +++ b/htmlindex/htmlindex_test.go @@ -5,6 +5,7 @@ import ( "net/url" "testing" + "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/net/html" @@ -91,7 +92,8 @@ func testSetup(t *testing.T, input []byte) *Index { ur, err := url.Parse("https://domain.com/") require.NoError(t, err) - idx := New() + logger := log.NewTestLogger(t) + idx := New(logger) idx.Index(ur, doc) return idx diff --git a/scraper/css.go b/scraper/css.go deleted file mode 100644 index e4be627..0000000 --- a/scraper/css.go +++ /dev/null @@ -1,70 +0,0 @@ -package scraper - -import ( - "fmt" - "net/url" - "path" - "regexp" - "strings" - - "github.com/cornelk/gotokit/log" - "github.com/gorilla/css/scanner" -) - -var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) - -func (s *Scraper) checkCSSForUrls(url *url.URL, data []byte) []byte { - urls := make(map[string]string) - str := string(data) - css := scanner.New(str) - - for { - token := css.Next() - if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { - break - } - if token.Type != scanner.TokenURI { - continue - } - - match := cssURLRe.FindStringSubmatch(token.Value) - if match == nil { - continue - } - - src := match[1] - if strings.HasPrefix(strings.ToLower(src), "data:") { - continue // skip embedded data - } - - u, err := url.Parse(src) - if err != nil { - s.logger.Error("Parsing URL failed", - log.String("url", src), - log.Err(err)) - continue - } - u = url.ResolveReference(u) - - s.imagesQueue = append(s.imagesQueue, u) - - cssPath := *url - cssPath.Path = path.Dir(cssPath.Path) + "/" - resolved := resolveURL(&cssPath, src, s.URL.Host, false, "") - urls[token.Value] = resolved - } - - if len(urls) == 0 { - return data - } - - for ori, filePath := range urls { - fixed := fmt.Sprintf("url(%s)", filePath) - str = strings.ReplaceAll(str, ori, fixed) - s.logger.Debug("CSS Element relinked", - log.String("url", ori), - log.String("fixed_url", fixed)) - } - - return []byte(str) -} diff --git a/scraper/download.go b/scraper/download.go index 7e9ea5c..0730970 100644 --- a/scraper/download.go +++ b/scraper/download.go @@ -5,7 +5,9 @@ import ( "errors" "fmt" "net/url" + "path" + "github.com/cornelk/goscrape/css" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" ) @@ -18,6 +20,7 @@ var tagsWithReferences = []string{ htmlindex.LinkTag, htmlindex.ScriptTag, htmlindex.BodyTag, + htmlindex.StyleTag, } func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error { @@ -43,7 +46,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index var processor assetProcessor if tag == htmlindex.LinkTag { - processor = s.checkCSSForUrls + processor = s.cssProcessor } for _, ur := range references { if err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) { @@ -97,3 +100,32 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset return nil } + +func (s *Scraper) cssProcessor(baseURL *url.URL, data []byte) []byte { + urls := make(map[string]string) + + processor := func(token *css.Token, data string, u *url.URL) { + s.imagesQueue = append(s.imagesQueue, u) + + cssPath := *u + cssPath.Path = path.Dir(cssPath.Path) + "/" + resolved := resolveURL(&cssPath, data, s.URL.Host, false, "") + urls[token.Value] = resolved + } + + cssData := string(data) + css.Process(s.logger, baseURL, cssData, processor) + + if len(urls) == 0 { + return data + } + + for ori, filePath := range urls { + cssData = replaceCSSUrls(ori, filePath, cssData) + s.logger.Debug("CSS Element relinked", + log.String("url", ori), + log.String("fixed_url", filePath)) + } + + return []byte(cssData) +} diff --git a/scraper/css_test.go b/scraper/download_test.go similarity index 93% rename from scraper/css_test.go rename to scraper/download_test.go index b7103a8..a5c32fa 100644 --- a/scraper/css_test.go +++ b/scraper/download_test.go @@ -9,7 +9,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestCheckCSSForURLs(t *testing.T) { +func TestCSSProcessor(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "http://localhost", @@ -31,7 +31,7 @@ func TestCheckCSSForURLs(t *testing.T) { u, _ := url.Parse("http://localhost") for input, expected := range fixtures { s.imagesQueue = nil - s.checkCSSForUrls(u, []byte(input)) + s.cssProcessor(u, []byte(input)) if expected == "" { assert.Empty(t, s.imagesQueue) diff --git a/scraper/html.go b/scraper/html.go index 0fb65ba..280c062 100644 --- a/scraper/html.go +++ b/scraper/html.go @@ -6,6 +6,7 @@ import ( "net/url" "strings" + "github.com/cornelk/goscrape/css" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" "golang.org/x/net/html" @@ -48,8 +49,15 @@ func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index urls := index.Nodes(tag) for _, nodes := range urls { for _, node := range nodes { - if s.fixNodeURL(baseURL, nodeInfo.Attributes, node, isHyperlink, relativeToRoot) { - changed = true + switch node.Data { + case htmlindex.StyleTag: + if s.fixScriptNodeURL(baseURL, node, isHyperlink, relativeToRoot) { + changed = true + } + default: + if s.fixNodeURL(baseURL, nodeInfo.Attributes, node, isHyperlink, relativeToRoot) { + changed = true + } } } } @@ -112,6 +120,42 @@ func (s *Scraper) fixNodeURL(baseURL *url.URL, attributes []string, node *html.N return changed } +// fixScriptNodeURL fixes the URL references of a HTML script node to point to a relative file name. +// It returns whether any attribute value bas been adjusted. +func (s *Scraper) fixScriptNodeURL(baseURL *url.URL, node *html.Node, + isHyperlink bool, relativeToRoot string) bool { + + if node.FirstChild == nil { + return false + } + + urls := map[string]string{} + + processor := func(_ *css.Token, before string, _ *url.URL) { + adjusted := resolveURL(baseURL, before, s.URL.Host, isHyperlink, relativeToRoot) + if before != adjusted { + urls[before] = adjusted + } + } + + cssData := node.FirstChild.Data + css.Process(s.logger, baseURL, cssData, processor) + + var changed bool + + for before, filePath := range urls { + cssData = replaceCSSUrls(before, filePath, cssData) + s.logger.Debug("CSS Element relinked", + log.String("url", before), + log.String("fixed_url", filePath)) + changed = true + } + + node.FirstChild.Data = cssData + + return changed +} + func resolveSrcSetURLs(base *url.URL, srcSetValue, mainPageHost string, isHyperlink bool, relativeToRoot string) string { // split the set of responsive images values := strings.Split(srcSetValue, ",") @@ -125,3 +169,17 @@ func resolveSrcSetURLs(base *url.URL, srcSetValue, mainPageHost string, isHyperl return strings.Join(values, ", ") } + +func replaceCSSUrls(before, after, content string) string { + prefixes := []string{ + "\"", "'", "", + } + + for _, prefix := range prefixes { + wrong := fmt.Sprintf("url(%s%s%s)", prefix, before, prefix) + fixed := "url('" + after + "')" + content = strings.ReplaceAll(content, wrong, fixed) + } + + return content +} diff --git a/scraper/html_test.go b/scraper/html_test.go index bfdd9ec..87e90c2 100644 --- a/scraper/html_test.go +++ b/scraper/html_test.go @@ -33,7 +33,7 @@ func TestFixURLReferences(t *testing.T) { doc, err := html.Parse(buf) require.NoError(t, err) - index := htmlindex.New() + index := htmlindex.New(logger) index.Index(s.URL, doc) ref, fixed, err := s.fixURLReferences(s.URL, doc, index) diff --git a/scraper/scraper.go b/scraper/scraper.go index c63f2f6..1d7ad52 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -219,7 +219,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) return fmt.Errorf("parsing HTML: %w", err) } - index := htmlindex.New() + index := htmlindex.New(s.logger) index.Index(u, doc) s.storeDownload(u, data, doc, index, fileExtension) diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index 98057c2..4f2a546 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -129,3 +129,66 @@ func TestScraperAttributes(t *testing.T) { } assert.Equal(t, expectedProcessed, scraper.processed) } + +func TestScraperInternalCss(t *testing.T) { + indexPage := []byte(` + + + + + + + +`) + empty := []byte(``) + + domain := "example.org" + file1Reference := "background.jpg" + file2Reference := "img/bg.jpg" + file3Reference := "bg3.jpg" + fullURL := "https://" + domain + + urls := map[string][]byte{ + fullURL + "/": indexPage, + fullURL + "/" + file1Reference: empty, + fullURL + "/" + file2Reference: empty, + fullURL + "/" + file3Reference: empty, + } + + scraper := newTestScraper(t, fullURL+"/", urls) + require.NotNil(t, scraper) + + files := map[string][]byte{} + scraper.fileWriter = func(filePath string, data []byte) error { + files[filePath] = data + return nil + } + + ctx := context.Background() + err := scraper.Start(ctx) + require.NoError(t, err) + + expectedProcessed := map[string]struct{}{ + "/": {}, + "/" + file1Reference: {}, + "/" + file2Reference: {}, + "/" + file3Reference: {}, + } + require.Equal(t, expectedProcessed, scraper.processed) + + ref := domain + "/index.html" + content := string(files[ref]) + assert.Contains(t, content, "url('"+file1Reference+"')") + assert.Contains(t, content, "url('"+file2Reference+"')") + assert.Contains(t, content, "url("+file3Reference+")") +}