From bbb8841579edc7d9ca9971c65b5c0800edba7d47 Mon Sep 17 00:00:00 2001 From: Marco Santos Date: Tue, 30 Jul 2024 18:17:36 +0000 Subject: [PATCH] fix(scrape): Fetch images from css --- cmd/wp-go-static/commands/scrape.go | 18 +++---- internal/html/css.go | 76 +++++++++++++++++++++++++++++ internal/html/html.go | 26 ++++++++++ 3 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 internal/html/css.go create mode 100644 internal/html/html.go diff --git a/cmd/wp-go-static/commands/scrape.go b/cmd/wp-go-static/commands/scrape.go index 68e100e..55c7ec7 100644 --- a/cmd/wp-go-static/commands/scrape.go +++ b/cmd/wp-go-static/commands/scrape.go @@ -6,7 +6,6 @@ import ( "log" "net/http" "net/url" - "regexp" "strings" "wp-go-static/pkg/file" @@ -17,6 +16,7 @@ import ( "wp-go-static/internal/cache" "wp-go-static/internal/config" + "wp-go-static/internal/html" ) type Scrape struct { @@ -228,15 +228,15 @@ func (s *Scrape) visitURL(link string) { } func (s *Scrape) parseBody(body []byte) []byte { - cssUrls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(string(body), -1) + var urlsToVisit []string + htmlParser := html.NewHTML(string(body)) - // Download each referenced file if it hasn't been visited before - for _, cssUrl := range cssUrls { - link := strings.Trim(cssUrl[1], "'\"") - if link == "" { - continue - } - s.visitURL(link) + urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractCSS())...) + urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractURLs())...) + + // Download each one if it hasn't been visited before + for _, url := range urlsToVisit { + s.visitURL(url) } if s.config.Scrape.Replace { diff --git a/internal/html/css.go b/internal/html/css.go new file mode 100644 index 0000000..28ca610 --- /dev/null +++ b/internal/html/css.go @@ -0,0 +1,76 @@ +package html + +import ( + "regexp" + "strings" + + "golang.org/x/net/html" +) + +// ExtractCSS extracts all CSS styles from HTML content +func (h *HTML) ExtractCSS() []string { + var cssContents []string + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && (n.Data == "style" || (n.Data == "link" && getAttributeValue(n, "rel") == "stylesheet")) { + if n.Data == "style" { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + cssContents = append(cssContents, c.Data) + } + } + } else if n.Data == "link" { + // If link tag with stylesheet reference, handle if needed + // For now, we're only considering inline styles + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(h.htmlNode) + + return cssContents +} + +// getAttributeValue retrieves the value of the specified attribute from the node +func getAttributeValue(n *html.Node, attrName string) string { + for _, attr := range n.Attr { + if attr.Key == attrName { + return attr.Val + } + } + return "" +} + +// ExtractImageURLs extracts image URLs from CSS content +func (h *HTML) ExtractImageURLs(cssContents []string) []string { + var imageUrls []string + regex := regexp.MustCompile(`url\(['"]?(.*?)['"]?\)`) + for _, css := range cssContents { + matches := regex.FindAllStringSubmatch(css, -1) + for _, match := range matches { + if len(match) > 1 { + imageUrls = append(imageUrls, match[1]) + } + } + } + return imageUrls +} + +// ExtractImageURLs extracts image URLs from CSS content +func (h *HTML) ExtractURLs() []string { + var urlList []string + urls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(h.body, -1) + + // Download each referenced file if it hasn't been visited before + for _, url := range urls { + link := strings.Trim(url[1], "'\"") + if link == "" { + continue + } + urlList = append(urlList, link) + } + + return urlList +} diff --git a/internal/html/html.go b/internal/html/html.go new file mode 100644 index 0000000..988ddd1 --- /dev/null +++ b/internal/html/html.go @@ -0,0 +1,26 @@ +package html + +import ( + "fmt" + "strings" + + "golang.org/x/net/html" +) + +type HTML struct { + body string + htmlNode *html.Node +} + +func NewHTML(body string) *HTML { + doc, err := html.Parse(strings.NewReader(body)) + if err != nil { + fmt.Println("Error parsing HTML:", err) + return nil + } + + return &HTML{ + body: body, + htmlNode: doc, + } +}