fix(scrape): Fetch images from css

LOQ9 · Jul 30, 2024 · bbb8841 · bbb8841
1 parent 0cb0cef
commit bbb8841
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 9 deletions.
diff --git a/cmd/wp-go-static/commands/scrape.go b/cmd/wp-go-static/commands/scrape.go
@@ -6,7 +6,6 @@ import (
 	"log"
 	"net/http"
 	"net/url"
-	"regexp"
 	"strings"
 	"wp-go-static/pkg/file"
 
@@ -17,6 +16,7 @@ import (
 
 	"wp-go-static/internal/cache"
 	"wp-go-static/internal/config"
+	"wp-go-static/internal/html"
 )
 
 type Scrape struct {
@@ -228,15 +228,15 @@ func (s *Scrape) visitURL(link string) {
 }
 
 func (s *Scrape) parseBody(body []byte) []byte {
-	cssUrls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(string(body), -1)
+	var urlsToVisit []string
+	htmlParser := html.NewHTML(string(body))
 
-	// Download each referenced file if it hasn't been visited before
-	for _, cssUrl := range cssUrls {
-		link := strings.Trim(cssUrl[1], "'\"")
-		if link == "" {
-			continue
-		}
-		s.visitURL(link)
+	urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractCSS())...)
+	urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractURLs())...)
+
+	// Download each one if it hasn't been visited before
+	for _, url := range urlsToVisit {
+		s.visitURL(url)
 	}
 
 	if s.config.Scrape.Replace {

diff --git a/internal/html/css.go b/internal/html/css.go
@@ -0,0 +1,76 @@
+package html
+
+import (
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// ExtractCSS extracts all CSS styles from HTML content
+func (h *HTML) ExtractCSS() []string {
+	var cssContents []string
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && (n.Data == "style" || (n.Data == "link" && getAttributeValue(n, "rel") == "stylesheet")) {
+			if n.Data == "style" {
+				for c := n.FirstChild; c != nil; c = c.NextSibling {
+					if c.Type == html.TextNode {
+						cssContents = append(cssContents, c.Data)
+					}
+				}
+			} else if n.Data == "link" {
+				// If link tag with stylesheet reference, handle if needed
+				// For now, we're only considering inline styles
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			f(c)
+		}
+	}
+	f(h.htmlNode)
+
+	return cssContents
+}
+
+// getAttributeValue retrieves the value of the specified attribute from the node
+func getAttributeValue(n *html.Node, attrName string) string {
+	for _, attr := range n.Attr {
+		if attr.Key == attrName {
+			return attr.Val
+		}
+	}
+	return ""
+}
+
+// ExtractImageURLs extracts image URLs from CSS content
+func (h *HTML) ExtractImageURLs(cssContents []string) []string {
+	var imageUrls []string
+	regex := regexp.MustCompile(`url\(['"]?(.*?)['"]?\)`)
+	for _, css := range cssContents {
+		matches := regex.FindAllStringSubmatch(css, -1)
+		for _, match := range matches {
+			if len(match) > 1 {
+				imageUrls = append(imageUrls, match[1])
+			}
+		}
+	}
+	return imageUrls
+}
+
+// ExtractImageURLs extracts image URLs from CSS content
+func (h *HTML) ExtractURLs() []string {
+	var urlList []string
+	urls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(h.body, -1)
+
+	// Download each referenced file if it hasn't been visited before
+	for _, url := range urls {
+		link := strings.Trim(url[1], "'\"")
+		if link == "" {
+			continue
+		}
+		urlList = append(urlList, link)
+	}
+
+	return urlList
+}
diff --git a/internal/html/html.go b/internal/html/html.go
@@ -0,0 +1,26 @@
+package html
+
+import (
+	"fmt"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+type HTML struct {
+	body     string
+	htmlNode *html.Node
+}
+
+func NewHTML(body string) *HTML {
+	doc, err := html.Parse(strings.NewReader(body))
+	if err != nil {
+		fmt.Println("Error parsing HTML:", err)
+		return nil
+	}
+
+	return &HTML{
+		body:     body,
+		htmlNode: doc,
+	}
+}