Skip to content

Commit

Permalink
fix(scrape): Fetch images from css
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Jul 30, 2024
1 parent 0cb0cef commit bbb8841
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 9 deletions.
18 changes: 9 additions & 9 deletions cmd/wp-go-static/commands/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"log"
"net/http"
"net/url"
"regexp"
"strings"
"wp-go-static/pkg/file"

Expand All @@ -17,6 +16,7 @@ import (

"wp-go-static/internal/cache"
"wp-go-static/internal/config"
"wp-go-static/internal/html"
)

type Scrape struct {
Expand Down Expand Up @@ -228,15 +228,15 @@ func (s *Scrape) visitURL(link string) {
}

func (s *Scrape) parseBody(body []byte) []byte {
cssUrls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(string(body), -1)
var urlsToVisit []string
htmlParser := html.NewHTML(string(body))

// Download each referenced file if it hasn't been visited before
for _, cssUrl := range cssUrls {
link := strings.Trim(cssUrl[1], "'\"")
if link == "" {
continue
}
s.visitURL(link)
urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractCSS())...)
urlsToVisit = append(urlsToVisit, htmlParser.ExtractImageURLs(htmlParser.ExtractURLs())...)

// Download each one if it hasn't been visited before
for _, url := range urlsToVisit {
s.visitURL(url)
}

if s.config.Scrape.Replace {
Expand Down
76 changes: 76 additions & 0 deletions internal/html/css.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package html

import (
"regexp"
"strings"

"golang.org/x/net/html"
)

// ExtractCSS extracts all CSS styles from HTML content
func (h *HTML) ExtractCSS() []string {
var cssContents []string
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && (n.Data == "style" || (n.Data == "link" && getAttributeValue(n, "rel") == "stylesheet")) {
if n.Data == "style" {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
cssContents = append(cssContents, c.Data)
}
}
} else if n.Data == "link" {
// If link tag with stylesheet reference, handle if needed
// For now, we're only considering inline styles
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(h.htmlNode)

return cssContents
}

// getAttributeValue retrieves the value of the specified attribute from the node
func getAttributeValue(n *html.Node, attrName string) string {
for _, attr := range n.Attr {
if attr.Key == attrName {
return attr.Val
}
}
return ""
}

// ExtractImageURLs extracts image URLs from CSS content
func (h *HTML) ExtractImageURLs(cssContents []string) []string {
var imageUrls []string
regex := regexp.MustCompile(`url\(['"]?(.*?)['"]?\)`)
for _, css := range cssContents {
matches := regex.FindAllStringSubmatch(css, -1)
for _, match := range matches {
if len(match) > 1 {
imageUrls = append(imageUrls, match[1])
}
}
}
return imageUrls
}

// ExtractImageURLs extracts image URLs from CSS content
func (h *HTML) ExtractURLs() []string {
var urlList []string
urls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(h.body, -1)

// Download each referenced file if it hasn't been visited before
for _, url := range urls {
link := strings.Trim(url[1], "'\"")
if link == "" {
continue
}
urlList = append(urlList, link)
}

return urlList
}
26 changes: 26 additions & 0 deletions internal/html/html.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package html

import (
"fmt"
"strings"

"golang.org/x/net/html"
)

type HTML struct {
body string
htmlNode *html.Node
}

func NewHTML(body string) *HTML {
doc, err := html.Parse(strings.NewReader(body))
if err != nil {
fmt.Println("Error parsing HTML:", err)
return nil
}

return &HTML{
body: body,
htmlNode: doc,
}
}

0 comments on commit bbb8841

Please sign in to comment.