Skip to content

Commit

Permalink
support inline css (#51)
Browse files Browse the repository at this point in the history
  • Loading branch information
cornelk authored Jan 1, 2025
1 parent 49706aa commit af0e0aa
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 117 deletions.
51 changes: 51 additions & 0 deletions css/css.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package css

import (
"net/url"
"regexp"
"strings"

"github.com/cornelk/gotokit/log"
"github.com/gorilla/css/scanner"
)

var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)

type Token = scanner.Token

type urlProcessor func(token *Token, data string, url *url.URL)

// Process the CSS data and call a processor for every found URL.
func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) {
css := scanner.New(data)

for {
token := css.Next()
if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
break
}
if token.Type != scanner.TokenURI {
continue
}

match := cssURLRe.FindStringSubmatch(token.Value)
if match == nil {
continue
}

src := match[1]
if strings.HasPrefix(strings.ToLower(src), "data:") {
continue // skip embedded data
}

u, err := url.Parse(src)
if err != nil {
logger.Error("Parsing URL failed",
log.String("url", src),
log.Err(err))
continue
}

processor(token, src, u)
}
}
25 changes: 23 additions & 2 deletions htmlindex/attributes.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
package htmlindex

import (
"net/url"

"github.com/cornelk/gotokit/log"
"golang.org/x/net/html"
)

type nodeAttributeParserData struct {
logger *log.Logger
url *url.URL
node *html.Node
attribute string
value string
}

// nodeAttributeParser returns the URL values of the attribute of the node and
// whether the attribute has been processed.
type nodeAttributeParser func(attribute, value string) ([]string, bool)
type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool)

type Node struct {
Attributes []string

parser nodeAttributeParser
noChildParsing bool
parser nodeAttributeParser
}

const (
Expand All @@ -27,6 +43,7 @@ const (
ImgTag = "img"
LinkTag = "link"
ScriptTag = "script"
StyleTag = "style"
)

// Nodes describes the HTML tags and their attributes that can contain URL.
Expand All @@ -47,6 +64,10 @@ var Nodes = map[string]Node{
ScriptTag: {
Attributes: []string{SrcAttribute},
},
StyleTag: {
noChildParsing: true,
parser: styleParser,
},
}

// SrcSetAttributes contains the attributes that contain srcset values.
Expand Down
122 changes: 85 additions & 37 deletions htmlindex/htmlindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,64 @@ import (
"sort"
"strings"

"github.com/cornelk/goscrape/css"
"github.com/cornelk/gotokit/log"
"golang.org/x/net/html"
)

// Index provides an index for all HTML tags of relevance for scraping.
type Index struct {
logger *log.Logger

// key is HTML tag, value is a map of all its urls and the HTML nodes for it
data map[string]map[string][]*html.Node
}

// New returns a new index.
func New() *Index {
func New(logger *log.Logger) *Index {
return &Index{
data: make(map[string]map[string][]*html.Node),
logger: logger,
data: make(map[string]map[string][]*html.Node),
}
}

// Index the given HTML document.
func (h *Index) Index(baseURL *url.URL, node *html.Node) {
func (idx *Index) Index(baseURL *url.URL, node *html.Node) {
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type != html.ElementNode {
continue
switch child.Type {
case html.ElementNode:
idx.indexElementNode(baseURL, node, child)
default:
}
}
}

var references []string
func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) {
var references []string

info, ok := Nodes[child.Data]
if ok {
references = nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
}
info, ok := Nodes[child.Data]
if ok {
references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
}

m, ok := h.data[child.Data]
if !ok {
m = map[string][]*html.Node{}
h.data[child.Data] = m
}
m, ok := idx.data[child.Data]
if !ok {
m = map[string][]*html.Node{}
idx.data[child.Data] = m
}

for _, reference := range references {
m[reference] = append(m[reference], child)
}
for _, reference := range references {
m[reference] = append(m[reference], child)
}

if node.FirstChild != nil {
h.Index(baseURL, child)
}
if node.FirstChild != nil && !info.noChildParsing {
idx.Index(baseURL, child)
}
}

// URLs returns all URLs of the references found for a specific tag.
func (h *Index) URLs(tag string) ([]*url.URL, error) {
m, ok := h.data[tag]
func (idx *Index) URLs(tag string) ([]*url.URL, error) {
m, ok := idx.data[tag]
if !ok {
return nil, nil
}
Expand All @@ -78,20 +87,32 @@ func (h *Index) URLs(tag string) ([]*url.URL, error) {
}

// Nodes returns a map of all URLs and their HTML nodes.
func (h *Index) Nodes(tag string) map[string][]*html.Node {
m, ok := h.data[tag]
func (idx *Index) Nodes(tag string) map[string][]*html.Node {
m, ok := idx.data[tag]
if ok {
return m
}
return map[string][]*html.Node{}
}

// nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values.
func nodeAttributeURLs(baseURL *url.URL, node *html.Node,
func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node,
parser nodeAttributeParser, attributeName ...string) []string {

var results []string

processReferences := func(references []string) {
for _, reference := range references {
ur, err := url.Parse(reference)
if err != nil {
continue
}

ur = baseURL.ResolveReference(ur)
results = append(results, ur.String())
}
}

for _, attr := range node.Attr {
var process bool
for _, name := range attributeName {
Expand All @@ -108,34 +129,44 @@ func nodeAttributeURLs(baseURL *url.URL, node *html.Node,
var parserHandled bool

if parser != nil {
references, parserHandled = parser(attr.Key, strings.TrimSpace(attr.Val))
data := nodeAttributeParserData{
logger: idx.logger,
url: baseURL,
node: node,
attribute: attr.Key,
value: strings.TrimSpace(attr.Val),
}
references, parserHandled = parser(data)
}
if parser == nil || !parserHandled {
references = append(references, strings.TrimSpace(attr.Val))
}

for _, reference := range references {
ur, err := url.Parse(reference)
if err != nil {
continue
}
processReferences(references)
}

ur = baseURL.ResolveReference(ur)
results = append(results, ur.String())
// special case to support style tag
if len(attributeName) == 0 && parser != nil {
data := nodeAttributeParserData{
logger: idx.logger,
url: baseURL,
node: node,
}
references, _ := parser(data)
processReferences(references)
}

return results
}

// srcSetValueSplitter returns the URL values of the srcset attribute of img nodes.
func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) {
if _, isSrcSet := SrcSetAttributes[attribute]; !isSrcSet {
func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {
if _, isSrcSet := SrcSetAttributes[data.attribute]; !isSrcSet {
return nil, false
}

// split the set of responsive images
values := strings.Split(attributeValue, ",")
values := strings.Split(data.value, ",")

for i, value := range values {
value = strings.TrimSpace(value)
Expand All @@ -145,3 +176,20 @@ func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) {

return values, true
}

// styleParser returns the URL values of a CSS style tag.
func styleParser(data nodeAttributeParserData) ([]string, bool) {
if data.node.FirstChild == nil {
return nil, false
}

var urls []string
processor := func(_ *css.Token, _ string, url *url.URL) {
urls = append(urls, url.String())
}

cssData := data.node.FirstChild.Data
css.Process(data.logger, data.url, cssData, processor)

return urls, true
}
4 changes: 3 additions & 1 deletion htmlindex/htmlindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"net/url"
"testing"

"github.com/cornelk/gotokit/log"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/html"
Expand Down Expand Up @@ -91,7 +92,8 @@ func testSetup(t *testing.T, input []byte) *Index {
ur, err := url.Parse("https://domain.com/")
require.NoError(t, err)

idx := New()
logger := log.NewTestLogger(t)
idx := New(logger)
idx.Index(ur, doc)

return idx
Expand Down
70 changes: 0 additions & 70 deletions scraper/css.go

This file was deleted.

Loading

0 comments on commit af0e0aa

Please sign in to comment.