Skip to content

Commit

Permalink
use []byte instead of bytes.Buffer for data processing to improve tes…
Browse files Browse the repository at this point in the history
…tability
  • Loading branch information
cornelk committed Sep 22, 2024
1 parent 1f7c0c0 commit d8c9a58
Show file tree
Hide file tree
Showing 10 changed files with 57 additions and 62 deletions.
9 changes: 4 additions & 5 deletions scraper/css.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scraper

import (
"bytes"
"fmt"
"net/url"
"path"
Expand All @@ -14,9 +13,9 @@ import (

var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)

func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer {
func (s *Scraper) checkCSSForUrls(url *url.URL, data []byte) []byte {
urls := make(map[string]string)
str := buf.String()
str := string(data)
css := scanner.New(str)

for {
Expand Down Expand Up @@ -56,7 +55,7 @@ func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer
}

if len(urls) == 0 {
return buf
return data
}

for ori, filePath := range urls {
Expand All @@ -67,5 +66,5 @@ func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer
log.String("fixed_url", fixed))
}

return bytes.NewBufferString(str)
return []byte(str)
}
4 changes: 1 addition & 3 deletions scraper/css_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scraper

import (
"bytes"
"net/url"
"testing"

Expand Down Expand Up @@ -32,8 +31,7 @@ func TestCheckCSSForURLs(t *testing.T) {
u, _ := url.Parse("http://localhost")
for input, expected := range fixtures {
s.imagesQueue = nil
buf := bytes.NewBufferString(input)
s.checkCSSForUrls(u, buf)
s.checkCSSForUrls(u, []byte(input))

if expected == "" {
assert.Empty(t, s.imagesQueue)
Expand Down
9 changes: 4 additions & 5 deletions scraper/download.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scraper

import (
"bytes"
"context"
"errors"
"fmt"
Expand All @@ -13,7 +12,7 @@ import (

// assetProcessor is a processor of a downloaded asset that can transform
// a downloaded file content before it will be stored on disk.
type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer
type assetProcessor func(URL *url.URL, data []byte) []byte

var tagsWithReferences = []string{
htmlindex.LinkTag,
Expand Down Expand Up @@ -77,7 +76,7 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
}

s.logger.Info("Downloading asset", log.String("url", urlFull))
buf, _, err := s.httpDownloader(ctx, u)
data, _, err := s.httpDownloader(ctx, u)
if err != nil {
s.logger.Error("Downloading asset failed",
log.String("url", urlFull),
Expand All @@ -86,10 +85,10 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
}

if processor != nil {
buf = processor(u, buf)
data = processor(u, data)
}

if err = s.fileWriter(filePath, buf); err != nil {
if err = s.fileWriter(filePath, data); err != nil {
s.logger.Error("Writing asset file failed",
log.String("url", urlFull),
log.String("file", filePath),
Expand Down
5 changes: 2 additions & 3 deletions scraper/fs.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scraper

import (
"bytes"
"fmt"
"os"
"path/filepath"
Expand All @@ -22,7 +21,7 @@ func (s *Scraper) createDownloadPath(path string) error {
return nil
}

func (s *Scraper) writeFile(filePath string, buf *bytes.Buffer) error {
func (s *Scraper) writeFile(filePath string, data []byte) error {
dir := filepath.Dir(filePath)
if len(dir) < len(s.URL.Host) { // nothing to append if it is the root dir
dir = filepath.Join(".", s.URL.Host, dir)
Expand All @@ -38,7 +37,7 @@ func (s *Scraper) writeFile(filePath string, buf *bytes.Buffer) error {
return fmt.Errorf("creating file '%s': %w", filePath, err)
}

if _, err = f.Write(buf.Bytes()); err != nil {
if _, err = f.Write(data); err != nil {
// nolint: wrapcheck
_ = f.Close() // try to close and remove file but return the first error
_ = os.Remove(filePath)
Expand Down
8 changes: 4 additions & 4 deletions scraper/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ var ignoredURLPrefixes = []string{
// It returns a bool that indicates that no reference needed to be fixed,
// in this case the returned HTML string will be empty.
func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,
index *htmlindex.Index) (string, bool, error) {
index *htmlindex.Index) ([]byte, bool, error) {

relativeToRoot := urlRelativeToRoot(url)
if !s.fixHTMLNodeURLs(url, relativeToRoot, index) {
return "", false, nil
return nil, false, nil
}

var rendered bytes.Buffer
if err := html.Render(&rendered, doc); err != nil {
return "", false, fmt.Errorf("rendering html: %w", err)
return nil, false, fmt.Errorf("rendering html: %w", err)
}
return rendered.String(), true, nil
return rendered.Bytes(), true, nil
}

// fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed
Expand Down
2 changes: 1 addition & 1 deletion scraper/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,5 @@ func TestFixURLReferences(t *testing.T) {
"<a href=\"wp-content/uploads/document.pdf\" rel=\"doc\">Guide</a>\n" +
"<img src=\"test.jpg\" srcset=\"test-480w.jpg 480w, test-800w.jpg 800w\"/> \n\n" +
"</body></html>"
assert.Equal(t, expected, ref)
assert.Equal(t, expected, string(ref))
}
4 changes: 2 additions & 2 deletions scraper/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"github.com/cornelk/gotokit/log"
)

func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*bytes.Buffer, *url.URL, error) {
func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil {
return nil, nil, fmt.Errorf("creating HTTP request: %w", err)
Expand Down Expand Up @@ -50,7 +50,7 @@ func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*bytes.Buffer, *
if _, err := io.Copy(buf, resp.Body); err != nil {
return nil, nil, fmt.Errorf("reading HTTP request body: %w", err)
}
return buf, resp.Request.URL, nil
return buf.Bytes(), resp.Request.URL, nil
}

func Headers(headers []string) http.Header {
Expand Down
50 changes: 25 additions & 25 deletions scraper/images.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,39 @@ import (
"github.com/h2non/filetype/types"
)

func (s *Scraper) checkImageForRecode(url *url.URL, buf *bytes.Buffer) *bytes.Buffer {
func (s *Scraper) checkImageForRecode(url *url.URL, data []byte) []byte {
if s.config.ImageQuality == 0 {
return buf
return data
}

kind, err := filetype.Match(buf.Bytes())
kind, err := filetype.Match(data)
if err != nil || kind == types.Unknown {
return buf
return data
}

s.logger.Debug("File type detected",
log.String("type", kind.MIME.Type),
log.String("sub_type", kind.MIME.Subtype))

if kind.MIME.Type == matchers.TypeJpeg.MIME.Type && kind.MIME.Subtype == matchers.TypeJpeg.MIME.Subtype {
if recoded := s.recodeJPEG(url, buf.Bytes()); recoded != nil {
if recoded := s.recodeJPEG(url, data); recoded != nil {
return recoded
}
return buf
return data
}

if kind.MIME.Type == matchers.TypePng.MIME.Type && kind.MIME.Subtype == matchers.TypePng.MIME.Subtype {
if recoded := s.recodePNG(url, buf.Bytes()); recoded != nil {
if recoded := s.recodePNG(url, data); recoded != nil {
return recoded
}
return buf
return data
}

return buf
return data
}

// encodeJPEG encodes a new JPG based on the given quality setting.
func (s *Scraper) encodeJPEG(img image.Image) *bytes.Buffer {
func (s *Scraper) encodeJPEG(img image.Image) []byte {
o := &jpeg.Options{
Quality: int(s.config.ImageQuality),
}
Expand All @@ -55,45 +55,45 @@ func (s *Scraper) encodeJPEG(img image.Image) *bytes.Buffer {
if err := jpeg.Encode(outBuf, img, o); err != nil {
return nil
}
return outBuf
return outBuf.Bytes()
}

// recodeJPEG recodes the image and returns it if it is smaller than before.
func (s *Scraper) recodeJPEG(url fmt.Stringer, b []byte) *bytes.Buffer {
inBuf := bytes.NewBuffer(b)
func (s *Scraper) recodeJPEG(url fmt.Stringer, data []byte) []byte {
inBuf := bytes.NewBuffer(data)
img, err := jpeg.Decode(inBuf)
if err != nil {
return nil
}

outBuf := s.encodeJPEG(img)
if outBuf == nil || outBuf.Len() > len(b) { // only use the new file if it is smaller
encoded := s.encodeJPEG(img)
if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller
return nil
}

s.logger.Debug("Recoded JPEG",
log.String("url", url.String()),
log.Int("size_original", len(b)),
log.Int("size_recoded", outBuf.Len()))
return outBuf
log.Int("size_original", len(data)),
log.Int("size_recoded", len(encoded)))
return encoded
}

// recodePNG recodes the image and returns it if it is smaller than before.
func (s *Scraper) recodePNG(url fmt.Stringer, b []byte) *bytes.Buffer {
inBuf := bytes.NewBuffer(b)
func (s *Scraper) recodePNG(url fmt.Stringer, data []byte) []byte {
inBuf := bytes.NewBuffer(data)
img, err := png.Decode(inBuf)
if err != nil {
return nil
}

outBuf := s.encodeJPEG(img)
if outBuf == nil || outBuf.Len() > len(b) { // only use the new file if it is smaller
encoded := s.encodeJPEG(img)
if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller
return nil
}

s.logger.Debug("Recoded PNG",
log.String("url", url.String()),
log.Int("size_original", len(b)),
log.Int("size_recoded", outBuf.Len()))
return outBuf
log.Int("size_original", len(data)),
log.Int("size_recoded", len(encoded)))
return encoded
}
21 changes: 11 additions & 10 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ type Config struct {
}

type (
httpDownloader func(ctx context.Context, u *url.URL) (*bytes.Buffer, *url.URL, error)
httpDownloader func(ctx context.Context, u *url.URL) ([]byte, *url.URL, error)
dirCreator func(path string) error
fileExistenceCheck func(filePath string) bool
fileWriter func(filePath string, buf *bytes.Buffer) error
fileWriter func(filePath string, data []byte) error
)

// Scraper contains all scraping data.
Expand Down Expand Up @@ -189,7 +189,7 @@ func (s *Scraper) Start(ctx context.Context) error {

func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) error {
s.logger.Info("Downloading webpage", log.String("url", u.String()))
buf, respURL, err := s.httpDownloader(ctx, u)
data, respURL, err := s.httpDownloader(ctx, u)
if err != nil {
s.logger.Error("Processing HTTP Request failed",
log.String("url", u.String()),
Expand All @@ -198,7 +198,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
}

fileExtension := ""
kind, err := filetype.Match(buf.Bytes())
kind, err := filetype.Match(data)
if err == nil && kind != types.Unknown {
fileExtension = kind.Extension
}
Expand All @@ -210,6 +210,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
s.URL = u
}

buf := bytes.NewBuffer(data)
doc, err := html.Parse(buf)
if err != nil {
s.logger.Error("Parsing HTML failed",
Expand All @@ -221,7 +222,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
index := htmlindex.New()
index.Index(u, doc)

s.storeDownload(u, buf, doc, index, fileExtension)
s.storeDownload(u, data, doc, index, fileExtension)

if err := s.downloadReferences(ctx, index); err != nil {
return err
Expand Down Expand Up @@ -249,28 +250,28 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)

// storeDownload writes the download to a file, if a known binary file is detected,
// processing of the file as page to look for links is skipped.
func (s *Scraper) storeDownload(u *url.URL, buf *bytes.Buffer, doc *html.Node,
func (s *Scraper) storeDownload(u *url.URL, data []byte, doc *html.Node,
index *htmlindex.Index, fileExtension string) {

isAPage := false
if fileExtension == "" {
content, fixed, err := s.fixURLReferences(u, doc, index)
fixed, hasChanges, err := s.fixURLReferences(u, doc, index)
if err != nil {
s.logger.Error("Fixing file references failed",
log.String("url", u.String()),
log.Err(err))
return
}

if fixed {
buf = bytes.NewBufferString(content)
if hasChanges {
data = fixed
}
isAPage = true
}

filePath := s.getFilePath(u, isAPage)
// always update html files, content might have changed
if err := s.fileWriter(filePath, buf); err != nil {
if err := s.fileWriter(filePath, data); err != nil {
s.logger.Error("Writing to file failed",
log.String("URL", u.String()),
log.String("file", filePath),
Expand Down
7 changes: 3 additions & 4 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scraper

import (
"bytes"
"context"
"fmt"
"net/url"
Expand All @@ -26,17 +25,17 @@ func newTestScraper(t *testing.T, startURL string, urls map[string][]byte) *Scra
scraper.dirCreator = func(_ string) error {
return nil
}
scraper.fileWriter = func(_ string, _ *bytes.Buffer) error {
scraper.fileWriter = func(_ string, _ []byte) error {
return nil
}
scraper.fileExistenceCheck = func(_ string) bool {
return false
}
scraper.httpDownloader = func(_ context.Context, url *url.URL) (*bytes.Buffer, *url.URL, error) {
scraper.httpDownloader = func(_ context.Context, url *url.URL) ([]byte, *url.URL, error) {
ur := url.String()
b, ok := urls[ur]
if ok {
return bytes.NewBuffer(b), url, nil
return b, url, nil
}
return nil, nil, fmt.Errorf("url '%s' not found in test data", ur)
}
Expand Down

0 comments on commit d8c9a58

Please sign in to comment.