From d8c9a5804a2ac7785e72007cd01752bec3b61d40 Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 22 Sep 2024 12:28:24 -0600 Subject: [PATCH] use []byte instead of bytes.Buffer for data processing to improve testability --- scraper/css.go | 9 ++++---- scraper/css_test.go | 4 +--- scraper/download.go | 9 ++++---- scraper/fs.go | 5 ++--- scraper/html.go | 8 +++---- scraper/html_test.go | 2 +- scraper/http.go | 4 ++-- scraper/images.go | 50 ++++++++++++++++++++--------------------- scraper/scraper.go | 21 ++++++++--------- scraper/scraper_test.go | 7 +++--- 10 files changed, 57 insertions(+), 62 deletions(-) diff --git a/scraper/css.go b/scraper/css.go index 8aeee46..e4be627 100644 --- a/scraper/css.go +++ b/scraper/css.go @@ -1,7 +1,6 @@ package scraper import ( - "bytes" "fmt" "net/url" "path" @@ -14,9 +13,9 @@ import ( var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) -func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer { +func (s *Scraper) checkCSSForUrls(url *url.URL, data []byte) []byte { urls := make(map[string]string) - str := buf.String() + str := string(data) css := scanner.New(str) for { @@ -56,7 +55,7 @@ func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer } if len(urls) == 0 { - return buf + return data } for ori, filePath := range urls { @@ -67,5 +66,5 @@ func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer log.String("fixed_url", fixed)) } - return bytes.NewBufferString(str) + return []byte(str) } diff --git a/scraper/css_test.go b/scraper/css_test.go index f8cb777..b7103a8 100644 --- a/scraper/css_test.go +++ b/scraper/css_test.go @@ -1,7 +1,6 @@ package scraper import ( - "bytes" "net/url" "testing" @@ -32,8 +31,7 @@ func TestCheckCSSForURLs(t *testing.T) { u, _ := url.Parse("http://localhost") for input, expected := range fixtures { s.imagesQueue = nil - buf := bytes.NewBufferString(input) - s.checkCSSForUrls(u, buf) + s.checkCSSForUrls(u, []byte(input)) if expected == "" { assert.Empty(t, s.imagesQueue) diff --git a/scraper/download.go b/scraper/download.go index f8011e7..7e9ea5c 100644 --- a/scraper/download.go +++ b/scraper/download.go @@ -1,7 +1,6 @@ package scraper import ( - "bytes" "context" "errors" "fmt" @@ -13,7 +12,7 @@ import ( // assetProcessor is a processor of a downloaded asset that can transform // a downloaded file content before it will be stored on disk. -type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer +type assetProcessor func(URL *url.URL, data []byte) []byte var tagsWithReferences = []string{ htmlindex.LinkTag, @@ -77,7 +76,7 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset } s.logger.Info("Downloading asset", log.String("url", urlFull)) - buf, _, err := s.httpDownloader(ctx, u) + data, _, err := s.httpDownloader(ctx, u) if err != nil { s.logger.Error("Downloading asset failed", log.String("url", urlFull), @@ -86,10 +85,10 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset } if processor != nil { - buf = processor(u, buf) + data = processor(u, data) } - if err = s.fileWriter(filePath, buf); err != nil { + if err = s.fileWriter(filePath, data); err != nil { s.logger.Error("Writing asset file failed", log.String("url", urlFull), log.String("file", filePath), diff --git a/scraper/fs.go b/scraper/fs.go index d327781..496d1e0 100644 --- a/scraper/fs.go +++ b/scraper/fs.go @@ -1,7 +1,6 @@ package scraper import ( - "bytes" "fmt" "os" "path/filepath" @@ -22,7 +21,7 @@ func (s *Scraper) createDownloadPath(path string) error { return nil } -func (s *Scraper) writeFile(filePath string, buf *bytes.Buffer) error { +func (s *Scraper) writeFile(filePath string, data []byte) error { dir := filepath.Dir(filePath) if len(dir) < len(s.URL.Host) { // nothing to append if it is the root dir dir = filepath.Join(".", s.URL.Host, dir) @@ -38,7 +37,7 @@ func (s *Scraper) writeFile(filePath string, buf *bytes.Buffer) error { return fmt.Errorf("creating file '%s': %w", filePath, err) } - if _, err = f.Write(buf.Bytes()); err != nil { + if _, err = f.Write(data); err != nil { // nolint: wrapcheck _ = f.Close() // try to close and remove file but return the first error _ = os.Remove(filePath) diff --git a/scraper/html.go b/scraper/html.go index e5a0060..0fb65ba 100644 --- a/scraper/html.go +++ b/scraper/html.go @@ -23,18 +23,18 @@ var ignoredURLPrefixes = []string{ // It returns a bool that indicates that no reference needed to be fixed, // in this case the returned HTML string will be empty. func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node, - index *htmlindex.Index) (string, bool, error) { + index *htmlindex.Index) ([]byte, bool, error) { relativeToRoot := urlRelativeToRoot(url) if !s.fixHTMLNodeURLs(url, relativeToRoot, index) { - return "", false, nil + return nil, false, nil } var rendered bytes.Buffer if err := html.Render(&rendered, doc); err != nil { - return "", false, fmt.Errorf("rendering html: %w", err) + return nil, false, fmt.Errorf("rendering html: %w", err) } - return rendered.String(), true, nil + return rendered.Bytes(), true, nil } // fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed diff --git a/scraper/html_test.go b/scraper/html_test.go index b85fe91..bfdd9ec 100644 --- a/scraper/html_test.go +++ b/scraper/html_test.go @@ -44,5 +44,5 @@ func TestFixURLReferences(t *testing.T) { "Guide\n" + " \n\n" + "" - assert.Equal(t, expected, ref) + assert.Equal(t, expected, string(ref)) } diff --git a/scraper/http.go b/scraper/http.go index d206e68..3793510 100644 --- a/scraper/http.go +++ b/scraper/http.go @@ -12,7 +12,7 @@ import ( "github.com/cornelk/gotokit/log" ) -func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*bytes.Buffer, *url.URL, error) { +func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) if err != nil { return nil, nil, fmt.Errorf("creating HTTP request: %w", err) @@ -50,7 +50,7 @@ func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*bytes.Buffer, * if _, err := io.Copy(buf, resp.Body); err != nil { return nil, nil, fmt.Errorf("reading HTTP request body: %w", err) } - return buf, resp.Request.URL, nil + return buf.Bytes(), resp.Request.URL, nil } func Headers(headers []string) http.Header { diff --git a/scraper/images.go b/scraper/images.go index 2bc6062..0e995c6 100644 --- a/scraper/images.go +++ b/scraper/images.go @@ -14,14 +14,14 @@ import ( "github.com/h2non/filetype/types" ) -func (s *Scraper) checkImageForRecode(url *url.URL, buf *bytes.Buffer) *bytes.Buffer { +func (s *Scraper) checkImageForRecode(url *url.URL, data []byte) []byte { if s.config.ImageQuality == 0 { - return buf + return data } - kind, err := filetype.Match(buf.Bytes()) + kind, err := filetype.Match(data) if err != nil || kind == types.Unknown { - return buf + return data } s.logger.Debug("File type detected", @@ -29,24 +29,24 @@ func (s *Scraper) checkImageForRecode(url *url.URL, buf *bytes.Buffer) *bytes.Bu log.String("sub_type", kind.MIME.Subtype)) if kind.MIME.Type == matchers.TypeJpeg.MIME.Type && kind.MIME.Subtype == matchers.TypeJpeg.MIME.Subtype { - if recoded := s.recodeJPEG(url, buf.Bytes()); recoded != nil { + if recoded := s.recodeJPEG(url, data); recoded != nil { return recoded } - return buf + return data } if kind.MIME.Type == matchers.TypePng.MIME.Type && kind.MIME.Subtype == matchers.TypePng.MIME.Subtype { - if recoded := s.recodePNG(url, buf.Bytes()); recoded != nil { + if recoded := s.recodePNG(url, data); recoded != nil { return recoded } - return buf + return data } - return buf + return data } // encodeJPEG encodes a new JPG based on the given quality setting. -func (s *Scraper) encodeJPEG(img image.Image) *bytes.Buffer { +func (s *Scraper) encodeJPEG(img image.Image) []byte { o := &jpeg.Options{ Quality: int(s.config.ImageQuality), } @@ -55,45 +55,45 @@ func (s *Scraper) encodeJPEG(img image.Image) *bytes.Buffer { if err := jpeg.Encode(outBuf, img, o); err != nil { return nil } - return outBuf + return outBuf.Bytes() } // recodeJPEG recodes the image and returns it if it is smaller than before. -func (s *Scraper) recodeJPEG(url fmt.Stringer, b []byte) *bytes.Buffer { - inBuf := bytes.NewBuffer(b) +func (s *Scraper) recodeJPEG(url fmt.Stringer, data []byte) []byte { + inBuf := bytes.NewBuffer(data) img, err := jpeg.Decode(inBuf) if err != nil { return nil } - outBuf := s.encodeJPEG(img) - if outBuf == nil || outBuf.Len() > len(b) { // only use the new file if it is smaller + encoded := s.encodeJPEG(img) + if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller return nil } s.logger.Debug("Recoded JPEG", log.String("url", url.String()), - log.Int("size_original", len(b)), - log.Int("size_recoded", outBuf.Len())) - return outBuf + log.Int("size_original", len(data)), + log.Int("size_recoded", len(encoded))) + return encoded } // recodePNG recodes the image and returns it if it is smaller than before. -func (s *Scraper) recodePNG(url fmt.Stringer, b []byte) *bytes.Buffer { - inBuf := bytes.NewBuffer(b) +func (s *Scraper) recodePNG(url fmt.Stringer, data []byte) []byte { + inBuf := bytes.NewBuffer(data) img, err := png.Decode(inBuf) if err != nil { return nil } - outBuf := s.encodeJPEG(img) - if outBuf == nil || outBuf.Len() > len(b) { // only use the new file if it is smaller + encoded := s.encodeJPEG(img) + if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller return nil } s.logger.Debug("Recoded PNG", log.String("url", url.String()), - log.Int("size_original", len(b)), - log.Int("size_recoded", outBuf.Len())) - return outBuf + log.Int("size_original", len(data)), + log.Int("size_recoded", len(encoded))) + return encoded } diff --git a/scraper/scraper.go b/scraper/scraper.go index 113629f..c63f2f6 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -41,10 +41,10 @@ type Config struct { } type ( - httpDownloader func(ctx context.Context, u *url.URL) (*bytes.Buffer, *url.URL, error) + httpDownloader func(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) dirCreator func(path string) error fileExistenceCheck func(filePath string) bool - fileWriter func(filePath string, buf *bytes.Buffer) error + fileWriter func(filePath string, data []byte) error ) // Scraper contains all scraping data. @@ -189,7 +189,7 @@ func (s *Scraper) Start(ctx context.Context) error { func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) error { s.logger.Info("Downloading webpage", log.String("url", u.String())) - buf, respURL, err := s.httpDownloader(ctx, u) + data, respURL, err := s.httpDownloader(ctx, u) if err != nil { s.logger.Error("Processing HTTP Request failed", log.String("url", u.String()), @@ -198,7 +198,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) } fileExtension := "" - kind, err := filetype.Match(buf.Bytes()) + kind, err := filetype.Match(data) if err == nil && kind != types.Unknown { fileExtension = kind.Extension } @@ -210,6 +210,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) s.URL = u } + buf := bytes.NewBuffer(data) doc, err := html.Parse(buf) if err != nil { s.logger.Error("Parsing HTML failed", @@ -221,7 +222,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) index := htmlindex.New() index.Index(u, doc) - s.storeDownload(u, buf, doc, index, fileExtension) + s.storeDownload(u, data, doc, index, fileExtension) if err := s.downloadReferences(ctx, index); err != nil { return err @@ -249,12 +250,12 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) // storeDownload writes the download to a file, if a known binary file is detected, // processing of the file as page to look for links is skipped. -func (s *Scraper) storeDownload(u *url.URL, buf *bytes.Buffer, doc *html.Node, +func (s *Scraper) storeDownload(u *url.URL, data []byte, doc *html.Node, index *htmlindex.Index, fileExtension string) { isAPage := false if fileExtension == "" { - content, fixed, err := s.fixURLReferences(u, doc, index) + fixed, hasChanges, err := s.fixURLReferences(u, doc, index) if err != nil { s.logger.Error("Fixing file references failed", log.String("url", u.String()), @@ -262,15 +263,15 @@ func (s *Scraper) storeDownload(u *url.URL, buf *bytes.Buffer, doc *html.Node, return } - if fixed { - buf = bytes.NewBufferString(content) + if hasChanges { + data = fixed } isAPage = true } filePath := s.getFilePath(u, isAPage) // always update html files, content might have changed - if err := s.fileWriter(filePath, buf); err != nil { + if err := s.fileWriter(filePath, data); err != nil { s.logger.Error("Writing to file failed", log.String("URL", u.String()), log.String("file", filePath), diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index 4246d63..98057c2 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -1,7 +1,6 @@ package scraper import ( - "bytes" "context" "fmt" "net/url" @@ -26,17 +25,17 @@ func newTestScraper(t *testing.T, startURL string, urls map[string][]byte) *Scra scraper.dirCreator = func(_ string) error { return nil } - scraper.fileWriter = func(_ string, _ *bytes.Buffer) error { + scraper.fileWriter = func(_ string, _ []byte) error { return nil } scraper.fileExistenceCheck = func(_ string) bool { return false } - scraper.httpDownloader = func(_ context.Context, url *url.URL) (*bytes.Buffer, *url.URL, error) { + scraper.httpDownloader = func(_ context.Context, url *url.URL) ([]byte, *url.URL, error) { ur := url.String() b, ok := urls[ur] if ok { - return bytes.NewBuffer(b), url, nil + return b, url, nil } return nil, nil, fmt.Errorf("url '%s' not found in test data", ur) }