Skip to content

Commit

Permalink
zstd + dict v0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Wikidepia committed Aug 9, 2024
1 parent 0762485 commit f6c49d9
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 5 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/elastic/go-freelru v0.13.0
github.com/go-chi/chi/v5 v5.1.0
github.com/kelindar/binary v1.0.19
github.com/klauspost/compress v1.17.9
github.com/tdewolff/parse/v2 v2.7.15
github.com/tidwall/gjson v1.17.1
go.etcd.io/bbolt v1.3.10
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw=
github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/kelindar/binary v1.0.19 h1:DNyQCtKjkLhBh9pnP49OWREddLB0Mho+1U/AOt/Qzxw=
github.com/kelindar/binary v1.0.19/go.mod h1:/twdz8gRLNMffx0U4UOgqm1LywPs6nd9YK2TX52MDh8=
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
Expand Down
22 changes: 17 additions & 5 deletions handlers/scraper/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package handlers

import (
"bytes"
_ "embed"
"errors"
"instafix/utils"
"io"
Expand All @@ -15,6 +16,7 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/PurpleSec/escape"
"github.com/kelindar/binary"
"github.com/klauspost/compress/zstd"
"github.com/tdewolff/parse/v2"
"github.com/tdewolff/parse/v2/js"
"github.com/tidwall/gjson"
Expand All @@ -23,16 +25,17 @@ import (
"golang.org/x/sync/singleflight"
)

var timeout = 10 * time.Second

var (
timeout = 10 * time.Second
ErrNotFound = errors.New("post not found")
)

var RemoteScraperAddr string

var sflightScraper singleflight.Group

//go:embed dictionary.bin
var zstdDict []byte

type Media struct {
TypeName string
URL string
Expand Down Expand Up @@ -138,12 +141,21 @@ func (i *InstaData) ScrapeData() error {
if err != nil {
return err
}
req.Header.Set("Accept-Encoding", "zstd")
res, err := client.Do(req)
if res != nil && res.StatusCode == 200 {
defer res.Body.Close()
iDataGunzip, err := io.ReadAll(res.Body)
zstdReader, err := zstd.NewReader(nil, zstd.WithDecoderLowmem(true), zstd.WithDecoderDicts(zstdDict))
if err != nil {
return err
}
remoteData, err := io.ReadAll(res.Body)
if err == nil {
if err = binary.Unmarshal(iDataGunzip, i); err == nil {
remoteDecomp, err := zstdReader.DecodeAll(remoteData, nil)
if err != nil {
return err
}
if err = binary.Unmarshal(remoteDecomp, i); err == nil {
slog.Info("Data parsed from remote scraper", "postID", i.PostID)
return nil
}
Expand Down

0 comments on commit f6c49d9

Please sign in to comment.