Skip to content
This repository has been archived by the owner on Nov 6, 2020. It is now read-only.

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Patel committed Dec 12, 2018
0 parents commit 334e520
Show file tree
Hide file tree
Showing 6 changed files with 482 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/Downloads/
/fma-scraper*
/downloaded.txt
/.idea/
*.mp3
42 changes: 42 additions & 0 deletions args.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package main

import (
"fmt"
"github.com/akamensky/argparse"
"os"
)

var parser = argparse.NewParser("free-music-archive-scraper",
"Scraper for https://freemusicarchive.org/")

var genre = parser.Selector("g", "genre", availableGenres[:], &argparse.Options{
Required: true,
Help: "Genre to scrape",
})

var concurrency = parser.Int("c", "concurrency", &argparse.Options{
Help: "Number of connections",
Default: 4,
})

var minPage = parser.Int("", "min-page", &argparse.Options{
Help: "Starting page",
Default: 1,
})

var dir = parser.String("o", "out-dir", &argparse.Options{
Help: "Output directory",
Default: "Downloads",
})

func parseArgs() {
if err := parser.Parse(os.Args); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}

if err := os.MkdirAll(*dir, 0777); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
116 changes: 116 additions & 0 deletions downloader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package main

import (
"fmt"
"github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"mime"
"os"
"path/filepath"
"sync/atomic"
"time"
)

func downloader(jobs <-chan Track, results chan<- Track) {
defer downloadGroup.Done()
for job := range jobs {
if atomic.LoadInt32(&exitRequested) != 0 {
break
}

now := time.Now()

u, err := followRedirect(job.Download)
if err != nil {
logrus.WithError(err).
WithField("url", job.Download).
WithField("title", job.Title).
Error("Download failed")
}
job.Download = u

n, err := download(u)
if os.IsExist(err) {
logrus.
WithField("title", job.Title).
Warning("Already downloaded")
} else if err != nil {
logrus.
WithError(err).
WithField("url", job.Download).
WithField("title", job.Title).
Error("Download failed")
}

dur := time.Since(now)

atomic.AddInt64(&totalDownloaded, n)

logrus.WithFields(logrus.Fields{
"title": job.Title,
"size": n,
"dur": dur.Seconds(),
}).Info("Downloaded track")

results <- job
}
}

func followRedirect(u string) (string, error) {
req := fasthttp.AcquireRequest()
defer fasthttp.ReleaseRequest(req)
res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res)

req.SetRequestURI(u)

if err := fasthttp.Do(req, res); err != nil {
return "", err
}

if sc := res.StatusCode(); sc != 302 {
return "", fmt.Errorf("failed to get redirected to mp3: HTTP status %d", sc)
}

return string(res.Header.Peek("Location")), nil
}

func download(u string) (int64, error) {
req := fasthttp.AcquireRequest()
defer fasthttp.ReleaseRequest(req)
res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res)

req.SetRequestURI(u)

if err := fasthttp.Do(req, res); err != nil {
return 0, err
}

if sc := res.StatusCode(); sc != 200 {
return 0, fmt.Errorf("HTTP status %d", sc)
}

cd := string(res.Header.Peek("Content-Disposition"))
if cd == "" {
return 0, fmt.Errorf("missing Content-Disposition header")
}

_, params, err := mime.ParseMediaType(cd)
if err != nil {
return 0, err
}

fileName := params["filename"]
if fileName == "" {
return 0, fmt.Errorf("missing file name in Content-Disposition header")
}

fileName = filepath.Join(*dir, fileName)
f, err := os.OpenFile(fileName, os.O_CREATE | os.O_EXCL | os.O_WRONLY, 0666)
if err != nil {
return 0, err
}

return res.WriteTo(f)
}
29 changes: 29 additions & 0 deletions logger.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package main

import (
"bufio"
"encoding/json"
"github.com/sirupsen/logrus"
"os"
)

func logger(results <-chan Track) {
defer helperGroup.Done()
f, err := os.OpenFile("downloaded.txt", os.O_CREATE | os.O_APPEND | os.O_WRONLY, 0666)
if err != nil {
logrus.Fatal(err)
}
defer f.Close()

wr := bufio.NewWriter(f)
defer wr.Flush()

j := json.NewEncoder(wr)

for result := range results {
err := j.Encode(result)
if err != nil {
logrus.Fatal(err)
}
}
}
120 changes: 120 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package main

import (
"context"
"fmt"
"github.com/cenkalti/backoff"
"github.com/sirupsen/logrus"
"os"
"os/signal"
"sync"
"sync/atomic"
"time"
)

var availableGenres = [...]string{
"Blues",
"Classical",
"Country",
"Electronic",
"Experimental",
"Folk",
"Hip-Hop",
"Instrumental",
"International",
"Jazz",
"Novelty",
"Old-Time__Historic",
"Pop",
"Rock",
"Soul-RB",
"Spoken",
}

var startTime = time.Now()
var totalDownloaded int64
var downloadGroup sync.WaitGroup
var helperGroup sync.WaitGroup
var exitRequested int32

const pageSize = 2

type Track struct {
Artist string `json:"artist"`
Title string `json:"title"`
Album string `json:"album"`
Genres []string `json:"genre"`
Download string `json:"download"`
}

func main() {
parseArgs()

jobs := make(chan Track, 2 * pageSize)
results := make(chan Track, 2 * pageSize)

c, cancel := context.WithCancel(context.Background())

go listenCtrlC(cancel)

// Start logger
helperGroup.Add(1)
go logger(results)

// Start downloaderss
downloadGroup.Add(*concurrency)
for i := 0; i < *concurrency; i++ {
go downloader(jobs, results)
}

// Start meta grabber
page := *minPage
for {
if atomic.LoadInt32(&exitRequested) != 0 {
break
}

err := backoff.Retry(func() error {
err := list(c, jobs, *genre, page)
if err != nil {
logrus.WithError(err).
Errorf("Failed visiting page %d", page)
}
return err
}, backoff.NewExponentialBackOff())

if err != nil {
logrus.Fatal(err)
}

page++
}

// Shutdown
close(jobs)
downloadGroup.Wait()
close(results)
helperGroup.Wait()

total := atomic.LoadInt64(&totalDownloaded)
dur := time.Since(startTime).Seconds()

logrus.WithFields(logrus.Fields{
"total": total,
"dur": dur,
"avg_rate": float64(total) / dur,
}).Info("Stats")
}

func listenCtrlC(cancel context.CancelFunc) {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
<-c
atomic.StoreInt32(&exitRequested, 1)
cancel()
fmt.Fprintln(os.Stderr, "\nWaiting for downloads to finish...")
fmt.Fprintln(os.Stderr, "Press ^C again to exit instantly.")
<-c
fmt.Fprintln(os.Stderr, "\nKilled!")
os.Exit(255)
}
Loading

0 comments on commit 334e520

Please sign in to comment.