This repository has been archived by the owner on Nov 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Richard Patel
committed
Dec 12, 2018
0 parents
commit 334e520
Showing
6 changed files
with
482 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
/Downloads/ | ||
/fma-scraper* | ||
/downloaded.txt | ||
/.idea/ | ||
*.mp3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"github.com/akamensky/argparse" | ||
"os" | ||
) | ||
|
||
var parser = argparse.NewParser("free-music-archive-scraper", | ||
"Scraper for https://freemusicarchive.org/") | ||
|
||
var genre = parser.Selector("g", "genre", availableGenres[:], &argparse.Options{ | ||
Required: true, | ||
Help: "Genre to scrape", | ||
}) | ||
|
||
var concurrency = parser.Int("c", "concurrency", &argparse.Options{ | ||
Help: "Number of connections", | ||
Default: 4, | ||
}) | ||
|
||
var minPage = parser.Int("", "min-page", &argparse.Options{ | ||
Help: "Starting page", | ||
Default: 1, | ||
}) | ||
|
||
var dir = parser.String("o", "out-dir", &argparse.Options{ | ||
Help: "Output directory", | ||
Default: "Downloads", | ||
}) | ||
|
||
func parseArgs() { | ||
if err := parser.Parse(os.Args); err != nil { | ||
fmt.Fprintln(os.Stderr, err) | ||
os.Exit(1) | ||
} | ||
|
||
if err := os.MkdirAll(*dir, 0777); err != nil { | ||
fmt.Fprintln(os.Stderr, err) | ||
os.Exit(1) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"github.com/sirupsen/logrus" | ||
"github.com/valyala/fasthttp" | ||
"mime" | ||
"os" | ||
"path/filepath" | ||
"sync/atomic" | ||
"time" | ||
) | ||
|
||
func downloader(jobs <-chan Track, results chan<- Track) { | ||
defer downloadGroup.Done() | ||
for job := range jobs { | ||
if atomic.LoadInt32(&exitRequested) != 0 { | ||
break | ||
} | ||
|
||
now := time.Now() | ||
|
||
u, err := followRedirect(job.Download) | ||
if err != nil { | ||
logrus.WithError(err). | ||
WithField("url", job.Download). | ||
WithField("title", job.Title). | ||
Error("Download failed") | ||
} | ||
job.Download = u | ||
|
||
n, err := download(u) | ||
if os.IsExist(err) { | ||
logrus. | ||
WithField("title", job.Title). | ||
Warning("Already downloaded") | ||
} else if err != nil { | ||
logrus. | ||
WithError(err). | ||
WithField("url", job.Download). | ||
WithField("title", job.Title). | ||
Error("Download failed") | ||
} | ||
|
||
dur := time.Since(now) | ||
|
||
atomic.AddInt64(&totalDownloaded, n) | ||
|
||
logrus.WithFields(logrus.Fields{ | ||
"title": job.Title, | ||
"size": n, | ||
"dur": dur.Seconds(), | ||
}).Info("Downloaded track") | ||
|
||
results <- job | ||
} | ||
} | ||
|
||
func followRedirect(u string) (string, error) { | ||
req := fasthttp.AcquireRequest() | ||
defer fasthttp.ReleaseRequest(req) | ||
res := fasthttp.AcquireResponse() | ||
defer fasthttp.ReleaseResponse(res) | ||
|
||
req.SetRequestURI(u) | ||
|
||
if err := fasthttp.Do(req, res); err != nil { | ||
return "", err | ||
} | ||
|
||
if sc := res.StatusCode(); sc != 302 { | ||
return "", fmt.Errorf("failed to get redirected to mp3: HTTP status %d", sc) | ||
} | ||
|
||
return string(res.Header.Peek("Location")), nil | ||
} | ||
|
||
func download(u string) (int64, error) { | ||
req := fasthttp.AcquireRequest() | ||
defer fasthttp.ReleaseRequest(req) | ||
res := fasthttp.AcquireResponse() | ||
defer fasthttp.ReleaseResponse(res) | ||
|
||
req.SetRequestURI(u) | ||
|
||
if err := fasthttp.Do(req, res); err != nil { | ||
return 0, err | ||
} | ||
|
||
if sc := res.StatusCode(); sc != 200 { | ||
return 0, fmt.Errorf("HTTP status %d", sc) | ||
} | ||
|
||
cd := string(res.Header.Peek("Content-Disposition")) | ||
if cd == "" { | ||
return 0, fmt.Errorf("missing Content-Disposition header") | ||
} | ||
|
||
_, params, err := mime.ParseMediaType(cd) | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
fileName := params["filename"] | ||
if fileName == "" { | ||
return 0, fmt.Errorf("missing file name in Content-Disposition header") | ||
} | ||
|
||
fileName = filepath.Join(*dir, fileName) | ||
f, err := os.OpenFile(fileName, os.O_CREATE | os.O_EXCL | os.O_WRONLY, 0666) | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
return res.WriteTo(f) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package main | ||
|
||
import ( | ||
"bufio" | ||
"encoding/json" | ||
"github.com/sirupsen/logrus" | ||
"os" | ||
) | ||
|
||
func logger(results <-chan Track) { | ||
defer helperGroup.Done() | ||
f, err := os.OpenFile("downloaded.txt", os.O_CREATE | os.O_APPEND | os.O_WRONLY, 0666) | ||
if err != nil { | ||
logrus.Fatal(err) | ||
} | ||
defer f.Close() | ||
|
||
wr := bufio.NewWriter(f) | ||
defer wr.Flush() | ||
|
||
j := json.NewEncoder(wr) | ||
|
||
for result := range results { | ||
err := j.Encode(result) | ||
if err != nil { | ||
logrus.Fatal(err) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"github.com/cenkalti/backoff" | ||
"github.com/sirupsen/logrus" | ||
"os" | ||
"os/signal" | ||
"sync" | ||
"sync/atomic" | ||
"time" | ||
) | ||
|
||
var availableGenres = [...]string{ | ||
"Blues", | ||
"Classical", | ||
"Country", | ||
"Electronic", | ||
"Experimental", | ||
"Folk", | ||
"Hip-Hop", | ||
"Instrumental", | ||
"International", | ||
"Jazz", | ||
"Novelty", | ||
"Old-Time__Historic", | ||
"Pop", | ||
"Rock", | ||
"Soul-RB", | ||
"Spoken", | ||
} | ||
|
||
var startTime = time.Now() | ||
var totalDownloaded int64 | ||
var downloadGroup sync.WaitGroup | ||
var helperGroup sync.WaitGroup | ||
var exitRequested int32 | ||
|
||
const pageSize = 2 | ||
|
||
type Track struct { | ||
Artist string `json:"artist"` | ||
Title string `json:"title"` | ||
Album string `json:"album"` | ||
Genres []string `json:"genre"` | ||
Download string `json:"download"` | ||
} | ||
|
||
func main() { | ||
parseArgs() | ||
|
||
jobs := make(chan Track, 2 * pageSize) | ||
results := make(chan Track, 2 * pageSize) | ||
|
||
c, cancel := context.WithCancel(context.Background()) | ||
|
||
go listenCtrlC(cancel) | ||
|
||
// Start logger | ||
helperGroup.Add(1) | ||
go logger(results) | ||
|
||
// Start downloaderss | ||
downloadGroup.Add(*concurrency) | ||
for i := 0; i < *concurrency; i++ { | ||
go downloader(jobs, results) | ||
} | ||
|
||
// Start meta grabber | ||
page := *minPage | ||
for { | ||
if atomic.LoadInt32(&exitRequested) != 0 { | ||
break | ||
} | ||
|
||
err := backoff.Retry(func() error { | ||
err := list(c, jobs, *genre, page) | ||
if err != nil { | ||
logrus.WithError(err). | ||
Errorf("Failed visiting page %d", page) | ||
} | ||
return err | ||
}, backoff.NewExponentialBackOff()) | ||
|
||
if err != nil { | ||
logrus.Fatal(err) | ||
} | ||
|
||
page++ | ||
} | ||
|
||
// Shutdown | ||
close(jobs) | ||
downloadGroup.Wait() | ||
close(results) | ||
helperGroup.Wait() | ||
|
||
total := atomic.LoadInt64(&totalDownloaded) | ||
dur := time.Since(startTime).Seconds() | ||
|
||
logrus.WithFields(logrus.Fields{ | ||
"total": total, | ||
"dur": dur, | ||
"avg_rate": float64(total) / dur, | ||
}).Info("Stats") | ||
} | ||
|
||
func listenCtrlC(cancel context.CancelFunc) { | ||
c := make(chan os.Signal, 1) | ||
signal.Notify(c, os.Interrupt) | ||
<-c | ||
atomic.StoreInt32(&exitRequested, 1) | ||
cancel() | ||
fmt.Fprintln(os.Stderr, "\nWaiting for downloads to finish...") | ||
fmt.Fprintln(os.Stderr, "Press ^C again to exit instantly.") | ||
<-c | ||
fmt.Fprintln(os.Stderr, "\nKilled!") | ||
os.Exit(255) | ||
} |
Oops, something went wrong.