diff --git a/README.md b/README.md index cd3027e..b123f42 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,15 @@ go install github.com/cornelk/goscrape@latest Compiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed. ## Usage + +Scrape a website by running +``` +goscrape http://website.com` +``` + +To serve the downloaded website directory in a local run webserver use ``` -goscrape http://website.com +goscrape --serve website.com` ``` ## Options @@ -50,7 +57,7 @@ goscrape http://website.com ``` Scrape a website and create an offline browsable version on the disk. -Usage: goscrape [--include INCLUDE] [--exclude EXCLUDE] [--output OUTPUT] [--depth DEPTH] [--imagequality IMAGEQUALITY] [--timeout TIMEOUT] [--header HEADER] [--proxy PROXY] [--user USER] [--useragent USERAGENT] [--verbose] [URLS [URLS ...]] +Usage: goscrape [--include INCLUDE] [--exclude EXCLUDE] [--output OUTPUT] [--depth DEPTH] [--imagequality IMAGEQUALITY] [--timeout TIMEOUT] [--serve SERVE] [--serverport SERVERPORT] [--header HEADER] [--proxy PROXY] [--user USER] [--useragent USERAGENT] [--verbose] [URLS [URLS ...]] Positional arguments: URLS @@ -68,6 +75,10 @@ Options: image quality, 0 to disable reencoding --timeout TIMEOUT, -t TIMEOUT time limit in seconds for each HTTP request to connect and read the request body + --serve SERVE, -s SERVE + serve the website using a webserver + --serverport SERVERPORT, -r SERVERPORT + port to use for the webserver [default: 8080] --header HEADER, -h HEADER HTTP header to use for scraping --proxy PROXY, -p PROXY diff --git a/main.go b/main.go index 87325ea..267a67c 100644 --- a/main.go +++ b/main.go @@ -31,6 +31,9 @@ type arguments struct { ImageQuality int64 `arg:"-i,--imagequality" help:"image quality, 0 to disable reencoding"` Timeout int64 `arg:"-t,--timeout" help:"time limit in seconds for each HTTP request to connect and read the request body"` + Serve string `arg:"-s,--serve" help:"serve the website using a webserver"` + ServerPort int16 `arg:"-r,--serverport" help:"port to use for the webserver" default:"8080"` + Headers []string `arg:"-h,--header" help:"HTTP header to use for scraping"` Proxy string `arg:"-p,--proxy" help:"HTTP proxy to use for scraping"` User string `arg:"-u,--user" help:"user[:password] to use for authentication"` @@ -55,8 +58,26 @@ func main() { } ctx := app.Context() - if err := run(ctx, args); err != nil { - fmt.Printf("Execution error: %s\n", err) + + if args.Verbose { + log.SetDefaultLevel(log.DebugLevel) + } + logger, err := createLogger() + if err != nil { + fmt.Printf("Creating logger failed: %s\n", err) + os.Exit(1) + } + + if args.Serve != "" { + if err := runServer(ctx, args, logger); err != nil { + fmt.Printf("Server execution error: %s\n", err) + os.Exit(1) + } + return + } + + if err := runScraper(ctx, args, logger); err != nil { + fmt.Printf("Scraping execution error: %s\n", err) os.Exit(1) } } @@ -81,7 +102,7 @@ func readArguments() (arguments, error) { return arguments{}, fmt.Errorf("parsing arguments: %w", err) } - if len(args.URLs) == 0 { + if len(args.URLs) == 0 && args.Serve == "" { parser.WriteHelp(os.Stdout) os.Exit(0) } @@ -90,7 +111,7 @@ func readArguments() (arguments, error) { } // nolint: funlen -func run(ctx context.Context, args arguments) error { +func runScraper(ctx context.Context, args arguments, logger *log.Logger) error { if len(args.URLs) == 0 { return nil } @@ -109,14 +130,6 @@ func run(ctx context.Context, args arguments) error { imageQuality = 0 } - if args.Verbose { - log.SetDefaultLevel(log.DebugLevel) - } - logger, err := createLogger() - if err != nil { - return fmt.Errorf("creating logger: %w", err) - } - cfg := scraper.Config{ Includes: args.Include, Excludes: args.Exclude, @@ -154,6 +167,13 @@ func run(ctx context.Context, args arguments) error { return nil } +func runServer(ctx context.Context, args arguments, logger *log.Logger) error { + if err := scraper.ServeDirectory(ctx, args.Serve, args.ServerPort, logger); err != nil { + return fmt.Errorf("serving directory: %w", err) + } + return nil +} + func createLogger() (*log.Logger, error) { logCfg, err := log.ConfigForEnv(env.Development) if err != nil { diff --git a/scraper/server.go b/scraper/server.go new file mode 100644 index 0000000..76f4f9f --- /dev/null +++ b/scraper/server.go @@ -0,0 +1,56 @@ +package scraper + +import ( + "context" + "fmt" + "mime" + "net/http" + + "github.com/cornelk/gotokit/log" +) + +// set more mime types in the browser, this for example fixes .asp files not being +// downloaded but handled as html. +var mimeTypes = map[string]string{ + ".asp": "text/html; charset=utf-8", +} + +func ServeDirectory(ctx context.Context, path string, port int16, logger *log.Logger) error { + fs := http.FileServer(http.Dir(path)) + mux := http.NewServeMux() + mux.Handle("/", fs) // server root by file system + + // update mime types + for ext, mt := range mimeTypes { + if err := mime.AddExtensionType(ext, mt); err != nil { + return fmt.Errorf("adding mime type '%s': %w", ext, err) + } + } + + fullAddr := fmt.Sprintf("http://127.0.0.1:%d", port) + logger.Info("Serving directory...", + log.String("path", path), + log.String("address", fullAddr)) + + server := &http.Server{ + Addr: fmt.Sprintf(":%d", port), + Handler: mux, + } + + serverErr := make(chan error, 1) + go func() { + serverErr <- server.ListenAndServe() + }() + + select { + case <-ctx.Done(): + //nolint: contextcheck + if err := server.Shutdown(context.Background()); err != nil { + return fmt.Errorf("shutting down webserver: %w", err) + } + return nil + + case err := <-serverErr: + return fmt.Errorf("starting webserver: %w", err) + } +}