Skip to content

Commit

Permalink
support serving directory in local webserver (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
cornelk authored Jun 21, 2024
1 parent caab877 commit 754db93
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 14 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,23 @@ go install github.com/cornelk/goscrape@latest
Compiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed.

## Usage

Scrape a website by running
```
goscrape http://website.com`
```

To serve the downloaded website directory in a local run webserver use
```
goscrape http://website.com
goscrape --serve website.com`
```

## Options

```
Scrape a website and create an offline browsable version on the disk.
Usage: goscrape [--include INCLUDE] [--exclude EXCLUDE] [--output OUTPUT] [--depth DEPTH] [--imagequality IMAGEQUALITY] [--timeout TIMEOUT] [--header HEADER] [--proxy PROXY] [--user USER] [--useragent USERAGENT] [--verbose] [URLS [URLS ...]]
Usage: goscrape [--include INCLUDE] [--exclude EXCLUDE] [--output OUTPUT] [--depth DEPTH] [--imagequality IMAGEQUALITY] [--timeout TIMEOUT] [--serve SERVE] [--serverport SERVERPORT] [--header HEADER] [--proxy PROXY] [--user USER] [--useragent USERAGENT] [--verbose] [URLS [URLS ...]]
Positional arguments:
URLS
Expand All @@ -68,6 +75,10 @@ Options:
image quality, 0 to disable reencoding
--timeout TIMEOUT, -t TIMEOUT
time limit in seconds for each HTTP request to connect and read the request body
--serve SERVE, -s SERVE
serve the website using a webserver
--serverport SERVERPORT, -r SERVERPORT
port to use for the webserver [default: 8080]
--header HEADER, -h HEADER
HTTP header to use for scraping
--proxy PROXY, -p PROXY
Expand Down
44 changes: 32 additions & 12 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ type arguments struct {
ImageQuality int64 `arg:"-i,--imagequality" help:"image quality, 0 to disable reencoding"`
Timeout int64 `arg:"-t,--timeout" help:"time limit in seconds for each HTTP request to connect and read the request body"`

Serve string `arg:"-s,--serve" help:"serve the website using a webserver"`
ServerPort int16 `arg:"-r,--serverport" help:"port to use for the webserver" default:"8080"`

Headers []string `arg:"-h,--header" help:"HTTP header to use for scraping"`
Proxy string `arg:"-p,--proxy" help:"HTTP proxy to use for scraping"`
User string `arg:"-u,--user" help:"user[:password] to use for authentication"`
Expand All @@ -55,8 +58,26 @@ func main() {
}

ctx := app.Context()
if err := run(ctx, args); err != nil {
fmt.Printf("Execution error: %s\n", err)

if args.Verbose {
log.SetDefaultLevel(log.DebugLevel)
}
logger, err := createLogger()
if err != nil {
fmt.Printf("Creating logger failed: %s\n", err)
os.Exit(1)
}

if args.Serve != "" {
if err := runServer(ctx, args, logger); err != nil {
fmt.Printf("Server execution error: %s\n", err)
os.Exit(1)
}
return
}

if err := runScraper(ctx, args, logger); err != nil {
fmt.Printf("Scraping execution error: %s\n", err)
os.Exit(1)
}
}
Expand All @@ -81,7 +102,7 @@ func readArguments() (arguments, error) {
return arguments{}, fmt.Errorf("parsing arguments: %w", err)
}

if len(args.URLs) == 0 {
if len(args.URLs) == 0 && args.Serve == "" {
parser.WriteHelp(os.Stdout)
os.Exit(0)
}
Expand All @@ -90,7 +111,7 @@ func readArguments() (arguments, error) {
}

// nolint: funlen
func run(ctx context.Context, args arguments) error {
func runScraper(ctx context.Context, args arguments, logger *log.Logger) error {
if len(args.URLs) == 0 {
return nil
}
Expand All @@ -109,14 +130,6 @@ func run(ctx context.Context, args arguments) error {
imageQuality = 0
}

if args.Verbose {
log.SetDefaultLevel(log.DebugLevel)
}
logger, err := createLogger()
if err != nil {
return fmt.Errorf("creating logger: %w", err)
}

cfg := scraper.Config{
Includes: args.Include,
Excludes: args.Exclude,
Expand Down Expand Up @@ -154,6 +167,13 @@ func run(ctx context.Context, args arguments) error {
return nil
}

func runServer(ctx context.Context, args arguments, logger *log.Logger) error {
if err := scraper.ServeDirectory(ctx, args.Serve, args.ServerPort, logger); err != nil {
return fmt.Errorf("serving directory: %w", err)
}
return nil
}

func createLogger() (*log.Logger, error) {
logCfg, err := log.ConfigForEnv(env.Development)
if err != nil {
Expand Down
56 changes: 56 additions & 0 deletions scraper/server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package scraper

import (
"context"
"fmt"
"mime"
"net/http"

"github.com/cornelk/gotokit/log"
)

// set more mime types in the browser, this for example fixes .asp files not being
// downloaded but handled as html.
var mimeTypes = map[string]string{
".asp": "text/html; charset=utf-8",
}

func ServeDirectory(ctx context.Context, path string, port int16, logger *log.Logger) error {
fs := http.FileServer(http.Dir(path))
mux := http.NewServeMux()
mux.Handle("/", fs) // server root by file system

// update mime types
for ext, mt := range mimeTypes {
if err := mime.AddExtensionType(ext, mt); err != nil {
return fmt.Errorf("adding mime type '%s': %w", ext, err)
}
}

fullAddr := fmt.Sprintf("http://127.0.0.1:%d", port)
logger.Info("Serving directory...",
log.String("path", path),
log.String("address", fullAddr))

server := &http.Server{
Addr: fmt.Sprintf(":%d", port),
Handler: mux,
}

serverErr := make(chan error, 1)
go func() {
serverErr <- server.ListenAndServe()
}()

select {
case <-ctx.Done():
//nolint: contextcheck
if err := server.Shutdown(context.Background()); err != nil {
return fmt.Errorf("shutting down webserver: %w", err)
}
return nil

case err := <-serverErr:
return fmt.Errorf("starting webserver: %w", err)
}
}

0 comments on commit 754db93

Please sign in to comment.