Skip to content

Commit

Permalink
Support passing config options as CLI arguments (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
philippta authored Nov 18, 2023
1 parent 94da929 commit 6aa52bd
Show file tree
Hide file tree
Showing 12 changed files with 349 additions and 82 deletions.
30 changes: 23 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@
```javascript
export const config = {
url: "https://news.ycombinator.com/",
// urls: [] // Specify additional URLs to start from. (default = none)
// depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
// follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
// allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
// blockedDomains: [], // Specify the blocked domains. (default = none)
// allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
// blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
// rate: 100, // Specify the rate in requests per second. (default = no rate limit)
// proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
// cache: "file", // Enable file-based request caching. (default = no cache)
}

export default function ({ doc, absoluteURL }) {
Expand Down Expand Up @@ -99,17 +109,23 @@ To compile flyscrape from source, follow these steps:
## Usage

```
flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
Usage:
flyscrape <command> [arguments]
flyscrape run SCRIPT [config flags]
Examples:
# Run the script.
$ flyscrape run example.js
# Set the URL as argument.
$ flyscrape run example.js --url "http://other.com"
Commands:
# Enable proxy support.
$ flyscrape run example.js --proxies "http://someproxy:8043"
new creates a sample scraping script
run runs a scraping script
dev watches and re-runs a scraping script
# Follow paginated links.
$ flyscrape run example.js --depth 5 --follow ".next-button > a"
```

## Configuration
Expand Down
97 changes: 97 additions & 0 deletions cmd/args.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package cmd

import (
"fmt"
"slices"
"strconv"
"strings"
)

var arrayFields = []string{
"urls",
"follow",
"allowedDomains",
"blockedDomains",
"allowedURLs",
"blockedURLs",
"proxies",
}

func parseConfigArgs(args []string) (map[string]any, error) {
updates := map[string]any{}

flag := ""
for _, arg := range normalizeArgs(args) {
if flag == "" && !isFlag(arg) {
return nil, fmt.Errorf("expected flag, got %q instead", arg)
}

if flag != "" && isFlag(arg) {
updates[flag[2:]] = true
flag = ""
continue
}

if flag != "" {
if v, ok := updates[flag[2:]]; ok {
if vv, ok := v.([]any); ok {
updates[flag[2:]] = append(vv, parseArg(arg))
} else {
updates[flag[2:]] = []any{v, parseArg(arg)}
}
} else {
if slices.Contains(arrayFields, flag[2:]) {
updates[flag[2:]] = []any{parseArg(arg)}
} else {
updates[flag[2:]] = parseArg(arg)
}
}
flag = ""
continue
}

flag = arg
}

if flag != "" {
updates[flag[2:]] = true
flag = ""
}

return updates, nil
}

func normalizeArgs(args []string) []string {
var norm []string

for _, arg := range args {
if !strings.HasPrefix(arg, "--") {
norm = append(norm, arg)
} else {
norm = append(norm, strings.SplitN(arg, "=", 2)...)
}
}

return norm
}

func parseArg(arg string) any {
if arg == "true" {
return true
}
if arg == "false" {
return false
}
if num, err := strconv.Atoi(arg); err == nil {
return num
}
return arg
}

func isFlag(arg string) bool {
return strings.HasPrefix(arg, "--")
}
75 changes: 75 additions & 0 deletions cmd/args_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package cmd

import (
"strings"
"testing"

"github.com/stretchr/testify/require"
)

func TestParseConfigUpdates(t *testing.T) {
tests := []struct {
flags string
err bool
updates map[string]any
}{
{
flags: `--foo bar`,
updates: map[string]any{"foo": "bar"},
},
{
flags: `--foo=bar`,
updates: map[string]any{"foo": "bar"},
},
{
flags: `--foo`,
updates: map[string]any{"foo": true},
},
{
flags: `--foo false`,
updates: map[string]any{"foo": false},
},
{
flags: `--foo a --foo b`,
updates: map[string]any{"foo": []any{"a", "b"}},
},
{
flags: `--foo a --foo=b`,
updates: map[string]any{"foo": []any{"a", "b"}},
},
{
flags: `--foo 69`,
updates: map[string]any{"foo": 69},
},
{
flags: `--foo.bar a`,
updates: map[string]any{"foo.bar": "a"},
},
{
flags: `foo`,
err: true,
},
{
flags: `--foo a b`,
err: true,
},
}
for _, test := range tests {
t.Run(test.flags, func(t *testing.T) {
args, err := parseConfigArgs(strings.Fields(test.flags))

if test.err {
require.Error(t, err)
require.Empty(t, args)
return
}

require.NoError(t, err)
require.Equal(t, test.updates, args)
})
}
}
19 changes: 14 additions & 5 deletions cmd/flyscrape/dev.go → cmd/dev.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package main
package cmd

import (
"flag"
Expand All @@ -22,11 +22,14 @@ func (c *DevCommand) Run(args []string) error {
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
c.Usage()
return flag.ErrHelp
} else if fs.NArg() > 1 {
return fmt.Errorf("too many arguments")
}

return flyscrape.Dev(fs.Arg(0))
cfg, err := parseConfigArgs(fs.Args()[1:])
if err != nil {
return fmt.Errorf("Error parsing config flags: %w", err)
}

return flyscrape.Dev(fs.Arg(0), cfg)
}

func (c *DevCommand) Usage() {
Expand All @@ -36,11 +39,17 @@ Recursive scraping is disabled in this mode, only the initial URL will be scrape
Usage:
flyscrape dev SCRIPT
flyscrape dev SCRIPT [config flags]
Examples:
# Run and watch script.
$ flyscrape dev example.js
# Set the URL as argument.
$ flyscrape dev example.js --url "http://other.com"
# Enable proxy support.
$ flyscrape dev example.js --proxies "http://someproxy:8043"
`[1:])
}
52 changes: 5 additions & 47 deletions cmd/flyscrape/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@ package main
import (
_ "embed"
"flag"
"fmt"
"log"
"os"
"strings"

"github.com/philippta/flyscrape/cmd"
_ "github.com/philippta/flyscrape/modules/cache"
_ "github.com/philippta/flyscrape/modules/depth"
_ "github.com/philippta/flyscrape/modules/domainfilter"
Expand All @@ -26,51 +25,10 @@ import (
func main() {
log.SetFlags(0)

m := &Main{}
if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
os.Exit(1)
} else if err != nil {
log.Println(err)
os.Exit(1)
}
}

type Main struct{}

func (m *Main) Run(args []string) error {
var cmd string
if len(args) > 0 {
cmd, args = args[0], args[1:]
}

switch cmd {
case "new":
return (&NewCommand{}).Run(args)
case "run":
return (&RunCommand{}).Run(args)
case "dev":
return (&DevCommand{}).Run(args)
default:
if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
m.Usage()
return flag.ErrHelp
if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil {
if err != flag.ErrHelp {
log.Println(err)
}
return fmt.Errorf("flyscrape %s: unknown command", cmd)
os.Exit(1)
}
}

func (m *Main) Usage() {
fmt.Println(`
flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
Usage:
flyscrape <command> [arguments]
Commands:
new creates a sample scraping script
run runs a scraping script
dev watches and re-runs a scraping script
`[1:])
}
Loading

0 comments on commit 6aa52bd

Please sign in to comment.