Skip to content

Commit

Permalink
feat: add extra-pages to scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Nov 27, 2023
1 parent 8bc41f7 commit 7eae728
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
6 changes: 6 additions & 0 deletions cmd/wp-go-static/commands/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func init() {
ScrapeCmd.PersistentFlags().String("url", "", "URL to scrape")
ScrapeCmd.PersistentFlags().String("cache", "", "Cache directory")
ScrapeCmd.PersistentFlags().String("replace-url", "", "Replace with a specific url")
ScrapeCmd.PersistentFlags().StringSlice("extra-pages", []string{}, "Extra pages to scrape")
ScrapeCmd.PersistentFlags().Bool("replace", true, "Replace url")
ScrapeCmd.PersistentFlags().Bool("parallel", false, "Fetch in parallel")
ScrapeCmd.PersistentFlags().Bool("images", true, "Download images")
Expand Down Expand Up @@ -96,6 +97,11 @@ func scrapeCmdF(command *cobra.Command, args []string) error {
// Visit only pages that are part of the website
scrape.c.AllowedDomains = []string{scrape.hostname}

for _, extraPage := range scrape.config.Scrape.ExtraPages {
log.Println("Visiting Extra Page:", extraPage)
scrape.visitURL(extraPage)
}

// On every a element which has href attribute call callback
scrape.c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
Expand Down
17 changes: 9 additions & 8 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ type SitemapConfig struct {
}

type ScrapeConfig struct {
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
Cache string `mapstructure:"cache"`
ReplaceURL string `mapstructure:"replace-url"`
Replace bool `mapstructure:"replace"`
Parallel bool `mapstructure:"parallel"`
Images bool `mapstructure:"images"`
CheckHead bool `mapstructure:"check-head"`
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
Cache string `mapstructure:"cache"`
ReplaceURL string `mapstructure:"replace-url"`
Replace bool `mapstructure:"replace"`
Parallel bool `mapstructure:"parallel"`
Images bool `mapstructure:"images"`
CheckHead bool `mapstructure:"check-head"`
ExtraPages []string `mapstructure:"extra-pages"`
}

type RobotsConfig struct {
Expand Down

0 comments on commit 7eae728

Please sign in to comment.