-
Notifications
You must be signed in to change notification settings - Fork 5
/
walker.yaml
81 lines (64 loc) · 2.72 KB
/
walker.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# The Walker Configuration File
#
# Walker was made to run with sensible default values, so most configuration
# keys are documented here but commented out. The values set here are the
# defaults.
# Whether to dynamically add new-found domains (or their links) to the crawl (a
# broad crawl) or discard them, assuming desired domains are manually seeded.
#add_new_domains: false
# The number of entries to keep in the CassandraDatastore's LRU cache of
# domains, preventing us from querying too frequently to see if we already have
# them.
#added_domains_cache_size: 20000
# Maximum number of entries to hold when we cache domain name resolutions
#max_dns_cache_entries: 20000
# Configure the User-Agent header
#user_agent: Walker (http://github.com/iParadigms/walker)
# Configure which formats this crawler Accepts
#accept_formats: ["text/html", "text/*"]
# Which link to accept based on protocol (a.k.a. schema)
#accept_protocols: ["http", "https"]
# Crawl delay (in seconds) to use when unspecified by robots.txt
#default_crawl_delay: 1
# Maximum size of http content
#max_http_content_size_bytes
# For the purpose of parsing out links for crawling, walker looks at the
# following tags:
# - a, area, form, frame, iframe, script, link, img
# It ignores several by default.
#ignore_tags: [script, img, link]
# The maximum number of links to parse from a page for further crawling.
#max_links_per_page: 1000
# How many simultaneous fetchers will your crawlmanager run
#num_simultaneous_fetchers: 10
# If true, walker will not crawl domains that resolve in private IP ranges
#blacklist_private_ips: true
## Dispatcher configuration
#dispatcher:
# ## maximum number of links added to segments table per dispatch (must be >0)
# num_links_per_segment: 500
#
# ## refresh_percentage is the percentage of links added per dispatch that have already been crawled.
# ## So refresh_percentage = 25 means that 25% of the links added to segments on the next dispatch
# ## will be refreshed (i.e. already crawled) links. This value must be >= 0 and <= 100.
# refresh_percentage: 25
#
# ## How many concurrent dispatching threads will be run at once (must be >0)
# num_concurrent_domains: 1
# Cassandra configuration for the datastore.
# Generally these are used to create a gocql.ClusterConfig object
# (https://godoc.org/github.com/gocql/gocql#ClusterConfig).
#
# keyspace shouldn't generally need to be changed; it is mainly changed in
# testing as an extra layer of safety.
#
# replication_factor is used when defining the keyspace.
#cassandra:
# hosts: ["localhost"]
# keyspace: "walker"
# replication_factor: 3
## Console specific config
#console:
# port: 3000
# template_directory: console/templates
# public_folder: console/public