-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsettings.py
132 lines (104 loc) · 4.27 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Scrapy settings for beatportscraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import json
import os
import psycopg2
BOT_NAME = "beatportscraper"
SPIDER_MODULES = ["beatportscraper.spiders"]
NEWSPIDER_MODULE = "beatportscraper.spiders"
# FEEDS = {
# 'data.csv': {'format': 'csv'}
# }
parent_directory = os.path.split(os.path.dirname(__file__))[0]
file_path = os.path.join(parent_directory, 'credentials.json')
with open(file_path, 'r') as f:
credentials = json.load(f)
POSTGRES_HOSTNAME = credentials["hostname"]
POSTGRES_USERNAME = credentials["username"]
POSTGRES_PASSWORD = credentials["password"]
POSTGRES_DATABASE = credentials["database"]
SCRAPEOPS_API_KEY = credentials["scrapeops_api"]
SCRAPEOPS_NUM_RES = 5
SCRAPEOPS_ENPOINT = 'https://headers.scrapeops.io/v1/browser-headers'
# GET LATEST ENTRY FROM POSTGRES
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "beatportscraper (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# "beatportscraper.middlewares.BeatportscraperSpiderMiddleware": 543,
"scrapy_deltafetch.DeltaFetch": 200,
}
DELTAFETCH_ENABLED = True
# DELTAFETCH_RESET = True
# RUN LINE TO RESET DELTAFETCH
# LOG_ENABLED = True
# LOG_FILE = "log.log"
# LOG_FILE_APPEND = False
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# "beatportscraper.middlewares.BeatportscraperDownloaderMiddleware": 543,
"beatportscraper.middlewares.ScrapeOpsFakeBrowserHeaders": 500,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
"scrapy.extensions.closespider.CloseSpider": 15,
}
CLOSESPIDER_TIMEOUT_NO_ITEM = 15
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"beatportscraper.pipelines.BeatportscraperPipeline": 400,
"beatportscraper.pipelines.SaveToPostgresPipeline": 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"