diff --git a/scraper/src/header_inspector_middleware.py b/scraper/src/header_inspector_middleware.py new file mode 100644 index 0000000..227565b --- /dev/null +++ b/scraper/src/header_inspector_middleware.py @@ -0,0 +1,28 @@ +import logging + +from scrapy import signals + + +class HeaderInspectionMiddleware: + """ + Middleware to inspect headers of outgoing requests and incoming responses + """ + + def __init__(self): + self.spider = None + + @classmethod + def from_crawler(cls, crawler): + middleware = cls() + crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened) + return middleware + + def spider_opened(self, spider): + self.spider = spider + + def process_request(self, request, spider): + """ + This method is called for each request that goes through the download middleware. + """ + logging.debug("\nOutgoing request to: %s", request.url) + logging.debug("\nHeaders: %s", request.headers) diff --git a/scraper/src/index.py b/scraper/src/index.py index 9eaa963..f059cf8 100644 --- a/scraper/src/index.py +++ b/scraper/src/index.py @@ -14,9 +14,9 @@ from .documentation_spider import DocumentationSpider from .strategies.default_strategy import DefaultStrategy from .custom_downloader_middleware import CustomDownloaderMiddleware +from .header_inspector_middleware import HeaderInspectionMiddleware from .custom_dupefilter import CustomDupeFilter from .config.browser_handler import BrowserHandler -from .strategies.algolia_settings import AlgoliaSettings try: # disable boto (S3 download) @@ -46,6 +46,7 @@ def run_config(config): root_module = 'src.' if __name__ == '__main__' else 'scraper.src.' DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__ + HEADER_MIDDLEWARES_PATH = root_module + 'header_inspector_middleware.' + HeaderInspectionMiddleware.__name__ DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__ headers = { @@ -98,7 +99,7 @@ def run_config(config): 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, - 'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900}, + 'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900, HEADER_MIDDLEWARES_PATH: 901}, # Need to be > 600 to be after the redirectMiddleware 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided diff --git a/scraper/src/tests/config_loader/auth_test.py b/scraper/src/tests/config_loader/auth_test.py new file mode 100644 index 0000000..c149502 --- /dev/null +++ b/scraper/src/tests/config_loader/auth_test.py @@ -0,0 +1,36 @@ +import os +import pdb +from unittest.mock import MagicMock + +import pytest +from scrapy.http import Request +from scrapy.spidermiddlewares.httperror import HttpError + + +@pytest.fixture +def config(): + return MagicMock( + index_name="test_index", + start_urls=[{"url": "http://example.com"}], + allowed_domains=["example.com"], + stop_urls=[], + js_render=False, + ) + + +@pytest.fixture +def env_vars(monkeypatch): + monkeypatch.setenv("DOCSEARCH_BASICAUTH_USERNAME", "testuser") + monkeypatch.setenv("DOCSEARCH_BASICAUTH_PASSWORD", "testpass") + monkeypatch.setenv("DOCSEARCH_AUTH_DOMAIN", "http://example.com") + + +def test_spider_auth_attributes(config, env_vars): + """Test that DocumentationSpider correctly sets up Basic Auth attributes""" + from scraper.src.documentation_spider import DocumentationSpider + + spider = DocumentationSpider(config=config, typesense_helper=None, strategy=None) + + assert spider.http_user == "testuser" + assert spider.http_pass == "testpass" + assert spider.http_auth_domain == "http://example.com"