Skip to content

Commit

Permalink
Merge pull request #79 from tharropoulos/header-middleware
Browse files Browse the repository at this point in the history
Add header inspection tools
  • Loading branch information
jasonbosco authored Feb 4, 2025
2 parents 27407d7 + 2256baf commit 678d77b
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
28 changes: 28 additions & 0 deletions scraper/src/header_inspector_middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import logging

from scrapy import signals


class HeaderInspectionMiddleware:
"""
Middleware to inspect headers of outgoing requests and incoming responses
"""

def __init__(self):
self.spider = None

@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened)
return middleware

def spider_opened(self, spider):
self.spider = spider

def process_request(self, request, spider):
"""
This method is called for each request that goes through the download middleware.
"""
logging.debug("\nOutgoing request to: %s", request.url)
logging.debug("\nHeaders: %s", request.headers)
5 changes: 3 additions & 2 deletions scraper/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from .documentation_spider import DocumentationSpider
from .strategies.default_strategy import DefaultStrategy
from .custom_downloader_middleware import CustomDownloaderMiddleware
from .header_inspector_middleware import HeaderInspectionMiddleware
from .custom_dupefilter import CustomDupeFilter
from .config.browser_handler import BrowserHandler
from .strategies.algolia_settings import AlgoliaSettings

try:
# disable boto (S3 download)
Expand Down Expand Up @@ -46,6 +46,7 @@ def run_config(config):

root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
HEADER_MIDDLEWARES_PATH = root_module + 'header_inspector_middleware.' + HeaderInspectionMiddleware.__name__
DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__

headers = {
Expand Down Expand Up @@ -98,7 +99,7 @@ def run_config(config):
'LOG_ENABLED': '1',
'LOG_LEVEL': 'ERROR',
'USER_AGENT': config.user_agent,
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900, HEADER_MIDDLEWARES_PATH: 901},
# Need to be > 600 to be after the redirectMiddleware
'DUPEFILTER_USE_ANCHORS': config.use_anchors,
# Use our custom dupefilter in order to be scheme agnostic regarding link provided
Expand Down
36 changes: 36 additions & 0 deletions scraper/src/tests/config_loader/auth_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import pdb
from unittest.mock import MagicMock

import pytest
from scrapy.http import Request
from scrapy.spidermiddlewares.httperror import HttpError


@pytest.fixture
def config():
return MagicMock(
index_name="test_index",
start_urls=[{"url": "http://example.com"}],
allowed_domains=["example.com"],
stop_urls=[],
js_render=False,
)


@pytest.fixture
def env_vars(monkeypatch):
monkeypatch.setenv("DOCSEARCH_BASICAUTH_USERNAME", "testuser")
monkeypatch.setenv("DOCSEARCH_BASICAUTH_PASSWORD", "testpass")
monkeypatch.setenv("DOCSEARCH_AUTH_DOMAIN", "http://example.com")


def test_spider_auth_attributes(config, env_vars):
"""Test that DocumentationSpider correctly sets up Basic Auth attributes"""
from scraper.src.documentation_spider import DocumentationSpider

spider = DocumentationSpider(config=config, typesense_helper=None, strategy=None)

assert spider.http_user == "testuser"
assert spider.http_pass == "testpass"
assert spider.http_auth_domain == "http://example.com"

0 comments on commit 678d77b

Please sign in to comment.