Skip to content

Commit

Permalink
Merge pull request #65 from GSA/SRCH-5751_low_output_fixes
Browse files Browse the repository at this point in the history
SRCH-5751 Low Output Fixes
  • Loading branch information
selfdanielj authored Dec 21, 2024
2 parents 2d107a6 + d465872 commit 608fee5
Show file tree
Hide file tree
Showing 17 changed files with 315 additions and 148 deletions.
47 changes: 35 additions & 12 deletions search_gov_crawler/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@
import argparse
import json
import logging
import time
import os
import sys
from datetime import datetime, UTC, timedelta
import time
from datetime import UTC, datetime, timedelta
from pathlib import Path

from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
from apscheduler.schedulers.background import BackgroundScheduler
from pythonjsonlogger.json import JsonFormatter

from search_gov_crawler.search_gov_spiders.extensions.json_logging import LOG_FMT
from search_gov_crawler import scrapy_scheduler
from search_gov_crawler.search_gov_spiders.extensions.json_logging import LOG_FMT

logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
Expand All @@ -58,7 +58,12 @@ def init_scheduler() -> BackgroundScheduler:


def create_apscheduler_job(
name: str, allowed_domains: str, starting_urls: str, handle_javascript: bool, runtime_offset_seconds: int
name: str,
allow_query_string: bool,
allowed_domains: str,
starting_urls: str,
handle_javascript: bool,
runtime_offset_seconds: int,
) -> dict:
"""Creates job record in format needed by apscheduler"""

Expand All @@ -71,6 +76,7 @@ def create_apscheduler_job(
"next_run_time": datetime.now(tz=UTC) + timedelta(seconds=runtime_offset_seconds),
"args": [
"domain_spider" if not handle_javascript else "domain_spider_js",
allow_query_string,
allowed_domains,
starting_urls,
],
Expand Down Expand Up @@ -110,19 +116,24 @@ def benchmark_from_file(input_file: Path, runtime_offset_seconds: int):
scheduler.shutdown() # this will wait until all jobs are finished


def benchmark_from_args(allowed_domains: str, starting_urls: str, handle_javascript: bool, runtime_offset_seconds: int):
def benchmark_from_args(
allow_query_string: bool,
allowed_domains: str,
starting_urls: str,
handle_javascript: bool,
runtime_offset_seconds: int,
):
"""Run an individual benchmarking job based on args"""

log.info(
"Starting benchmark from args! allowed_domains=%s starting_urls=%s handle_javascript=%s runtime_offset_seconds=%s",
allowed_domains,
starting_urls,
handle_javascript,
runtime_offset_seconds,
msg = (
"Starting benchmark from args! "
"allow_query_string=%s allowed_domains=%s starting_urls=%s handle_javascript=%s runtime_offset_seconds=%s"
)
log.info(msg, allow_query_string, allowed_domains, starting_urls, handle_javascript, runtime_offset_seconds)

apscheduler_job_kwargs = {
"name": "benchmark",
"allow_query_string": allow_query_string,
"allowed_domains": allowed_domains,
"starting_urls": starting_urls,
"handle_javascript": handle_javascript,
Expand All @@ -146,13 +157,25 @@ def benchmark_from_args(allowed_domains: str, starting_urls: str, handle_javascr
parser.add_argument("-d", "--allowed_domains", type=str, help="domains allowed to crawl", required=no_input_arg)
parser.add_argument("-u", "--starting_urls", type=str, help="url used to start crawl", required=no_input_arg)
parser.add_argument(
"-js", "--handle_js", action=argparse.BooleanOptionalAction, default=False, help="Flag to enable javascript"
"-js",
"--handle_js",
action=argparse.BooleanOptionalAction,
default=False,
help="Flag to enable javascript",
)
parser.add_argument(
"-qs",
"--allow_query_string",
action=argparse.BooleanOptionalAction,
default=False,
help="Flag to enable capturing URLs with query strings",
)
parser.add_argument("-t", "--runtime_offset", type=int, default=5, help="Number of seconds to offset job start")
args = parser.parse_args()

if no_input_arg:
benchmark_args = {
"allow_query_string": args.allow_query_string,
"allowed_domains": args.allowed_domains,
"starting_urls": args.starting_urls,
"handle_javascript": args.handle_js,
Expand Down
16 changes: 8 additions & 8 deletions search_gov_crawler/scrapy_scheduler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging
import subprocess
import os
import subprocess
from pathlib import Path

from apscheduler.executors.pool import ThreadPoolExecutor
Expand All @@ -20,26 +20,25 @@
CRAWL_SITES_FILE = Path(__file__).parent / "search_gov_spiders" / "utility_files" / "crawl-sites.json"


def run_scrapy_crawl(spider: str, allowed_domains: str, start_urls: str) -> None:
def run_scrapy_crawl(spider: str, allow_query_string: bool, allowed_domains: str, start_urls: str) -> None:
"""Runs `scrapy crawl` command as a subprocess given the allowed arguments"""

scrapy_env = os.environ.copy()
scrapy_env["PYTHONPATH"] = str(Path(__file__).parent.parent)

subprocess.run(
f"scrapy crawl {spider} -a allowed_domains={allowed_domains} -a start_urls={start_urls}",
f"scrapy crawl {spider} -a allow_query_string={allow_query_string} -a allowed_domains={allowed_domains} -a start_urls={start_urls}",
check=True,
cwd=Path(__file__).parent,
env=scrapy_env,
executable="/bin/bash",
shell=True,
)
log.info(
"Successfully completed scrapy crawl with args spider=%s, allowed_domains=%s, start_urls=%s",
spider,
allowed_domains,
start_urls,
msg = (
"Successfully completed scrapy crawl with args "
"spider=%s, allow_query_string=%s, allowed_domains=%s, start_urls=%s"
)
log.info(msg, spider, allow_query_string, allowed_domains, start_urls)


def transform_crawl_sites(crawl_sites: list[dict]) -> list[dict]:
Expand Down Expand Up @@ -70,6 +69,7 @@ def transform_crawl_sites(crawl_sites: list[dict]) -> list[dict]:
),
"args": [
"domain_spider" if not crawl_site["handle_javascript"] else "domain_spider_js",
crawl_site["allow_query_string"],
crawl_site["allowed_domains"],
crawl_site["starting_urls"],
],
Expand Down
36 changes: 19 additions & 17 deletions search_gov_crawler/search_gov_spiders/extensions/json_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
from typing import Self

from pythonjsonlogger.json import JsonFormatter
from scrapy.spiders import Spider
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.signals import spider_opened

from scrapy.spiders import Spider

LOG_FMT = "%(asctime)%(name)%(levelname)%(message)"

Expand All @@ -16,6 +15,7 @@ def search_gov_default(obj) -> dict | None:
if isinstance(obj, Spider):
return {
"name": obj.name,
"allow_query_string": getattr(obj, "allow_query_string", None),
"allowed_domains": getattr(obj, "allowed_domains", None),
"start_urls": obj.start_urls,
}
Expand All @@ -29,32 +29,30 @@ def search_gov_default(obj) -> dict | None:
class SearchGovSpiderStreamHandler(logging.StreamHandler):
"""Extension of logging.StreamHandler with our level, fmt, and defaults"""

def __init__(self, *_args, **_kwargs):
def __init__(self, log_level, *_args, **_kwargs):
super().__init__(*_args, **_kwargs)
formatter = JsonFormatter(fmt=LOG_FMT, json_default=search_gov_default)
self.setLevel(logging.INFO)
self.setLevel(log_level)
self.setFormatter(formatter)


class SearchGovSpiderFileHandler(logging.FileHandler):
"""Extension of logging.File with our level, fmt, and defaults"""

def __init__(self, *args, **kwargs):
def __init__(self, log_level, *args, **kwargs):
super().__init__(*args, **kwargs)
formatter = JsonFormatter(fmt=LOG_FMT, json_default=search_gov_default)
self.setLevel(logging.INFO)
self.setLevel(log_level)
self.setFormatter(formatter)

@classmethod
def from_hanlder(cls, handler: logging.FileHandler) -> "SearchGovSpiderFileHandler":
def from_hanlder(cls, handler: logging.FileHandler, log_level: str) -> "SearchGovSpiderFileHandler":
"""Create a json file handler based on values used by an existing FileHandler"""

if handler.baseFilename == "/dev/null":
new_filename = handler.baseFilename
else:
new_filename = f"{handler.baseFilename}.json"
new_filename = handler.baseFilename if handler.baseFilename == "/dev/null" else f"{handler.baseFilename}.json"

return cls(
log_level=log_level,
filename=new_filename,
mode=handler.mode,
encoding=handler.encoding,
Expand All @@ -67,38 +65,42 @@ class JsonLogging:
"""Scrapy extension that injects JSON logging into a spider run."""

file_hanlder_enabled: bool
log_level: str

def __init__(self):
def __init__(self, log_level):
self.file_hanlder_enabled = False
self.log_level = log_level
self._add_json_handlers()

def _add_json_handlers(self) -> None:
"""Try to add json hanlders for file and streaming"""

if not self.file_hanlder_enabled:
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
root_logger.setLevel(self.log_level)

file_handlers = [handler for handler in root_logger.handlers if isinstance(handler, logging.FileHandler)]

for file_handler in file_handlers:
root_logger.addHandler(SearchGovSpiderFileHandler.from_hanlder(file_handler))
root_logger.addHandler(
SearchGovSpiderFileHandler.from_hanlder(handler=file_handler, log_level=self.log_level)
)
self.file_hanlder_enabled = True

if not any(
handler for handler in root_logger.handlers if isinstance(handler, SearchGovSpiderStreamHandler)
):
root_logger.addHandler(SearchGovSpiderStreamHandler())
root_logger.addHandler(SearchGovSpiderStreamHandler(log_level=self.log_level))

@classmethod
def from_crawler(cls, crawler) -> Self:
def from_crawler(cls, crawler: Crawler) -> Self:
"""
Required extension method that checks for configuration and connects extension methons to signals
"""
if not crawler.settings.getbool("JSON_LOGGING_ENABLED"):
raise NotConfigured("JsonLogging Extension is listed in Extension but is not enabled.")

ext = cls()
ext = cls(log_level=crawler.settings.get("LOG_LEVEL", "INFO"))
crawler.signals.connect(ext.spider_opened, signal=spider_opened)
return ext

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# fmt: off
FILTER_EXTENSIONS = [
# archives
"7z", "7zip", "bz2", "rar", "tar", "tar.gz", "xz", "zip", "gz"
"7z", "7zip", "bz2", "rar", "tar", "tar.gz", "xz", "zip", "gz",
# images
"mng", "pct", "bmp", "gif", "jpg", "jpeg", "png", "pst", "psp", "image",
"tif", "tiff", "ai", "drw", "dxf", "eps", "ps", "svg", "cdr", "ico",
Expand All @@ -21,8 +21,8 @@
"ppt", "pptx", "pps", "odt", "ods", "odg", "odp",
# other
"css", "exe", "bin", "rss", "dmg", "iso", "apk", "js", "xml", "ibooks",
"cfm", "ics", "nc", "prj", "sfx", "eventsource", "fetch",
"stylesheet", "websocket", "xhr", "font", "manifest",
"ics", "nc", "nc4", "prj", "sfx", "eventsource", "fetch", "stylesheet",
"websocket", "xhr", "font", "manifest", "hdf", "geojson",
]
# fmt: on

Expand All @@ -42,6 +42,7 @@
allow=(),
deny=LINK_DENY_REGEX_STR,
deny_extensions=FILTER_EXTENSIONS,
tags=("a", "area", "va-link"), # specified to account for custom link tags
unique=True,
)

Expand Down
Loading

0 comments on commit 608fee5

Please sign in to comment.