diff --git a/.circleci/config.yml b/.circleci/config.yml old mode 100644 new mode 100755 diff --git a/.codeclimate.yml b/.codeclimate.yml old mode 100644 new mode 100755 diff --git a/.github/dependabot.yml b/.github/dependabot.yml old mode 100644 new mode 100755 index 192df5b..2caf7fc --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -16,4 +16,4 @@ updates: patterns: - "freezegun" - "pylint" - - "pytest" + - "pytest*" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 index e056019..10b1cfe --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,6 @@ repos: - id: check-added-large-files # prevents giant files from being committed. - id: check-case-conflict # checks for files that would conflict in case-insensitive filesystems. - id: check-json # checks json files for parseable syntax. - - id: check-shebang-scripts-are-executable # ensures that (non-binary) files with a shebang are executable. - id: check-merge-conflict # checks for files that contain merge conflict strings. - id: check-symlinks # checks for symlinks which do not point to anything. - id: check-yaml # checks yaml files for parseable syntax. diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index bd7c2e0..94b760c --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Make sure to run `pip install -r requirements.txt` and `playwright install` befo 1. Navigate to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory 2. Enter one of two following commands: - * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the “FEEDS” variable of the [*settings.py*](search_gov_crawler/search_gov_spiders/settings.py) file: + * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the `search_gov_crawler/search_gov_spiders/pipelines.py`: $ scrapy runspider diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/appspec.yml b/appspec.yml new file mode 100755 index 0000000..87244e0 --- /dev/null +++ b/appspec.yml @@ -0,0 +1,25 @@ +version: 0.0 +os: linux +permissions: + - object: . + mode: 777 + acls: + - "d:u::rwx" + - "d:g::rwx" + - "d:o::rwx" + owner: search + type: + - directory +hooks: + AfterInstall: + - location: cicd-scripts/app_install.sh + timeout: 600 + runas: search + ApplicationStart: + - location: cicd-scripts/app_start.sh + timeout: 300 + runas: search + ApplicationStop: + - location: cicd-scripts/app_stop.sh + timeout: 300 + runas: search diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh new file mode 100755 index 0000000..5179308 --- /dev/null +++ b/cicd-scripts/app_install.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh + +### VARIABLES ### +SPIDER_PYTHON_VERSION=3.12 +_CURRENT_BUILD_DIR=${PWD} +VENV_DIR=./venv + +### FUNCTIONS ### + +# Stop spider services +stop_services() { + echo "Running app_stop.sh..." + ensure_executable "./cicd-scripts/app_stop.sh" +} + +# Install missing system dependencies +install_system_dependencies() { + echo "Installing system dependencies..." + sudo apt-get update -y + sudo apt-get install -y \ + lzma liblzma-dev libbz2-dev python-setuptools \ + acl build-essential checkinstall libreadline-dev \ + libncursesw5-dev libssl-dev libsqlite3-dev tk-dev \ + libgdbm-dev libc6-dev zlib1g-dev libffi-dev openssl +} + +# Install Python +install_python() { + echo "Installing Python ${SPIDER_PYTHON_VERSION}..." + cd /usr/src + wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz + tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz + sudo chown -R $(whoami) ./Python-${SPIDER_PYTHON_VERSION}.0 + cd Python-${SPIDER_PYTHON_VERSION}.0 + ./configure --enable-optimizations + make + make install + make altinstall + cd "$_CURRENT_BUILD_DIR" + echo "Python ${SPIDER_PYTHON_VERSION} installed successfully." +} + +# Check and install Python if needed +check_python() { + if ! command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then + install_python + else + echo "Python ${SPIDER_PYTHON_VERSION} already installed: $(python${SPIDER_PYTHON_VERSION} --version)" + fi +} + +# Set environment paths +update_pythonpath() { + ensure_executable "./cicd-scripts/helpers/update_pythonpath.sh" +} + +# Setup virtual environment +setup_virtualenv() { + echo "Setting up virtual environment..." + python${SPIDER_PYTHON_VERSION} -m venv "$VENV_DIR" + source "$VENV_DIR/bin/activate" + python -m pip install --upgrade pip +} + +# Install dependencies +install_dependencies() { + echo "Installing dependencies..." + python -m pip install --upgrade -r ./search_gov_crawler/requirements.txt + echo "Installing Playwright..." + python -m pip install --upgrade pytest-playwright playwright + playwright install --with-deps + deactivate +} + +# Configure permissions +configure_permissions() { + echo "Configuring file permissions..." + chmod -R 777 . + chown -R "$(whoami)" . + sudo setfacl -Rdm g:dgsearch:rwx . +} + +# Manage cron jobs +manage_cron_jobs() { + echo "Managing cron jobs..." + crontab -l | grep -v 'app_start.sh' > temp_cron || true + echo "@reboot $(pwd)/cicd-scripts/app_start.sh" >> temp_cron + crontab temp_cron + rm temp_cron + echo "Cron jobs updated." +} + +# Start monitoring agents +start_agents() { + echo "Starting AWS CloudWatch agent..." + ensure_executable "./cicd-scripts/helpers/check_cloudwatch.sh" + + echo "Starting AWS CodeDeploy agent..." + ensure_executable "./cicd-scripts/helpers/check_codedeploy.sh" +} + +### SCRIPT EXECUTION ### + +# Stop running services +stop_services + +# Install system dependencies +install_system_dependencies + +# Check and install Python if missing +check_python + +# Set environment paths +update_pythonpath + +# Configure permissions +configure_permissions + +# Setup and activate virtual environment +setup_virtualenv + +# Install dependencies +install_dependencies + +# Start AWS agents +start_agents + +# Manage cron jobs +manage_cron_jobs + +echo "App installation completed successfully." diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh new file mode 100755 index 0000000..76b0081 --- /dev/null +++ b/cicd-scripts/app_start.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh + +# TODO: Make it part of the local env variable that is set by Ansible +SPIDER_RUN_WITH_UI=false + +# Determine which script to run based on the SPIDER_RUN_WITH_UI flag +if $SPIDER_RUN_WITH_UI; then + SCRIPT="./cicd-scripts/helpers/run_with_ui.sh" +else + SCRIPT="./cicd-scripts/helpers/run_without_ui.sh" +fi + +# Ensure the script exists, is executable, and run it +ensure_executable "$SCRIPT" diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh new file mode 100755 index 0000000..9d536a8 --- /dev/null +++ b/cicd-scripts/app_stop.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh + +### FUNCTIONS ### + +# Remove virtual environment if it exists +remove_venv() { + if [ -d ./venv ]; then + echo "Removing virtual environment..." + rm -rf ./venv/ + fi +} + +# Purge pip cache +purge_pip_cache() { + echo "Purging pip cache..." + rm -rf ~/.cache/pip /root/.cache/pip +} + +# Stop scrapy scheduler if running +stop_scrapy_scheduler() { + echo "Stopping scrapy_scheduler.py (if running)..." + ensure_executable "./cicd-scripts/helpers/kill_scheduler.sh" +} + +# Stop scrapyd and scrapydweb tasks +stop_scrapy_tasks() { + echo "Stopping all scrapyd and scrapydweb tasks..." + + # Kill scrapydweb tasks + if pkill -f "scrapydweb" 2>/dev/null; then + echo "scrapydweb tasks stopped." + else + echo "No scrapydweb tasks running." + fi + + # Kill scrapyd tasks + if pkill -f "scrapyd" 2>/dev/null; then + echo "scrapyd tasks stopped." + else + echo "No scrapyd tasks running." + fi +} + +# Display remaining scrapy processes +display_remaining_scrapy_processes() { + echo -e "\nRemaining scrapy processes (if any):" + ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." +} + +# Force kill any remaining scrapy background jobs +kill_remaining_scrapy_jobs() { + echo "Force killing remaining scrapy background jobs..." + if ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9; then + echo "Remaining scrapy jobs killed." + else + echo "No remaining scrapy jobs to kill." + fi +} + +# Remove nohup jobs (python scripts) +remove_nohup_jobs() { + echo "Removing nohup jobs (python)..." + ps -ef | grep nohup | grep -v grep | awk '{print $2}' | xargs kill -9 +} + +# Remove cron job entries referencing the given string +remove_cron_entry() { + if [ -z "$1" ]; then + echo "Error: No cron entry provided." + return + fi + + local CRON_ENTRY="$1" + local CRON_USER=$(whoami) + + echo "Removing cron job entries referencing: $CRON_ENTRY" + + # Remove cron job for the current user (including the full path if needed) + sudo crontab -l -u "$CRON_USER" 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab -u "$CRON_USER" - + + echo "Cron job entries for '$CRON_ENTRY' removed." +} + +### SCRIPT EXECUTION ### + +# Remove virtual environment +remove_venv + +# Purge pip cache +purge_pip_cache + +# Stop scrapy scheduler if running +stop_scrapy_scheduler + +# Stop scrapyd and scrapydweb tasks +stop_scrapy_tasks + +# Display remaining scrapy processes (if any) +display_remaining_scrapy_processes + +# Force kill any remaining scrapy background jobs +kill_remaining_scrapy_jobs + +# Remove nohup jobs (python) +remove_nohup_jobs + +# Remove specific cron jobs +remove_cron_entry "check_cloudwatch.sh" +remove_cron_entry "check_codedeploy.sh" +remove_cron_entry "app_start.sh" + +echo "App stop completed successfully." diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh new file mode 100755 index 0000000..487d122 --- /dev/null +++ b/cicd-scripts/helpers/check_cloudwatch.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Function to check if CloudWatch agent is running +check_cloudwatch() { + if ! pgrep -f amazon-cloudwatch-agent > /dev/null; then + echo "AWS CloudWatch agent is not running. Starting it now..." + sudo service amazon-cloudwatch-agent start + if [ $? -eq 0 ]; then + echo "AWS CloudWatch agent started successfully." + else + echo "Failed to start AWS CloudWatch agent." + fi + else + echo "AWS CloudWatch agent is running." + fi +} + +# Ensure the script is added to crontab for execution on reboot +setup_cron() { + chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh + CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_cloudwatch.sh" + + # Update crontab, ensuring no duplicates + (crontab -l 2>/dev/null | grep -v -F "check_cloudwatch.sh"; echo "$CRON_ENTRY") | crontab - + echo "Crontab entry added to ensure the script runs on reboot." +} + +# Execute the function +check_cloudwatch + +# Add to crontab +setup_cron diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh new file mode 100755 index 0000000..6e6cf15 --- /dev/null +++ b/cicd-scripts/helpers/check_codedeploy.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Function to check if CodeDeploy agent is running +check_codedeploy() { + if ! pgrep -f codedeploy-agent > /dev/null; then + echo "AWS CodeDeploy agent is not running. Starting it now..." + sudo service codedeploy-agent start + if [ $? -eq 0 ]; then + echo "AWS CodeDeploy agent started successfully." + else + echo "Failed to start AWS CodeDeploy agent." + fi + else + echo "AWS CodeDeploy agent is running." + fi +} + +# Ensure the script is added to crontab for execution on reboot +setup_cron() { + chmod +x ./cicd-scripts/helpers/check_codedeploy.sh + CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_codedeploy.sh" + + # Update crontab, ensuring no duplicates + (crontab -l 2>/dev/null | grep -v -F "check_codedeploy.sh"; echo "$CRON_ENTRY") | crontab - + echo "Crontab entry added to ensure the script runs on reboot." +} + +# Execute the function +check_codedeploy + +# Add to crontab +setup_cron diff --git a/cicd-scripts/helpers/ensure_executable.sh b/cicd-scripts/helpers/ensure_executable.sh new file mode 100755 index 0000000..88e6439 --- /dev/null +++ b/cicd-scripts/helpers/ensure_executable.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Function to ensure a file exists, is executable, and then runs it +ensure_executable() { + local script="$1" + + if [ -f "$script" ]; then + chmod +x "$script" + echo "$script is now executable." + source "$script" + else + echo "Error: $script not found!" + # exit 1 + fi +} diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh new file mode 100755 index 0000000..4c559da --- /dev/null +++ b/cicd-scripts/helpers/kill_scheduler.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Find the process ID of the running scrapy_scheduler.py script +echo "Searching for scrapy_scheduler.py process..." +PROCESS_ID=$(pgrep -f "scrapy_scheduler.py") + +# Check if the process ID was found +if [ -n "$PROCESS_ID" ]; then + echo "No running process found for scrapy_scheduler.py." + + # Kill the process + echo "Killing process with PID: $PROCESS_ID" + kill "$PROCESS_ID" 2>/dev/null + + # Pause to allow the process to terminate + sleep 3 + + # Verify if the process was killed + if ! kill -0 "$PROCESS_ID" 2>/dev/null; then + echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated." + else + echo "Failed to terminate the process or process no longer exists." + fi +fi diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh new file mode 100755 index 0000000..de181e2 --- /dev/null +++ b/cicd-scripts/helpers/run_with_ui.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +SCRAPYD_URL="http://127.0.0.1:6800/" +SCRAPYDWEB_URL="http://127.0.0.1:5000/" +SPIDER_URLS_API=https://staging.search.usa.gov/urls + +# Function to check if a URL is up and running +function check_url() { + local URL=$1 + local MAX_ATTEMPTS="${2:-3}" + local DELAY=5 + local attempt=1 + + while [ $attempt -le $MAX_ATTEMPTS ]; do + if curl --output /dev/null --silent --head --fail "$URL"; then + echo "Service at $URL is up on attempt $attempt." + return 0 + else + echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..." + fi + attempt=$((attempt+1)) + sleep $DELAY + done + + echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts." + return 1 +} + +# Function to check if required command exists +function check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 is not installed or not in your PATH." + exit 1 + fi +} + +check_command "scrapyd" +check_command "scrapydweb" +check_command "curl" + +echo "Killing any existing scrapyd and scrapydweb services" +sudo pkill -f "scrapydweb" 2>/dev/null +sudo pkill -f "scrapyd" 2>/dev/null + +echo "Running searchgov-spider application..." + +# Start scrapyd +echo "Starting scrapyd service..." +sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &' +PID1=$! +echo "Started scrapyd with PID $PID1" + +# Check if scrapyd is running +if check_url "$SCRAPYD_URL"; then + echo "The scrapyd service is running at $SCRAPYD_URL" + sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &' + PID2=$! + echo "Started scrapydweb with PID $PID2" + + if check_url "$SCRAPYDWEB_URL"; then + echo "The scrapydweb service is running at $SCRAPYDWEB_URL" + else + echo "Error: scrapydweb failed at $SCRAPYDWEB_URL." + return + fi +else + echo "Error: scrapyd failed at $SCRAPYD_URL." + return +fi + +# Display the last few lines of logs +echo -e "\n-- Last 10 lines of scrapyd.log:\n" +tail -n 10 /var/log/scrapyd.log + +echo -e "\n-- Last 10 lines of scrapydweb.log:\n" +tail -n 10 /var/log/scrapydweb.log diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh new file mode 100755 index 0000000..247488b --- /dev/null +++ b/cicd-scripts/helpers/run_without_ui.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Run the script in the background using the virtual environment +chmod +x ./search_gov_crawler/scrapy_scheduler.py + +sudo nohup bash -c "source ./venv/bin/activate && ./venv/bin/python ./search_gov_crawler/scrapy_scheduler.py" > /var/log/scrapy_scheduler.log 2>&1 & + +echo "Running no UI vesrion of searchgov-spider..." diff --git a/cicd-scripts/helpers/update_pythonpath.sh b/cicd-scripts/helpers/update_pythonpath.sh new file mode 100755 index 0000000..e742b55 --- /dev/null +++ b/cicd-scripts/helpers/update_pythonpath.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Define the current directory +CURRENT_DIR=$(pwd) + +# Define the .bashrc file location +BASHRC_FILE="$HOME/.bashrc" + +# Check if .bashrc contains an export PYTHONPATH line +if grep -q "^export PYTHONPATH=" "$BASHRC_FILE"; then + # Extract the existing PYTHONPATH line + PYTHONPATH_LINE=$(grep "^export PYTHONPATH=" "$BASHRC_FILE") + + # Check if the current directory is already included + if echo "$PYTHONPATH_LINE" | grep -q "$CURRENT_DIR"; then + echo "PYTHONPATH already includes the current directory: $CURRENT_DIR" + else + # Ensure the updated line includes the starting and ending quotes + CURRENT_PATHS=$(echo "$PYTHONPATH_LINE" | sed -e 's/^export PYTHONPATH=//' -e 's/^"//' -e 's/"$//') + UPDATED_LINE="export PYTHONPATH=\"${CURRENT_PATHS}:${CURRENT_DIR}\"" + sed -i "s|^export PYTHONPATH=.*|$UPDATED_LINE|" "$BASHRC_FILE" + echo "Updated PYTHONPATH to include the current directory: $CURRENT_DIR" + fi +else + # Add a new export PYTHONPATH line to .bashrc + echo "export PYTHONPATH=\"\$PYTHONPATH:${CURRENT_DIR}\"" >> "$BASHRC_FILE" + echo "Added new PYTHONPATH to .bashrc including the current directory: $CURRENT_DIR" +fi + +# Apply changes for the current session +export PYTHONPATH=\"${CURRENT_PATHS//"\$PYTHONPATH"/}:${CURRENT_DIR}\" + +echo "PYTHONPATH changes applied:" +echo $PYTHONPATH diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 diff --git a/search_gov_crawler/__init__.py b/search_gov_crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search_gov_crawler/benchmark.py b/search_gov_crawler/benchmark.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/output/.gitignore b/search_gov_crawler/output/.gitignore old mode 100644 new mode 100755 diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt old mode 100644 new mode 100755 index 645c8f8..19790be --- a/search_gov_crawler/requirements.txt +++ b/search_gov_crawler/requirements.txt @@ -2,6 +2,7 @@ freezegun==1.5.1 pylint==3.3.2 pytest==8.3.4 pytest-console-scripts==1.4.1 +pytest-mock==3.14.0 python-json-logger==3.2.0 scrapy==2.11.2 diff --git a/search_gov_crawler/scrapy.cfg b/search_gov_crawler/scrapy.cfg old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapy_scheduler.py b/search_gov_crawler/scrapy_scheduler.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapyd-logs/.gitignore b/search_gov_crawler/scrapyd-logs/.gitignore old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapyd.conf b/search_gov_crawler/scrapyd.conf old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapydweb_settings_v10.py b/search_gov_crawler/scrapydweb_settings_v10.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_logparser/__init__.py b/search_gov_crawler/search_gov_logparser/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_scrapyd/__init__.py b/search_gov_crawler/search_gov_scrapyd/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_scrapydweb/__init__.py b/search_gov_crawler/search_gov_scrapydweb/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/__init__.py b/search_gov_crawler/search_gov_spiders/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja old mode 100644 new mode 100755 index 656c94b..47f6bff --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja @@ -23,5 +23,3 @@ - - diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css old mode 100644 new mode 100755 index 2f13050..7d63694 --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css @@ -83,5 +83,3 @@ table{max-width:100%;background-color:transparent;border-collapse:collapse;borde .icon,.icon-big {display:inline-block;} .icon {width:34px;height:34px;} .icon-big {width:140px;height:140px;} - - diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css old mode 100644 new mode 100755 index bd1e39d..124bd07 --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css @@ -24,4 +24,4 @@ table.report-container td {padding: 40px 20px;} .report-section h2 {margin: 0 0 20px 0;padding: 0 0 12px 0;line-height: 20px;border-bottom: 1px solid #f4f4f4;} .report-section h3 {margin: 25px 0 5px 0;line-height: 24px;} .report-section h4 {margin: 0 0 2px 0;} -.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;} \ No newline at end of file +.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;} diff --git a/search_gov_crawler/search_gov_spiders/actions/results.css b/search_gov_crawler/search_gov_spiders/actions/results.css old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/results.jinja b/search_gov_crawler/search_gov_spiders/actions/results.jinja old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/extensions/__init__.py b/search_gov_crawler/search_gov_spiders/extensions/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/extensions/json_logging.py b/search_gov_crawler/search_gov_spiders/extensions/json_logging.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/helpers/__init__.py b/search_gov_crawler/search_gov_spiders/helpers/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py b/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/items.py b/search_gov_crawler/search_gov_spiders/items.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/middlewares.py b/search_gov_crawler/search_gov_spiders/middlewares.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/monitors.py b/search_gov_crawler/search_gov_spiders/monitors.py old mode 100644 new mode 100755 index 260dd94..7dafd2a --- a/search_gov_crawler/search_gov_spiders/monitors.py +++ b/search_gov_crawler/search_gov_spiders/monitors.py @@ -14,4 +14,4 @@ class PeriodicMonitorSuite(MonitorSuite): monitors_failed_actions = [ CreateCustomFileReport, SendSmtpEmail - ] \ No newline at end of file + ] diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py old mode 100644 new mode 100755 index b626072..6cbc5e8 --- a/search_gov_crawler/search_gov_spiders/pipelines.py +++ b/search_gov_crawler/search_gov_spiders/pipelines.py @@ -2,46 +2,99 @@ Don't forget to add your pipeline to the ITEM_PIPELINES setting See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html """ - import os from pathlib import Path +import requests from scrapy.exceptions import DropItem + class SearchGovSpidersPipeline: """ - Class for pipeline that takes items and adds them - to output file with a max size of 3.9MB + Pipeline that writes items to files (rotated at ~3.9MB) or sends batched POST requests + to SPIDER_URLS_API if the environment variable is set. """ - def __init__(self, *_args, **_kwargs): - self.current_file_size = 0 + MAX_FILE_SIZE_BYTES = int(3.9 * 1024 * 1024) # 3.9MB in bytes + APP_PID = os.getpid() + + def __init__(self): + self.api_url = os.environ.get("SPIDER_URLS_API") + self.urls_batch = [] self.file_number = 1 - self.parent_file_path = Path(__file__).parent.parent.resolve() - self.base_path_name = str(self.parent_file_path / f"output/all-links-p{os.getpid()}.csv") - self.short_file = open(self.base_path_name, "a", encoding="utf-8") - self.max_file_size = 39000000 #3.9MB max - self.paginate = True + self.file_path = None + self.current_file = None + + if not self.api_url: + output_dir = Path(__file__).parent.parent / "output" + output_dir.mkdir(parents=True, exist_ok=True) + base_filename = f"all-links-p{self.APP_PID}" + self.file_path = output_dir / f"{base_filename}.csv" + self.current_file = open(self.file_path, "a", encoding="utf-8") + + def process_item(self, item, spider): + """Handle each item by writing to file or batching URLs for an API POST.""" + url = item.get("url", "") + if not url: + raise DropItem("Missing URL in item") + + if self.api_url: + self._process_api_item(url, spider) + else: + self._process_file_item(url) - def process_item(self, item, _spider): - """Checks that the file is not at max size. - Adds it to the file if less, or creates a new file if too large.""" - line = item["url"] - self.current_file_size += 1 - file_stats = os.stat(self.base_path_name) - self.current_file_size += file_stats.st_size - next_file_size = self.current_file_size + len(line) - if self.paginate and next_file_size > self.max_file_size: - self.short_file.close() - new_name = str(self.parent_file_path / f"output/all-links-p{os.getpid()}-{self.file_number}.csv") - os.rename(self.base_path_name, new_name) - self.file_number = self.file_number + 1 - self.short_file = open(self.base_path_name, "w", encoding="utf-8") - self.current_file_size = 0 - self.short_file.write(line) - self.short_file.write("\n") - self.current_file_size = self.current_file_size + len(line) return item + def _process_api_item(self, url, spider): + """Batch URLs for API and send POST if size limit is reached.""" + self.urls_batch.append(url) + if self._batch_size() >= self.MAX_FILE_SIZE_BYTES: + self._send_post_request(spider) + + def _process_file_item(self, url): + """Write URL to file and rotate the file if size exceeds the limit.""" + self.current_file.write(f"{url}\n") + if self._file_size() >= self.MAX_FILE_SIZE_BYTES: + self._rotate_file() + + def _batch_size(self): + """Calculate total size of the batched URLs.""" + return sum(len(url.encode("utf-8")) for url in self.urls_batch) + + def _file_size(self): + """Get the current file size.""" + self.current_file.flush() # Ensure the OS writes buffered data to disk + return self.file_path.stat().st_size + + def _rotate_file(self): + """Close the current file, rename it, and open a new one.""" + self.current_file.close() + rotated_file = self.file_path.with_name(f"{self.file_path.stem}-{self.file_number}.csv") + os.rename(self.file_path, rotated_file) + self.current_file = open(self.file_path, "a", encoding="utf-8") + self.file_number += 1 + + def _send_post_request(self, spider): + """Send a POST request with the batched URLs.""" + if not self.urls_batch: + return + + try: + response = requests.post(self.api_url, json={"urls": self.urls_batch}) + response.raise_for_status() + spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}") + except requests.RequestException as e: + spider.logger.error(f"Failed to send URLs to {self.api_url}: {e}") + raise DropItem(f"POST request failed: {e}") + finally: + self.urls_batch.clear() + + def close_spider(self, spider): + """Finalize operations: close files or send remaining batched URLs.""" + if self.api_url: + self._send_post_request(spider) + elif self.current_file: + self.current_file.close() + class DeDeuplicatorPipeline: """Class for pipeline that removes duplicate items""" diff --git a/search_gov_crawler/search_gov_spiders/settings.py b/search_gov_crawler/search_gov_spiders/settings.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/__init__.py b/search_gov_crawler/search_gov_spiders/spiders/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/README.md b/search_gov_crawler/search_gov_spiders/utility_files/README.md old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json b/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py b/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py b/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist b/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist old mode 100644 new mode 100755 diff --git a/search_gov_crawler/setup.py b/search_gov_crawler/setup.py old mode 100644 new mode 100755 diff --git a/setup.cfg b/setup.cfg old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 diff --git a/tests/integration_tests/test_scrapyd.py b/tests/integration_tests/test_scrapyd.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/conftest.py b/tests/search_gov_spiders/conftest.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/crawl-sites-test.json b/tests/search_gov_spiders/crawl-sites-test.json old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_deduplicator_pipeline.py b/tests/search_gov_spiders/test_deduplicator_pipeline.py new file mode 100644 index 0000000..55ee3e2 --- /dev/null +++ b/tests/search_gov_spiders/test_deduplicator_pipeline.py @@ -0,0 +1,146 @@ +import os +import pytest +from contextlib import suppress +from unittest.mock import MagicMock, patch +from scrapy.exceptions import DropItem +from search_gov_crawler.search_gov_spiders.pipelines import ( + SearchGovSpidersPipeline, + DeDeuplicatorPipeline, +) +from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem +# --------------------------- +# Fixtures +# --------------------------- + +@pytest.fixture +def sample_item(): + """Fixture for a valid sample item.""" + return {"url": "http://example.com"} + +@pytest.fixture +def invalid_item(): + """Fixture for an invalid item with no URL.""" + return {} + +@pytest.fixture +def sample_spider(): + """Fixture for a mock spider with a logger.""" + class SpiderMock: + logger = MagicMock() + return SpiderMock() + +@pytest.fixture +def pipeline_no_api(): + """Fixture for SearchGovSpidersPipeline with no SPIDER_URLS_API.""" + with patch.dict(os.environ, {}, clear=True): + return SearchGovSpidersPipeline() + +@pytest.fixture +def pipeline_with_api(): + """Fixture for SearchGovSpidersPipeline with SPIDER_URLS_API set.""" + with patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}): + return SearchGovSpidersPipeline() + +@pytest.fixture +def deduplicator_pipeline(): + """Fixture for DeDeuplicatorPipeline with clean state.""" + return DeDeuplicatorPipeline() + +# --------------------------- +# Tests for SearchGovSpidersPipeline +# --------------------------- + +def test_missing_url_in_item(pipeline_no_api, sample_spider, invalid_item): + """ + Verify DropItem exception is raised when an item has no URL. + """ + with pytest.raises(DropItem, match="Missing URL in item"): + pipeline_no_api.process_item(invalid_item, sample_spider) + +# --------------------------- +# Tests for DeDeuplicatorPipeline +# --------------------------- + +@pytest.mark.parametrize( + "item", + [ + {"url": "http://example.com/1"}, + {"url": "http://example.com/2"}, + ], +) +def test_deduplicator_pipeline_unique_items(deduplicator_pipeline, item): + """ + Verify that unique items are processed successfully. + """ + result = deduplicator_pipeline.process_item(item, None) + assert result == item + + +def test_deduplicator_pipeline_duplicate_item(deduplicator_pipeline, sample_item): + """ + Verify that duplicate items raise DropItem. + """ + deduplicator_pipeline.process_item(sample_item, None) # First time should pass + + with pytest.raises(DropItem, match="Item already seen!"): + deduplicator_pipeline.process_item(sample_item, None) # Duplicate raises DropItem + + +def test_deduplicator_pipeline_multiple_items(deduplicator_pipeline): + """ + Verify that multiple unique items are processed without errors. + """ + item1 = {"url": "http://example.com/1"} + item2 = {"url": "http://example.com/2"} + + result1 = deduplicator_pipeline.process_item(item1, None) + result2 = deduplicator_pipeline.process_item(item2, None) + + assert result1 == item1 + assert result2 == item2 + + +def test_deduplicator_pipeline_clean_state(): + """ + Verify that a new instance of DeDeuplicatorPipeline starts with a clean state. + """ + pipeline1 = DeDeuplicatorPipeline() + pipeline2 = DeDeuplicatorPipeline() + + item = {"url": "http://example.com/1"} + + # First pipeline processes the item + result = pipeline1.process_item(item, None) + assert result == item + + # Second pipeline should also process the same item as it has a clean state + result = pipeline2.process_item(item, None) + assert result == item + +@pytest.mark.parametrize( + ("items", "urls_seen_length"), + [ + ( + [ + SearchGovSpidersItem(url="https://www.example.com/1"), + SearchGovSpidersItem(url="https://www.example.com/2"), + ], + 2, + ), + ( + [ + SearchGovSpidersItem(url="https://www.example.com/1"), + SearchGovSpidersItem(url="https://www.example.com/1"), + ], + 1, + ), + ], +) +def test_deduplicator_pipeline(items, urls_seen_length): + pl = DeDeuplicatorPipeline() + + with suppress(DropItem): + for item in items: + pl.process_item(item, None) + + assert len(pl.urls_seen) == urls_seen_length diff --git a/tests/search_gov_spiders/test_extensions.py b/tests/search_gov_spiders/test_extensions.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py old mode 100644 new mode 100755 index 8ce41a0..dac5305 --- a/tests/search_gov_spiders/test_full_crawl.py +++ b/tests/search_gov_spiders/test_full_crawl.py @@ -111,13 +111,12 @@ def test_full_crawl(mock_scrapy_settings, monkeypatch, spider, use_dedup, crawl_ temp_dir.joinpath("output").mkdir(exist_ok=True) def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs): - pipeline_cls.current_file_size = 0 + pipeline_cls.api_url = None pipeline_cls.file_number = 1 pipeline_cls.parent_file_path = temp_dir - pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv") - pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8") - pipeline_cls.max_file_size = max_file_size - pipeline_cls.paginate = True + pipeline_cls.base_file_name = temp_dir / "output" / "all-links-p1234.csv" + pipeline_cls.file_path = pipeline_cls.base_file_name + pipeline_cls.current_file = open(pipeline_cls.file_path, "w", encoding="utf-8") monkeypatch.setattr( "search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.__init__", mock_init @@ -132,7 +131,7 @@ def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs): with open(output_file.name, encoding="UTF") as f: links = json.load(f) - split_files = list(temp_dir.glob("all-links*.csv")) + split_files = list(temp_dir.glob("all-links-p*.csv")) # verify total links match expected assert len(links) == expected_results diff --git a/tests/search_gov_spiders/test_helpers.py b/tests/search_gov_spiders/test_helpers.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_middlewares.py b/tests/search_gov_spiders/test_middlewares.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_pipelines.py b/tests/search_gov_spiders/test_pipelines.py deleted file mode 100644 index 0b85135..0000000 --- a/tests/search_gov_spiders/test_pipelines.py +++ /dev/null @@ -1,36 +0,0 @@ -from contextlib import suppress - -import pytest -from scrapy.exceptions import DropItem - -from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem -from search_gov_crawler.search_gov_spiders.pipelines import DeDeuplicatorPipeline - - -@pytest.mark.parametrize( - ("items", "urls_seen_length"), - [ - ( - [ - SearchGovSpidersItem(url="https://www.example.com/1"), - SearchGovSpidersItem(url="https://www.example.com/2"), - ], - 2, - ), - ( - [ - SearchGovSpidersItem(url="https://www.example.com/1"), - SearchGovSpidersItem(url="https://www.example.com/1"), - ], - 1, - ), - ], -) -def test_deduplicator_pipeline(items, urls_seen_length): - pl = DeDeuplicatorPipeline() - - with suppress(DropItem): - for item in items: - pl.process_item(item, None) - - assert len(pl.urls_seen) == urls_seen_length diff --git a/tests/search_gov_spiders/test_scrapy_scheduler.py b/tests/search_gov_spiders/test_scrapy_scheduler.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_spider.py b/tests/search_gov_spiders/test_spider.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py new file mode 100755 index 0000000..319b547 --- /dev/null +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -0,0 +1,113 @@ +import os + +import pytest +from scrapy import Spider +from scrapy.utils.test import get_crawler + +from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem +from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline + + +@pytest.fixture(name="sample_spider") +def fixture_sample_spider(): + crawler = get_crawler(Spider) + return crawler._create_spider( + name="urls_test", allowed_domains="example.com", allowed_domain_paths="https://www.example.com" + ) + + +@pytest.fixture(name="sample_item") +def fixture_sample_item() -> SearchGovSpidersItem: + """Fixture for a sample item with a URL.""" + item = SearchGovSpidersItem() + item["url"] = "http://example.com" + return item + + +@pytest.fixture(name="mock_open") +def fixture_mock_open(mocker): + return mocker.patch("builtins.open", mocker.mock_open()) + + +@pytest.fixture(name="pipeline_no_api") +def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: + mocker.patch.dict(os.environ, {}) + mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234) + return SearchGovSpidersPipeline() + + +@pytest.fixture(name="pipeline_with_api") +def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: + """Fixture for pipeline with an API URL set.""" + mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}) + mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234) + + return SearchGovSpidersPipeline() + + +def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker): + """Test that URLs are written to files when SPIDER_URLS_API is not set.""" + mocker.patch.object(SearchGovSpidersPipeline, "_file_size", return_value=100) + pipeline_no_api.process_item(sample_item, sample_spider) + + # Ensure file is opened and written to + mock_open.assert_called_once_with(pipeline_no_api.file_path, "a", encoding="utf-8") + mock_open().write.assert_any_call(sample_item["url"] + "\n") + + +def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker): + """Test that URLs are batched and sent via POST when SPIDER_URLS_API is set.""" + mock_post = mocker.patch("requests.post") + + pipeline_with_api.process_item(sample_item, sample_spider) + + # Check that the batch contains the URL + assert sample_item["url"] in pipeline_with_api.urls_batch + + # Simulate max size to force post + mocker.patch.object( + SearchGovSpidersPipeline, + "_batch_size", + return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES, + ) + pipeline_with_api.process_item(sample_item, sample_spider) + + # Ensure POST request was made + mock_post.assert_called_once_with("http://mockapi.com", json={"urls": pipeline_with_api.urls_batch}) + + +def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker): + """Test that file rotation occurs when max size is exceeded.""" + mock_rename = mocker.patch("os.rename") + mocker.patch.object( + SearchGovSpidersPipeline, + "_file_size", + return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES, + ) + pipeline_no_api.process_item(sample_item, None) + + # Check if the file was rotated + mock_open.assert_called_with(pipeline_no_api.file_path, "a", encoding="utf-8") + mock_open().close.assert_called() + mock_rename.assert_called_once() + + +def test_post_urls_on_spider_close(pipeline_with_api, sample_spider, mocker): + """Test that remaining URLs are posted when spider closes and SPIDER_URLS_API is set.""" + mock_post = mocker.patch("requests.post") + + pipeline_with_api.urls_batch = ["http://example.com"] + + pipeline_with_api.close_spider(sample_spider) + + # Ensure POST request was made on spider close, cannot verify json once urls_batch is cleared + mock_post.assert_called_once_with("http://mockapi.com", json=mocker.ANY) + + +def test_close_file_on_spider_close(pipeline_no_api, mock_open): + """Test that the file is closed when the spider closes and no SPIDER_URLS_API is set.""" + + pipeline_no_api.close_spider(None) + + # Ensure the file is closed + mock_open().close.assert_called_once() diff --git a/tests/search_gov_spiders/test_utiliity_files.py b/tests/search_gov_spiders/test_utiliity_files.py old mode 100644 new mode 100755