From 0488698e493fd783f96abd06d0ce60de683043f3 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 4 Sep 2024 13:47:59 -0400 Subject: [PATCH 01/40] added build spec --- buildspec.yaml | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 buildspec.yaml diff --git a/buildspec.yaml b/buildspec.yaml new file mode 100644 index 0000000..ba7d924 --- /dev/null +++ b/buildspec.yaml @@ -0,0 +1,40 @@ +version: 0.2 + +phases: + install: + runtime-versions: + python: 3.12 + commands: + - echo "Setting up Python virtual environment..." + - python -m venv venv + - source venv/bin/activate + - echo "Installing dependencies..." + - pip install --upgrade pip + - pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt + + pre_build: + commands: + - echo "Pre-build phase - preparing environment..." + + build: + commands: + - echo "Running the domain_spider" + - cd ./search_gov_crawler + - pkill scrapy + - scrapy crawl domain_spider + - echo "Finished all non-js domain_spider urls" + - scrapy crawl domain_spider_js + - echo "Finished all js domain_spider urls" + + post_build: + commands: + - echo "Build completed!" + - deactivate + +artifacts: + files: + - '**/*' + +cache: + paths: + - '/root/.cache/pip' From da78798b19f6d884336c2095c7c63edcbd91459a Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 4 Sep 2024 13:50:23 -0400 Subject: [PATCH 02/40] Changed name --- buildspec.yaml => buildspec_spider.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename buildspec.yaml => buildspec_spider.yml (100%) diff --git a/buildspec.yaml b/buildspec_spider.yml similarity index 100% rename from buildspec.yaml rename to buildspec_spider.yml From 67f19c95f61ca88751b4493e6e123eb1417e0a84 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Mon, 9 Sep 2024 12:05:58 -0400 Subject: [PATCH 03/40] added appspec file --- .pre-commit-config.yaml | 1 - appspec.yml | 34 +++++++++++++++++++++ cicd-scripts/app_install.sh | 60 +++++++++++++++++++++++++++++++++++++ cicd-scripts/app_start.sh | 19 ++++++++++++ cicd-scripts/app_stop.sh | 10 +++++++ 5 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 appspec.yml create mode 100644 cicd-scripts/app_install.sh create mode 100644 cicd-scripts/app_start.sh create mode 100644 cicd-scripts/app_stop.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e056019..10b1cfe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,6 @@ repos: - id: check-added-large-files # prevents giant files from being committed. - id: check-case-conflict # checks for files that would conflict in case-insensitive filesystems. - id: check-json # checks json files for parseable syntax. - - id: check-shebang-scripts-are-executable # ensures that (non-binary) files with a shebang are executable. - id: check-merge-conflict # checks for files that contain merge conflict strings. - id: check-symlinks # checks for symlinks which do not point to anything. - id: check-yaml # checks yaml files for parseable syntax. diff --git a/appspec.yml b/appspec.yml new file mode 100644 index 0000000..09660cb --- /dev/null +++ b/appspec.yml @@ -0,0 +1,34 @@ +version: 0.0 +os: linux +files: + - source: . + destination: /home/ec2-user/spider +permissions: + - object: /home/ec2-user/spider/cicd-scripts/app_install.sh + owner: search + mode: 755 + type: + - file + - object: /home/ec2-user/spider/cicd-scripts/app_start.sh + owner: search + mode: 755 + type: + - file + - object: /home/ec2-user/spider/cicd-scripts/app_stop.sh + owner: search + mode: 755 + type: + - file +hooks: + AfterInstall: + - location: spider/cicd-scripts/app_install.sh + timeout: 300 + runas: search + ApplicationStart: + - location: spider/cicd-scripts/app_start.sh + timeout: 300 + runas: search + ApplicationStop: + - location: spider/cicd-scripts/app_stop.sh + timeout: 300 + runas: search diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh new file mode 100644 index 0000000..c1382d3 --- /dev/null +++ b/cicd-scripts/app_install.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +sudo apt-get update +sudo apt-get upgrade +sudo apt-get install python-setuptools +sudo apt-get install python-pip + +# Function to install Python 3.12 +install_python() { + echo "Installing Python 3.12..." + sudo apt update + sudo apt install -y build-essential checkinstall + sudo apt install -y libreadline-gplv2-dev libncursesw5-dev libssl-dev \ + libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ + zlib1g-dev openssl libffi-dev + + # Download Python 3.12 source code + cd /usr/src + sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz + sudo tar xzf Python-3.12.0.tgz + + # Build and install + cd Python-3.12.0 + sudo ./configure --enable-optimizations + sudo make altinstall + + echo "Python 3.12 has been installed." +} + +# Check if Python 3.12 is installed +python_version=$(python3 --version 2>&1) + +if [[ $python_version == *"3.12"* ]]; then + echo "Python 3.12 is already installed: $python_version" +else + echo "Current Python version: $python_version" + echo "Installing Python 3.12..." + install_python +fi + + + +# Creating python3.12 virtual env + +pip install virtualenv + +cd /home/ec2-user/python-flask-service + +echo "Creating python3.12 virtual environment..." +python3.12 -m venv /home/ec2-user/app/venv +source /home/ec2-user/app/venv/bin/activate + +# Installing all spider dependencies +echo "Installing dependencies..." +pip install --upgrade pip + +# pip install -r /home/ec2-user/app/requirements.txt +pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt + +echo "Dependencies installed." diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh new file mode 100644 index 0000000..8e67ce7 --- /dev/null +++ b/cicd-scripts/app_start.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +echo "Activating virtual environment..." +source /home/ec2-user/app/venv/bin/activate + +echo "Running searchgov-spider application..." +# python /home/ec2-user/app/app.py + +pkill scrapy + + +nohup scrapy crawl domain_spider & +nohup scrapy crawl domain_spider_js & + +echo "\nCurrent running crawl jobs:" +jobs + +echo "\noutput:" +cat nohup.out diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh new file mode 100644 index 0000000..03b209a --- /dev/null +++ b/cicd-scripts/app_stop.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "Killing all scrapy tasks" +pkill scrapy + +# Kill all background jobs (by PID) +jobs -p | grep -o -E '\s\d+\s' | xargs kill + +echo "\nBelow jobs list should be empty:" +jobs From 33c09b30c41e89491e4523d53f7882de256c0c65 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 10 Sep 2024 11:52:19 -0400 Subject: [PATCH 04/40] ... --- buildspec_spider.yml | 40 ------------------------------------- cicd-scripts/app_install.sh | 7 +++---- cicd-scripts/app_start.sh | 5 ----- 3 files changed, 3 insertions(+), 49 deletions(-) delete mode 100644 buildspec_spider.yml diff --git a/buildspec_spider.yml b/buildspec_spider.yml deleted file mode 100644 index ba7d924..0000000 --- a/buildspec_spider.yml +++ /dev/null @@ -1,40 +0,0 @@ -version: 0.2 - -phases: - install: - runtime-versions: - python: 3.12 - commands: - - echo "Setting up Python virtual environment..." - - python -m venv venv - - source venv/bin/activate - - echo "Installing dependencies..." - - pip install --upgrade pip - - pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt - - pre_build: - commands: - - echo "Pre-build phase - preparing environment..." - - build: - commands: - - echo "Running the domain_spider" - - cd ./search_gov_crawler - - pkill scrapy - - scrapy crawl domain_spider - - echo "Finished all non-js domain_spider urls" - - scrapy crawl domain_spider_js - - echo "Finished all js domain_spider urls" - - post_build: - commands: - - echo "Build completed!" - - deactivate - -artifacts: - files: - - '**/*' - -cache: - paths: - - '/root/.cache/pip' diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index c1382d3..f34e980 100644 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -44,17 +44,16 @@ fi pip install virtualenv -cd /home/ec2-user/python-flask-service +cd /home/ec2-user/spider echo "Creating python3.12 virtual environment..." -python3.12 -m venv /home/ec2-user/app/venv -source /home/ec2-user/app/venv/bin/activate +python3.12 -m venv /home/ec2-user/spider/venv +source /home/ec2-user/spider/venv/bin/activate # Installing all spider dependencies echo "Installing dependencies..." pip install --upgrade pip -# pip install -r /home/ec2-user/app/requirements.txt pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt echo "Dependencies installed." diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 8e67ce7..0f181f2 100644 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,14 +1,9 @@ #!/bin/bash -echo "Activating virtual environment..." -source /home/ec2-user/app/venv/bin/activate - echo "Running searchgov-spider application..." -# python /home/ec2-user/app/app.py pkill scrapy - nohup scrapy crawl domain_spider & nohup scrapy crawl domain_spider_js & From 06bab351fa50da83902a36bf789fae5e94e83cfe Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 10 Sep 2024 12:25:45 -0400 Subject: [PATCH 05/40] Resolved conflicts from multiple merged branches --- appspec.yml | 6 ++--- cicd-scripts/app_install.sh | 47 ++++++++++++++++++------------------- cicd-scripts/app_start.sh | 27 +++++++++++++++------ cicd-scripts/app_stop.sh | 26 ++++++++++++++++---- 4 files changed, 67 insertions(+), 39 deletions(-) diff --git a/appspec.yml b/appspec.yml index 09660cb..0220a3d 100644 --- a/appspec.yml +++ b/appspec.yml @@ -21,14 +21,14 @@ permissions: - file hooks: AfterInstall: - - location: spider/cicd-scripts/app_install.sh + - location: /home/ec2-user/spider/cicd-scripts/app_install.sh timeout: 300 runas: search ApplicationStart: - - location: spider/cicd-scripts/app_start.sh + - location: /home/ec2-user/spider/cicd-scripts/app_start.sh timeout: 300 runas: search ApplicationStop: - - location: spider/cicd-scripts/app_stop.sh + - location: /home/ec2-user/spider/cicd-scripts/app_stop.sh timeout: 300 runas: search diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index f34e980..5c6577d 100644 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,25 +1,26 @@ #!/bin/bash -sudo apt-get update -sudo apt-get upgrade -sudo apt-get install python-setuptools -sudo apt-get install python-pip +# Update and upgrade the system without prompting for confirmation +sudo apt-get update -y +sudo apt-get upgrade -y + +# Install necessary system dependencies +sudo apt-get install -y python-setuptools python-pip # Function to install Python 3.12 install_python() { echo "Installing Python 3.12..." - sudo apt update - sudo apt install -y build-essential checkinstall - sudo apt install -y libreadline-gplv2-dev libncursesw5-dev libssl-dev \ - libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ - zlib1g-dev openssl libffi-dev + sudo apt-get install -y build-essential checkinstall libreadline-dev \ + libncursesw5-dev libssl-dev libsqlite3-dev \ + tk-dev libgdbm-dev libc6-dev libbz2-dev \ + zlib1g-dev openssl libffi-dev # Download Python 3.12 source code cd /usr/src sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz sudo tar xzf Python-3.12.0.tgz - # Build and install + # Build and install Python 3.12 cd Python-3.12.0 sudo ./configure --enable-optimizations sudo make altinstall @@ -28,32 +29,30 @@ install_python() { } # Check if Python 3.12 is installed -python_version=$(python3 --version 2>&1) - -if [[ $python_version == *"3.12"* ]]; then - echo "Python 3.12 is already installed: $python_version" +if command -v python3.12 &>/dev/null; then + echo "Python 3.12 is already installed: $(python3.12 --version)" else - echo "Current Python version: $python_version" - echo "Installing Python 3.12..." + echo "Python 3.12 is not installed. Installing Python 3.12..." install_python fi +# Install virtualenv using Python 3.12's pip +sudo /usr/local/bin/python3.12 -m pip install --upgrade pip +sudo /usr/local/bin/python3.12 -m pip install virtualenv - -# Creating python3.12 virtual env - -pip install virtualenv - +# Navigate to the spider directory cd /home/ec2-user/spider +# Create a virtual environment using Python 3.12 echo "Creating python3.12 virtual environment..." -python3.12 -m venv /home/ec2-user/spider/venv +/usr/local/bin/python3.12 -m venv /home/ec2-user/spider/venv + +# Activate the virtual environment source /home/ec2-user/spider/venv/bin/activate -# Installing all spider dependencies +# Install all spider dependencies echo "Installing dependencies..." pip install --upgrade pip - pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt echo "Dependencies installed." diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 0f181f2..c96f4a9 100644 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -2,13 +2,26 @@ echo "Running searchgov-spider application..." -pkill scrapy +# Kill existing scrapy processes started by this script +pkill -f "scrapy crawl domain_spider" +pkill -f "scrapy crawl domain_spider_js" -nohup scrapy crawl domain_spider & -nohup scrapy crawl domain_spider_js & +# Start the scrapy crawlers and redirect their outputs to separate files +nohup scrapy crawl domain_spider > domain_spider.log 2>&1 & +PID1=$! +echo "Started domain_spider with PID $PID1" -echo "\nCurrent running crawl jobs:" -jobs +nohup scrapy crawl domain_spider_js > domain_spider_js.log 2>&1 & +PID2=$! +echo "Started domain_spider_js with PID $PID2" -echo "\noutput:" -cat nohup.out +# Display currently running scrapy processes +echo -e "\nCurrent running scrapy processes:" +ps -ef | grep scrapy | grep -v grep + +# Display the last few lines of the logs +echo -e "\nLast few lines of domain_spider.log:" +tail -n 10 domain_spider.log + +echo -e "\nLast few lines of domain_spider_js.log:" +tail -n 10 domain_spider_js.log diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 03b209a..ed24e51 100644 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,10 +1,26 @@ #!/bin/bash -echo "Killing all scrapy tasks" -pkill scrapy +echo "Stopping all scrapy tasks..." -# Kill all background jobs (by PID) -jobs -p | grep -o -E '\s\d+\s' | xargs kill +# Kill specific scrapy processes +pkill -f "scrapy crawl domain_spider" +pkill -f "scrapy crawl domain_spider_js" -echo "\nBelow jobs list should be empty:" +# Display remaining scrapy processes (if any) +echo -e "\nRemaining scrapy processes (if any):" +ps -ef | grep scrapy | grep -v grep + +# Check if there are any jobs still running (if started by this shell) +bg_jobs=$(jobs -p) + +if [[ -n "$bg_jobs" ]]; then + echo "Killing all background jobs..." + # Kill all background jobs in this shell session + jobs -p | xargs kill +else + echo "No background jobs to kill." +fi + +# List background jobs to confirm they are terminated +echo -e "\nBelow jobs list should be empty:" jobs From f64f2504614c685f706423dddbd205bebc55129e Mon Sep 17 00:00:00 2001 From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:23:16 -0700 Subject: [PATCH 06/40] Update appspec.yml --- appspec.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/appspec.yml b/appspec.yml index 0220a3d..54c7793 100644 --- a/appspec.yml +++ b/appspec.yml @@ -1,34 +1,34 @@ version: 0.0 os: linux -files: - - source: . - destination: /home/ec2-user/spider +# files: +# - source: . +# destination: /home/ec2-user/spider permissions: - - object: /home/ec2-user/spider/cicd-scripts/app_install.sh + - object: cicd-scripts/app_install.sh owner: search mode: 755 type: - file - - object: /home/ec2-user/spider/cicd-scripts/app_start.sh + - object: cicd-scripts/app_start.sh owner: search mode: 755 type: - file - - object: /home/ec2-user/spider/cicd-scripts/app_stop.sh + - object: cicd-scripts/app_stop.sh owner: search mode: 755 type: - file hooks: AfterInstall: - - location: /home/ec2-user/spider/cicd-scripts/app_install.sh + - location: cicd-scripts/app_install.sh timeout: 300 runas: search ApplicationStart: - - location: /home/ec2-user/spider/cicd-scripts/app_start.sh + - location: cicd-scripts/app_start.sh timeout: 300 runas: search ApplicationStop: - - location: /home/ec2-user/spider/cicd-scripts/app_stop.sh + - location: cicd-scripts/app_stop.sh timeout: 300 runas: search From db95d54a3fc1d9b8f95dd1ea7a9a142016797eab Mon Sep 17 00:00:00 2001 From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com> Date: Mon, 16 Sep 2024 09:29:27 -0700 Subject: [PATCH 07/40] Update appspec.yml --- appspec.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/appspec.yml b/appspec.yml index 54c7793..4ffe90f 100644 --- a/appspec.yml +++ b/appspec.yml @@ -3,22 +3,22 @@ os: linux # files: # - source: . # destination: /home/ec2-user/spider -permissions: - - object: cicd-scripts/app_install.sh - owner: search - mode: 755 - type: - - file - - object: cicd-scripts/app_start.sh - owner: search - mode: 755 - type: - - file - - object: cicd-scripts/app_stop.sh - owner: search - mode: 755 - type: - - file +# permissions: +# - object: cicd-scripts/app_install.sh +# owner: search +# mode: 755 +# type: +# - file +# - object: cicd-scripts/app_start.sh +# owner: search +# mode: 755 +# type: +# - file +# - object: cicd-scripts/app_stop.sh +# owner: search +# mode: 755 +# type: +# - file hooks: AfterInstall: - location: cicd-scripts/app_install.sh From 3634607e4a10a248f44be1c9f654b4d1a3d5e788 Mon Sep 17 00:00:00 2001 From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:06:51 -0700 Subject: [PATCH 08/40] Update appspec.yml --- appspec.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/appspec.yml b/appspec.yml index 4ffe90f..a8a0180 100644 --- a/appspec.yml +++ b/appspec.yml @@ -3,12 +3,12 @@ os: linux # files: # - source: . # destination: /home/ec2-user/spider -# permissions: -# - object: cicd-scripts/app_install.sh -# owner: search -# mode: 755 -# type: -# - file +permissions: + - object: cicd-scripts + owner: search + mode: 755 + type: + - directory # - object: cicd-scripts/app_start.sh # owner: search # mode: 755 From 7d061ccbd046d05900b23cbfd689ba7a2e735938 Mon Sep 17 00:00:00 2001 From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:08:05 -0700 Subject: [PATCH 09/40] Update appspec.yml --- appspec.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/appspec.yml b/appspec.yml index a8a0180..576d4b1 100644 --- a/appspec.yml +++ b/appspec.yml @@ -3,12 +3,12 @@ os: linux # files: # - source: . # destination: /home/ec2-user/spider -permissions: - - object: cicd-scripts - owner: search - mode: 755 - type: - - directory +# permissions: +# - object: cicd-scripts +# owner: search +# mode: 755 +# type: +# - directory # - object: cicd-scripts/app_start.sh # owner: search # mode: 755 From b6c97e357dbabfca8c3358a8084a341ee4613c21 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Mon, 23 Sep 2024 05:10:20 -0400 Subject: [PATCH 10/40] Fixed start and stop scripts --- cicd-scripts/app_start.sh | 103 +++++++++++++++++++++++++++++++------- cicd-scripts/app_stop.sh | 22 +++++--- 2 files changed, 100 insertions(+), 25 deletions(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index c96f4a9..5104b95 100644 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,27 +1,94 @@ #!/bin/bash -echo "Running searchgov-spider application..." +SCRAPYD_URL="http://127.0.0.1:6800/" +SCRAPYDWEB_URL="http://127.0.0.1:5000/" +CICD_SCRIPTS_BASE_DIR=$(pwd) + +# Function to check if a URL is up and running +function check_url() { + local URL=$1 + local MAX_ATTEMPTS=3 + local DELAY=5 + local attempt=1 + + while [ $attempt -le $MAX_ATTEMPTS ]; do + if curl --output /dev/null --silent --head --fail "$URL"; then + echo "Service at $URL is up on attempt $attempt." + return 0 + else + echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..." + fi + attempt=$((attempt+1)) + sleep $DELAY + done + + echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts." + return 1 +} + +# Function to combine current directory with subdirectory and return absolute path +function get_abs_path() { + local base_dir="$CICD_SCRIPTS_BASE_DIR" + local sub_dir="$1" + + if [[ "$sub_dir" == /* ]]; then + echo "$sub_dir" + else + echo "$base_dir/$sub_dir" + fi +} -# Kill existing scrapy processes started by this script -pkill -f "scrapy crawl domain_spider" -pkill -f "scrapy crawl domain_spider_js" +# Function to check if required command exists +function check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 is not installed or not in your PATH." + exit 1 + fi +} + +check_command "scrapyd" +check_command "scrapydweb" +check_command "curl" + +scrapyd_dir=$(get_abs_path "../") +scrapydweb_dir=$(get_abs_path "../search_gov_crawler") + +echo "Killing any existing scrapyd and scrapydweb services" +pkill -f "scrapydweb" 2>/dev/null +pkill -f "scrapyd" 2>/dev/null + +echo "Running searchgov-spider application..." -# Start the scrapy crawlers and redirect their outputs to separate files -nohup scrapy crawl domain_spider > domain_spider.log 2>&1 & +# Start scrapyd +echo "Starting scrapyd service..." +cd "$scrapyd_dir" +nohup scrapyd > /var/log/scrapyd.log 2>&1 & PID1=$! -echo "Started domain_spider with PID $PID1" +echo "Started scrapyd with PID $PID1" -nohup scrapy crawl domain_spider_js > domain_spider_js.log 2>&1 & -PID2=$! -echo "Started domain_spider_js with PID $PID2" +# Check if scrapyd is running +if check_url "$SCRAPYD_URL"; then + echo "The scrapyd service is running at $SCRAPYD_URL" + cd "$scrapydweb_dir" + nohup scrapydweb > /var/log/scrapydweb.log 2>&1 & + PID2=$! + echo "Started scrapydweb with PID $PID2" -# Display currently running scrapy processes -echo -e "\nCurrent running scrapy processes:" -ps -ef | grep scrapy | grep -v grep + if check_url "$SCRAPYDWEB_URL"; then + echo "The scrapydweb service is running at $SCRAPYDWEB_URL" + else + echo "Error: scrapydweb failed at $SCRAPYDWEB_URL." + exit 1 + fi +else + echo "Error: scrapyd failed at $SCRAPYD_URL." + exit 1 +fi -# Display the last few lines of the logs -echo -e "\nLast few lines of domain_spider.log:" -tail -n 10 domain_spider.log +# Display the last few lines of logs +echo -e "\n-- Last 10 lines of scrapyd.log:\n" +tail -n 10 /var/log/scrapyd.log -echo -e "\nLast few lines of domain_spider_js.log:" -tail -n 10 domain_spider_js.log +echo -e "\n-- Last 10 lines of scrapydweb.log:\n" +tail -n 10 /var/log/scrapydweb.log +exit 0 diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index ed24e51..9073c8e 100644 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,22 +1,30 @@ #!/bin/bash -echo "Stopping all scrapy tasks..." +echo "Stopping all scrapyd and scrapydweb tasks..." +# pkill for scrapydweb and scrapyd +if pkill -f "scrapydweb" 2>/dev/null; then + echo "scrapydweb tasks stopped." +else + echo "No scrapydweb tasks running." +fi -# Kill specific scrapy processes -pkill -f "scrapy crawl domain_spider" -pkill -f "scrapy crawl domain_spider_js" +if pkill -f "scrapyd" 2>/dev/null; then + echo "scrapyd tasks stopped." +else + echo "No scrapyd tasks running." +fi # Display remaining scrapy processes (if any) echo -e "\nRemaining scrapy processes (if any):" -ps -ef | grep scrapy | grep -v grep +ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." -# Check if there are any jobs still running (if started by this shell) +# Check for any background jobs still running bg_jobs=$(jobs -p) if [[ -n "$bg_jobs" ]]; then echo "Killing all background jobs..." # Kill all background jobs in this shell session - jobs -p | xargs kill + jobs -p | xargs -r kill else echo "No background jobs to kill." fi From ef63ab0d04969c70bbe1f9f7226f40250756a8b6 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 24 Sep 2024 10:55:47 -0400 Subject: [PATCH 11/40] Added ansible workaround --- cicd-scripts/app_install.sh | 53 ++++++++++++++++++++++++------------- cicd-scripts/app_start.sh | 13 ++++++--- cicd-scripts/app_stop.sh | 22 ++++----------- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 5c6577d..dc5c508 100644 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,5 +1,23 @@ #!/bin/bash + +# A hack to get the environment running without ansible local env variables +# This block of code will eventually be removed once ansible task is completed +SPIDER_PYTHON_VERSION=3.12 +SPIDER_STAGING_URLS_API=https://staging.search.usa.gov/urls +spider_local_path=/etc/profile.d/spider_local.sh + +# Writing environment variables to the profile file +echo " +export SPIDER_PYTHON_VERSION=${SPIDER_PYTHON_VERSION} +export SPIDER_STAGING_URLS_API=${SPIDER_STAGING_URLS_API} +" | tee "$spider_local_path" > /dev/null + +# Source the script to update the current shell's environment +source "$spider_local_path" +### TODO: Remove the above code block after ansible is fully implmented + + # Update and upgrade the system without prompting for confirmation sudo apt-get update -y sudo apt-get upgrade -y @@ -7,45 +25,44 @@ sudo apt-get upgrade -y # Install necessary system dependencies sudo apt-get install -y python-setuptools python-pip -# Function to install Python 3.12 install_python() { - echo "Installing Python 3.12..." + echo "Installing ${SPIDER_PYTHON_VERSION}" sudo apt-get install -y build-essential checkinstall libreadline-dev \ libncursesw5-dev libssl-dev libsqlite3-dev \ tk-dev libgdbm-dev libc6-dev libbz2-dev \ zlib1g-dev openssl libffi-dev - # Download Python 3.12 source code + # Download Python source code cd /usr/src - sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz - sudo tar xzf Python-3.12.0.tgz + sudo wget https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz + sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz - # Build and install Python 3.12 - cd Python-3.12.0 + # Build and install Python + cd Python-${SPIDER_PYTHON_VERSION}.0 sudo ./configure --enable-optimizations sudo make altinstall - echo "Python 3.12 has been installed." + echo "Python ${SPIDER_PYTHON_VERSION} has been installed." } -# Check if Python 3.12 is installed -if command -v python3.12 &>/dev/null; then - echo "Python 3.12 is already installed: $(python3.12 --version)" +# Check if Python is installed +if command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then + echo "Python ${SPIDER_PYTHON_VERSION} is already installed: $(python${SPIDER_PYTHON_VERSION} --version)" else - echo "Python 3.12 is not installed. Installing Python 3.12..." + echo "Python ${SPIDER_PYTHON_VERSION} is not installed. Installing Python ${SPIDER_PYTHON_VERSION}..." install_python fi -# Install virtualenv using Python 3.12's pip -sudo /usr/local/bin/python3.12 -m pip install --upgrade pip -sudo /usr/local/bin/python3.12 -m pip install virtualenv +# Install virtualenv using Python pip +sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip +sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv # Navigate to the spider directory cd /home/ec2-user/spider -# Create a virtual environment using Python 3.12 -echo "Creating python3.12 virtual environment..." -/usr/local/bin/python3.12 -m venv /home/ec2-user/spider/venv +# Create a virtual environment using Python +echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..." +/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv /home/ec2-user/spider/venv # Activate the virtual environment source /home/ec2-user/spider/venv/bin/activate diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 5104b95..7319165 100644 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -54,15 +54,15 @@ scrapyd_dir=$(get_abs_path "../") scrapydweb_dir=$(get_abs_path "../search_gov_crawler") echo "Killing any existing scrapyd and scrapydweb services" -pkill -f "scrapydweb" 2>/dev/null -pkill -f "scrapyd" 2>/dev/null +sudo pkill -f "scrapydweb" 2>/dev/null +sudo pkill -f "scrapyd" 2>/dev/null echo "Running searchgov-spider application..." # Start scrapyd echo "Starting scrapyd service..." cd "$scrapyd_dir" -nohup scrapyd > /var/log/scrapyd.log 2>&1 & +sudo nohup scrapyd > /var/log/scrapyd.log 2>&1 & PID1=$! echo "Started scrapyd with PID $PID1" @@ -70,7 +70,7 @@ echo "Started scrapyd with PID $PID1" if check_url "$SCRAPYD_URL"; then echo "The scrapyd service is running at $SCRAPYD_URL" cd "$scrapydweb_dir" - nohup scrapydweb > /var/log/scrapydweb.log 2>&1 & + sudo nohup scrapydweb > /var/log/scrapydweb.log 2>&1 & PID2=$! echo "Started scrapydweb with PID $PID2" @@ -85,6 +85,11 @@ else exit 1 fi +# Add startup cron for this script: +echo " +export LATEST_SPIDER_CICD_DEPLOY_PATH=$(CICD_SCRIPTS_BASE_DIR) +" | tee '/etc/profile.d/spider_env.sh' > /dev/null + # Display the last few lines of logs echo -e "\n-- Last 10 lines of scrapyd.log:\n" tail -n 10 /var/log/scrapyd.log diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 9073c8e..7b1e941 100644 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,14 +1,14 @@ #!/bin/bash echo "Stopping all scrapyd and scrapydweb tasks..." -# pkill for scrapydweb and scrapyd -if pkill -f "scrapydweb" 2>/dev/null; then +# Kill all scrapydweb and scrapyd jobs +if sudo pkill -f "scrapydweb" 2>/dev/null; then echo "scrapydweb tasks stopped." else echo "No scrapydweb tasks running." fi -if pkill -f "scrapyd" 2>/dev/null; then +if sudo pkill -f "scrapyd" 2>/dev/null; then echo "scrapyd tasks stopped." else echo "No scrapyd tasks running." @@ -18,17 +18,5 @@ fi echo -e "\nRemaining scrapy processes (if any):" ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." -# Check for any background jobs still running -bg_jobs=$(jobs -p) - -if [[ -n "$bg_jobs" ]]; then - echo "Killing all background jobs..." - # Kill all background jobs in this shell session - jobs -p | xargs -r kill -else - echo "No background jobs to kill." -fi - -# List background jobs to confirm they are terminated -echo -e "\nBelow jobs list should be empty:" -jobs +# Force kill any remaning scrapy background jobs still running +sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9 From 44034fc2141e55f706cc3bcf2241fc7dcbf3ba6d Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Mon, 30 Sep 2024 12:27:20 -0400 Subject: [PATCH 12/40] Fixed cron jobs --- appspec.yml | 31 ++++++++-------------- cicd-scripts/app_install.sh | 53 +++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/appspec.yml b/appspec.yml index 576d4b1..ccb3c07 100644 --- a/appspec.yml +++ b/appspec.yml @@ -1,28 +1,19 @@ version: 0.0 os: linux -# files: -# - source: . -# destination: /home/ec2-user/spider -# permissions: -# - object: cicd-scripts -# owner: search -# mode: 755 -# type: -# - directory -# - object: cicd-scripts/app_start.sh -# owner: search -# mode: 755 -# type: -# - file -# - object: cicd-scripts/app_stop.sh -# owner: search -# mode: 755 -# type: -# - file +permissions: + - object: . + mode: 755 + acls: + - "d:u::rwx" + - "d:g::rwx" + - "d:o::rwx" + owner: search + type: + - directory hooks: AfterInstall: - location: cicd-scripts/app_install.sh - timeout: 300 + timeout: 600 runas: search ApplicationStart: - location: cicd-scripts/app_start.sh diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index dc5c508..f8ea203 100644 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,5 +1,20 @@ #!/bin/bash +# CICD scripts can only runas 'search' user on AWS +if [ "$(whoami)" == "search" ]; then + echo "Executing cicd scripts as 'search' user" +else + echo "This script must be executed as 'search' user" + return 1 +fi + +sudo apt install acl -y + +# Required to give all app_* bash scripts read/write permissions to self and parent. +# Give current directory and all its files rw permissions +sudo chmod -R 755 . +# All new files/directories will inherit rwx (required when installing and using sqllite) +sudo setfacl -Rdm g:dgsearch:rwx . # A hack to get the environment running without ansible local env variables # This block of code will eventually be removed once ansible task is completed @@ -57,15 +72,12 @@ fi sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv -# Navigate to the spider directory -cd /home/ec2-user/spider - # Create a virtual environment using Python echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..." -/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv /home/ec2-user/spider/venv +/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv # Activate the virtual environment -source /home/ec2-user/spider/venv/bin/activate +source ./venv/bin/activate # Install all spider dependencies echo "Installing dependencies..." @@ -73,3 +85,34 @@ pip install --upgrade pip pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt echo "Dependencies installed." + + +# Remove any outstanding app_start.sh reboot cronjobs +echo "Removing any app_start.sh reboot cron jobs..." +crontab -l > cron_backup.bak + +# Remove lines containing 'app_start.sh' and update crontab +crontab -l | grep -v 'app_start.sh' > cron_backup_filtered + +# Check if there are changes +if cmp -s cron_backup_filtered cron_backup.bak; then + echo "No cron jobs with 'app_start.sh' found." +else + crontab cron_backup_filtered + echo "Cron jobs containing 'app_start.sh' have been removed." +fi + +# Clean up temporary files +rm cron_backup_filtered cron_backup.bak + +# Add cron job to run the app back up on ec2 restart +echo "Adding app_start.sh reboot cron job..." +sudo chmod +x ./cicd-scripts/app_start.sh + +# Define the new cron job +new_cron="@reboot at now + 1 min -f ${pwd}/cicd-scripts/app_start.sh" + +# Add the new cron job to the crontab if it's not already present +(crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab - + +echo "Cron job added: $new_cron" From ef4e3f5724acb22066819aaa332db69ca0c18590 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Mon, 7 Oct 2024 15:56:28 -0400 Subject: [PATCH 13/40] ... --- cicd-scripts/app_install.sh | 41 ++++++++++++------------------------- cicd-scripts/app_start.sh | 39 ++++++++++++----------------------- cicd-scripts/app_stop.sh | 0 3 files changed, 26 insertions(+), 54 deletions(-) mode change 100644 => 100755 cicd-scripts/app_install.sh mode change 100644 => 100755 cicd-scripts/app_start.sh mode change 100644 => 100755 cicd-scripts/app_stop.sh diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh old mode 100644 new mode 100755 index f8ea203..c81f3e9 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,13 +1,18 @@ #!/bin/bash # CICD scripts can only runas 'search' user on AWS -if [ "$(whoami)" == "search" ]; then +if [ "$(whoami)" = "search" ]; then echo "Executing cicd scripts as 'search' user" else echo "This script must be executed as 'search' user" - return 1 + exit 1 fi +SPIDER_PYTHON_VERSION=3.12 + +# Update and upgrade the system without prompting for confirmation +sudo apt-get update -y +sudo apt-get upgrade -y sudo apt install acl -y # Required to give all app_* bash scripts read/write permissions to self and parent. @@ -16,29 +21,9 @@ sudo chmod -R 755 . # All new files/directories will inherit rwx (required when installing and using sqllite) sudo setfacl -Rdm g:dgsearch:rwx . -# A hack to get the environment running without ansible local env variables -# This block of code will eventually be removed once ansible task is completed -SPIDER_PYTHON_VERSION=3.12 -SPIDER_STAGING_URLS_API=https://staging.search.usa.gov/urls -spider_local_path=/etc/profile.d/spider_local.sh - -# Writing environment variables to the profile file -echo " -export SPIDER_PYTHON_VERSION=${SPIDER_PYTHON_VERSION} -export SPIDER_STAGING_URLS_API=${SPIDER_STAGING_URLS_API} -" | tee "$spider_local_path" > /dev/null - -# Source the script to update the current shell's environment -source "$spider_local_path" -### TODO: Remove the above code block after ansible is fully implmented - - -# Update and upgrade the system without prompting for confirmation -sudo apt-get update -y -sudo apt-get upgrade -y # Install necessary system dependencies -sudo apt-get install -y python-setuptools python-pip +sudo apt-get install -y python-setuptools install_python() { echo "Installing ${SPIDER_PYTHON_VERSION}" @@ -73,8 +58,8 @@ sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv # Create a virtual environment using Python -echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..." -/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv +echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..." +sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv # Activate the virtual environment source ./venv/bin/activate @@ -82,7 +67,7 @@ source ./venv/bin/activate # Install all spider dependencies echo "Installing dependencies..." pip install --upgrade pip -pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt +sudo pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt echo "Dependencies installed." @@ -98,7 +83,7 @@ crontab -l | grep -v 'app_start.sh' > cron_backup_filtered if cmp -s cron_backup_filtered cron_backup.bak; then echo "No cron jobs with 'app_start.sh' found." else - crontab cron_backup_filtered + sudo crontab cron_backup_filtered echo "Cron jobs containing 'app_start.sh' have been removed." fi @@ -110,7 +95,7 @@ echo "Adding app_start.sh reboot cron job..." sudo chmod +x ./cicd-scripts/app_start.sh # Define the new cron job -new_cron="@reboot at now + 1 min -f ${pwd}/cicd-scripts/app_start.sh" +new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh" # Add the new cron job to the crontab if it's not already present (crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab - diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh old mode 100644 new mode 100755 index 7319165..4121067 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -2,12 +2,12 @@ SCRAPYD_URL="http://127.0.0.1:6800/" SCRAPYDWEB_URL="http://127.0.0.1:5000/" -CICD_SCRIPTS_BASE_DIR=$(pwd) +SPIDER_URLS_API=https://staging.search.usa.gov/urls # Function to check if a URL is up and running function check_url() { local URL=$1 - local MAX_ATTEMPTS=3 + local MAX_ATTEMPTS="${2:-3}" local DELAY=5 local attempt=1 @@ -26,18 +26,6 @@ function check_url() { return 1 } -# Function to combine current directory with subdirectory and return absolute path -function get_abs_path() { - local base_dir="$CICD_SCRIPTS_BASE_DIR" - local sub_dir="$1" - - if [[ "$sub_dir" == /* ]]; then - echo "$sub_dir" - else - echo "$base_dir/$sub_dir" - fi -} - # Function to check if required command exists function check_command() { if ! command -v "$1" &> /dev/null; then @@ -50,27 +38,31 @@ check_command "scrapyd" check_command "scrapydweb" check_command "curl" -scrapyd_dir=$(get_abs_path "../") -scrapydweb_dir=$(get_abs_path "../search_gov_crawler") - echo "Killing any existing scrapyd and scrapydweb services" sudo pkill -f "scrapydweb" 2>/dev/null sudo pkill -f "scrapyd" 2>/dev/null +# Check search-gov /urls endpoint +echo "Checking search-gov /urls api..." +if check_url "$SPIDER_URLS_API"; then + echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API" +else + echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API" + exit 1 +fi + echo "Running searchgov-spider application..." # Start scrapyd echo "Starting scrapyd service..." -cd "$scrapyd_dir" -sudo nohup scrapyd > /var/log/scrapyd.log 2>&1 & +sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &' PID1=$! echo "Started scrapyd with PID $PID1" # Check if scrapyd is running if check_url "$SCRAPYD_URL"; then echo "The scrapyd service is running at $SCRAPYD_URL" - cd "$scrapydweb_dir" - sudo nohup scrapydweb > /var/log/scrapydweb.log 2>&1 & + sudo bash -c 'nohup cd ./search_gov_crawler && scrapydweb > /var/log/scrapydweb.log 2>&1 &' PID2=$! echo "Started scrapydweb with PID $PID2" @@ -85,11 +77,6 @@ else exit 1 fi -# Add startup cron for this script: -echo " -export LATEST_SPIDER_CICD_DEPLOY_PATH=$(CICD_SCRIPTS_BASE_DIR) -" | tee '/etc/profile.d/spider_env.sh' > /dev/null - # Display the last few lines of logs echo -e "\n-- Last 10 lines of scrapyd.log:\n" tail -n 10 /var/log/scrapyd.log diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh old mode 100644 new mode 100755 From 1453d080a850b30bc2f6c4d6582aea97a5d79211 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 8 Oct 2024 09:11:27 -0400 Subject: [PATCH 14/40] merged all orphan branches --- cicd-scripts/app_start.sh | 2 +- .../search_gov_spiders/pipelines.py | 90 +++++++++++++------ 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 4121067..5fec3ce 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -62,7 +62,7 @@ echo "Started scrapyd with PID $PID1" # Check if scrapyd is running if check_url "$SCRAPYD_URL"; then echo "The scrapyd service is running at $SCRAPYD_URL" - sudo bash -c 'nohup cd ./search_gov_crawler && scrapydweb > /var/log/scrapydweb.log 2>&1 &' + sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &' PID2=$! echo "Started scrapydweb with PID $PID2" diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py index b2b0e54..64099b0 100644 --- a/search_gov_crawler/search_gov_spiders/pipelines.py +++ b/search_gov_crawler/search_gov_spiders/pipelines.py @@ -4,6 +4,7 @@ """ import os +import requests from pathlib import Path from scrapy.exceptions import DropItem @@ -11,40 +12,79 @@ class SearchGovSpidersPipeline: """ - Class for pipeline that takes items and adds them - to output file with a max size of 3.9MB + Pipeline that either writes items to an output file with a max size of 3.9MB + or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once + the size limit is reached. """ + MAX_FILE_SIZE_MB = 3.9 # max size in MB + MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024) # convert to bytes + def __init__(self, *_args, **_kwargs): - self.current_file_size = 0 - self.file_number = 1 - self.parent_file_path = Path(__file__).parent.parent.resolve() - self.base_path_name = str(self.parent_file_path / "output/all-links.csv") - self.short_file = open(self.base_path_name, "w", encoding="utf-8") - self.max_file_size = 3900 - self.paginate = True + self.api_url = os.environ.get("SPIDER_URLS_API") + if not self.api_url: + self.file_number = 1 + self.parent_file_path = Path(__file__).parent.parent.resolve() + self.base_file_name = self.parent_file_path / "output" / "all-links.csv" + self.file_path = self.base_file_name + self.current_file = open(self.file_path, "w", encoding="utf-8") + else: + self.urls_batch = [] def process_item(self, item, _spider): - """Checks that the file is not at max size. - Adds it to the file if less, or creates a new file if too large.""" + """Process item either by writing to file or by posting to API.""" + + line = item.get("url", "") + "\n" + line_size = len(line.encode('utf-8')) - line = item["url"] - self.current_file_size += 1 - next_file_size = self.current_file_size + len(line) - if self.paginate and next_file_size > self.max_file_size: - self.short_file.close() - new_name = str(self.parent_file_path / f"output/all-links{self.file_number}.csv") - os.rename(self.base_path_name, new_name) - self.file_number = self.file_number + 1 - self.short_file = open(self.base_path_name, "w", encoding="utf-8") - self.current_file_size = 0 - - self.short_file.write(line) - self.short_file.write("\n") - self.current_file_size = self.current_file_size + len(line) + # If API URL is set, batch URLs and send a POST request when max size is reached + if self.api_url: + self.urls_batch.append(item.get("url", "")) + if self._is_batch_too_large(line_size): + self._post_urls() + # Otherwise, write to file and rotate if needed + else: + self.current_file.write(line) + if self._is_file_too_large(line_size): + self._rotate_file() return item + def _is_batch_too_large(self, new_entry_size): + current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch) + return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES + + def _is_file_too_large(self, new_entry_size): + self.current_file.flush() + current_file_size = self.file_path.stat().st_size + return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES + + def _rotate_file(self): + """Close current file, rename it, and open a new one for continued writing.""" + self.current_file.close() + new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv" + os.rename(self.file_path, new_file_path) + self.file_number += 1 + self.current_file = open(self.file_path, "w", encoding="utf-8") + + def _post_urls(self): + """Send a POST request with the batch of URLs if any exist.""" + if self.urls_batch: + try: + response = requests.post(self.api_url, json={"urls": self.urls_batch}) + response.raise_for_status() + print(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.") + except requests.exceptions.RequestException as e: + print(f"Failed to send URLs to {self.api_url}: {e}") + finally: + self.urls_batch.clear() + + def close_spider(self, _spider): + """Close the file or send remaining URLs if needed when the spider finishes.""" + if not self.api_url and self.current_file: + self.current_file.close() + elif self.api_url: + self._post_urls() # Send any remaining URLs on spider close class DeDeuplicatorPipeline: """Class for pipeline that removes duplicate items""" From 8911dc57c570961680129d2f676f52300cbb5eeb Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 8 Oct 2024 13:46:06 -0400 Subject: [PATCH 15/40] code review feedback --- search_gov_crawler/search_gov_spiders/pipelines.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py index 64099b0..001080c 100644 --- a/search_gov_crawler/search_gov_spiders/pipelines.py +++ b/search_gov_crawler/search_gov_spiders/pipelines.py @@ -31,7 +31,7 @@ def __init__(self, *_args, **_kwargs): else: self.urls_batch = [] - def process_item(self, item, _spider): + def process_item(self, item, spider): """Process item either by writing to file or by posting to API.""" line = item.get("url", "") + "\n" @@ -41,7 +41,7 @@ def process_item(self, item, _spider): if self.api_url: self.urls_batch.append(item.get("url", "")) if self._is_batch_too_large(line_size): - self._post_urls() + self._post_urls(spider) # Otherwise, write to file and rotate if needed else: self.current_file.write(line) @@ -67,15 +67,15 @@ def _rotate_file(self): self.file_number += 1 self.current_file = open(self.file_path, "w", encoding="utf-8") - def _post_urls(self): + def _post_urls(self, spider): """Send a POST request with the batch of URLs if any exist.""" if self.urls_batch: try: response = requests.post(self.api_url, json={"urls": self.urls_batch}) response.raise_for_status() - print(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.") + spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.") except requests.exceptions.RequestException as e: - print(f"Failed to send URLs to {self.api_url}: {e}") + raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}") finally: self.urls_batch.clear() From 2209f41d87493eeed00133847c2b66cd52c1e3f3 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 8 Oct 2024 13:51:35 -0400 Subject: [PATCH 16/40] removed virtualenv install --- cicd-scripts/app_install.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index c81f3e9..0e9dc63 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -53,9 +53,8 @@ else install_python fi -# Install virtualenv using Python pip +# Use venv with Python 3.12 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip -sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv # Create a virtual environment using Python echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..." From b855df3a697c00068f213ae171fa8af4db3201e2 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 16 Oct 2024 12:27:34 -0400 Subject: [PATCH 17/40] Added unit tests --- .../test_urls_files_size.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/search_gov_spiders/test_urls_files_size.py diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py new file mode 100644 index 0000000..4e9da22 --- /dev/null +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -0,0 +1,184 @@ +import os +import pytest +from unittest.mock import MagicMock +from scrapy import Item +# from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline + +import os +import requests +from pathlib import Path + +from scrapy.exceptions import DropItem + + +class SearchGovSpidersPipeline: + """ + Pipeline that either writes items to an output file with a max size of 3.9MB + or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once + the size limit is reached. + """ + + MAX_FILE_SIZE_MB = 3.9 # max size in MB + MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024) # convert to bytes + + def __init__(self, *_args, **_kwargs): + self.api_url = os.environ.get("SPIDER_URLS_API") + if not self.api_url: + self.file_number = 1 + self.parent_file_path = Path(__file__).parent.parent.resolve() + self.base_file_name = self.parent_file_path / "output" / "all-links.csv" + self.file_path = self.base_file_name + self.current_file = open(self.file_path, "w", encoding="utf-8") + else: + self.urls_batch = [] + + def process_item(self, item, spider): + """Process item either by writing to file or by posting to API.""" + + line = item.get("url", "") + "\n" + line_size = len(line.encode('utf-8')) + + # If API URL is set, batch URLs and send a POST request when max size is reached + if self.api_url: + self.urls_batch.append(item.get("url", "")) + if self._is_batch_too_large(line_size): + self._post_urls(spider) + # Otherwise, write to file and rotate if needed + else: + self.current_file.write(line) + if self._is_file_too_large(line_size): + self._rotate_file() + + return item + + def _is_batch_too_large(self, new_entry_size): + current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch) + return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES + + def _is_file_too_large(self, new_entry_size): + self.current_file.flush() + current_file_size = self.file_path.stat().st_size + return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES + + def _rotate_file(self): + """Close current file, rename it, and open a new one for continued writing.""" + self.current_file.close() + new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv" + os.rename(self.file_path, new_file_path) + self.file_number += 1 + self.current_file = open(self.file_path, "w", encoding="utf-8") + + def _post_urls(self, spider): + """Send a POST request with the batch of URLs if any exist.""" + if self.urls_batch: + try: + response = requests.post(self.api_url, json={"urls": self.urls_batch}) + response.raise_for_status() + spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.") + except requests.exceptions.RequestException as e: + raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}") + finally: + self.urls_batch.clear() + + def close_spider(self, _spider): + """Close the file or send remaining URLs if needed when the spider finishes.""" + if not self.api_url and self.current_file: + self.current_file.close() + elif self.api_url: + self._post_urls() # Send any remaining URLs on spider close + +class DeDeuplicatorPipeline: + """Class for pipeline that removes duplicate items""" + + itemlist = [] + + def process_item(self, item, _spider): + """Checks that the file is not at max size. + Adds it to the file if less, or creates a new file if too large.""" + if item in self.itemlist: + raise DropItem("already in list") + self.itemlist.append(item) + + return item + + +@pytest.fixture +def sample_item(): + """Fixture for a sample item with a URL.""" + item = Item() + item['url'] = "http://example.com" + return item + +@pytest.fixture +def pipeline_no_api(mocker): + """Fixture for pipeline with no API URL set.""" + mocker.patch('os.getenv', return_value=None) + return SearchGovSpidersPipeline() + +@pytest.fixture +def pipeline_with_api(mocker): + """Fixture for pipeline with an API URL set.""" + mocker.patch('os.getenv', return_value="http://mockapi.com") + return SearchGovSpidersPipeline() + +def test_write_to_file(pipeline_no_api, sample_item, mocker): + """Test that URLs are written to files when SPIDER_URLS_API is not set.""" + mock_open = mocker.patch('open', mocker.mock_open()) + + pipeline_no_api.process_item(sample_item, None) + + # Ensure file is opened and written to + mock_open.assert_called_once_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8') + mock_open().write.assert_any_call(sample_item['url'] + "\n") + +def test_post_to_api(pipeline_with_api, sample_item, mocker): + """Test that URLs are batched and sent via POST when SPIDER_URLS_API is set.""" + mock_post = mocker.patch('requests.post') + + pipeline_with_api.process_item(sample_item, None) + + # Check that the batch contains the URL + assert sample_item['url'] in pipeline_with_api.urls_batch + + # Simulate max size to force post + mocker.patch.object(SearchGovSpidersPipeline, '_is_batch_too_large', return_value=True) + pipeline_with_api.process_item(sample_item, None) + + # Ensure POST request was made + mock_post.assert_called_once_with("http://mockapi.com", json={"urls": pipeline_with_api.urls_batch}) + +def test_rotate_file(pipeline_no_api, sample_item, mocker): + """Test that file rotation occurs when max size is exceeded.""" + mock_open = mocker.patch('open', mocker.mock_open()) + mock_rename = mocker.patch('os.rename') + + mocker.patch.object(SearchGovSpidersPipeline, '_is_file_too_large', return_value=True) + pipeline_no_api.process_item(sample_item, None) + + # Check if the file was rotated + mock_open.assert_called_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8') + mock_open().close.assert_called() + mock_rename.assert_called_once_with( + pipeline_no_api.file_path, + pipeline_no_api.parent_file_path / "output/all-links-1.csv" + ) + +def test_post_urls_on_spider_close(pipeline_with_api, mocker): + """Test that remaining URLs are posted when spider closes and SPIDER_URLS_API is set.""" + mock_post = mocker.patch('requests.post') + + pipeline_with_api.urls_batch = ["http://example.com"] + + pipeline_with_api.close_spider(None) + + # Ensure POST request was made on spider close + mock_post.assert_called_once_with("http://mockapi.com", json={"urls": ["http://example.com"]}) + +def test_close_file_on_spider_close(pipeline_no_api, mocker): + """Test that the file is closed when the spider closes and no SPIDER_URLS_API is set.""" + mock_open = mocker.patch('open', mocker.mock_open()) + + pipeline_no_api.close_spider(None) + + # Ensure the file is closed + mock_open().close.assert_called_once() From 5aaf9f77b453c74edaeda6729d8231907cc54339 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 16 Oct 2024 12:47:03 -0400 Subject: [PATCH 18/40] ... --- .../test_urls_files_size.py | 100 +----------------- 1 file changed, 1 insertion(+), 99 deletions(-) diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index 4e9da22..f0d5b30 100644 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -2,105 +2,7 @@ import pytest from unittest.mock import MagicMock from scrapy import Item -# from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline - -import os -import requests -from pathlib import Path - -from scrapy.exceptions import DropItem - - -class SearchGovSpidersPipeline: - """ - Pipeline that either writes items to an output file with a max size of 3.9MB - or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once - the size limit is reached. - """ - - MAX_FILE_SIZE_MB = 3.9 # max size in MB - MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024) # convert to bytes - - def __init__(self, *_args, **_kwargs): - self.api_url = os.environ.get("SPIDER_URLS_API") - if not self.api_url: - self.file_number = 1 - self.parent_file_path = Path(__file__).parent.parent.resolve() - self.base_file_name = self.parent_file_path / "output" / "all-links.csv" - self.file_path = self.base_file_name - self.current_file = open(self.file_path, "w", encoding="utf-8") - else: - self.urls_batch = [] - - def process_item(self, item, spider): - """Process item either by writing to file or by posting to API.""" - - line = item.get("url", "") + "\n" - line_size = len(line.encode('utf-8')) - - # If API URL is set, batch URLs and send a POST request when max size is reached - if self.api_url: - self.urls_batch.append(item.get("url", "")) - if self._is_batch_too_large(line_size): - self._post_urls(spider) - # Otherwise, write to file and rotate if needed - else: - self.current_file.write(line) - if self._is_file_too_large(line_size): - self._rotate_file() - - return item - - def _is_batch_too_large(self, new_entry_size): - current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch) - return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES - - def _is_file_too_large(self, new_entry_size): - self.current_file.flush() - current_file_size = self.file_path.stat().st_size - return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES - - def _rotate_file(self): - """Close current file, rename it, and open a new one for continued writing.""" - self.current_file.close() - new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv" - os.rename(self.file_path, new_file_path) - self.file_number += 1 - self.current_file = open(self.file_path, "w", encoding="utf-8") - - def _post_urls(self, spider): - """Send a POST request with the batch of URLs if any exist.""" - if self.urls_batch: - try: - response = requests.post(self.api_url, json={"urls": self.urls_batch}) - response.raise_for_status() - spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.") - except requests.exceptions.RequestException as e: - raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}") - finally: - self.urls_batch.clear() - - def close_spider(self, _spider): - """Close the file or send remaining URLs if needed when the spider finishes.""" - if not self.api_url and self.current_file: - self.current_file.close() - elif self.api_url: - self._post_urls() # Send any remaining URLs on spider close - -class DeDeuplicatorPipeline: - """Class for pipeline that removes duplicate items""" - - itemlist = [] - - def process_item(self, item, _spider): - """Checks that the file is not at max size. - Adds it to the file if less, or creates a new file if too large.""" - if item in self.itemlist: - raise DropItem("already in list") - self.itemlist.append(item) - - return item - +from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline @pytest.fixture def sample_item(): From f01db66e1c7a7e1d05ba6b93aeffccaa87a569f5 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Mon, 2 Dec 2024 04:50:10 -0500 Subject: [PATCH 19/40] resolved conflict --- cicd-scripts/app_install.sh | 23 ++++++ cicd-scripts/app_start.sh | 86 ++--------------------- cicd-scripts/app_stop.sh | 34 +++++++++ cicd-scripts/helpers/check_cloudwatch.sh | 32 +++++++++ cicd-scripts/helpers/check_codedeploy.sh | 32 +++++++++ cicd-scripts/helpers/kill_scheduler.sh | 25 +++++++ cicd-scripts/helpers/run_with_ui.sh | 86 +++++++++++++++++++++++ cicd-scripts/helpers/run_without_ui.sh | 3 + cicd-scripts/helpers/update_pythonpath.sh | 34 +++++++++ 9 files changed, 274 insertions(+), 81 deletions(-) create mode 100644 cicd-scripts/helpers/check_cloudwatch.sh create mode 100644 cicd-scripts/helpers/check_codedeploy.sh create mode 100644 cicd-scripts/helpers/kill_scheduler.sh create mode 100755 cicd-scripts/helpers/run_with_ui.sh create mode 100644 cicd-scripts/helpers/run_without_ui.sh create mode 100644 cicd-scripts/helpers/update_pythonpath.sh diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 0e9dc63..0b45cc9 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,5 +1,10 @@ #!/bin/bash +# Kill all spider services (if running) +echo "Running app_stop.sh" +sudo chmod +x ./cicd-scripts/app_stop.sh +source ./cicd-scripts/app_stop.sh + # CICD scripts can only runas 'search' user on AWS if [ "$(whoami)" = "search" ]; then echo "Executing cicd scripts as 'search' user" @@ -8,8 +13,20 @@ else exit 1 fi +# Start AWS CloudWatch agent +sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh +source ./cicd-scripts/helpers/check_cloudwatch.sh + +# Start AWS CodeDeploy agent +sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh +source ./cicd-scripts/helpers/check_codedeploy.sh + +# PUBLIC SPIDER_PYTHON_VERSION=3.12 +# PRIVATE +_CURRENT_BUILD_DIR=${PWD} + # Update and upgrade the system without prompting for confirmation sudo apt-get update -y sudo apt-get upgrade -y @@ -42,6 +59,9 @@ install_python() { sudo ./configure --enable-optimizations sudo make altinstall + # Return to the build directory + cd $_CURRENT_BUILD_DIR + echo "Python ${SPIDER_PYTHON_VERSION} has been installed." } @@ -53,6 +73,9 @@ else install_python fi +# Set PYTHONPATH env +source ./cicd-scripts/helpers/update_pythonpath.sh + # Use venv with Python 3.12 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 5fec3ce..3610c15 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,86 +1,10 @@ #!/bin/bash -SCRAPYD_URL="http://127.0.0.1:6800/" -SCRAPYDWEB_URL="http://127.0.0.1:5000/" -SPIDER_URLS_API=https://staging.search.usa.gov/urls +# PUBLIC +RUN_WITH_UI=true -# Function to check if a URL is up and running -function check_url() { - local URL=$1 - local MAX_ATTEMPTS="${2:-3}" - local DELAY=5 - local attempt=1 - - while [ $attempt -le $MAX_ATTEMPTS ]; do - if curl --output /dev/null --silent --head --fail "$URL"; then - echo "Service at $URL is up on attempt $attempt." - return 0 - else - echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..." - fi - attempt=$((attempt+1)) - sleep $DELAY - done - - echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts." - return 1 -} - -# Function to check if required command exists -function check_command() { - if ! command -v "$1" &> /dev/null; then - echo "Error: $1 is not installed or not in your PATH." - exit 1 - fi -} - -check_command "scrapyd" -check_command "scrapydweb" -check_command "curl" - -echo "Killing any existing scrapyd and scrapydweb services" -sudo pkill -f "scrapydweb" 2>/dev/null -sudo pkill -f "scrapyd" 2>/dev/null - -# Check search-gov /urls endpoint -echo "Checking search-gov /urls api..." -if check_url "$SPIDER_URLS_API"; then - echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API" -else - echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API" - exit 1 -fi - -echo "Running searchgov-spider application..." - -# Start scrapyd -echo "Starting scrapyd service..." -sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &' -PID1=$! -echo "Started scrapyd with PID $PID1" - -# Check if scrapyd is running -if check_url "$SCRAPYD_URL"; then - echo "The scrapyd service is running at $SCRAPYD_URL" - sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &' - PID2=$! - echo "Started scrapydweb with PID $PID2" - - if check_url "$SCRAPYDWEB_URL"; then - echo "The scrapydweb service is running at $SCRAPYDWEB_URL" - else - echo "Error: scrapydweb failed at $SCRAPYDWEB_URL." - exit 1 - fi +if $RUN_WITH_UI ; then + source ./cicd-scripts/helpers/run_with_ui.sh else - echo "Error: scrapyd failed at $SCRAPYD_URL." - exit 1 + source ./cicd-scripts/helpers/run_without_ui.sh fi - -# Display the last few lines of logs -echo -e "\n-- Last 10 lines of scrapyd.log:\n" -tail -n 10 /var/log/scrapyd.log - -echo -e "\n-- Last 10 lines of scrapydweb.log:\n" -tail -n 10 /var/log/scrapydweb.log -exit 0 diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 7b1e941..4844b25 100755 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,5 +1,14 @@ #!/bin/bash +# Clear all cache +echo "Purge all pip cache..." +sudo pip cache purge + +# Kill scrapy schedular (if running): +echo "Stopping scrapy_scheduler.py (if running)" +sudo chmod +x ./cicd-scripts/helpers/kill_scheduler.sh +source ./cicd-scripts/helpers/kill_scheduler.sh + echo "Stopping all scrapyd and scrapydweb tasks..." # Kill all scrapydweb and scrapyd jobs if sudo pkill -f "scrapydweb" 2>/dev/null; then @@ -20,3 +29,28 @@ ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." # Force kill any remaning scrapy background jobs still running sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9 + +# Kill all nohup jobs (runs with python) +ps -ef | grep nohup | grep -v grep | awk '{print $2}' + +# Remove other deploy cron jobs: +#!/bin/bash + +# Function to remove crontab entries referencing a given cron entry string +remove_cron_entry() { + if [ -z "$1" ]; then + echo "Error: No cron entry provided." + exit 1 + fi + + CRON_ENTRY="$1" + + # Remove entries referencing the script + sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab - + + echo "Removed any crontab entries referencing $CRON_ENTRY." +} + +# Remove any other cron job entries +remove_cron_entry "check_cloudwatch.sh" +remove_cron_entry "check_codedeploy.sh" diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh new file mode 100644 index 0000000..18e4870 --- /dev/null +++ b/cicd-scripts/helpers/check_cloudwatch.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Function to check if CloudWatch agent is running +check_cloudwatch() { + if ! pgrep -f amazon-cloudwatch-agent > /dev/null; then + echo "AWS CloudWatch agent is not running. Starting it now..." + sudo service amazon-cloudwatch-agent start + if [ $? -eq 0 ]; then + echo "AWS CloudWatch agent started successfully." + else + echo "Failed to start AWS CloudWatch agent." + fi + else + echo "AWS CloudWatch agent is running." + fi +} + +# Ensure the script is added to crontab for execution on reboot +setup_cron() { + sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh + CRON_ENTRY="@reboot /bin/bash $PWD/cicd-scripts/helpers/check_cloudwatch.sh" + + # Update crontab, ensuring no duplicates + (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab - + echo "Crontab entry added to ensure the script runs on reboot." +} + +# Execute the function +check_cloudwatch + +# Add to crontab +setup_cron diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh new file mode 100644 index 0000000..98731dc --- /dev/null +++ b/cicd-scripts/helpers/check_codedeploy.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Function to check if CodeDeploy agent is running +check_codedeploy() { + if ! pgrep -f codedeploy-agent > /dev/null; then + echo "AWS CodeDeploy agent is not running. Starting it now..." + sudo service codedeploy-agent start + if [ $? -eq 0 ]; then + echo "AWS CodeDeploy agent started successfully." + else + echo "Failed to start AWS CodeDeploy agent." + fi + else + echo "AWS CodeDeploy agent is running." + fi +} + +# Ensure the script is added to crontab for execution on reboot +setup_cron() { + sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh + CRON_ENTRY="@reboot /bin/bash $PWD/helpers/check_codedeploy.sh" + + # Update crontab, ensuring no duplicates + (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab - + echo "Crontab entry added to ensure the script runs on reboot." +} + +# Execute the function +check_codedeploy + +# Add to crontab +setup_cron diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh new file mode 100644 index 0000000..ddc115b --- /dev/null +++ b/cicd-scripts/helpers/kill_scheduler.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Find the process ID of the running scrapy_scheduler.py script +echo "Searching for scrapy_scheduler.py process..." +PROCESS_ID=$(pgrep -f "scrapy_scheduler.py") + +# Check if the process exists +if [ -z "$PROCESS_ID" ]; then + echo "No running process found for scrapy_scheduler.py." + exit 0 +fi + +# Kill the process +echo "Killing process with PID: $PROCESS_ID" +kill "$PROCESS_ID" + +sleep 3 + +# Verify if the process was killed +if [ $? -eq 0 ]; then + echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated." +else + echo "Failed to terminate the process. Please check manually." + exit 1 +fi diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh new file mode 100755 index 0000000..5fec3ce --- /dev/null +++ b/cicd-scripts/helpers/run_with_ui.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +SCRAPYD_URL="http://127.0.0.1:6800/" +SCRAPYDWEB_URL="http://127.0.0.1:5000/" +SPIDER_URLS_API=https://staging.search.usa.gov/urls + +# Function to check if a URL is up and running +function check_url() { + local URL=$1 + local MAX_ATTEMPTS="${2:-3}" + local DELAY=5 + local attempt=1 + + while [ $attempt -le $MAX_ATTEMPTS ]; do + if curl --output /dev/null --silent --head --fail "$URL"; then + echo "Service at $URL is up on attempt $attempt." + return 0 + else + echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..." + fi + attempt=$((attempt+1)) + sleep $DELAY + done + + echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts." + return 1 +} + +# Function to check if required command exists +function check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 is not installed or not in your PATH." + exit 1 + fi +} + +check_command "scrapyd" +check_command "scrapydweb" +check_command "curl" + +echo "Killing any existing scrapyd and scrapydweb services" +sudo pkill -f "scrapydweb" 2>/dev/null +sudo pkill -f "scrapyd" 2>/dev/null + +# Check search-gov /urls endpoint +echo "Checking search-gov /urls api..." +if check_url "$SPIDER_URLS_API"; then + echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API" +else + echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API" + exit 1 +fi + +echo "Running searchgov-spider application..." + +# Start scrapyd +echo "Starting scrapyd service..." +sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &' +PID1=$! +echo "Started scrapyd with PID $PID1" + +# Check if scrapyd is running +if check_url "$SCRAPYD_URL"; then + echo "The scrapyd service is running at $SCRAPYD_URL" + sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &' + PID2=$! + echo "Started scrapydweb with PID $PID2" + + if check_url "$SCRAPYDWEB_URL"; then + echo "The scrapydweb service is running at $SCRAPYDWEB_URL" + else + echo "Error: scrapydweb failed at $SCRAPYDWEB_URL." + exit 1 + fi +else + echo "Error: scrapyd failed at $SCRAPYD_URL." + exit 1 +fi + +# Display the last few lines of logs +echo -e "\n-- Last 10 lines of scrapyd.log:\n" +tail -n 10 /var/log/scrapyd.log + +echo -e "\n-- Last 10 lines of scrapydweb.log:\n" +tail -n 10 /var/log/scrapydweb.log +exit 0 diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh new file mode 100644 index 0000000..cabeb3d --- /dev/null +++ b/cicd-scripts/helpers/run_without_ui.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo bash -c 'nohup ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &' diff --git a/cicd-scripts/helpers/update_pythonpath.sh b/cicd-scripts/helpers/update_pythonpath.sh new file mode 100644 index 0000000..e742b55 --- /dev/null +++ b/cicd-scripts/helpers/update_pythonpath.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Define the current directory +CURRENT_DIR=$(pwd) + +# Define the .bashrc file location +BASHRC_FILE="$HOME/.bashrc" + +# Check if .bashrc contains an export PYTHONPATH line +if grep -q "^export PYTHONPATH=" "$BASHRC_FILE"; then + # Extract the existing PYTHONPATH line + PYTHONPATH_LINE=$(grep "^export PYTHONPATH=" "$BASHRC_FILE") + + # Check if the current directory is already included + if echo "$PYTHONPATH_LINE" | grep -q "$CURRENT_DIR"; then + echo "PYTHONPATH already includes the current directory: $CURRENT_DIR" + else + # Ensure the updated line includes the starting and ending quotes + CURRENT_PATHS=$(echo "$PYTHONPATH_LINE" | sed -e 's/^export PYTHONPATH=//' -e 's/^"//' -e 's/"$//') + UPDATED_LINE="export PYTHONPATH=\"${CURRENT_PATHS}:${CURRENT_DIR}\"" + sed -i "s|^export PYTHONPATH=.*|$UPDATED_LINE|" "$BASHRC_FILE" + echo "Updated PYTHONPATH to include the current directory: $CURRENT_DIR" + fi +else + # Add a new export PYTHONPATH line to .bashrc + echo "export PYTHONPATH=\"\$PYTHONPATH:${CURRENT_DIR}\"" >> "$BASHRC_FILE" + echo "Added new PYTHONPATH to .bashrc including the current directory: $CURRENT_DIR" +fi + +# Apply changes for the current session +export PYTHONPATH=\"${CURRENT_PATHS//"\$PYTHONPATH"/}:${CURRENT_DIR}\" + +echo "PYTHONPATH changes applied:" +echo $PYTHONPATH From 54355ed7be938d48fa74bdd69382816515963b60 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 10 Dec 2024 11:54:40 -0500 Subject: [PATCH 20/40] Addressed code review --- cicd-scripts/app_install.sh | 10 ++++++++-- cicd-scripts/app_stop.sh | 4 +++- cicd-scripts/helpers/run_with_ui.sh | 9 --------- tests/search_gov_spiders/test_full_crawl.py | 17 ++++++++++++----- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 0b45cc9..6622445 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -13,6 +13,11 @@ else exit 1 fi +# Get missing packages +sudo apt-get install lzma +sudo apt-get install liblzma-dev +yes | sudo apt-get install libbz2-dev + # Start AWS CloudWatch agent sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh source ./cicd-scripts/helpers/check_cloudwatch.sh @@ -88,8 +93,9 @@ source ./venv/bin/activate # Install all spider dependencies echo "Installing dependencies..." -pip install --upgrade pip -sudo pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt +sudo pip install --force-reinstall -r ./search_gov_crawler/requirements.txt +sudo pip install pytest-playwright playwright -U +playwright install echo "Dependencies installed." diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 4844b25..0e39875 100755 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -2,7 +2,9 @@ # Clear all cache echo "Purge all pip cache..." -sudo pip cache purge +# We can't do `$pip cache purge`, this does the samething: +sudo rm -r ~/.cache/pip +sudo rm -rf /root/.cache/pip # Kill scrapy schedular (if running): echo "Stopping scrapy_scheduler.py (if running)" diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh index 5fec3ce..ed2f400 100755 --- a/cicd-scripts/helpers/run_with_ui.sh +++ b/cicd-scripts/helpers/run_with_ui.sh @@ -42,15 +42,6 @@ echo "Killing any existing scrapyd and scrapydweb services" sudo pkill -f "scrapydweb" 2>/dev/null sudo pkill -f "scrapyd" 2>/dev/null -# Check search-gov /urls endpoint -echo "Checking search-gov /urls api..." -if check_url "$SPIDER_URLS_API"; then - echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API" -else - echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API" - exit 1 -fi - echo "Running searchgov-spider application..." # Start scrapyd diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py index 3992263..1bfd892 100644 --- a/tests/search_gov_spiders/test_full_crawl.py +++ b/tests/search_gov_spiders/test_full_crawl.py @@ -94,13 +94,20 @@ def test_full_crawl(mock_scrapy_settings, monkeypatch, spider, use_dedup, crawl_ temp_dir.joinpath("output").mkdir(exist_ok=True) def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs): - pipeline_cls.current_file_size = 0 + # pipeline_cls.current_file_size = 0 + # pipeline_cls.file_number = 1 + # pipeline_cls.parent_file_path = temp_dir + # pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv") + # pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8") + # pipeline_cls.max_file_size = 3900 + # pipeline_cls.paginate = True + + pipeline_cls.api_url = None pipeline_cls.file_number = 1 pipeline_cls.parent_file_path = temp_dir - pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv") - pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8") - pipeline_cls.max_file_size = 3900 - pipeline_cls.paginate = True + pipeline_cls.base_file_name = temp_dir / "output" / "all-links.csv" + pipeline_cls.file_path = pipeline_cls.base_file_name + pipeline_cls.current_file = open(pipeline_cls.file_path, "w", encoding="utf-8") monkeypatch.setattr( "search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.__init__", mock_init From a43688f4b75b11ea3d50f997b8202c653fbf571a Mon Sep 17 00:00:00 2001 From: selfdanielj <38377823+selfdanielj@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:40:36 -0500 Subject: [PATCH 21/40] Add pytest-mock dependency --- .github/dependabot.yml | 2 +- search_gov_crawler/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 37e7fcf..5674693 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -14,4 +14,4 @@ updates: patterns: - "freezegun" - "pylint" - - "pytest" + - "pytest*" diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt index bf1dafb..8941c1e 100644 --- a/search_gov_crawler/requirements.txt +++ b/search_gov_crawler/requirements.txt @@ -2,6 +2,7 @@ freezegun==1.5.1 pylint==3.3.1 pytest==8.3.3 pytest-console-scripts==1.4.1 +pytest-mock==3.14.0 # Install from github due to unmaintained project using pypi https://github.com/nhairs/python-json-logger/issues/1 python-json-logger @ https://github.com/nhairs/python-json-logger/releases/download/v3.1.0/python_json_logger-3.1.0-py3-none-any.whl @@ -11,4 +12,3 @@ scrapyd==1.5.0 scrapyd-client==2.0.0 scrapydweb @ git+https://github.com/GSA/searchgov-scrapydweb spidermon [monitoring] == 1.22.0 - From 6be4bb6e893549211141db3700f8d86373daa853 Mon Sep 17 00:00:00 2001 From: selfdanielj <38377823+selfdanielj@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:42:04 -0500 Subject: [PATCH 22/40] update file pipeline and tests --- .../search_gov_spiders/pipelines.py | 11 ++- .../test_urls_files_size.py | 96 +++++++++++-------- 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py index 18f3664..6849fc5 100644 --- a/search_gov_crawler/search_gov_spiders/pipelines.py +++ b/search_gov_crawler/search_gov_spiders/pipelines.py @@ -4,9 +4,9 @@ """ import os -import requests from pathlib import Path +import requests from scrapy.exceptions import DropItem @@ -35,7 +35,7 @@ def process_item(self, item, spider): """Process item either by writing to file or by posting to API.""" line = item.get("url", "") + "\n" - line_size = len(line.encode('utf-8')) + line_size = len(line.encode("utf-8")) # If API URL is set, batch URLs and send a POST request when max size is reached if self.api_url: @@ -51,7 +51,7 @@ def process_item(self, item, spider): return item def _is_batch_too_large(self, new_entry_size): - current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch) + current_batch_size = sum(len(url.encode("utf-8")) for url in self.urls_batch) return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES def _is_file_too_large(self, new_entry_size): @@ -79,12 +79,13 @@ def _post_urls(self, spider): finally: self.urls_batch.clear() - def close_spider(self, _spider): + def close_spider(self, spider): """Close the file or send remaining URLs if needed when the spider finishes.""" if not self.api_url and self.current_file: self.current_file.close() elif self.api_url: - self._post_urls() # Send any remaining URLs on spider close + self._post_urls(spider) # Send any remaining URLs on spider close + class DeDeuplicatorPipeline: """Class for pipeline that removes duplicate items""" diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index f0d5b30..7735711 100644 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -1,84 +1,102 @@ import os + import pytest -from unittest.mock import MagicMock -from scrapy import Item + +from scrapy import Spider +from scrapy.utils.test import get_crawler +from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline -@pytest.fixture -def sample_item(): + +@pytest.fixture(name="sample_spider") +def fixture_sample_spider(): + crawler = get_crawler(Spider) + return crawler._create_spider( + name="urls_test", allowed_domains="example.com", allowed_domain_paths="https://www.example.com" + ) + + +@pytest.fixture(name="sample_item") +def fixture_sample_item() -> SearchGovSpidersItem: """Fixture for a sample item with a URL.""" - item = Item() - item['url'] = "http://example.com" + item = SearchGovSpidersItem() + item["url"] = "http://example.com" return item -@pytest.fixture -def pipeline_no_api(mocker): - """Fixture for pipeline with no API URL set.""" - mocker.patch('os.getenv', return_value=None) + +@pytest.fixture(name="mock_open") +def fixture_mock_open(mocker): + return mocker.patch("builtins.open", mocker.mock_open()) + + +@pytest.fixture(name="pipeline_no_api") +def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: + mocker.patch.dict(os.environ, {}) return SearchGovSpidersPipeline() -@pytest.fixture -def pipeline_with_api(mocker): + +@pytest.fixture(name="pipeline_with_api") +def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: """Fixture for pipeline with an API URL set.""" - mocker.patch('os.getenv', return_value="http://mockapi.com") + mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}) return SearchGovSpidersPipeline() -def test_write_to_file(pipeline_no_api, sample_item, mocker): - """Test that URLs are written to files when SPIDER_URLS_API is not set.""" - mock_open = mocker.patch('open', mocker.mock_open()) - pipeline_no_api.process_item(sample_item, None) +def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider): + """Test that URLs are written to files when SPIDER_URLS_API is not set.""" + pipeline_no_api.process_item(sample_item, sample_spider) # Ensure file is opened and written to - mock_open.assert_called_once_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8') - mock_open().write.assert_any_call(sample_item['url'] + "\n") + mock_open.assert_called_once_with(pipeline_no_api.base_file_name, "w", encoding="utf-8") + mock_open().write.assert_any_call(sample_item["url"] + "\n") -def test_post_to_api(pipeline_with_api, sample_item, mocker): + +def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker): """Test that URLs are batched and sent via POST when SPIDER_URLS_API is set.""" - mock_post = mocker.patch('requests.post') + mock_post = mocker.patch("requests.post") - pipeline_with_api.process_item(sample_item, None) + pipeline_with_api.process_item(sample_item, sample_spider) # Check that the batch contains the URL - assert sample_item['url'] in pipeline_with_api.urls_batch + assert sample_item["url"] in pipeline_with_api.urls_batch # Simulate max size to force post - mocker.patch.object(SearchGovSpidersPipeline, '_is_batch_too_large', return_value=True) - pipeline_with_api.process_item(sample_item, None) + mocker.patch.object(SearchGovSpidersPipeline, "_is_batch_too_large", return_value=True) + pipeline_with_api.process_item(sample_item, sample_spider) # Ensure POST request was made mock_post.assert_called_once_with("http://mockapi.com", json={"urls": pipeline_with_api.urls_batch}) -def test_rotate_file(pipeline_no_api, sample_item, mocker): + +def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker): """Test that file rotation occurs when max size is exceeded.""" - mock_open = mocker.patch('open', mocker.mock_open()) - mock_rename = mocker.patch('os.rename') + mock_rename = mocker.patch("os.rename") - mocker.patch.object(SearchGovSpidersPipeline, '_is_file_too_large', return_value=True) + mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=True) pipeline_no_api.process_item(sample_item, None) # Check if the file was rotated - mock_open.assert_called_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8') + mock_open.assert_called_with(pipeline_no_api.base_file_name, "w", encoding="utf-8") mock_open().close.assert_called() mock_rename.assert_called_once_with( - pipeline_no_api.file_path, - pipeline_no_api.parent_file_path / "output/all-links-1.csv" + pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv" ) -def test_post_urls_on_spider_close(pipeline_with_api, mocker): + +def test_post_urls_on_spider_close(pipeline_with_api, sample_spider, mocker): """Test that remaining URLs are posted when spider closes and SPIDER_URLS_API is set.""" - mock_post = mocker.patch('requests.post') + mock_post = mocker.patch("requests.post") pipeline_with_api.urls_batch = ["http://example.com"] - pipeline_with_api.close_spider(None) + pipeline_with_api.close_spider(sample_spider) + + # Ensure POST request was made on spider close, cannot verify json once urls_batch is cleared + mock_post.assert_called_once_with("http://mockapi.com", json=mocker.ANY) - # Ensure POST request was made on spider close - mock_post.assert_called_once_with("http://mockapi.com", json={"urls": ["http://example.com"]}) -def test_close_file_on_spider_close(pipeline_no_api, mocker): +def test_close_file_on_spider_close(pipeline_no_api, mock_open): """Test that the file is closed when the spider closes and no SPIDER_URLS_API is set.""" - mock_open = mocker.patch('open', mocker.mock_open()) pipeline_no_api.close_spider(None) From b8d60219ebe50b8ac6ffa9920c981e1d7cabddd5 Mon Sep 17 00:00:00 2001 From: selfdanielj <38377823+selfdanielj@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:56:03 -0500 Subject: [PATCH 23/40] fix test looking for a mocked all-files.csv --- tests/search_gov_spiders/test_urls_files_size.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index 7735711..1c9a3b0 100644 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -1,4 +1,5 @@ import os +from pathlib import Path import pytest @@ -42,8 +43,9 @@ def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: return SearchGovSpidersPipeline() -def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider): +def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker): """Test that URLs are written to files when SPIDER_URLS_API is not set.""" + mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=False) pipeline_no_api.process_item(sample_item, sample_spider) # Ensure file is opened and written to From ae2605e252cebfd2feb730de58673fe8ff61a03f Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 10 Dec 2024 14:38:32 -0500 Subject: [PATCH 24/40] fixed no ui code --- cicd-scripts/helpers/run_without_ui.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh index cabeb3d..1c88afe 100644 --- a/cicd-scripts/helpers/run_without_ui.sh +++ b/cicd-scripts/helpers/run_without_ui.sh @@ -1,3 +1,3 @@ #!/bin/bash - -sudo bash -c 'nohup ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &' +SPIDER_PYTHON_VERSION=3.12 +sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &" From f92f12e8c85ea1ccf402f36e46e4ee31e8b57590 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 10 Dec 2024 18:01:34 -0500 Subject: [PATCH 25/40] ... --- cicd-scripts/app_install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 6622445..b62abf4 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -128,4 +128,5 @@ new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh" # Add the new cron job to the crontab if it's not already present (crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab - + echo "Cron job added: $new_cron" From 251c6924bfe8632ff1bfb182e0a5bd00f7ed686a Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 11 Dec 2024 12:24:30 -0500 Subject: [PATCH 26/40] fixed executable scripts --- cicd-scripts/app_install.sh | 2 +- cicd-scripts/app_start.sh | 4 +++- cicd-scripts/app_stop.sh | 2 +- cicd-scripts/helpers/kill_scheduler.sh | 7 ++++--- cicd-scripts/helpers/run_with_ui.sh | 5 ++--- cicd-scripts/helpers/run_without_ui.sh | 1 + 6 files changed, 12 insertions(+), 9 deletions(-) mode change 100644 => 100755 cicd-scripts/helpers/kill_scheduler.sh diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index b62abf4..446ed5a 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -10,7 +10,7 @@ if [ "$(whoami)" = "search" ]; then echo "Executing cicd scripts as 'search' user" else echo "This script must be executed as 'search' user" - exit 1 + return fi # Get missing packages diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 3610c15..2af738a 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,10 +1,12 @@ #!/bin/bash # PUBLIC -RUN_WITH_UI=true +RUN_WITH_UI=false if $RUN_WITH_UI ; then + sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh source ./cicd-scripts/helpers/run_with_ui.sh else + sudo chmod +x ./cicd-scripts/helpers/run_without_ui.sh source ./cicd-scripts/helpers/run_without_ui.sh fi diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 0e39875..3079d15 100755 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -42,7 +42,7 @@ ps -ef | grep nohup | grep -v grep | awk '{print $2}' remove_cron_entry() { if [ -z "$1" ]; then echo "Error: No cron entry provided." - exit 1 + return fi CRON_ENTRY="$1" diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh old mode 100644 new mode 100755 index ddc115b..76b39e6 --- a/cicd-scripts/helpers/kill_scheduler.sh +++ b/cicd-scripts/helpers/kill_scheduler.sh @@ -7,19 +7,20 @@ PROCESS_ID=$(pgrep -f "scrapy_scheduler.py") # Check if the process exists if [ -z "$PROCESS_ID" ]; then echo "No running process found for scrapy_scheduler.py." - exit 0 + return fi # Kill the process echo "Killing process with PID: $PROCESS_ID" kill "$PROCESS_ID" +# Pause to allow the process to terminate sleep 3 # Verify if the process was killed -if [ $? -eq 0 ]; then +if ! kill -0 "$PROCESS_ID" 2>/dev/null; then echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated." else echo "Failed to terminate the process. Please check manually." - exit 1 + return fi diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh index ed2f400..de181e2 100755 --- a/cicd-scripts/helpers/run_with_ui.sh +++ b/cicd-scripts/helpers/run_with_ui.sh @@ -61,11 +61,11 @@ if check_url "$SCRAPYD_URL"; then echo "The scrapydweb service is running at $SCRAPYDWEB_URL" else echo "Error: scrapydweb failed at $SCRAPYDWEB_URL." - exit 1 + return fi else echo "Error: scrapyd failed at $SCRAPYD_URL." - exit 1 + return fi # Display the last few lines of logs @@ -74,4 +74,3 @@ tail -n 10 /var/log/scrapyd.log echo -e "\n-- Last 10 lines of scrapydweb.log:\n" tail -n 10 /var/log/scrapydweb.log -exit 0 diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh index 1c88afe..edf0c3a 100644 --- a/cicd-scripts/helpers/run_without_ui.sh +++ b/cicd-scripts/helpers/run_without_ui.sh @@ -1,3 +1,4 @@ #!/bin/bash SPIDER_PYTHON_VERSION=3.12 sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &" +echo "Running no UI vesrion of searchgov-spider..." From 58d5495eda939696935a59d7de75df06cfa47458 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Thu, 12 Dec 2024 12:22:27 -0500 Subject: [PATCH 27/40] ... --- cicd-scripts/app_start.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 2af738a..e790b57 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,9 +1,9 @@ #!/bin/bash # PUBLIC -RUN_WITH_UI=false +SPIDER_RUN_WITH_UI=false -if $RUN_WITH_UI ; then +if $SPIDER_RUN_WITH_UI ; then sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh source ./cicd-scripts/helpers/run_with_ui.sh else From 25321725e342c1f06ce272d8c7c1e32becec6d4c Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 04:40:59 -0500 Subject: [PATCH 28/40] fixed paths and scripts --- .circleci/config.yml | 0 .codeclimate.yml | 0 .github/dependabot.yml | 0 .github/pull_request_template.md | 0 .gitignore | 0 .pre-commit-config.yaml | 0 LICENSE | 0 README.md | 0 appspec.yml | 2 +- cicd-scripts/app_install.sh | 195 +++++++++--------- cicd-scripts/app_start.sh | 17 +- cicd-scripts/app_stop.sh | 135 ++++++++---- cicd-scripts/helpers/check_cloudwatch.sh | 4 +- cicd-scripts/helpers/check_codedeploy.sh | 4 +- cicd-scripts/helpers/ensure_executable.sh | 15 ++ cicd-scripts/helpers/kill_scheduler.sh | 28 ++- cicd-scripts/helpers/run_without_ui.sh | 8 +- cicd-scripts/helpers/update_pythonpath.sh | 0 pyproject.toml | 0 search_gov_crawler/benchmark.py | 0 search_gov_crawler/output/.gitignore | 0 search_gov_crawler/requirements.txt | 0 search_gov_crawler/scrapy.cfg | 0 search_gov_crawler/scrapy_scheduler.py | 0 search_gov_crawler/scrapyd-logs/.gitignore | 0 search_gov_crawler/scrapyd.conf | 0 search_gov_crawler/scrapydweb_settings_v10.py | 0 .../search_gov_logparser/__init__.py | 0 .../search_gov_scrapyd/__init__.py | 0 .../search_gov_scrapydweb/__init__.py | 0 .../search_gov_spiders/__init__.py | 0 .../reports/email/bases/report/base.jinja | 2 - .../reports/email/bases/report/email.css | 2 - .../reports/email/bases/report/medium.jinja | 0 .../reports/email/bases/report/report.css | 2 +- .../search_gov_spiders/actions/results.css | 0 .../search_gov_spiders/actions/results.jinja | 0 .../search_gov_spiders/extensions/__init__.py | 0 .../extensions/json_logging.py | 0 .../search_gov_spiders/helpers/__init__.py | 0 .../helpers/domain_spider.py | 0 .../search_gov_spiders/items.py | 0 .../search_gov_spiders/middlewares.py | 0 .../search_gov_spiders/monitors.py | 2 +- .../search_gov_spiders/pipelines.py | 0 .../search_gov_spiders/settings.py | 0 .../search_gov_spiders/spiders/__init__.py | 0 .../spiders/domain_spider.py | 0 .../spiders/domain_spider_js.py | 0 .../utility_files/README.md | 0 .../utility_files/crawl-sites.json | 0 .../utility_files/import_plist.py | 0 .../utility_files/init_schedule.py | 0 .../utility_files/scrutiny-2023-06-20.plist | 0 search_gov_crawler/setup.py | 0 setup.cfg | 0 tests/__init__.py | 0 tests/integration_tests/test_scrapyd.py | 0 tests/search_gov_spiders/conftest.py | 0 .../search_gov_spiders/crawl-sites-test.json | 0 .../scrapy_httpcache/domain_spider.db.bak | 0 .../scrapy_httpcache/domain_spider.db.dat | Bin .../scrapy_httpcache/domain_spider.db.dir | 0 .../scrapy_httpcache/domain_spider_js.db.bak | 0 .../scrapy_httpcache/domain_spider_js.db.dat | Bin .../scrapy_httpcache/domain_spider_js.db.dir | 0 tests/search_gov_spiders/test_extensions.py | 0 tests/search_gov_spiders/test_full_crawl.py | 0 tests/search_gov_spiders/test_helpers.py | 0 tests/search_gov_spiders/test_middlewares.py | 0 tests/search_gov_spiders/test_pipelines.py | 0 .../test_scrapy_scheduler.py | 0 tests/search_gov_spiders/test_spider.py | 0 .../test_urls_files_size.py | 0 .../search_gov_spiders/test_utiliity_files.py | 0 75 files changed, 245 insertions(+), 171 deletions(-) mode change 100644 => 100755 .circleci/config.yml mode change 100644 => 100755 .codeclimate.yml mode change 100644 => 100755 .github/dependabot.yml mode change 100644 => 100755 .github/pull_request_template.md mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .pre-commit-config.yaml mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 appspec.yml mode change 100644 => 100755 cicd-scripts/helpers/check_cloudwatch.sh mode change 100644 => 100755 cicd-scripts/helpers/check_codedeploy.sh create mode 100755 cicd-scripts/helpers/ensure_executable.sh mode change 100644 => 100755 cicd-scripts/helpers/run_without_ui.sh mode change 100644 => 100755 cicd-scripts/helpers/update_pythonpath.sh mode change 100644 => 100755 pyproject.toml mode change 100644 => 100755 search_gov_crawler/benchmark.py mode change 100644 => 100755 search_gov_crawler/output/.gitignore mode change 100644 => 100755 search_gov_crawler/requirements.txt mode change 100644 => 100755 search_gov_crawler/scrapy.cfg mode change 100644 => 100755 search_gov_crawler/scrapy_scheduler.py mode change 100644 => 100755 search_gov_crawler/scrapyd-logs/.gitignore mode change 100644 => 100755 search_gov_crawler/scrapyd.conf mode change 100644 => 100755 search_gov_crawler/scrapydweb_settings_v10.py mode change 100644 => 100755 search_gov_crawler/search_gov_logparser/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_scrapyd/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_scrapydweb/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/results.css mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/results.jinja mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/extensions/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/extensions/json_logging.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/helpers/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/helpers/domain_spider.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/items.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/middlewares.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/monitors.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/pipelines.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/settings.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/__init__.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/domain_spider.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/README.md mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/import_plist.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist mode change 100644 => 100755 search_gov_crawler/setup.py mode change 100644 => 100755 setup.cfg mode change 100644 => 100755 tests/__init__.py mode change 100644 => 100755 tests/integration_tests/test_scrapyd.py mode change 100644 => 100755 tests/search_gov_spiders/conftest.py mode change 100644 => 100755 tests/search_gov_spiders/crawl-sites-test.json mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir mode change 100644 => 100755 tests/search_gov_spiders/test_extensions.py mode change 100644 => 100755 tests/search_gov_spiders/test_full_crawl.py mode change 100644 => 100755 tests/search_gov_spiders/test_helpers.py mode change 100644 => 100755 tests/search_gov_spiders/test_middlewares.py mode change 100644 => 100755 tests/search_gov_spiders/test_pipelines.py mode change 100644 => 100755 tests/search_gov_spiders/test_scrapy_scheduler.py mode change 100644 => 100755 tests/search_gov_spiders/test_spider.py mode change 100644 => 100755 tests/search_gov_spiders/test_urls_files_size.py mode change 100644 => 100755 tests/search_gov_spiders/test_utiliity_files.py diff --git a/.circleci/config.yml b/.circleci/config.yml old mode 100644 new mode 100755 diff --git a/.codeclimate.yml b/.codeclimate.yml old mode 100644 new mode 100755 diff --git a/.github/dependabot.yml b/.github/dependabot.yml old mode 100644 new mode 100755 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/appspec.yml b/appspec.yml old mode 100644 new mode 100755 index ccb3c07..87244e0 --- a/appspec.yml +++ b/appspec.yml @@ -2,7 +2,7 @@ version: 0.0 os: linux permissions: - object: . - mode: 755 + mode: 777 acls: - "d:u::rwx" - "d:g::rwx" diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 446ed5a..9b9c8df 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,132 +1,131 @@ #!/bin/bash -# Kill all spider services (if running) -echo "Running app_stop.sh" -sudo chmod +x ./cicd-scripts/app_stop.sh -source ./cicd-scripts/app_stop.sh - -# CICD scripts can only runas 'search' user on AWS -if [ "$(whoami)" = "search" ]; then - echo "Executing cicd scripts as 'search' user" -else - echo "This script must be executed as 'search' user" - return -fi - -# Get missing packages -sudo apt-get install lzma -sudo apt-get install liblzma-dev -yes | sudo apt-get install libbz2-dev - -# Start AWS CloudWatch agent -sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh -source ./cicd-scripts/helpers/check_cloudwatch.sh - -# Start AWS CodeDeploy agent -sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh -source ./cicd-scripts/helpers/check_codedeploy.sh - -# PUBLIC -SPIDER_PYTHON_VERSION=3.12 +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh -# PRIVATE +### VARIABLES ### +SPIDER_PYTHON_VERSION=3.12 _CURRENT_BUILD_DIR=${PWD} +VENV_DIR=./venv -# Update and upgrade the system without prompting for confirmation -sudo apt-get update -y -sudo apt-get upgrade -y -sudo apt install acl -y - -# Required to give all app_* bash scripts read/write permissions to self and parent. -# Give current directory and all its files rw permissions -sudo chmod -R 755 . -# All new files/directories will inherit rwx (required when installing and using sqllite) -sudo setfacl -Rdm g:dgsearch:rwx . +### FUNCTIONS ### +# Stop spider services +stop_services() { + echo "Running app_stop.sh..." + ensure_executable "./cicd-scripts/app_stop.sh" +} -# Install necessary system dependencies -sudo apt-get install -y python-setuptools +# Install missing system dependencies +install_system_dependencies() { + echo "Installing system dependencies..." + sudo apt-get update -y + sudo apt-get install -y \ + lzma liblzma-dev libbz2-dev python-setuptools \ + acl build-essential checkinstall libreadline-dev \ + libncursesw5-dev libssl-dev libsqlite3-dev tk-dev \ + libgdbm-dev libc6-dev zlib1g-dev libffi-dev openssl +} +# Install Python install_python() { - echo "Installing ${SPIDER_PYTHON_VERSION}" - sudo apt-get install -y build-essential checkinstall libreadline-dev \ - libncursesw5-dev libssl-dev libsqlite3-dev \ - tk-dev libgdbm-dev libc6-dev libbz2-dev \ - zlib1g-dev openssl libffi-dev - - # Download Python source code + echo "Installing Python ${SPIDER_PYTHON_VERSION}..." cd /usr/src - sudo wget https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz + sudo wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz - - # Build and install Python cd Python-${SPIDER_PYTHON_VERSION}.0 sudo ./configure --enable-optimizations sudo make altinstall + cd "$_CURRENT_BUILD_DIR" + echo "Python ${SPIDER_PYTHON_VERSION} installed successfully." +} - # Return to the build directory - cd $_CURRENT_BUILD_DIR +# Check and install Python if needed +check_python() { + if ! command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then + install_python + else + echo "Python ${SPIDER_PYTHON_VERSION} already installed: $(python${SPIDER_PYTHON_VERSION} --version)" + fi +} - echo "Python ${SPIDER_PYTHON_VERSION} has been installed." +# Set environment paths +update_pythonpath() { + ensure_executable "./cicd-scripts/helpers/update_pythonpath.sh" } -# Check if Python is installed -if command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then - echo "Python ${SPIDER_PYTHON_VERSION} is already installed: $(python${SPIDER_PYTHON_VERSION} --version)" -else - echo "Python ${SPIDER_PYTHON_VERSION} is not installed. Installing Python ${SPIDER_PYTHON_VERSION}..." - install_python -fi +# Setup virtual environment +setup_virtualenv() { + echo "Setting up virtual environment..." + python${SPIDER_PYTHON_VERSION} -m venv "$VENV_DIR" + source "$VENV_DIR/bin/activate" + python -m pip install --upgrade pip +} -# Set PYTHONPATH env -source ./cicd-scripts/helpers/update_pythonpath.sh +# Install dependencies +install_dependencies() { + echo "Installing dependencies..." + python -m pip install --upgrade -r ./search_gov_crawler/requirements.txt + echo "Installing Playwright..." + python -m pip install --upgrade pytest-playwright playwright + playwright install --with-deps + deactivate +} -# Use venv with Python 3.12 -sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip +# Configure permissions +configure_permissions() { + echo "Configuring file permissions..." + sudo chmod -R 777 . + sudo chown -R "$(whoami)" . + sudo setfacl -Rdm g:dgsearch:rwx . +} -# Create a virtual environment using Python -echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..." -sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv +# Manage cron jobs +manage_cron_jobs() { + echo "Managing cron jobs..." + crontab -l | grep -v 'app_start.sh' > temp_cron || true + echo "@reboot $(pwd)/cicd-scripts/app_start.sh" >> temp_cron + crontab temp_cron + rm temp_cron + echo "Cron jobs updated." +} -# Activate the virtual environment -source ./venv/bin/activate +# Start monitoring agents +start_agents() { + echo "Starting AWS CloudWatch agent..." + ensure_executable "./cicd-scripts/helpers/check_cloudwatch.sh" -# Install all spider dependencies -echo "Installing dependencies..." -sudo pip install --force-reinstall -r ./search_gov_crawler/requirements.txt -sudo pip install pytest-playwright playwright -U -playwright install + echo "Starting AWS CodeDeploy agent..." + ensure_executable "./cicd-scripts/helpers/check_codedeploy.sh" +} -echo "Dependencies installed." +### SCRIPT EXECUTION ### +# Stop running services +stop_services -# Remove any outstanding app_start.sh reboot cronjobs -echo "Removing any app_start.sh reboot cron jobs..." -crontab -l > cron_backup.bak +# Install system dependencies +install_system_dependencies -# Remove lines containing 'app_start.sh' and update crontab -crontab -l | grep -v 'app_start.sh' > cron_backup_filtered +# Check and install Python if missing +check_python -# Check if there are changes -if cmp -s cron_backup_filtered cron_backup.bak; then - echo "No cron jobs with 'app_start.sh' found." -else - sudo crontab cron_backup_filtered - echo "Cron jobs containing 'app_start.sh' have been removed." -fi +# Set environment paths +update_pythonpath -# Clean up temporary files -rm cron_backup_filtered cron_backup.bak +# Configure permissions +configure_permissions -# Add cron job to run the app back up on ec2 restart -echo "Adding app_start.sh reboot cron job..." -sudo chmod +x ./cicd-scripts/app_start.sh +# Setup and activate virtual environment +setup_virtualenv -# Define the new cron job -new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh" +# Install dependencies +install_dependencies -# Add the new cron job to the crontab if it's not already present -(crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab - +# Start AWS agents +start_agents +# Manage cron jobs +manage_cron_jobs -echo "Cron job added: $new_cron" +echo "App installation completed successfully." diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index e790b57..889b511 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,12 +1,17 @@ #!/bin/bash -# PUBLIC +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh + +# TODO: Make it part of the local env variable that is set by Ansible SPIDER_RUN_WITH_UI=false -if $SPIDER_RUN_WITH_UI ; then - sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh - source ./cicd-scripts/helpers/run_with_ui.sh +# Determine which script to run based on the SPIDER_RUN_WITH_UI flag +if $SPIDER_RUN_WITH_UI; then + SCRIPT="./cicd-scripts/helpers/run_with_ui.sh" else - sudo chmod +x ./cicd-scripts/helpers/run_without_ui.sh - source ./cicd-scripts/helpers/run_without_ui.sh + SCRIPT="./cicd-scripts/helpers/run_without_ui.sh" fi + +# Ensure the script exists, is executable, and run it +ensure_executable "$SCRIPT" diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index 3079d15..ea1cb66 100755 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,58 +1,115 @@ #!/bin/bash -# Clear all cache -echo "Purge all pip cache..." -# We can't do `$pip cache purge`, this does the samething: -sudo rm -r ~/.cache/pip -sudo rm -rf /root/.cache/pip - -# Kill scrapy schedular (if running): -echo "Stopping scrapy_scheduler.py (if running)" -sudo chmod +x ./cicd-scripts/helpers/kill_scheduler.sh -source ./cicd-scripts/helpers/kill_scheduler.sh - -echo "Stopping all scrapyd and scrapydweb tasks..." -# Kill all scrapydweb and scrapyd jobs -if sudo pkill -f "scrapydweb" 2>/dev/null; then - echo "scrapydweb tasks stopped." -else - echo "No scrapydweb tasks running." -fi - -if sudo pkill -f "scrapyd" 2>/dev/null; then - echo "scrapyd tasks stopped." -else - echo "No scrapyd tasks running." -fi +chmod +x ./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh -# Display remaining scrapy processes (if any) -echo -e "\nRemaining scrapy processes (if any):" -ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." +### FUNCTIONS ### + +# Remove virtual environment if it exists +remove_venv() { + if [ -d ./venv ]; then + echo "Removing virtual environment..." + rm -rf ./venv/ + fi +} -# Force kill any remaning scrapy background jobs still running -sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9 +# Purge pip cache +purge_pip_cache() { + echo "Purging pip cache..." + rm -rf ~/.cache/pip /root/.cache/pip +} + +# Stop scrapy scheduler if running +stop_scrapy_scheduler() { + echo "Stopping scrapy_scheduler.py (if running)..." + ensure_executable "./cicd-scripts/helpers/kill_scheduler.sh" +} -# Kill all nohup jobs (runs with python) -ps -ef | grep nohup | grep -v grep | awk '{print $2}' +# Stop scrapyd and scrapydweb tasks +stop_scrapy_tasks() { + echo "Stopping all scrapyd and scrapydweb tasks..." -# Remove other deploy cron jobs: -#!/bin/bash + # Kill scrapydweb tasks + if pkill -f "scrapydweb" 2>/dev/null; then + echo "scrapydweb tasks stopped." + else + echo "No scrapydweb tasks running." + fi + + # Kill scrapyd tasks + if pkill -f "scrapyd" 2>/dev/null; then + echo "scrapyd tasks stopped." + else + echo "No scrapyd tasks running." + fi +} + +# Display remaining scrapy processes +display_remaining_scrapy_processes() { + echo -e "\nRemaining scrapy processes (if any):" + ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running." +} + +# Force kill any remaining scrapy background jobs +kill_remaining_scrapy_jobs() { + echo "Force killing remaining scrapy background jobs..." + if ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9; then + echo "Remaining scrapy jobs killed." + else + echo "No remaining scrapy jobs to kill." + fi +} + +# Remove nohup jobs (python scripts) +remove_nohup_jobs() { + echo "Removing nohup jobs (python)..." + ps -ef | grep nohup | grep -v grep | awk '{print $2}' | xargs kill -9 +} -# Function to remove crontab entries referencing a given cron entry string +# Remove cron job entries referencing the given string remove_cron_entry() { if [ -z "$1" ]; then echo "Error: No cron entry provided." return fi - CRON_ENTRY="$1" + local CRON_ENTRY="$1" + local CRON_USER=$(whoami) - # Remove entries referencing the script - sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab - + echo "Removing cron job entries referencing: $CRON_ENTRY" - echo "Removed any crontab entries referencing $CRON_ENTRY." + # Remove cron job for the current user (including the full path if needed) + sudo crontab -l -u "$CRON_USER" 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab -u "$CRON_USER" - + + echo "Cron job entries for '$CRON_ENTRY' removed." } -# Remove any other cron job entries +### SCRIPT EXECUTION ### + +# Remove virtual environment +remove_venv + +# Purge pip cache +purge_pip_cache + +# Stop scrapy scheduler if running +stop_scrapy_scheduler + +# Stop scrapyd and scrapydweb tasks +stop_scrapy_tasks + +# Display remaining scrapy processes (if any) +display_remaining_scrapy_processes + +# Force kill any remaining scrapy background jobs +kill_remaining_scrapy_jobs + +# Remove nohup jobs (python) +remove_nohup_jobs + +# Remove specific cron jobs remove_cron_entry "check_cloudwatch.sh" remove_cron_entry "check_codedeploy.sh" +remove_cron_entry "app_start.sh" + +echo "App stop completed successfully." diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh old mode 100644 new mode 100755 index 18e4870..29bb1db --- a/cicd-scripts/helpers/check_cloudwatch.sh +++ b/cicd-scripts/helpers/check_cloudwatch.sh @@ -18,10 +18,10 @@ check_cloudwatch() { # Ensure the script is added to crontab for execution on reboot setup_cron() { sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh - CRON_ENTRY="@reboot /bin/bash $PWD/cicd-scripts/helpers/check_cloudwatch.sh" + CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_cloudwatch.sh" # Update crontab, ensuring no duplicates - (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab - + (crontab -l 2>/dev/null | grep -v -F "check_cloudwatch.sh"; echo "$CRON_ENTRY") | crontab - echo "Crontab entry added to ensure the script runs on reboot." } diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh old mode 100644 new mode 100755 index 98731dc..2faaca1 --- a/cicd-scripts/helpers/check_codedeploy.sh +++ b/cicd-scripts/helpers/check_codedeploy.sh @@ -18,10 +18,10 @@ check_codedeploy() { # Ensure the script is added to crontab for execution on reboot setup_cron() { sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh - CRON_ENTRY="@reboot /bin/bash $PWD/helpers/check_codedeploy.sh" + CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_codedeploy.sh" # Update crontab, ensuring no duplicates - (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab - + (crontab -l 2>/dev/null | grep -v -F "check_codedeploy.sh"; echo "$CRON_ENTRY") | crontab - echo "Crontab entry added to ensure the script runs on reboot." } diff --git a/cicd-scripts/helpers/ensure_executable.sh b/cicd-scripts/helpers/ensure_executable.sh new file mode 100755 index 0000000..88e6439 --- /dev/null +++ b/cicd-scripts/helpers/ensure_executable.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Function to ensure a file exists, is executable, and then runs it +ensure_executable() { + local script="$1" + + if [ -f "$script" ]; then + chmod +x "$script" + echo "$script is now executable." + source "$script" + else + echo "Error: $script not found!" + # exit 1 + fi +} diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh index 76b39e6..4c559da 100755 --- a/cicd-scripts/helpers/kill_scheduler.sh +++ b/cicd-scripts/helpers/kill_scheduler.sh @@ -4,23 +4,21 @@ echo "Searching for scrapy_scheduler.py process..." PROCESS_ID=$(pgrep -f "scrapy_scheduler.py") -# Check if the process exists -if [ -z "$PROCESS_ID" ]; then +# Check if the process ID was found +if [ -n "$PROCESS_ID" ]; then echo "No running process found for scrapy_scheduler.py." - return -fi -# Kill the process -echo "Killing process with PID: $PROCESS_ID" -kill "$PROCESS_ID" + # Kill the process + echo "Killing process with PID: $PROCESS_ID" + kill "$PROCESS_ID" 2>/dev/null -# Pause to allow the process to terminate -sleep 3 + # Pause to allow the process to terminate + sleep 3 -# Verify if the process was killed -if ! kill -0 "$PROCESS_ID" 2>/dev/null; then - echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated." -else - echo "Failed to terminate the process. Please check manually." - return + # Verify if the process was killed + if ! kill -0 "$PROCESS_ID" 2>/dev/null; then + echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated." + else + echo "Failed to terminate the process or process no longer exists." + fi fi diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh old mode 100644 new mode 100755 index edf0c3a..247488b --- a/cicd-scripts/helpers/run_without_ui.sh +++ b/cicd-scripts/helpers/run_without_ui.sh @@ -1,4 +1,8 @@ #!/bin/bash -SPIDER_PYTHON_VERSION=3.12 -sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &" + +# Run the script in the background using the virtual environment +chmod +x ./search_gov_crawler/scrapy_scheduler.py + +sudo nohup bash -c "source ./venv/bin/activate && ./venv/bin/python ./search_gov_crawler/scrapy_scheduler.py" > /var/log/scrapy_scheduler.log 2>&1 & + echo "Running no UI vesrion of searchgov-spider..." diff --git a/cicd-scripts/helpers/update_pythonpath.sh b/cicd-scripts/helpers/update_pythonpath.sh old mode 100644 new mode 100755 diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 diff --git a/search_gov_crawler/benchmark.py b/search_gov_crawler/benchmark.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/output/.gitignore b/search_gov_crawler/output/.gitignore old mode 100644 new mode 100755 diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapy.cfg b/search_gov_crawler/scrapy.cfg old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapy_scheduler.py b/search_gov_crawler/scrapy_scheduler.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapyd-logs/.gitignore b/search_gov_crawler/scrapyd-logs/.gitignore old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapyd.conf b/search_gov_crawler/scrapyd.conf old mode 100644 new mode 100755 diff --git a/search_gov_crawler/scrapydweb_settings_v10.py b/search_gov_crawler/scrapydweb_settings_v10.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_logparser/__init__.py b/search_gov_crawler/search_gov_logparser/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_scrapyd/__init__.py b/search_gov_crawler/search_gov_scrapyd/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_scrapydweb/__init__.py b/search_gov_crawler/search_gov_scrapydweb/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/__init__.py b/search_gov_crawler/search_gov_spiders/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja old mode 100644 new mode 100755 index 656c94b..47f6bff --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja @@ -23,5 +23,3 @@ - - diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css old mode 100644 new mode 100755 index 2f13050..7d63694 --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css @@ -83,5 +83,3 @@ table{max-width:100%;background-color:transparent;border-collapse:collapse;borde .icon,.icon-big {display:inline-block;} .icon {width:34px;height:34px;} .icon-big {width:140px;height:140px;} - - diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css old mode 100644 new mode 100755 index bd1e39d..124bd07 --- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css @@ -24,4 +24,4 @@ table.report-container td {padding: 40px 20px;} .report-section h2 {margin: 0 0 20px 0;padding: 0 0 12px 0;line-height: 20px;border-bottom: 1px solid #f4f4f4;} .report-section h3 {margin: 25px 0 5px 0;line-height: 24px;} .report-section h4 {margin: 0 0 2px 0;} -.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;} \ No newline at end of file +.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;} diff --git a/search_gov_crawler/search_gov_spiders/actions/results.css b/search_gov_crawler/search_gov_spiders/actions/results.css old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/actions/results.jinja b/search_gov_crawler/search_gov_spiders/actions/results.jinja old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/extensions/__init__.py b/search_gov_crawler/search_gov_spiders/extensions/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/extensions/json_logging.py b/search_gov_crawler/search_gov_spiders/extensions/json_logging.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/helpers/__init__.py b/search_gov_crawler/search_gov_spiders/helpers/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py b/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/items.py b/search_gov_crawler/search_gov_spiders/items.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/middlewares.py b/search_gov_crawler/search_gov_spiders/middlewares.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/monitors.py b/search_gov_crawler/search_gov_spiders/monitors.py old mode 100644 new mode 100755 index 260dd94..7dafd2a --- a/search_gov_crawler/search_gov_spiders/monitors.py +++ b/search_gov_crawler/search_gov_spiders/monitors.py @@ -14,4 +14,4 @@ class PeriodicMonitorSuite(MonitorSuite): monitors_failed_actions = [ CreateCustomFileReport, SendSmtpEmail - ] \ No newline at end of file + ] diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/settings.py b/search_gov_crawler/search_gov_spiders/settings.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/__init__.py b/search_gov_crawler/search_gov_spiders/spiders/__init__.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/README.md b/search_gov_crawler/search_gov_spiders/utility_files/README.md old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json b/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py b/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py b/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py old mode 100644 new mode 100755 diff --git a/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist b/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist old mode 100644 new mode 100755 diff --git a/search_gov_crawler/setup.py b/search_gov_crawler/setup.py old mode 100644 new mode 100755 diff --git a/setup.cfg b/setup.cfg old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 diff --git a/tests/integration_tests/test_scrapyd.py b/tests/integration_tests/test_scrapyd.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/conftest.py b/tests/search_gov_spiders/conftest.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/crawl-sites-test.json b/tests/search_gov_spiders/crawl-sites-test.json old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_extensions.py b/tests/search_gov_spiders/test_extensions.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_helpers.py b/tests/search_gov_spiders/test_helpers.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_middlewares.py b/tests/search_gov_spiders/test_middlewares.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_pipelines.py b/tests/search_gov_spiders/test_pipelines.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_scrapy_scheduler.py b/tests/search_gov_spiders/test_scrapy_scheduler.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_spider.py b/tests/search_gov_spiders/test_spider.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py old mode 100644 new mode 100755 diff --git a/tests/search_gov_spiders/test_utiliity_files.py b/tests/search_gov_spiders/test_utiliity_files.py old mode 100644 new mode 100755 From 721f77fc5054f2b9c093b49a14569649b5f3b3a3 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 04:46:13 -0500 Subject: [PATCH 29/40] fix --- cicd-scripts/app_start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 889b511..31e9353 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,7 +1,7 @@ #!/bin/bash chmod +x ./cicd-scripts/helpers/ensure_executable.sh -source ./cicd-scripts/helpers/ensure_executable.sh +./cicd-scripts/helpers/ensure_executable.sh # TODO: Make it part of the local env variable that is set by Ansible SPIDER_RUN_WITH_UI=false From 866fbd40f1d3ed3dd96a0d8d960ac7312b8ff2f2 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 04:50:48 -0500 Subject: [PATCH 30/40] ... --- cicd-scripts/app_start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 31e9353..8a8710e 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,5 +1,5 @@ #!/bin/bash - +echo "###" $(pwd) chmod +x ./cicd-scripts/helpers/ensure_executable.sh ./cicd-scripts/helpers/ensure_executable.sh From 9a528f10fdff59c37c6298fd8c9ec5a68eb8b451 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 05:13:53 -0500 Subject: [PATCH 31/40] set root path --- cicd-scripts/app_install.sh | 3 +++ cicd-scripts/app_start.sh | 5 ++++- cicd-scripts/app_stop.sh | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 9b9c8df..d19f00d 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -1,5 +1,8 @@ #!/bin/bash +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + chmod +x ./cicd-scripts/helpers/ensure_executable.sh source ./cicd-scripts/helpers/ensure_executable.sh diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 8a8710e..a0fa24c 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -1,5 +1,8 @@ #!/bin/bash -echo "###" $(pwd) + +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + chmod +x ./cicd-scripts/helpers/ensure_executable.sh ./cicd-scripts/helpers/ensure_executable.sh diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh index ea1cb66..9d536a8 100755 --- a/cicd-scripts/app_stop.sh +++ b/cicd-scripts/app_stop.sh @@ -1,5 +1,8 @@ #!/bin/bash +# CD into the current script directory (which != $pwd) +cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ + chmod +x ./cicd-scripts/helpers/ensure_executable.sh source ./cicd-scripts/helpers/ensure_executable.sh From 3f43ad8b879ff600e2356e70e336343380b96631 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 05:18:04 -0500 Subject: [PATCH 32/40] ... --- cicd-scripts/app_start.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index a0fa24c..5e3e45b 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -3,6 +3,8 @@ # CD into the current script directory (which != $pwd) cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ +echo "### $pwd" + chmod +x ./cicd-scripts/helpers/ensure_executable.sh ./cicd-scripts/helpers/ensure_executable.sh From 2841b49fcdae0bca1191f268a3bf3023b8b746ff Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 05:18:14 -0500 Subject: [PATCH 33/40] ... --- cicd-scripts/app_start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index 5e3e45b..eb99ddb 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -6,7 +6,7 @@ cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ echo "### $pwd" chmod +x ./cicd-scripts/helpers/ensure_executable.sh -./cicd-scripts/helpers/ensure_executable.sh +source ./cicd-scripts/helpers/ensure_executable.sh # TODO: Make it part of the local env variable that is set by Ansible SPIDER_RUN_WITH_UI=false From a1ba2da143a2c260eda7482015694edbf02742dd Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 05:42:11 -0500 Subject: [PATCH 34/40] ... --- cicd-scripts/app_install.sh | 14 ++++++++------ cicd-scripts/app_start.sh | 2 -- cicd-scripts/helpers/check_cloudwatch.sh | 2 +- cicd-scripts/helpers/check_codedeploy.sh | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index d19f00d..1a61dee 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -34,11 +34,13 @@ install_system_dependencies() { install_python() { echo "Installing Python ${SPIDER_PYTHON_VERSION}..." cd /usr/src - sudo wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz - sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz + wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz + tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz cd Python-${SPIDER_PYTHON_VERSION}.0 - sudo ./configure --enable-optimizations - sudo make altinstall + ./configure --enable-optimizations + make + make install + make altinstall cd "$_CURRENT_BUILD_DIR" echo "Python ${SPIDER_PYTHON_VERSION} installed successfully." } @@ -78,8 +80,8 @@ install_dependencies() { # Configure permissions configure_permissions() { echo "Configuring file permissions..." - sudo chmod -R 777 . - sudo chown -R "$(whoami)" . + chmod -R 777 . + chown -R "$(whoami)" . sudo setfacl -Rdm g:dgsearch:rwx . } diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh index eb99ddb..76b0081 100755 --- a/cicd-scripts/app_start.sh +++ b/cicd-scripts/app_start.sh @@ -3,8 +3,6 @@ # CD into the current script directory (which != $pwd) cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../ -echo "### $pwd" - chmod +x ./cicd-scripts/helpers/ensure_executable.sh source ./cicd-scripts/helpers/ensure_executable.sh diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh index 29bb1db..487d122 100755 --- a/cicd-scripts/helpers/check_cloudwatch.sh +++ b/cicd-scripts/helpers/check_cloudwatch.sh @@ -17,7 +17,7 @@ check_cloudwatch() { # Ensure the script is added to crontab for execution on reboot setup_cron() { - sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh + chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_cloudwatch.sh" # Update crontab, ensuring no duplicates diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh index 2faaca1..6e6cf15 100755 --- a/cicd-scripts/helpers/check_codedeploy.sh +++ b/cicd-scripts/helpers/check_codedeploy.sh @@ -17,7 +17,7 @@ check_codedeploy() { # Ensure the script is added to crontab for execution on reboot setup_cron() { - sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh + chmod +x ./cicd-scripts/helpers/check_codedeploy.sh CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_codedeploy.sh" # Update crontab, ensuring no duplicates From 43c8df0dd256f5f3aae1932ced6ac8372da4acce Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 12:39:53 -0500 Subject: [PATCH 35/40] test unit tests --- cicd-scripts/app_install.sh | 1 + tests/search_gov_spiders/test_urls_files_size.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh index 1a61dee..5179308 100755 --- a/cicd-scripts/app_install.sh +++ b/cicd-scripts/app_install.sh @@ -36,6 +36,7 @@ install_python() { cd /usr/src wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz + sudo chown -R $(whoami) ./Python-${SPIDER_PYTHON_VERSION}.0 cd Python-${SPIDER_PYTHON_VERSION}.0 ./configure --enable-optimizations make diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index 1c9a3b0..2c683c0 100755 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -33,6 +33,7 @@ def fixture_mock_open(mocker): @pytest.fixture(name="pipeline_no_api") def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: mocker.patch.dict(os.environ, {}) + mocker.patch('os.getpid', return_value=1234) return SearchGovSpidersPipeline() @@ -40,12 +41,12 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: """Fixture for pipeline with an API URL set.""" mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}) + mocker.patch('os.getpid', return_value=1234) return SearchGovSpidersPipeline() def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker): """Test that URLs are written to files when SPIDER_URLS_API is not set.""" - mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=False) pipeline_no_api.process_item(sample_item, sample_spider) # Ensure file is opened and written to @@ -74,11 +75,10 @@ def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker): """Test that file rotation occurs when max size is exceeded.""" mock_rename = mocker.patch("os.rename") - mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=True) pipeline_no_api.process_item(sample_item, None) # Check if the file was rotated - mock_open.assert_called_with(pipeline_no_api.base_file_name, "w", encoding="utf-8") + mock_open.assert_called_with(pipeline_no_api.base_file_name, "a", encoding="utf-8") mock_open().close.assert_called() mock_rename.assert_called_once_with( pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv" From 2638509ba155e689d225638ed1798cd2f4edba13 Mon Sep 17 00:00:00 2001 From: selfdanielj <38377823+selfdanielj@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:47:17 -0500 Subject: [PATCH 36/40] fixes to tests --- .../test_urls_files_size.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index 2c683c0..f2de164 100755 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -1,10 +1,9 @@ import os -from pathlib import Path import pytest - from scrapy import Spider from scrapy.utils.test import get_crawler + from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline @@ -33,7 +32,7 @@ def fixture_mock_open(mocker): @pytest.fixture(name="pipeline_no_api") def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: mocker.patch.dict(os.environ, {}) - mocker.patch('os.getpid', return_value=1234) + mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234) return SearchGovSpidersPipeline() @@ -41,16 +40,17 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: """Fixture for pipeline with an API URL set.""" mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}) - mocker.patch('os.getpid', return_value=1234) + mocker.patch("os.getpid", return_value=1234) return SearchGovSpidersPipeline() def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker): """Test that URLs are written to files when SPIDER_URLS_API is not set.""" + mocker.patch.object(SearchGovSpidersPipeline, "_file_size", return_value=100) pipeline_no_api.process_item(sample_item, sample_spider) # Ensure file is opened and written to - mock_open.assert_called_once_with(pipeline_no_api.base_file_name, "w", encoding="utf-8") + mock_open.assert_called_once_with(pipeline_no_api.file_path, "a", encoding="utf-8") mock_open().write.assert_any_call(sample_item["url"] + "\n") @@ -64,7 +64,11 @@ def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker): assert sample_item["url"] in pipeline_with_api.urls_batch # Simulate max size to force post - mocker.patch.object(SearchGovSpidersPipeline, "_is_batch_too_large", return_value=True) + mocker.patch.object( + SearchGovSpidersPipeline, + "_batch_size", + return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES, + ) pipeline_with_api.process_item(sample_item, sample_spider) # Ensure POST request was made @@ -74,15 +78,17 @@ def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker): def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker): """Test that file rotation occurs when max size is exceeded.""" mock_rename = mocker.patch("os.rename") - + mocker.patch.object( + SearchGovSpidersPipeline, + "_file_size", + return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES, + ) pipeline_no_api.process_item(sample_item, None) # Check if the file was rotated - mock_open.assert_called_with(pipeline_no_api.base_file_name, "a", encoding="utf-8") + mock_open.assert_called_with(pipeline_no_api.file_path, "a", encoding="utf-8") mock_open().close.assert_called() - mock_rename.assert_called_once_with( - pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv" - ) + mock_rename.assert_called_once() def test_post_urls_on_spider_close(pipeline_with_api, sample_spider, mocker): From 7e9c9350c48a119b7b5a68190ff3d86e90384225 Mon Sep 17 00:00:00 2001 From: selfdanielj <38377823+selfdanielj@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:12:05 -0500 Subject: [PATCH 37/40] use same method for both mocks --- tests/search_gov_spiders/test_urls_files_size.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py index f2de164..319b547 100755 --- a/tests/search_gov_spiders/test_urls_files_size.py +++ b/tests/search_gov_spiders/test_urls_files_size.py @@ -40,7 +40,8 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline: def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline: """Fixture for pipeline with an API URL set.""" mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}) - mocker.patch("os.getpid", return_value=1234) + mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234) + return SearchGovSpidersPipeline() From c2ac070c769cf0f7ee06e3887ad009328e2b707a Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Tue, 17 Dec 2024 15:37:49 -0500 Subject: [PATCH 38/40] added dedup test for full coverage --- .../test_deduplicator_pipeline.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/search_gov_spiders/test_deduplicator_pipeline.py diff --git a/tests/search_gov_spiders/test_deduplicator_pipeline.py b/tests/search_gov_spiders/test_deduplicator_pipeline.py new file mode 100644 index 0000000..86d9ee0 --- /dev/null +++ b/tests/search_gov_spiders/test_deduplicator_pipeline.py @@ -0,0 +1,117 @@ +import os +import pytest +from unittest.mock import MagicMock, patch +from scrapy.exceptions import DropItem +from search_gov_crawler.search_gov_spiders.pipelines import ( + SearchGovSpidersPipeline, + DeDeuplicatorPipeline, +) + +# --------------------------- +# Fixtures +# --------------------------- + +@pytest.fixture +def sample_item(): + """Fixture for a valid sample item.""" + return {"url": "http://example.com"} + +@pytest.fixture +def invalid_item(): + """Fixture for an invalid item with no URL.""" + return {} + +@pytest.fixture +def sample_spider(): + """Fixture for a mock spider with a logger.""" + class SpiderMock: + logger = MagicMock() + return SpiderMock() + +@pytest.fixture +def pipeline_no_api(): + """Fixture for SearchGovSpidersPipeline with no SPIDER_URLS_API.""" + with patch.dict(os.environ, {}, clear=True): + return SearchGovSpidersPipeline() + +@pytest.fixture +def pipeline_with_api(): + """Fixture for SearchGovSpidersPipeline with SPIDER_URLS_API set.""" + with patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}): + return SearchGovSpidersPipeline() + +@pytest.fixture +def deduplicator_pipeline(): + """Fixture for DeDeuplicatorPipeline with clean state.""" + return DeDeuplicatorPipeline() + +# --------------------------- +# Tests for SearchGovSpidersPipeline +# --------------------------- + +def test_missing_url_in_item(pipeline_no_api, sample_spider, invalid_item): + """ + Verify DropItem exception is raised when an item has no URL. + """ + with pytest.raises(DropItem, match="Missing URL in item"): + pipeline_no_api.process_item(invalid_item, sample_spider) + +# --------------------------- +# Tests for DeDeuplicatorPipeline +# --------------------------- + +@pytest.mark.parametrize( + "item", + [ + {"url": "http://example.com/1"}, + {"url": "http://example.com/2"}, + ], +) +def test_deduplicator_pipeline_unique_items(deduplicator_pipeline, item): + """ + Verify that unique items are processed successfully. + """ + result = deduplicator_pipeline.process_item(item, None) + assert result == item + + +def test_deduplicator_pipeline_duplicate_item(deduplicator_pipeline, sample_item): + """ + Verify that duplicate items raise DropItem. + """ + deduplicator_pipeline.process_item(sample_item, None) # First time should pass + + with pytest.raises(DropItem, match="Item already seen!"): + deduplicator_pipeline.process_item(sample_item, None) # Duplicate raises DropItem + + +def test_deduplicator_pipeline_multiple_items(deduplicator_pipeline): + """ + Verify that multiple unique items are processed without errors. + """ + item1 = {"url": "http://example.com/1"} + item2 = {"url": "http://example.com/2"} + + result1 = deduplicator_pipeline.process_item(item1, None) + result2 = deduplicator_pipeline.process_item(item2, None) + + assert result1 == item1 + assert result2 == item2 + + +def test_deduplicator_pipeline_clean_state(): + """ + Verify that a new instance of DeDeuplicatorPipeline starts with a clean state. + """ + pipeline1 = DeDeuplicatorPipeline() + pipeline2 = DeDeuplicatorPipeline() + + item = {"url": "http://example.com/1"} + + # First pipeline processes the item + result = pipeline1.process_item(item, None) + assert result == item + + # Second pipeline should also process the same item as it has a clean state + result = pipeline2.process_item(item, None) + assert result == item From 0db90b22e48ce01ad70d8157fe9a4edc2a90f675 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 18 Dec 2024 11:21:48 -0500 Subject: [PATCH 39/40] code feedback and unit tests --- README.md | 2 +- __init__.py | 0 search_gov_crawler/__init__.py | 0 search_gov_crawler/requirements.txt | 1 - tests/search_gov_spiders/test_full_crawl.py | 12 ++---------- 5 files changed, 3 insertions(+), 12 deletions(-) create mode 100644 __init__.py create mode 100644 search_gov_crawler/__init__.py diff --git a/README.md b/README.md index bd7c2e0..94b760c 100755 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Make sure to run `pip install -r requirements.txt` and `playwright install` befo 1. Navigate to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory 2. Enter one of two following commands: - * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the “FEEDS” variable of the [*settings.py*](search_gov_crawler/search_gov_spiders/settings.py) file: + * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the `search_gov_crawler/search_gov_spiders/pipelines.py`: $ scrapy runspider diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search_gov_crawler/__init__.py b/search_gov_crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt index f0f56e0..19790be 100755 --- a/search_gov_crawler/requirements.txt +++ b/search_gov_crawler/requirements.txt @@ -10,5 +10,4 @@ scrapy-playwright==0.0.42 scrapyd==1.5.0 scrapyd-client==2.0.0 scrapydweb @ git+https://github.com/GSA/searchgov-scrapydweb -spidermon [monitoring] == 1.22.0 spidermon[monitoring]==1.22.0 diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py index 1bfd892..58e24ab 100755 --- a/tests/search_gov_spiders/test_full_crawl.py +++ b/tests/search_gov_spiders/test_full_crawl.py @@ -94,18 +94,10 @@ def test_full_crawl(mock_scrapy_settings, monkeypatch, spider, use_dedup, crawl_ temp_dir.joinpath("output").mkdir(exist_ok=True) def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs): - # pipeline_cls.current_file_size = 0 - # pipeline_cls.file_number = 1 - # pipeline_cls.parent_file_path = temp_dir - # pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv") - # pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8") - # pipeline_cls.max_file_size = 3900 - # pipeline_cls.paginate = True - pipeline_cls.api_url = None pipeline_cls.file_number = 1 pipeline_cls.parent_file_path = temp_dir - pipeline_cls.base_file_name = temp_dir / "output" / "all-links.csv" + pipeline_cls.base_file_name = temp_dir / "output" / "all-links-p1234.csv" pipeline_cls.file_path = pipeline_cls.base_file_name pipeline_cls.current_file = open(pipeline_cls.file_path, "w", encoding="utf-8") @@ -122,7 +114,7 @@ def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs): with open(output_file.name, encoding="UTF") as f: links = json.load(f) - split_files = list(temp_dir.glob("all-links*.csv")) + split_files = list(temp_dir.glob("all-links-p*.csv")) # verify total links match expected assert len(links) == expected_results From 0b05f3f74d0f7928c639663fd5f4cd33ed2c8270 Mon Sep 17 00:00:00 2001 From: Igor Zaytsev Date: Wed, 18 Dec 2024 11:55:24 -0500 Subject: [PATCH 40/40] optimized unit tests --- .../test_deduplicator_pipeline.py | 31 +++++++++++++++- tests/search_gov_spiders/test_pipelines.py | 36 ------------------- 2 files changed, 30 insertions(+), 37 deletions(-) delete mode 100755 tests/search_gov_spiders/test_pipelines.py diff --git a/tests/search_gov_spiders/test_deduplicator_pipeline.py b/tests/search_gov_spiders/test_deduplicator_pipeline.py index 86d9ee0..55ee3e2 100644 --- a/tests/search_gov_spiders/test_deduplicator_pipeline.py +++ b/tests/search_gov_spiders/test_deduplicator_pipeline.py @@ -1,12 +1,13 @@ import os import pytest +from contextlib import suppress from unittest.mock import MagicMock, patch from scrapy.exceptions import DropItem from search_gov_crawler.search_gov_spiders.pipelines import ( SearchGovSpidersPipeline, DeDeuplicatorPipeline, ) - +from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem # --------------------------- # Fixtures # --------------------------- @@ -115,3 +116,31 @@ def test_deduplicator_pipeline_clean_state(): # Second pipeline should also process the same item as it has a clean state result = pipeline2.process_item(item, None) assert result == item + +@pytest.mark.parametrize( + ("items", "urls_seen_length"), + [ + ( + [ + SearchGovSpidersItem(url="https://www.example.com/1"), + SearchGovSpidersItem(url="https://www.example.com/2"), + ], + 2, + ), + ( + [ + SearchGovSpidersItem(url="https://www.example.com/1"), + SearchGovSpidersItem(url="https://www.example.com/1"), + ], + 1, + ), + ], +) +def test_deduplicator_pipeline(items, urls_seen_length): + pl = DeDeuplicatorPipeline() + + with suppress(DropItem): + for item in items: + pl.process_item(item, None) + + assert len(pl.urls_seen) == urls_seen_length diff --git a/tests/search_gov_spiders/test_pipelines.py b/tests/search_gov_spiders/test_pipelines.py deleted file mode 100755 index 0b85135..0000000 --- a/tests/search_gov_spiders/test_pipelines.py +++ /dev/null @@ -1,36 +0,0 @@ -from contextlib import suppress - -import pytest -from scrapy.exceptions import DropItem - -from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem -from search_gov_crawler.search_gov_spiders.pipelines import DeDeuplicatorPipeline - - -@pytest.mark.parametrize( - ("items", "urls_seen_length"), - [ - ( - [ - SearchGovSpidersItem(url="https://www.example.com/1"), - SearchGovSpidersItem(url="https://www.example.com/2"), - ], - 2, - ), - ( - [ - SearchGovSpidersItem(url="https://www.example.com/1"), - SearchGovSpidersItem(url="https://www.example.com/1"), - ], - 1, - ), - ], -) -def test_deduplicator_pipeline(items, urls_seen_length): - pl = DeDeuplicatorPipeline() - - with suppress(DropItem): - for item in items: - pl.process_item(item, None) - - assert len(pl.urls_seen) == urls_seen_length