From 0488698e493fd783f96abd06d0ce60de683043f3 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 4 Sep 2024 13:47:59 -0400
Subject: [PATCH 01/40] added build spec

---
 buildspec.yaml | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 buildspec.yaml

diff --git a/buildspec.yaml b/buildspec.yaml
new file mode 100644
index 0000000..ba7d924
--- /dev/null
+++ b/buildspec.yaml
@@ -0,0 +1,40 @@
+version: 0.2
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.12
+    commands:
+      - echo "Setting up Python virtual environment..."
+      - python -m venv venv
+      - source venv/bin/activate
+      - echo "Installing dependencies..."
+      - pip install --upgrade pip
+      - pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
+
+  pre_build:
+    commands:
+      - echo "Pre-build phase - preparing environment..."
+
+  build:
+    commands:
+      - echo "Running the domain_spider"
+      - cd ./search_gov_crawler
+      - pkill scrapy
+      - scrapy crawl domain_spider
+      - echo "Finished all non-js domain_spider urls"
+      - scrapy crawl domain_spider_js
+      - echo "Finished all js domain_spider urls"
+
+  post_build:
+    commands:
+      - echo "Build completed!"
+      - deactivate
+
+artifacts:
+  files:
+    - '**/*'
+
+cache:
+  paths:
+    - '/root/.cache/pip'

From da78798b19f6d884336c2095c7c63edcbd91459a Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 4 Sep 2024 13:50:23 -0400
Subject: [PATCH 02/40] Changed name

---
 buildspec.yaml => buildspec_spider.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename buildspec.yaml => buildspec_spider.yml (100%)

diff --git a/buildspec.yaml b/buildspec_spider.yml
similarity index 100%
rename from buildspec.yaml
rename to buildspec_spider.yml

From 67f19c95f61ca88751b4493e6e123eb1417e0a84 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Mon, 9 Sep 2024 12:05:58 -0400
Subject: [PATCH 03/40] added appspec file

---
 .pre-commit-config.yaml     |  1 -
 appspec.yml                 | 34 +++++++++++++++++++++
 cicd-scripts/app_install.sh | 60 +++++++++++++++++++++++++++++++++++++
 cicd-scripts/app_start.sh   | 19 ++++++++++++
 cicd-scripts/app_stop.sh    | 10 +++++++
 5 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 appspec.yml
 create mode 100644 cicd-scripts/app_install.sh
 create mode 100644 cicd-scripts/app_start.sh
 create mode 100644 cicd-scripts/app_stop.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e056019..10b1cfe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,7 +10,6 @@ repos:
       - id: check-added-large-files # prevents giant files from being committed.
       - id: check-case-conflict # checks for files that would conflict in case-insensitive filesystems.
       - id: check-json # checks json files for parseable syntax.
-      - id: check-shebang-scripts-are-executable # ensures that (non-binary) files with a shebang are executable.
       - id: check-merge-conflict # checks for files that contain merge conflict strings.
       - id: check-symlinks # checks for symlinks which do not point to anything.
       - id: check-yaml # checks yaml files for parseable syntax.
diff --git a/appspec.yml b/appspec.yml
new file mode 100644
index 0000000..09660cb
--- /dev/null
+++ b/appspec.yml
@@ -0,0 +1,34 @@
+version: 0.0
+os: linux
+files:
+  - source: .
+    destination: /home/ec2-user/spider
+permissions:
+  - object: /home/ec2-user/spider/cicd-scripts/app_install.sh
+    owner: search
+    mode: 755
+    type:
+      - file
+  - object: /home/ec2-user/spider/cicd-scripts/app_start.sh
+    owner: search
+    mode: 755
+    type:
+      - file
+  - object: /home/ec2-user/spider/cicd-scripts/app_stop.sh
+    owner: search
+    mode: 755
+    type:
+      - file
+hooks:
+  AfterInstall:
+    - location: spider/cicd-scripts/app_install.sh
+      timeout: 300
+      runas: search
+  ApplicationStart:
+    - location: spider/cicd-scripts/app_start.sh
+      timeout: 300
+      runas: search
+  ApplicationStop:
+    - location: spider/cicd-scripts/app_stop.sh
+      timeout: 300
+      runas: search
diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
new file mode 100644
index 0000000..c1382d3
--- /dev/null
+++ b/cicd-scripts/app_install.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+sudo apt-get update
+sudo apt-get upgrade
+sudo apt-get install python-setuptools
+sudo apt-get install python-pip
+
+# Function to install Python 3.12
+install_python() {
+    echo "Installing Python 3.12..."
+    sudo apt update
+    sudo apt install -y build-essential checkinstall
+    sudo apt install -y libreadline-gplv2-dev libncursesw5-dev libssl-dev \
+                        libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
+                        zlib1g-dev openssl libffi-dev
+
+    # Download Python 3.12 source code
+    cd /usr/src
+    sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz
+    sudo tar xzf Python-3.12.0.tgz
+
+    # Build and install
+    cd Python-3.12.0
+    sudo ./configure --enable-optimizations
+    sudo make altinstall
+
+    echo "Python 3.12 has been installed."
+}
+
+# Check if Python 3.12 is installed
+python_version=$(python3 --version 2>&1)
+
+if [[ $python_version == *"3.12"* ]]; then
+    echo "Python 3.12 is already installed: $python_version"
+else
+    echo "Current Python version: $python_version"
+    echo "Installing Python 3.12..."
+    install_python
+fi
+
+
+
+# Creating python3.12 virtual env
+
+pip install virtualenv
+
+cd /home/ec2-user/python-flask-service
+
+echo "Creating python3.12 virtual environment..."
+python3.12 -m venv /home/ec2-user/app/venv
+source /home/ec2-user/app/venv/bin/activate
+
+# Installing all spider dependencies
+echo "Installing dependencies..."
+pip install --upgrade pip
+
+# pip install -r /home/ec2-user/app/requirements.txt
+pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
+
+echo "Dependencies installed."
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
new file mode 100644
index 0000000..8e67ce7
--- /dev/null
+++ b/cicd-scripts/app_start.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+echo "Activating virtual environment..."
+source /home/ec2-user/app/venv/bin/activate
+
+echo "Running searchgov-spider application..."
+# python /home/ec2-user/app/app.py
+
+pkill scrapy
+
+
+nohup scrapy crawl domain_spider &
+nohup scrapy crawl domain_spider_js &
+
+echo "\nCurrent running crawl jobs:"
+jobs
+
+echo "\noutput:"
+cat nohup.out
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
new file mode 100644
index 0000000..03b209a
--- /dev/null
+++ b/cicd-scripts/app_stop.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+echo "Killing all scrapy tasks"
+pkill scrapy
+
+# Kill all background jobs (by PID)
+jobs -p | grep -o -E '\s\d+\s' | xargs kill
+
+echo "\nBelow jobs list should be empty:"
+jobs

From 33c09b30c41e89491e4523d53f7882de256c0c65 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 10 Sep 2024 11:52:19 -0400
Subject: [PATCH 04/40] ...

---
 buildspec_spider.yml        | 40 -------------------------------------
 cicd-scripts/app_install.sh |  7 +++----
 cicd-scripts/app_start.sh   |  5 -----
 3 files changed, 3 insertions(+), 49 deletions(-)
 delete mode 100644 buildspec_spider.yml

diff --git a/buildspec_spider.yml b/buildspec_spider.yml
deleted file mode 100644
index ba7d924..0000000
--- a/buildspec_spider.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-version: 0.2
-
-phases:
-  install:
-    runtime-versions:
-      python: 3.12
-    commands:
-      - echo "Setting up Python virtual environment..."
-      - python -m venv venv
-      - source venv/bin/activate
-      - echo "Installing dependencies..."
-      - pip install --upgrade pip
-      - pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
-
-  pre_build:
-    commands:
-      - echo "Pre-build phase - preparing environment..."
-
-  build:
-    commands:
-      - echo "Running the domain_spider"
-      - cd ./search_gov_crawler
-      - pkill scrapy
-      - scrapy crawl domain_spider
-      - echo "Finished all non-js domain_spider urls"
-      - scrapy crawl domain_spider_js
-      - echo "Finished all js domain_spider urls"
-
-  post_build:
-    commands:
-      - echo "Build completed!"
-      - deactivate
-
-artifacts:
-  files:
-    - '**/*'
-
-cache:
-  paths:
-    - '/root/.cache/pip'
diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index c1382d3..f34e980 100644
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -44,17 +44,16 @@ fi
 
 pip install virtualenv
 
-cd /home/ec2-user/python-flask-service
+cd /home/ec2-user/spider
 
 echo "Creating python3.12 virtual environment..."
-python3.12 -m venv /home/ec2-user/app/venv
-source /home/ec2-user/app/venv/bin/activate
+python3.12 -m venv /home/ec2-user/spider/venv
+source /home/ec2-user/spider/venv/bin/activate
 
 # Installing all spider dependencies
 echo "Installing dependencies..."
 pip install --upgrade pip
 
-# pip install -r /home/ec2-user/app/requirements.txt
 pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
 
 echo "Dependencies installed."
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 8e67ce7..0f181f2 100644
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,14 +1,9 @@
 #!/bin/bash
 
-echo "Activating virtual environment..."
-source /home/ec2-user/app/venv/bin/activate
-
 echo "Running searchgov-spider application..."
-# python /home/ec2-user/app/app.py
 
 pkill scrapy
 
-
 nohup scrapy crawl domain_spider &
 nohup scrapy crawl domain_spider_js &
 

From 06bab351fa50da83902a36bf789fae5e94e83cfe Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 10 Sep 2024 12:25:45 -0400
Subject: [PATCH 05/40] Resolved conflicts from multiple merged branches

---
 appspec.yml                 |  6 ++---
 cicd-scripts/app_install.sh | 47 ++++++++++++++++++-------------------
 cicd-scripts/app_start.sh   | 27 +++++++++++++++------
 cicd-scripts/app_stop.sh    | 26 ++++++++++++++++----
 4 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index 09660cb..0220a3d 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -21,14 +21,14 @@ permissions:
       - file
 hooks:
   AfterInstall:
-    - location: spider/cicd-scripts/app_install.sh
+    - location: /home/ec2-user/spider/cicd-scripts/app_install.sh
       timeout: 300
       runas: search
   ApplicationStart:
-    - location: spider/cicd-scripts/app_start.sh
+    - location: /home/ec2-user/spider/cicd-scripts/app_start.sh
       timeout: 300
       runas: search
   ApplicationStop:
-    - location: spider/cicd-scripts/app_stop.sh
+    - location: /home/ec2-user/spider/cicd-scripts/app_stop.sh
       timeout: 300
       runas: search
diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index f34e980..5c6577d 100644
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,25 +1,26 @@
 #!/bin/bash
 
-sudo apt-get update
-sudo apt-get upgrade
-sudo apt-get install python-setuptools
-sudo apt-get install python-pip
+# Update and upgrade the system without prompting for confirmation
+sudo apt-get update -y
+sudo apt-get upgrade -y
+
+# Install necessary system dependencies
+sudo apt-get install -y python-setuptools python-pip
 
 # Function to install Python 3.12
 install_python() {
     echo "Installing Python 3.12..."
-    sudo apt update
-    sudo apt install -y build-essential checkinstall
-    sudo apt install -y libreadline-gplv2-dev libncursesw5-dev libssl-dev \
-                        libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
-                        zlib1g-dev openssl libffi-dev
+    sudo apt-get install -y build-essential checkinstall libreadline-dev \
+                            libncursesw5-dev libssl-dev libsqlite3-dev \
+                            tk-dev libgdbm-dev libc6-dev libbz2-dev \
+                            zlib1g-dev openssl libffi-dev
 
     # Download Python 3.12 source code
     cd /usr/src
     sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz
     sudo tar xzf Python-3.12.0.tgz
 
-    # Build and install
+    # Build and install Python 3.12
     cd Python-3.12.0
     sudo ./configure --enable-optimizations
     sudo make altinstall
@@ -28,32 +29,30 @@ install_python() {
 }
 
 # Check if Python 3.12 is installed
-python_version=$(python3 --version 2>&1)
-
-if [[ $python_version == *"3.12"* ]]; then
-    echo "Python 3.12 is already installed: $python_version"
+if command -v python3.12 &>/dev/null; then
+    echo "Python 3.12 is already installed: $(python3.12 --version)"
 else
-    echo "Current Python version: $python_version"
-    echo "Installing Python 3.12..."
+    echo "Python 3.12 is not installed. Installing Python 3.12..."
     install_python
 fi
 
+# Install virtualenv using Python 3.12's pip
+sudo /usr/local/bin/python3.12 -m pip install --upgrade pip
+sudo /usr/local/bin/python3.12 -m pip install virtualenv
 
-
-# Creating python3.12 virtual env
-
-pip install virtualenv
-
+# Navigate to the spider directory
 cd /home/ec2-user/spider
 
+# Create a virtual environment using Python 3.12
 echo "Creating python3.12 virtual environment..."
-python3.12 -m venv /home/ec2-user/spider/venv
+/usr/local/bin/python3.12 -m venv /home/ec2-user/spider/venv
+
+# Activate the virtual environment
 source /home/ec2-user/spider/venv/bin/activate
 
-# Installing all spider dependencies
+# Install all spider dependencies
 echo "Installing dependencies..."
 pip install --upgrade pip
-
 pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
 
 echo "Dependencies installed."
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 0f181f2..c96f4a9 100644
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -2,13 +2,26 @@
 
 echo "Running searchgov-spider application..."
 
-pkill scrapy
+# Kill existing scrapy processes started by this script
+pkill -f "scrapy crawl domain_spider"
+pkill -f "scrapy crawl domain_spider_js"
 
-nohup scrapy crawl domain_spider &
-nohup scrapy crawl domain_spider_js &
+# Start the scrapy crawlers and redirect their outputs to separate files
+nohup scrapy crawl domain_spider > domain_spider.log 2>&1 &
+PID1=$!
+echo "Started domain_spider with PID $PID1"
 
-echo "\nCurrent running crawl jobs:"
-jobs
+nohup scrapy crawl domain_spider_js > domain_spider_js.log 2>&1 &
+PID2=$!
+echo "Started domain_spider_js with PID $PID2"
 
-echo "\noutput:"
-cat nohup.out
+# Display currently running scrapy processes
+echo -e "\nCurrent running scrapy processes:"
+ps -ef | grep scrapy | grep -v grep
+
+# Display the last few lines of the logs
+echo -e "\nLast few lines of domain_spider.log:"
+tail -n 10 domain_spider.log
+
+echo -e "\nLast few lines of domain_spider_js.log:"
+tail -n 10 domain_spider_js.log
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 03b209a..ed24e51 100644
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,10 +1,26 @@
 #!/bin/bash
 
-echo "Killing all scrapy tasks"
-pkill scrapy
+echo "Stopping all scrapy tasks..."
 
-# Kill all background jobs (by PID)
-jobs -p | grep -o -E '\s\d+\s' | xargs kill
+# Kill specific scrapy processes
+pkill -f "scrapy crawl domain_spider"
+pkill -f "scrapy crawl domain_spider_js"
 
-echo "\nBelow jobs list should be empty:"
+# Display remaining scrapy processes (if any)
+echo -e "\nRemaining scrapy processes (if any):"
+ps -ef | grep scrapy | grep -v grep
+
+# Check if there are any jobs still running (if started by this shell)
+bg_jobs=$(jobs -p)
+
+if [[ -n "$bg_jobs" ]]; then
+    echo "Killing all background jobs..."
+    # Kill all background jobs in this shell session
+    jobs -p | xargs kill
+else
+    echo "No background jobs to kill."
+fi
+
+# List background jobs to confirm they are terminated
+echo -e "\nBelow jobs list should be empty:"
 jobs

From f64f2504614c685f706423dddbd205bebc55129e Mon Sep 17 00:00:00 2001
From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com>
Date: Thu, 12 Sep 2024 11:23:16 -0700
Subject: [PATCH 06/40] Update appspec.yml

---
 appspec.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index 0220a3d..54c7793 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -1,34 +1,34 @@
 version: 0.0
 os: linux
-files:
-  - source: .
-    destination: /home/ec2-user/spider
+# files:
+#   - source: .
+#     destination: /home/ec2-user/spider
 permissions:
-  - object: /home/ec2-user/spider/cicd-scripts/app_install.sh
+  - object: cicd-scripts/app_install.sh
     owner: search
     mode: 755
     type:
       - file
-  - object: /home/ec2-user/spider/cicd-scripts/app_start.sh
+  - object: cicd-scripts/app_start.sh
     owner: search
     mode: 755
     type:
       - file
-  - object: /home/ec2-user/spider/cicd-scripts/app_stop.sh
+  - object: cicd-scripts/app_stop.sh
     owner: search
     mode: 755
     type:
       - file
 hooks:
   AfterInstall:
-    - location: /home/ec2-user/spider/cicd-scripts/app_install.sh
+    - location: cicd-scripts/app_install.sh
       timeout: 300
       runas: search
   ApplicationStart:
-    - location: /home/ec2-user/spider/cicd-scripts/app_start.sh
+    - location: cicd-scripts/app_start.sh
       timeout: 300
       runas: search
   ApplicationStop:
-    - location: /home/ec2-user/spider/cicd-scripts/app_stop.sh
+    - location: cicd-scripts/app_stop.sh
       timeout: 300
       runas: search

From db95d54a3fc1d9b8f95dd1ea7a9a142016797eab Mon Sep 17 00:00:00 2001
From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com>
Date: Mon, 16 Sep 2024 09:29:27 -0700
Subject: [PATCH 07/40] Update appspec.yml

---
 appspec.yml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index 54c7793..4ffe90f 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -3,22 +3,22 @@ os: linux
 # files:
 #   - source: .
 #     destination: /home/ec2-user/spider
-permissions:
-  - object: cicd-scripts/app_install.sh
-    owner: search
-    mode: 755
-    type:
-      - file
-  - object: cicd-scripts/app_start.sh
-    owner: search
-    mode: 755
-    type:
-      - file
-  - object: cicd-scripts/app_stop.sh
-    owner: search
-    mode: 755
-    type:
-      - file
+# permissions:
+#   - object: cicd-scripts/app_install.sh
+#     owner: search
+#     mode: 755
+#     type:
+#       - file
+#   - object: cicd-scripts/app_start.sh
+#     owner: search
+#     mode: 755
+#     type:
+#       - file
+#   - object: cicd-scripts/app_stop.sh
+#     owner: search
+#     mode: 755
+#     type:
+#       - file
 hooks:
   AfterInstall:
     - location: cicd-scripts/app_install.sh

From 3634607e4a10a248f44be1c9f654b4d1a3d5e788 Mon Sep 17 00:00:00 2001
From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:06:51 -0700
Subject: [PATCH 08/40] Update appspec.yml

---
 appspec.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index 4ffe90f..a8a0180 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -3,12 +3,12 @@ os: linux
 # files:
 #   - source: .
 #     destination: /home/ec2-user/spider
-# permissions:
-#   - object: cicd-scripts/app_install.sh
-#     owner: search
-#     mode: 755
-#     type:
-#       - file
+permissions:
+  - object: cicd-scripts
+    owner: search
+    mode: 755
+    type:
+      - directory
 #   - object: cicd-scripts/app_start.sh
 #     owner: search
 #     mode: 755

From 7d061ccbd046d05900b23cbfd689ba7a2e735938 Mon Sep 17 00:00:00 2001
From: sb-ebukaanene <98557321+sb-ebukaanene@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:08:05 -0700
Subject: [PATCH 09/40] Update appspec.yml

---
 appspec.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index a8a0180..576d4b1 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -3,12 +3,12 @@ os: linux
 # files:
 #   - source: .
 #     destination: /home/ec2-user/spider
-permissions:
-  - object: cicd-scripts
-    owner: search
-    mode: 755
-    type:
-      - directory
+# permissions:
+#   - object: cicd-scripts
+#     owner: search
+#     mode: 755
+#     type:
+#       - directory
 #   - object: cicd-scripts/app_start.sh
 #     owner: search
 #     mode: 755

From b6c97e357dbabfca8c3358a8084a341ee4613c21 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Mon, 23 Sep 2024 05:10:20 -0400
Subject: [PATCH 10/40] Fixed start and stop scripts

---
 cicd-scripts/app_start.sh | 103 +++++++++++++++++++++++++++++++-------
 cicd-scripts/app_stop.sh  |  22 +++++---
 2 files changed, 100 insertions(+), 25 deletions(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index c96f4a9..5104b95 100644
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,27 +1,94 @@
 #!/bin/bash
 
-echo "Running searchgov-spider application..."
+SCRAPYD_URL="http://127.0.0.1:6800/"
+SCRAPYDWEB_URL="http://127.0.0.1:5000/"
+CICD_SCRIPTS_BASE_DIR=$(pwd)
+
+# Function to check if a URL is up and running
+function check_url() {
+    local URL=$1
+    local MAX_ATTEMPTS=3
+    local DELAY=5
+    local attempt=1
+
+    while [ $attempt -le $MAX_ATTEMPTS ]; do
+        if curl --output /dev/null --silent --head --fail "$URL"; then
+            echo "Service at $URL is up on attempt $attempt."
+            return 0
+        else
+            echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..."
+        fi
+        attempt=$((attempt+1))
+        sleep $DELAY
+    done
+
+    echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts."
+    return 1
+}
+
+# Function to combine current directory with subdirectory and return absolute path
+function get_abs_path() {
+    local base_dir="$CICD_SCRIPTS_BASE_DIR"
+    local sub_dir="$1"
+
+    if [[ "$sub_dir" == /* ]]; then
+        echo "$sub_dir"
+    else
+        echo "$base_dir/$sub_dir"
+    fi
+}
 
-# Kill existing scrapy processes started by this script
-pkill -f "scrapy crawl domain_spider"
-pkill -f "scrapy crawl domain_spider_js"
+# Function to check if required command exists
+function check_command() {
+    if ! command -v "$1" &> /dev/null; then
+        echo "Error: $1 is not installed or not in your PATH."
+        exit 1
+    fi
+}
+
+check_command "scrapyd"
+check_command "scrapydweb"
+check_command "curl"
+
+scrapyd_dir=$(get_abs_path "../")
+scrapydweb_dir=$(get_abs_path "../search_gov_crawler")
+
+echo "Killing any existing scrapyd and scrapydweb services"
+pkill -f "scrapydweb" 2>/dev/null
+pkill -f "scrapyd" 2>/dev/null
+
+echo "Running searchgov-spider application..."
 
-# Start the scrapy crawlers and redirect their outputs to separate files
-nohup scrapy crawl domain_spider > domain_spider.log 2>&1 &
+# Start scrapyd
+echo "Starting scrapyd service..."
+cd "$scrapyd_dir"
+nohup scrapyd > /var/log/scrapyd.log 2>&1 &
 PID1=$!
-echo "Started domain_spider with PID $PID1"
+echo "Started scrapyd with PID $PID1"
 
-nohup scrapy crawl domain_spider_js > domain_spider_js.log 2>&1 &
-PID2=$!
-echo "Started domain_spider_js with PID $PID2"
+# Check if scrapyd is running
+if check_url "$SCRAPYD_URL"; then
+    echo "The scrapyd service is running at $SCRAPYD_URL"
+    cd "$scrapydweb_dir"
+    nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &
+    PID2=$!
+    echo "Started scrapydweb with PID $PID2"
 
-# Display currently running scrapy processes
-echo -e "\nCurrent running scrapy processes:"
-ps -ef | grep scrapy | grep -v grep
+    if check_url "$SCRAPYDWEB_URL"; then
+        echo "The scrapydweb service is running at $SCRAPYDWEB_URL"
+    else
+        echo "Error: scrapydweb failed at $SCRAPYDWEB_URL."
+        exit 1
+    fi
+else
+    echo "Error: scrapyd failed at $SCRAPYD_URL."
+    exit 1
+fi
 
-# Display the last few lines of the logs
-echo -e "\nLast few lines of domain_spider.log:"
-tail -n 10 domain_spider.log
+# Display the last few lines of logs
+echo -e "\n-- Last 10 lines of scrapyd.log:\n"
+tail -n 10 /var/log/scrapyd.log
 
-echo -e "\nLast few lines of domain_spider_js.log:"
-tail -n 10 domain_spider_js.log
+echo -e "\n-- Last 10 lines of scrapydweb.log:\n"
+tail -n 10 /var/log/scrapydweb.log
+exit 0
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index ed24e51..9073c8e 100644
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,22 +1,30 @@
 #!/bin/bash
 
-echo "Stopping all scrapy tasks..."
+echo "Stopping all scrapyd and scrapydweb tasks..."
+# pkill for scrapydweb and scrapyd
+if pkill -f "scrapydweb" 2>/dev/null; then
+    echo "scrapydweb tasks stopped."
+else
+    echo "No scrapydweb tasks running."
+fi
 
-# Kill specific scrapy processes
-pkill -f "scrapy crawl domain_spider"
-pkill -f "scrapy crawl domain_spider_js"
+if pkill -f "scrapyd" 2>/dev/null; then
+    echo "scrapyd tasks stopped."
+else
+    echo "No scrapyd tasks running."
+fi
 
 # Display remaining scrapy processes (if any)
 echo -e "\nRemaining scrapy processes (if any):"
-ps -ef | grep scrapy | grep -v grep
+ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running."
 
-# Check if there are any jobs still running (if started by this shell)
+# Check for any background jobs still running
 bg_jobs=$(jobs -p)
 
 if [[ -n "$bg_jobs" ]]; then
     echo "Killing all background jobs..."
     # Kill all background jobs in this shell session
-    jobs -p | xargs kill
+    jobs -p | xargs -r kill
 else
     echo "No background jobs to kill."
 fi

From ef63ab0d04969c70bbe1f9f7226f40250756a8b6 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 24 Sep 2024 10:55:47 -0400
Subject: [PATCH 11/40] Added ansible workaround

---
 cicd-scripts/app_install.sh | 53 ++++++++++++++++++++++++-------------
 cicd-scripts/app_start.sh   | 13 ++++++---
 cicd-scripts/app_stop.sh    | 22 ++++-----------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 5c6577d..dc5c508 100644
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,5 +1,23 @@
 #!/bin/bash
 
+
+# A hack to get the environment running without ansible local env variables
+# This block of code will eventually be removed once ansible task is completed
+SPIDER_PYTHON_VERSION=3.12
+SPIDER_STAGING_URLS_API=https://staging.search.usa.gov/urls
+spider_local_path=/etc/profile.d/spider_local.sh
+
+# Writing environment variables to the profile file
+echo "
+export SPIDER_PYTHON_VERSION=${SPIDER_PYTHON_VERSION}
+export SPIDER_STAGING_URLS_API=${SPIDER_STAGING_URLS_API}
+" | tee "$spider_local_path" > /dev/null
+
+# Source the script to update the current shell's environment
+source "$spider_local_path"
+### TODO: Remove the above code block after ansible is fully implmented
+
+
 # Update and upgrade the system without prompting for confirmation
 sudo apt-get update -y
 sudo apt-get upgrade -y
@@ -7,45 +25,44 @@ sudo apt-get upgrade -y
 # Install necessary system dependencies
 sudo apt-get install -y python-setuptools python-pip
 
-# Function to install Python 3.12
 install_python() {
-    echo "Installing Python 3.12..."
+    echo "Installing ${SPIDER_PYTHON_VERSION}"
     sudo apt-get install -y build-essential checkinstall libreadline-dev \
                             libncursesw5-dev libssl-dev libsqlite3-dev \
                             tk-dev libgdbm-dev libc6-dev libbz2-dev \
                             zlib1g-dev openssl libffi-dev
 
-    # Download Python 3.12 source code
+    # Download Python source code
     cd /usr/src
-    sudo wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz
-    sudo tar xzf Python-3.12.0.tgz
+    sudo wget https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
+    sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz
 
-    # Build and install Python 3.12
-    cd Python-3.12.0
+    # Build and install Python
+    cd Python-${SPIDER_PYTHON_VERSION}.0
     sudo ./configure --enable-optimizations
     sudo make altinstall
 
-    echo "Python 3.12 has been installed."
+    echo "Python ${SPIDER_PYTHON_VERSION} has been installed."
 }
 
-# Check if Python 3.12 is installed
-if command -v python3.12 &>/dev/null; then
-    echo "Python 3.12 is already installed: $(python3.12 --version)"
+# Check if Python is installed
+if command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then
+    echo "Python ${SPIDER_PYTHON_VERSION} is already installed: $(python${SPIDER_PYTHON_VERSION} --version)"
 else
-    echo "Python 3.12 is not installed. Installing Python 3.12..."
+    echo "Python ${SPIDER_PYTHON_VERSION} is not installed. Installing Python ${SPIDER_PYTHON_VERSION}..."
     install_python
 fi
 
-# Install virtualenv using Python 3.12's pip
-sudo /usr/local/bin/python3.12 -m pip install --upgrade pip
-sudo /usr/local/bin/python3.12 -m pip install virtualenv
+# Install virtualenv using Python pip
+sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
+sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv
 
 # Navigate to the spider directory
 cd /home/ec2-user/spider
 
-# Create a virtual environment using Python 3.12
-echo "Creating python3.12 virtual environment..."
-/usr/local/bin/python3.12 -m venv /home/ec2-user/spider/venv
+# Create a virtual environment using Python
+echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..."
+/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv /home/ec2-user/spider/venv
 
 # Activate the virtual environment
 source /home/ec2-user/spider/venv/bin/activate
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 5104b95..7319165 100644
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -54,15 +54,15 @@ scrapyd_dir=$(get_abs_path "../")
 scrapydweb_dir=$(get_abs_path "../search_gov_crawler")
 
 echo "Killing any existing scrapyd and scrapydweb services"
-pkill -f "scrapydweb" 2>/dev/null
-pkill -f "scrapyd" 2>/dev/null
+sudo pkill -f "scrapydweb" 2>/dev/null
+sudo pkill -f "scrapyd" 2>/dev/null
 
 echo "Running searchgov-spider application..."
 
 # Start scrapyd
 echo "Starting scrapyd service..."
 cd "$scrapyd_dir"
-nohup scrapyd > /var/log/scrapyd.log 2>&1 &
+sudo nohup scrapyd > /var/log/scrapyd.log 2>&1 &
 PID1=$!
 echo "Started scrapyd with PID $PID1"
 
@@ -70,7 +70,7 @@ echo "Started scrapyd with PID $PID1"
 if check_url "$SCRAPYD_URL"; then
     echo "The scrapyd service is running at $SCRAPYD_URL"
     cd "$scrapydweb_dir"
-    nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &
+    sudo nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &
     PID2=$!
     echo "Started scrapydweb with PID $PID2"
 
@@ -85,6 +85,11 @@ else
     exit 1
 fi
 
+# Add startup cron for this script:
+echo "
+export LATEST_SPIDER_CICD_DEPLOY_PATH=$(CICD_SCRIPTS_BASE_DIR)
+" | tee '/etc/profile.d/spider_env.sh' > /dev/null
+
 # Display the last few lines of logs
 echo -e "\n-- Last 10 lines of scrapyd.log:\n"
 tail -n 10 /var/log/scrapyd.log
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 9073c8e..7b1e941 100644
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
 echo "Stopping all scrapyd and scrapydweb tasks..."
-# pkill for scrapydweb and scrapyd
-if pkill -f "scrapydweb" 2>/dev/null; then
+# Kill all scrapydweb and scrapyd jobs
+if sudo pkill -f "scrapydweb" 2>/dev/null; then
     echo "scrapydweb tasks stopped."
 else
     echo "No scrapydweb tasks running."
 fi
 
-if pkill -f "scrapyd" 2>/dev/null; then
+if sudo pkill -f "scrapyd" 2>/dev/null; then
     echo "scrapyd tasks stopped."
 else
     echo "No scrapyd tasks running."
@@ -18,17 +18,5 @@ fi
 echo -e "\nRemaining scrapy processes (if any):"
 ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running."
 
-# Check for any background jobs still running
-bg_jobs=$(jobs -p)
-
-if [[ -n "$bg_jobs" ]]; then
-    echo "Killing all background jobs..."
-    # Kill all background jobs in this shell session
-    jobs -p | xargs -r kill
-else
-    echo "No background jobs to kill."
-fi
-
-# List background jobs to confirm they are terminated
-echo -e "\nBelow jobs list should be empty:"
-jobs
+# Force kill any remaning scrapy background jobs still running
+sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9

From 44034fc2141e55f706cc3bcf2241fc7dcbf3ba6d Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Mon, 30 Sep 2024 12:27:20 -0400
Subject: [PATCH 12/40] Fixed cron jobs

---
 appspec.yml                 | 31 ++++++++--------------
 cicd-scripts/app_install.sh | 53 +++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/appspec.yml b/appspec.yml
index 576d4b1..ccb3c07 100644
--- a/appspec.yml
+++ b/appspec.yml
@@ -1,28 +1,19 @@
 version: 0.0
 os: linux
-# files:
-#   - source: .
-#     destination: /home/ec2-user/spider
-# permissions:
-#   - object: cicd-scripts
-#     owner: search
-#     mode: 755
-#     type:
-#       - directory
-#   - object: cicd-scripts/app_start.sh
-#     owner: search
-#     mode: 755
-#     type:
-#       - file
-#   - object: cicd-scripts/app_stop.sh
-#     owner: search
-#     mode: 755
-#     type:
-#       - file
+permissions:
+  - object: .
+    mode: 755
+    acls:
+      - "d:u::rwx"
+      - "d:g::rwx"
+      - "d:o::rwx"
+    owner: search
+    type:
+      - directory
 hooks:
   AfterInstall:
     - location: cicd-scripts/app_install.sh
-      timeout: 300
+      timeout: 600
       runas: search
   ApplicationStart:
     - location: cicd-scripts/app_start.sh
diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index dc5c508..f8ea203 100644
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# CICD scripts can only runas 'search' user on AWS
+if [ "$(whoami)" == "search" ]; then
+  echo "Executing cicd scripts as 'search' user"
+else
+  echo "This script must be executed as 'search' user"
+  return 1
+fi
+
+sudo apt install acl -y
+
+# Required to give all app_* bash scripts read/write permissions to self and parent.
+# Give current directory and all its files rw permissions
+sudo chmod -R 755 .
+# All new files/directories will inherit rwx (required when installing and using sqllite)
+sudo setfacl -Rdm g:dgsearch:rwx .
 
 # A hack to get the environment running without ansible local env variables
 # This block of code will eventually be removed once ansible task is completed
@@ -57,15 +72,12 @@ fi
 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv
 
-# Navigate to the spider directory
-cd /home/ec2-user/spider
-
 # Create a virtual environment using Python
 echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..."
-/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv /home/ec2-user/spider/venv
+/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv
 
 # Activate the virtual environment
-source /home/ec2-user/spider/venv/bin/activate
+source ./venv/bin/activate
 
 # Install all spider dependencies
 echo "Installing dependencies..."
@@ -73,3 +85,34 @@ pip install --upgrade pip
 pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
 
 echo "Dependencies installed."
+
+
+# Remove any outstanding app_start.sh reboot cronjobs
+echo "Removing any app_start.sh reboot cron jobs..."
+crontab -l > cron_backup.bak
+
+# Remove lines containing 'app_start.sh' and update crontab
+crontab -l | grep -v 'app_start.sh' > cron_backup_filtered
+
+# Check if there are changes
+if cmp -s cron_backup_filtered cron_backup.bak; then
+  echo "No cron jobs with 'app_start.sh' found."
+else
+  crontab cron_backup_filtered
+  echo "Cron jobs containing 'app_start.sh' have been removed."
+fi
+
+# Clean up temporary files
+rm cron_backup_filtered cron_backup.bak
+
+# Add cron job to run the app back up on ec2 restart
+echo "Adding app_start.sh reboot cron job..."
+sudo chmod +x ./cicd-scripts/app_start.sh
+
+# Define the new cron job
+new_cron="@reboot at now + 1 min -f ${pwd}/cicd-scripts/app_start.sh"
+
+# Add the new cron job to the crontab if it's not already present
+(crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab -
+
+echo "Cron job added: $new_cron"

From ef4e3f5724acb22066819aaa332db69ca0c18590 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Mon, 7 Oct 2024 15:56:28 -0400
Subject: [PATCH 13/40] ...

---
 cicd-scripts/app_install.sh | 41 ++++++++++++-------------------------
 cicd-scripts/app_start.sh   | 39 ++++++++++++-----------------------
 cicd-scripts/app_stop.sh    |  0
 3 files changed, 26 insertions(+), 54 deletions(-)
 mode change 100644 => 100755 cicd-scripts/app_install.sh
 mode change 100644 => 100755 cicd-scripts/app_start.sh
 mode change 100644 => 100755 cicd-scripts/app_stop.sh

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
old mode 100644
new mode 100755
index f8ea203..c81f3e9
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,13 +1,18 @@
 #!/bin/bash
 
 # CICD scripts can only runas 'search' user on AWS
-if [ "$(whoami)" == "search" ]; then
+if [ "$(whoami)" = "search" ]; then
   echo "Executing cicd scripts as 'search' user"
 else
   echo "This script must be executed as 'search' user"
-  return 1
+  exit 1
 fi
 
+SPIDER_PYTHON_VERSION=3.12
+
+# Update and upgrade the system without prompting for confirmation
+sudo apt-get update -y
+sudo apt-get upgrade -y
 sudo apt install acl -y
 
 # Required to give all app_* bash scripts read/write permissions to self and parent.
@@ -16,29 +21,9 @@ sudo chmod -R 755 .
 # All new files/directories will inherit rwx (required when installing and using sqllite)
 sudo setfacl -Rdm g:dgsearch:rwx .
 
-# A hack to get the environment running without ansible local env variables
-# This block of code will eventually be removed once ansible task is completed
-SPIDER_PYTHON_VERSION=3.12
-SPIDER_STAGING_URLS_API=https://staging.search.usa.gov/urls
-spider_local_path=/etc/profile.d/spider_local.sh
-
-# Writing environment variables to the profile file
-echo "
-export SPIDER_PYTHON_VERSION=${SPIDER_PYTHON_VERSION}
-export SPIDER_STAGING_URLS_API=${SPIDER_STAGING_URLS_API}
-" | tee "$spider_local_path" > /dev/null
-
-# Source the script to update the current shell's environment
-source "$spider_local_path"
-### TODO: Remove the above code block after ansible is fully implmented
-
-
-# Update and upgrade the system without prompting for confirmation
-sudo apt-get update -y
-sudo apt-get upgrade -y
 
 # Install necessary system dependencies
-sudo apt-get install -y python-setuptools python-pip
+sudo apt-get install -y python-setuptools
 
 install_python() {
     echo "Installing ${SPIDER_PYTHON_VERSION}"
@@ -73,8 +58,8 @@ sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv
 
 # Create a virtual environment using Python
-echo "Creating pytho${SPIDER_PYTHON_VERSION} virtual environment..."
-/usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv
+echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..."
+sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv
 
 # Activate the virtual environment
 source ./venv/bin/activate
@@ -82,7 +67,7 @@ source ./venv/bin/activate
 # Install all spider dependencies
 echo "Installing dependencies..."
 pip install --upgrade pip
-pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
+sudo pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
 
 echo "Dependencies installed."
 
@@ -98,7 +83,7 @@ crontab -l | grep -v 'app_start.sh' > cron_backup_filtered
 if cmp -s cron_backup_filtered cron_backup.bak; then
   echo "No cron jobs with 'app_start.sh' found."
 else
-  crontab cron_backup_filtered
+  sudo crontab cron_backup_filtered
   echo "Cron jobs containing 'app_start.sh' have been removed."
 fi
 
@@ -110,7 +95,7 @@ echo "Adding app_start.sh reboot cron job..."
 sudo chmod +x ./cicd-scripts/app_start.sh
 
 # Define the new cron job
-new_cron="@reboot at now + 1 min -f ${pwd}/cicd-scripts/app_start.sh"
+new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh"
 
 # Add the new cron job to the crontab if it's not already present
 (crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab -
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
old mode 100644
new mode 100755
index 7319165..4121067
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -2,12 +2,12 @@
 
 SCRAPYD_URL="http://127.0.0.1:6800/"
 SCRAPYDWEB_URL="http://127.0.0.1:5000/"
-CICD_SCRIPTS_BASE_DIR=$(pwd)
+SPIDER_URLS_API=https://staging.search.usa.gov/urls
 
 # Function to check if a URL is up and running
 function check_url() {
     local URL=$1
-    local MAX_ATTEMPTS=3
+    local MAX_ATTEMPTS="${2:-3}"
     local DELAY=5
     local attempt=1
 
@@ -26,18 +26,6 @@ function check_url() {
     return 1
 }
 
-# Function to combine current directory with subdirectory and return absolute path
-function get_abs_path() {
-    local base_dir="$CICD_SCRIPTS_BASE_DIR"
-    local sub_dir="$1"
-
-    if [[ "$sub_dir" == /* ]]; then
-        echo "$sub_dir"
-    else
-        echo "$base_dir/$sub_dir"
-    fi
-}
-
 # Function to check if required command exists
 function check_command() {
     if ! command -v "$1" &> /dev/null; then
@@ -50,27 +38,31 @@ check_command "scrapyd"
 check_command "scrapydweb"
 check_command "curl"
 
-scrapyd_dir=$(get_abs_path "../")
-scrapydweb_dir=$(get_abs_path "../search_gov_crawler")
-
 echo "Killing any existing scrapyd and scrapydweb services"
 sudo pkill -f "scrapydweb" 2>/dev/null
 sudo pkill -f "scrapyd" 2>/dev/null
 
+# Check search-gov /urls endpoint
+echo "Checking search-gov /urls api..."
+if check_url "$SPIDER_URLS_API"; then
+    echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API"
+else
+    echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API"
+    exit 1
+fi
+
 echo "Running searchgov-spider application..."
 
 # Start scrapyd
 echo "Starting scrapyd service..."
-cd "$scrapyd_dir"
-sudo nohup scrapyd > /var/log/scrapyd.log 2>&1 &
+sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &'
 PID1=$!
 echo "Started scrapyd with PID $PID1"
 
 # Check if scrapyd is running
 if check_url "$SCRAPYD_URL"; then
     echo "The scrapyd service is running at $SCRAPYD_URL"
-    cd "$scrapydweb_dir"
-    sudo nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &
+    sudo bash -c 'nohup cd ./search_gov_crawler && scrapydweb > /var/log/scrapydweb.log 2>&1 &'
     PID2=$!
     echo "Started scrapydweb with PID $PID2"
 
@@ -85,11 +77,6 @@ else
     exit 1
 fi
 
-# Add startup cron for this script:
-echo "
-export LATEST_SPIDER_CICD_DEPLOY_PATH=$(CICD_SCRIPTS_BASE_DIR)
-" | tee '/etc/profile.d/spider_env.sh' > /dev/null
-
 # Display the last few lines of logs
 echo -e "\n-- Last 10 lines of scrapyd.log:\n"
 tail -n 10 /var/log/scrapyd.log
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
old mode 100644
new mode 100755

From 1453d080a850b30bc2f6c4d6582aea97a5d79211 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 8 Oct 2024 09:11:27 -0400
Subject: [PATCH 14/40] merged all orphan branches

---
 cicd-scripts/app_start.sh                     |  2 +-
 .../search_gov_spiders/pipelines.py           | 90 +++++++++++++------
 2 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 4121067..5fec3ce 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -62,7 +62,7 @@ echo "Started scrapyd with PID $PID1"
 # Check if scrapyd is running
 if check_url "$SCRAPYD_URL"; then
     echo "The scrapyd service is running at $SCRAPYD_URL"
-    sudo bash -c 'nohup cd ./search_gov_crawler && scrapydweb > /var/log/scrapydweb.log 2>&1 &'
+    sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &'
     PID2=$!
     echo "Started scrapydweb with PID $PID2"
 
diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py
index b2b0e54..64099b0 100644
--- a/search_gov_crawler/search_gov_spiders/pipelines.py
+++ b/search_gov_crawler/search_gov_spiders/pipelines.py
@@ -4,6 +4,7 @@
 """
 
 import os
+import requests
 from pathlib import Path
 
 from scrapy.exceptions import DropItem
@@ -11,40 +12,79 @@
 
 class SearchGovSpidersPipeline:
     """
-    Class for pipeline that takes items and adds them
-    to output file with a max size of 3.9MB
+    Pipeline that either writes items to an output file with a max size of 3.9MB
+    or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once
+    the size limit is reached.
     """
 
+    MAX_FILE_SIZE_MB = 3.9  # max size in MB
+    MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024)  # convert to bytes
+
     def __init__(self, *_args, **_kwargs):
-        self.current_file_size = 0
-        self.file_number = 1
-        self.parent_file_path = Path(__file__).parent.parent.resolve()
-        self.base_path_name = str(self.parent_file_path / "output/all-links.csv")
-        self.short_file = open(self.base_path_name, "w", encoding="utf-8")
-        self.max_file_size = 3900
-        self.paginate = True
+        self.api_url = os.environ.get("SPIDER_URLS_API")
+        if not self.api_url:
+            self.file_number = 1
+            self.parent_file_path = Path(__file__).parent.parent.resolve()
+            self.base_file_name = self.parent_file_path / "output" / "all-links.csv"
+            self.file_path = self.base_file_name
+            self.current_file = open(self.file_path, "w", encoding="utf-8")
+        else:
+            self.urls_batch = []
 
     def process_item(self, item, _spider):
-        """Checks that the file is not at max size.
-        Adds it to the file if less, or creates a new file if too large."""
+        """Process item either by writing to file or by posting to API."""
+
+        line = item.get("url", "") + "\n"
+        line_size = len(line.encode('utf-8'))
 
-        line = item["url"]
-        self.current_file_size += 1
-        next_file_size = self.current_file_size + len(line)
-        if self.paginate and next_file_size > self.max_file_size:
-            self.short_file.close()
-            new_name = str(self.parent_file_path / f"output/all-links{self.file_number}.csv")
-            os.rename(self.base_path_name, new_name)
-            self.file_number = self.file_number + 1
-            self.short_file = open(self.base_path_name, "w", encoding="utf-8")
-            self.current_file_size = 0
-
-        self.short_file.write(line)
-        self.short_file.write("\n")
-        self.current_file_size = self.current_file_size + len(line)
+        # If API URL is set, batch URLs and send a POST request when max size is reached
+        if self.api_url:
+            self.urls_batch.append(item.get("url", ""))
+            if self._is_batch_too_large(line_size):
+                self._post_urls()
+        # Otherwise, write to file and rotate if needed
+        else:
+            self.current_file.write(line)
+            if self._is_file_too_large(line_size):
+                self._rotate_file()
 
         return item
 
+    def _is_batch_too_large(self, new_entry_size):
+        current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch)
+        return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
+
+    def _is_file_too_large(self, new_entry_size):
+        self.current_file.flush()
+        current_file_size = self.file_path.stat().st_size
+        return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
+
+    def _rotate_file(self):
+        """Close current file, rename it, and open a new one for continued writing."""
+        self.current_file.close()
+        new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv"
+        os.rename(self.file_path, new_file_path)
+        self.file_number += 1
+        self.current_file = open(self.file_path, "w", encoding="utf-8")
+
+    def _post_urls(self):
+        """Send a POST request with the batch of URLs if any exist."""
+        if self.urls_batch:
+            try:
+                response = requests.post(self.api_url, json={"urls": self.urls_batch})
+                response.raise_for_status()
+                print(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.")
+            except requests.exceptions.RequestException as e:
+                print(f"Failed to send URLs to {self.api_url}: {e}")
+            finally:
+                self.urls_batch.clear()
+
+    def close_spider(self, _spider):
+        """Close the file or send remaining URLs if needed when the spider finishes."""
+        if not self.api_url and self.current_file:
+            self.current_file.close()
+        elif self.api_url:
+            self._post_urls()  # Send any remaining URLs on spider close
 
 class DeDeuplicatorPipeline:
     """Class for pipeline that removes duplicate items"""

From 8911dc57c570961680129d2f676f52300cbb5eeb Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 8 Oct 2024 13:46:06 -0400
Subject: [PATCH 15/40] code review feedback

---
 search_gov_crawler/search_gov_spiders/pipelines.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py
index 64099b0..001080c 100644
--- a/search_gov_crawler/search_gov_spiders/pipelines.py
+++ b/search_gov_crawler/search_gov_spiders/pipelines.py
@@ -31,7 +31,7 @@ def __init__(self, *_args, **_kwargs):
         else:
             self.urls_batch = []
 
-    def process_item(self, item, _spider):
+    def process_item(self, item, spider):
         """Process item either by writing to file or by posting to API."""
 
         line = item.get("url", "") + "\n"
@@ -41,7 +41,7 @@ def process_item(self, item, _spider):
         if self.api_url:
             self.urls_batch.append(item.get("url", ""))
             if self._is_batch_too_large(line_size):
-                self._post_urls()
+                self._post_urls(spider)
         # Otherwise, write to file and rotate if needed
         else:
             self.current_file.write(line)
@@ -67,15 +67,15 @@ def _rotate_file(self):
         self.file_number += 1
         self.current_file = open(self.file_path, "w", encoding="utf-8")
 
-    def _post_urls(self):
+    def _post_urls(self, spider):
         """Send a POST request with the batch of URLs if any exist."""
         if self.urls_batch:
             try:
                 response = requests.post(self.api_url, json={"urls": self.urls_batch})
                 response.raise_for_status()
-                print(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.")
+                spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.")
             except requests.exceptions.RequestException as e:
-                print(f"Failed to send URLs to {self.api_url}: {e}")
+                raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}")
             finally:
                 self.urls_batch.clear()
 

From 2209f41d87493eeed00133847c2b66cd52c1e3f3 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 8 Oct 2024 13:51:35 -0400
Subject: [PATCH 16/40] removed virtualenv install

---
 cicd-scripts/app_install.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index c81f3e9..0e9dc63 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -53,9 +53,8 @@ else
     install_python
 fi
 
-# Install virtualenv using Python pip
+# Use venv with Python 3.12
 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
-sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install virtualenv
 
 # Create a virtual environment using Python
 echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..."

From b855df3a697c00068f213ae171fa8af4db3201e2 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 16 Oct 2024 12:27:34 -0400
Subject: [PATCH 17/40] Added unit tests

---
 .../test_urls_files_size.py                   | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/search_gov_spiders/test_urls_files_size.py

diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
new file mode 100644
index 0000000..4e9da22
--- /dev/null
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -0,0 +1,184 @@
+import os
+import pytest
+from unittest.mock import MagicMock
+from scrapy import Item
+# from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline
+
+import os
+import requests
+from pathlib import Path
+
+from scrapy.exceptions import DropItem
+
+
+class SearchGovSpidersPipeline:
+    """
+    Pipeline that either writes items to an output file with a max size of 3.9MB
+    or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once
+    the size limit is reached.
+    """
+
+    MAX_FILE_SIZE_MB = 3.9  # max size in MB
+    MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024)  # convert to bytes
+
+    def __init__(self, *_args, **_kwargs):
+        self.api_url = os.environ.get("SPIDER_URLS_API")
+        if not self.api_url:
+            self.file_number = 1
+            self.parent_file_path = Path(__file__).parent.parent.resolve()
+            self.base_file_name = self.parent_file_path / "output" / "all-links.csv"
+            self.file_path = self.base_file_name
+            self.current_file = open(self.file_path, "w", encoding="utf-8")
+        else:
+            self.urls_batch = []
+
+    def process_item(self, item, spider):
+        """Process item either by writing to file or by posting to API."""
+
+        line = item.get("url", "") + "\n"
+        line_size = len(line.encode('utf-8'))
+
+        # If API URL is set, batch URLs and send a POST request when max size is reached
+        if self.api_url:
+            self.urls_batch.append(item.get("url", ""))
+            if self._is_batch_too_large(line_size):
+                self._post_urls(spider)
+        # Otherwise, write to file and rotate if needed
+        else:
+            self.current_file.write(line)
+            if self._is_file_too_large(line_size):
+                self._rotate_file()
+
+        return item
+
+    def _is_batch_too_large(self, new_entry_size):
+        current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch)
+        return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
+
+    def _is_file_too_large(self, new_entry_size):
+        self.current_file.flush()
+        current_file_size = self.file_path.stat().st_size
+        return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
+
+    def _rotate_file(self):
+        """Close current file, rename it, and open a new one for continued writing."""
+        self.current_file.close()
+        new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv"
+        os.rename(self.file_path, new_file_path)
+        self.file_number += 1
+        self.current_file = open(self.file_path, "w", encoding="utf-8")
+
+    def _post_urls(self, spider):
+        """Send a POST request with the batch of URLs if any exist."""
+        if self.urls_batch:
+            try:
+                response = requests.post(self.api_url, json={"urls": self.urls_batch})
+                response.raise_for_status()
+                spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.")
+            except requests.exceptions.RequestException as e:
+                raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}")
+            finally:
+                self.urls_batch.clear()
+
+    def close_spider(self, _spider):
+        """Close the file or send remaining URLs if needed when the spider finishes."""
+        if not self.api_url and self.current_file:
+            self.current_file.close()
+        elif self.api_url:
+            self._post_urls()  # Send any remaining URLs on spider close
+
+class DeDeuplicatorPipeline:
+    """Class for pipeline that removes duplicate items"""
+
+    itemlist = []
+
+    def process_item(self, item, _spider):
+        """Checks that the file is not at max size.
+        Adds it to the file if less, or creates a new file if too large."""
+        if item in self.itemlist:
+            raise DropItem("already in list")
+        self.itemlist.append(item)
+
+        return item
+
+
+@pytest.fixture
+def sample_item():
+    """Fixture for a sample item with a URL."""
+    item = Item()
+    item['url'] = "http://example.com"
+    return item
+
+@pytest.fixture
+def pipeline_no_api(mocker):
+    """Fixture for pipeline with no API URL set."""
+    mocker.patch('os.getenv', return_value=None)
+    return SearchGovSpidersPipeline()
+
+@pytest.fixture
+def pipeline_with_api(mocker):
+    """Fixture for pipeline with an API URL set."""
+    mocker.patch('os.getenv', return_value="http://mockapi.com")
+    return SearchGovSpidersPipeline()
+
+def test_write_to_file(pipeline_no_api, sample_item, mocker):
+    """Test that URLs are written to files when SPIDER_URLS_API is not set."""
+    mock_open = mocker.patch('open', mocker.mock_open())
+
+    pipeline_no_api.process_item(sample_item, None)
+
+    # Ensure file is opened and written to
+    mock_open.assert_called_once_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8')
+    mock_open().write.assert_any_call(sample_item['url'] + "\n")
+
+def test_post_to_api(pipeline_with_api, sample_item, mocker):
+    """Test that URLs are batched and sent via POST when SPIDER_URLS_API is set."""
+    mock_post = mocker.patch('requests.post')
+
+    pipeline_with_api.process_item(sample_item, None)
+
+    # Check that the batch contains the URL
+    assert sample_item['url'] in pipeline_with_api.urls_batch
+
+    # Simulate max size to force post
+    mocker.patch.object(SearchGovSpidersPipeline, '_is_batch_too_large', return_value=True)
+    pipeline_with_api.process_item(sample_item, None)
+
+    # Ensure POST request was made
+    mock_post.assert_called_once_with("http://mockapi.com", json={"urls": pipeline_with_api.urls_batch})
+
+def test_rotate_file(pipeline_no_api, sample_item, mocker):
+    """Test that file rotation occurs when max size is exceeded."""
+    mock_open = mocker.patch('open', mocker.mock_open())
+    mock_rename = mocker.patch('os.rename')
+
+    mocker.patch.object(SearchGovSpidersPipeline, '_is_file_too_large', return_value=True)
+    pipeline_no_api.process_item(sample_item, None)
+
+    # Check if the file was rotated
+    mock_open.assert_called_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8')
+    mock_open().close.assert_called()
+    mock_rename.assert_called_once_with(
+        pipeline_no_api.file_path,
+        pipeline_no_api.parent_file_path / "output/all-links-1.csv"
+    )
+
+def test_post_urls_on_spider_close(pipeline_with_api, mocker):
+    """Test that remaining URLs are posted when spider closes and SPIDER_URLS_API is set."""
+    mock_post = mocker.patch('requests.post')
+
+    pipeline_with_api.urls_batch = ["http://example.com"]
+
+    pipeline_with_api.close_spider(None)
+
+    # Ensure POST request was made on spider close
+    mock_post.assert_called_once_with("http://mockapi.com", json={"urls": ["http://example.com"]})
+
+def test_close_file_on_spider_close(pipeline_no_api, mocker):
+    """Test that the file is closed when the spider closes and no SPIDER_URLS_API is set."""
+    mock_open = mocker.patch('open', mocker.mock_open())
+
+    pipeline_no_api.close_spider(None)
+
+    # Ensure the file is closed
+    mock_open().close.assert_called_once()

From 5aaf9f77b453c74edaeda6729d8231907cc54339 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 16 Oct 2024 12:47:03 -0400
Subject: [PATCH 18/40] ...

---
 .../test_urls_files_size.py                   | 100 +-----------------
 1 file changed, 1 insertion(+), 99 deletions(-)

diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index 4e9da22..f0d5b30 100644
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -2,105 +2,7 @@
 import pytest
 from unittest.mock import MagicMock
 from scrapy import Item
-# from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline
-
-import os
-import requests
-from pathlib import Path
-
-from scrapy.exceptions import DropItem
-
-
-class SearchGovSpidersPipeline:
-    """
-    Pipeline that either writes items to an output file with a max size of 3.9MB
-    or, if SPIDER_URLS_API is set, sends a POST request with a list of URLs once
-    the size limit is reached.
-    """
-
-    MAX_FILE_SIZE_MB = 3.9  # max size in MB
-    MAX_FILE_SIZE_BYTES = int(MAX_FILE_SIZE_MB * 1024 * 1024)  # convert to bytes
-
-    def __init__(self, *_args, **_kwargs):
-        self.api_url = os.environ.get("SPIDER_URLS_API")
-        if not self.api_url:
-            self.file_number = 1
-            self.parent_file_path = Path(__file__).parent.parent.resolve()
-            self.base_file_name = self.parent_file_path / "output" / "all-links.csv"
-            self.file_path = self.base_file_name
-            self.current_file = open(self.file_path, "w", encoding="utf-8")
-        else:
-            self.urls_batch = []
-
-    def process_item(self, item, spider):
-        """Process item either by writing to file or by posting to API."""
-
-        line = item.get("url", "") + "\n"
-        line_size = len(line.encode('utf-8'))
-
-        # If API URL is set, batch URLs and send a POST request when max size is reached
-        if self.api_url:
-            self.urls_batch.append(item.get("url", ""))
-            if self._is_batch_too_large(line_size):
-                self._post_urls(spider)
-        # Otherwise, write to file and rotate if needed
-        else:
-            self.current_file.write(line)
-            if self._is_file_too_large(line_size):
-                self._rotate_file()
-
-        return item
-
-    def _is_batch_too_large(self, new_entry_size):
-        current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch)
-        return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
-
-    def _is_file_too_large(self, new_entry_size):
-        self.current_file.flush()
-        current_file_size = self.file_path.stat().st_size
-        return (current_file_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
-
-    def _rotate_file(self):
-        """Close current file, rename it, and open a new one for continued writing."""
-        self.current_file.close()
-        new_file_path = self.parent_file_path / f"output/all-links-{self.file_number}.csv"
-        os.rename(self.file_path, new_file_path)
-        self.file_number += 1
-        self.current_file = open(self.file_path, "w", encoding="utf-8")
-
-    def _post_urls(self, spider):
-        """Send a POST request with the batch of URLs if any exist."""
-        if self.urls_batch:
-            try:
-                response = requests.post(self.api_url, json={"urls": self.urls_batch})
-                response.raise_for_status()
-                spider.logger.info(f"Successfully posted {len(self.urls_batch)} URLs to {self.api_url}.")
-            except requests.exceptions.RequestException as e:
-                raise SystemExit(f"Failed to send URLs to {self.api_url}: {e}")
-            finally:
-                self.urls_batch.clear()
-
-    def close_spider(self, _spider):
-        """Close the file or send remaining URLs if needed when the spider finishes."""
-        if not self.api_url and self.current_file:
-            self.current_file.close()
-        elif self.api_url:
-            self._post_urls()  # Send any remaining URLs on spider close
-
-class DeDeuplicatorPipeline:
-    """Class for pipeline that removes duplicate items"""
-
-    itemlist = []
-
-    def process_item(self, item, _spider):
-        """Checks that the file is not at max size.
-        Adds it to the file if less, or creates a new file if too large."""
-        if item in self.itemlist:
-            raise DropItem("already in list")
-        self.itemlist.append(item)
-
-        return item
-
+from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline
 
 @pytest.fixture
 def sample_item():

From f01db66e1c7a7e1d05ba6b93aeffccaa87a569f5 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Mon, 2 Dec 2024 04:50:10 -0500
Subject: [PATCH 19/40] resolved conflict

---
 cicd-scripts/app_install.sh               | 23 ++++++
 cicd-scripts/app_start.sh                 | 86 ++---------------------
 cicd-scripts/app_stop.sh                  | 34 +++++++++
 cicd-scripts/helpers/check_cloudwatch.sh  | 32 +++++++++
 cicd-scripts/helpers/check_codedeploy.sh  | 32 +++++++++
 cicd-scripts/helpers/kill_scheduler.sh    | 25 +++++++
 cicd-scripts/helpers/run_with_ui.sh       | 86 +++++++++++++++++++++++
 cicd-scripts/helpers/run_without_ui.sh    |  3 +
 cicd-scripts/helpers/update_pythonpath.sh | 34 +++++++++
 9 files changed, 274 insertions(+), 81 deletions(-)
 create mode 100644 cicd-scripts/helpers/check_cloudwatch.sh
 create mode 100644 cicd-scripts/helpers/check_codedeploy.sh
 create mode 100644 cicd-scripts/helpers/kill_scheduler.sh
 create mode 100755 cicd-scripts/helpers/run_with_ui.sh
 create mode 100644 cicd-scripts/helpers/run_without_ui.sh
 create mode 100644 cicd-scripts/helpers/update_pythonpath.sh

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 0e9dc63..0b45cc9 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+# Kill all spider services (if running)
+echo "Running app_stop.sh"
+sudo chmod +x ./cicd-scripts/app_stop.sh
+source ./cicd-scripts/app_stop.sh
+
 # CICD scripts can only runas 'search' user on AWS
 if [ "$(whoami)" = "search" ]; then
   echo "Executing cicd scripts as 'search' user"
@@ -8,8 +13,20 @@ else
   exit 1
 fi
 
+# Start AWS CloudWatch agent
+sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
+source ./cicd-scripts/helpers/check_cloudwatch.sh
+
+# Start AWS CodeDeploy agent
+sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
+source ./cicd-scripts/helpers/check_codedeploy.sh
+
+# PUBLIC
 SPIDER_PYTHON_VERSION=3.12
 
+# PRIVATE
+_CURRENT_BUILD_DIR=${PWD}
+
 # Update and upgrade the system without prompting for confirmation
 sudo apt-get update -y
 sudo apt-get upgrade -y
@@ -42,6 +59,9 @@ install_python() {
     sudo ./configure --enable-optimizations
     sudo make altinstall
 
+    # Return to the build directory
+    cd $_CURRENT_BUILD_DIR
+
     echo "Python ${SPIDER_PYTHON_VERSION} has been installed."
 }
 
@@ -53,6 +73,9 @@ else
     install_python
 fi
 
+# Set PYTHONPATH env
+source ./cicd-scripts/helpers/update_pythonpath.sh
+
 # Use venv with Python 3.12
 sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
 
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 5fec3ce..3610c15 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,86 +1,10 @@
 #!/bin/bash
 
-SCRAPYD_URL="http://127.0.0.1:6800/"
-SCRAPYDWEB_URL="http://127.0.0.1:5000/"
-SPIDER_URLS_API=https://staging.search.usa.gov/urls
+# PUBLIC
+RUN_WITH_UI=true
 
-# Function to check if a URL is up and running
-function check_url() {
-    local URL=$1
-    local MAX_ATTEMPTS="${2:-3}"
-    local DELAY=5
-    local attempt=1
-
-    while [ $attempt -le $MAX_ATTEMPTS ]; do
-        if curl --output /dev/null --silent --head --fail "$URL"; then
-            echo "Service at $URL is up on attempt $attempt."
-            return 0
-        else
-            echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..."
-        fi
-        attempt=$((attempt+1))
-        sleep $DELAY
-    done
-
-    echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts."
-    return 1
-}
-
-# Function to check if required command exists
-function check_command() {
-    if ! command -v "$1" &> /dev/null; then
-        echo "Error: $1 is not installed or not in your PATH."
-        exit 1
-    fi
-}
-
-check_command "scrapyd"
-check_command "scrapydweb"
-check_command "curl"
-
-echo "Killing any existing scrapyd and scrapydweb services"
-sudo pkill -f "scrapydweb" 2>/dev/null
-sudo pkill -f "scrapyd" 2>/dev/null
-
-# Check search-gov /urls endpoint
-echo "Checking search-gov /urls api..."
-if check_url "$SPIDER_URLS_API"; then
-    echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API"
-else
-    echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API"
-    exit 1
-fi
-
-echo "Running searchgov-spider application..."
-
-# Start scrapyd
-echo "Starting scrapyd service..."
-sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &'
-PID1=$!
-echo "Started scrapyd with PID $PID1"
-
-# Check if scrapyd is running
-if check_url "$SCRAPYD_URL"; then
-    echo "The scrapyd service is running at $SCRAPYD_URL"
-    sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &'
-    PID2=$!
-    echo "Started scrapydweb with PID $PID2"
-
-    if check_url "$SCRAPYDWEB_URL"; then
-        echo "The scrapydweb service is running at $SCRAPYDWEB_URL"
-    else
-        echo "Error: scrapydweb failed at $SCRAPYDWEB_URL."
-        exit 1
-    fi
+if $RUN_WITH_UI ; then
+    source ./cicd-scripts/helpers/run_with_ui.sh
 else
-    echo "Error: scrapyd failed at $SCRAPYD_URL."
-    exit 1
+    source ./cicd-scripts/helpers/run_without_ui.sh
 fi
-
-# Display the last few lines of logs
-echo -e "\n-- Last 10 lines of scrapyd.log:\n"
-tail -n 10 /var/log/scrapyd.log
-
-echo -e "\n-- Last 10 lines of scrapydweb.log:\n"
-tail -n 10 /var/log/scrapydweb.log
-exit 0
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 7b1e941..4844b25 100755
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,5 +1,14 @@
 #!/bin/bash
 
+# Clear all cache
+echo "Purge all pip cache..."
+sudo pip cache purge
+
+# Kill scrapy schedular (if running):
+echo "Stopping scrapy_scheduler.py (if running)"
+sudo chmod +x ./cicd-scripts/helpers/kill_scheduler.sh
+source ./cicd-scripts/helpers/kill_scheduler.sh
+
 echo "Stopping all scrapyd and scrapydweb tasks..."
 # Kill all scrapydweb and scrapyd jobs
 if sudo pkill -f "scrapydweb" 2>/dev/null; then
@@ -20,3 +29,28 @@ ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running."
 
 # Force kill any remaning scrapy background jobs still running
 sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9
+
+# Kill all nohup jobs (runs with python)
+ps -ef | grep nohup | grep -v grep | awk '{print $2}'
+
+# Remove other deploy cron jobs:
+#!/bin/bash
+
+# Function to remove crontab entries referencing a given cron entry string
+remove_cron_entry() {
+    if [ -z "$1" ]; then
+        echo "Error: No cron entry provided."
+        exit 1
+    fi
+
+    CRON_ENTRY="$1"
+
+    # Remove entries referencing the script
+    sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab -
+
+    echo "Removed any crontab entries referencing $CRON_ENTRY."
+}
+
+# Remove any other cron job entries
+remove_cron_entry "check_cloudwatch.sh"
+remove_cron_entry "check_codedeploy.sh"
diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh
new file mode 100644
index 0000000..18e4870
--- /dev/null
+++ b/cicd-scripts/helpers/check_cloudwatch.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Function to check if CloudWatch agent is running
+check_cloudwatch() {
+    if ! pgrep -f amazon-cloudwatch-agent > /dev/null; then
+        echo "AWS CloudWatch agent is not running. Starting it now..."
+        sudo service amazon-cloudwatch-agent start
+        if [ $? -eq 0 ]; then
+            echo "AWS CloudWatch agent started successfully."
+        else
+            echo "Failed to start AWS CloudWatch agent."
+        fi
+    else
+        echo "AWS CloudWatch agent is running."
+    fi
+}
+
+# Ensure the script is added to crontab for execution on reboot
+setup_cron() {
+    sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
+    CRON_ENTRY="@reboot /bin/bash $PWD/cicd-scripts/helpers/check_cloudwatch.sh"
+
+    # Update crontab, ensuring no duplicates
+    (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab -
+    echo "Crontab entry added to ensure the script runs on reboot."
+}
+
+# Execute the function
+check_cloudwatch
+
+# Add to crontab
+setup_cron
diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh
new file mode 100644
index 0000000..98731dc
--- /dev/null
+++ b/cicd-scripts/helpers/check_codedeploy.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Function to check if CodeDeploy agent is running
+check_codedeploy() {
+    if ! pgrep -f codedeploy-agent > /dev/null; then
+        echo "AWS CodeDeploy agent is not running. Starting it now..."
+        sudo service codedeploy-agent start
+        if [ $? -eq 0 ]; then
+            echo "AWS CodeDeploy agent started successfully."
+        else
+            echo "Failed to start AWS CodeDeploy agent."
+        fi
+    else
+        echo "AWS CodeDeploy agent is running."
+    fi
+}
+
+# Ensure the script is added to crontab for execution on reboot
+setup_cron() {
+    sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
+    CRON_ENTRY="@reboot /bin/bash $PWD/helpers/check_codedeploy.sh"
+
+    # Update crontab, ensuring no duplicates
+    (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab -
+    echo "Crontab entry added to ensure the script runs on reboot."
+}
+
+# Execute the function
+check_codedeploy
+
+# Add to crontab
+setup_cron
diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh
new file mode 100644
index 0000000..ddc115b
--- /dev/null
+++ b/cicd-scripts/helpers/kill_scheduler.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Find the process ID of the running scrapy_scheduler.py script
+echo "Searching for scrapy_scheduler.py process..."
+PROCESS_ID=$(pgrep -f "scrapy_scheduler.py")
+
+# Check if the process exists
+if [ -z "$PROCESS_ID" ]; then
+  echo "No running process found for scrapy_scheduler.py."
+  exit 0
+fi
+
+# Kill the process
+echo "Killing process with PID: $PROCESS_ID"
+kill "$PROCESS_ID"
+
+sleep 3
+
+# Verify if the process was killed
+if [ $? -eq 0 ]; then
+  echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated."
+else
+  echo "Failed to terminate the process. Please check manually."
+  exit 1
+fi
diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh
new file mode 100755
index 0000000..5fec3ce
--- /dev/null
+++ b/cicd-scripts/helpers/run_with_ui.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+SCRAPYD_URL="http://127.0.0.1:6800/"
+SCRAPYDWEB_URL="http://127.0.0.1:5000/"
+SPIDER_URLS_API=https://staging.search.usa.gov/urls
+
+# Function to check if a URL is up and running
+function check_url() {
+    local URL=$1
+    local MAX_ATTEMPTS="${2:-3}"
+    local DELAY=5
+    local attempt=1
+
+    while [ $attempt -le $MAX_ATTEMPTS ]; do
+        if curl --output /dev/null --silent --head --fail "$URL"; then
+            echo "Service at $URL is up on attempt $attempt."
+            return 0
+        else
+            echo "Attempt $attempt: Service at $URL is not available, retrying in $DELAY seconds..."
+        fi
+        attempt=$((attempt+1))
+        sleep $DELAY
+    done
+
+    echo "Service at $URL is still not available after $MAX_ATTEMPTS attempts."
+    return 1
+}
+
+# Function to check if required command exists
+function check_command() {
+    if ! command -v "$1" &> /dev/null; then
+        echo "Error: $1 is not installed or not in your PATH."
+        exit 1
+    fi
+}
+
+check_command "scrapyd"
+check_command "scrapydweb"
+check_command "curl"
+
+echo "Killing any existing scrapyd and scrapydweb services"
+sudo pkill -f "scrapydweb" 2>/dev/null
+sudo pkill -f "scrapyd" 2>/dev/null
+
+# Check search-gov /urls endpoint
+echo "Checking search-gov /urls api..."
+if check_url "$SPIDER_URLS_API"; then
+    echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API"
+else
+    echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API"
+    exit 1
+fi
+
+echo "Running searchgov-spider application..."
+
+# Start scrapyd
+echo "Starting scrapyd service..."
+sudo bash -c 'nohup scrapyd > /var/log/scrapyd.log 2>&1 &'
+PID1=$!
+echo "Started scrapyd with PID $PID1"
+
+# Check if scrapyd is running
+if check_url "$SCRAPYD_URL"; then
+    echo "The scrapyd service is running at $SCRAPYD_URL"
+    sudo bash -c 'cd ./search_gov_crawler && nohup scrapydweb > /var/log/scrapydweb.log 2>&1 &'
+    PID2=$!
+    echo "Started scrapydweb with PID $PID2"
+
+    if check_url "$SCRAPYDWEB_URL"; then
+        echo "The scrapydweb service is running at $SCRAPYDWEB_URL"
+    else
+        echo "Error: scrapydweb failed at $SCRAPYDWEB_URL."
+        exit 1
+    fi
+else
+    echo "Error: scrapyd failed at $SCRAPYD_URL."
+    exit 1
+fi
+
+# Display the last few lines of logs
+echo -e "\n-- Last 10 lines of scrapyd.log:\n"
+tail -n 10 /var/log/scrapyd.log
+
+echo -e "\n-- Last 10 lines of scrapydweb.log:\n"
+tail -n 10 /var/log/scrapydweb.log
+exit 0
diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh
new file mode 100644
index 0000000..cabeb3d
--- /dev/null
+++ b/cicd-scripts/helpers/run_without_ui.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sudo bash -c 'nohup ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &'
diff --git a/cicd-scripts/helpers/update_pythonpath.sh b/cicd-scripts/helpers/update_pythonpath.sh
new file mode 100644
index 0000000..e742b55
--- /dev/null
+++ b/cicd-scripts/helpers/update_pythonpath.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Define the current directory
+CURRENT_DIR=$(pwd)
+
+# Define the .bashrc file location
+BASHRC_FILE="$HOME/.bashrc"
+
+# Check if .bashrc contains an export PYTHONPATH line
+if grep -q "^export PYTHONPATH=" "$BASHRC_FILE"; then
+    # Extract the existing PYTHONPATH line
+    PYTHONPATH_LINE=$(grep "^export PYTHONPATH=" "$BASHRC_FILE")
+
+    # Check if the current directory is already included
+    if echo "$PYTHONPATH_LINE" | grep -q "$CURRENT_DIR"; then
+        echo "PYTHONPATH already includes the current directory: $CURRENT_DIR"
+    else
+        # Ensure the updated line includes the starting and ending quotes
+        CURRENT_PATHS=$(echo "$PYTHONPATH_LINE" | sed -e 's/^export PYTHONPATH=//' -e 's/^"//' -e 's/"$//')
+        UPDATED_LINE="export PYTHONPATH=\"${CURRENT_PATHS}:${CURRENT_DIR}\""
+        sed -i "s|^export PYTHONPATH=.*|$UPDATED_LINE|" "$BASHRC_FILE"
+        echo "Updated PYTHONPATH to include the current directory: $CURRENT_DIR"
+    fi
+else
+    # Add a new export PYTHONPATH line to .bashrc
+    echo "export PYTHONPATH=\"\$PYTHONPATH:${CURRENT_DIR}\"" >> "$BASHRC_FILE"
+    echo "Added new PYTHONPATH to .bashrc including the current directory: $CURRENT_DIR"
+fi
+
+# Apply changes for the current session
+export PYTHONPATH=\"${CURRENT_PATHS//"\$PYTHONPATH"/}:${CURRENT_DIR}\"
+
+echo "PYTHONPATH changes applied:"
+echo $PYTHONPATH

From 54355ed7be938d48fa74bdd69382816515963b60 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 10 Dec 2024 11:54:40 -0500
Subject: [PATCH 20/40] Addressed code review

---
 cicd-scripts/app_install.sh                 | 10 ++++++++--
 cicd-scripts/app_stop.sh                    |  4 +++-
 cicd-scripts/helpers/run_with_ui.sh         |  9 ---------
 tests/search_gov_spiders/test_full_crawl.py | 17 ++++++++++++-----
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 0b45cc9..6622445 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -13,6 +13,11 @@ else
   exit 1
 fi
 
+# Get missing packages
+sudo apt-get install lzma
+sudo apt-get install liblzma-dev
+yes | sudo apt-get install libbz2-dev
+
 # Start AWS CloudWatch agent
 sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
 source ./cicd-scripts/helpers/check_cloudwatch.sh
@@ -88,8 +93,9 @@ source ./venv/bin/activate
 
 # Install all spider dependencies
 echo "Installing dependencies..."
-pip install --upgrade pip
-sudo pip install --upgrade --force-reinstall -r ./search_gov_crawler/requirements.txt
+sudo pip install --force-reinstall -r ./search_gov_crawler/requirements.txt
+sudo pip install pytest-playwright playwright -U
+playwright install
 
 echo "Dependencies installed."
 
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 4844b25..0e39875 100755
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -2,7 +2,9 @@
 
 # Clear all cache
 echo "Purge all pip cache..."
-sudo pip cache purge
+# We can't do `$pip cache purge`, this does the samething:
+sudo rm -r ~/.cache/pip
+sudo rm -rf /root/.cache/pip
 
 # Kill scrapy schedular (if running):
 echo "Stopping scrapy_scheduler.py (if running)"
diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh
index 5fec3ce..ed2f400 100755
--- a/cicd-scripts/helpers/run_with_ui.sh
+++ b/cicd-scripts/helpers/run_with_ui.sh
@@ -42,15 +42,6 @@ echo "Killing any existing scrapyd and scrapydweb services"
 sudo pkill -f "scrapydweb" 2>/dev/null
 sudo pkill -f "scrapyd" 2>/dev/null
 
-# Check search-gov /urls endpoint
-echo "Checking search-gov /urls api..."
-if check_url "$SPIDER_URLS_API"; then
-    echo "The /urls api is up and running at endpoint: $SPIDER_URLS_API"
-else
-    echo "Error: /urls failed failed at endpoint: $SPIDER_URLS_API"
-    exit 1
-fi
-
 echo "Running searchgov-spider application..."
 
 # Start scrapyd
diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py
index 3992263..1bfd892 100644
--- a/tests/search_gov_spiders/test_full_crawl.py
+++ b/tests/search_gov_spiders/test_full_crawl.py
@@ -94,13 +94,20 @@ def test_full_crawl(mock_scrapy_settings, monkeypatch, spider, use_dedup, crawl_
         temp_dir.joinpath("output").mkdir(exist_ok=True)
 
         def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs):
-            pipeline_cls.current_file_size = 0
+            # pipeline_cls.current_file_size = 0
+            # pipeline_cls.file_number = 1
+            # pipeline_cls.parent_file_path = temp_dir
+            # pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv")
+            # pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8")
+            # pipeline_cls.max_file_size = 3900
+            # pipeline_cls.paginate = True
+
+            pipeline_cls.api_url = None
             pipeline_cls.file_number = 1
             pipeline_cls.parent_file_path = temp_dir
-            pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv")
-            pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8")
-            pipeline_cls.max_file_size = 3900
-            pipeline_cls.paginate = True
+            pipeline_cls.base_file_name = temp_dir / "output" / "all-links.csv"
+            pipeline_cls.file_path = pipeline_cls.base_file_name
+            pipeline_cls.current_file = open(pipeline_cls.file_path, "w", encoding="utf-8")
 
         monkeypatch.setattr(
             "search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.__init__", mock_init

From a43688f4b75b11ea3d50f997b8202c653fbf571a Mon Sep 17 00:00:00 2001
From: selfdanielj <38377823+selfdanielj@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:40:36 -0500
Subject: [PATCH 21/40] Add pytest-mock dependency

---
 .github/dependabot.yml              | 2 +-
 search_gov_crawler/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 37e7fcf..5674693 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -14,4 +14,4 @@ updates:
         patterns:
           - "freezegun"
           - "pylint"
-          - "pytest"
+          - "pytest*"
diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt
index bf1dafb..8941c1e 100644
--- a/search_gov_crawler/requirements.txt
+++ b/search_gov_crawler/requirements.txt
@@ -2,6 +2,7 @@ freezegun==1.5.1
 pylint==3.3.1
 pytest==8.3.3
 pytest-console-scripts==1.4.1
+pytest-mock==3.14.0
 
 # Install from github due to unmaintained project using pypi https://github.com/nhairs/python-json-logger/issues/1
 python-json-logger @ https://github.com/nhairs/python-json-logger/releases/download/v3.1.0/python_json_logger-3.1.0-py3-none-any.whl
@@ -11,4 +12,3 @@ scrapyd==1.5.0
 scrapyd-client==2.0.0
 scrapydweb @ git+https://github.com/GSA/searchgov-scrapydweb
 spidermon [monitoring] == 1.22.0
-

From 6be4bb6e893549211141db3700f8d86373daa853 Mon Sep 17 00:00:00 2001
From: selfdanielj <38377823+selfdanielj@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:42:04 -0500
Subject: [PATCH 22/40] update file pipeline and tests

---
 .../search_gov_spiders/pipelines.py           | 11 ++-
 .../test_urls_files_size.py                   | 96 +++++++++++--------
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py
index 18f3664..6849fc5 100644
--- a/search_gov_crawler/search_gov_spiders/pipelines.py
+++ b/search_gov_crawler/search_gov_spiders/pipelines.py
@@ -4,9 +4,9 @@
 """
 
 import os
-import requests
 from pathlib import Path
 
+import requests
 from scrapy.exceptions import DropItem
 
 
@@ -35,7 +35,7 @@ def process_item(self, item, spider):
         """Process item either by writing to file or by posting to API."""
 
         line = item.get("url", "") + "\n"
-        line_size = len(line.encode('utf-8'))
+        line_size = len(line.encode("utf-8"))
 
         # If API URL is set, batch URLs and send a POST request when max size is reached
         if self.api_url:
@@ -51,7 +51,7 @@ def process_item(self, item, spider):
         return item
 
     def _is_batch_too_large(self, new_entry_size):
-        current_batch_size = sum(len(url.encode('utf-8')) for url in self.urls_batch)
+        current_batch_size = sum(len(url.encode("utf-8")) for url in self.urls_batch)
         return (current_batch_size + new_entry_size) > self.MAX_FILE_SIZE_BYTES
 
     def _is_file_too_large(self, new_entry_size):
@@ -79,12 +79,13 @@ def _post_urls(self, spider):
             finally:
                 self.urls_batch.clear()
 
-    def close_spider(self, _spider):
+    def close_spider(self, spider):
         """Close the file or send remaining URLs if needed when the spider finishes."""
         if not self.api_url and self.current_file:
             self.current_file.close()
         elif self.api_url:
-            self._post_urls()  # Send any remaining URLs on spider close
+            self._post_urls(spider)  # Send any remaining URLs on spider close
+
 
 class DeDeuplicatorPipeline:
     """Class for pipeline that removes duplicate items"""
diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index f0d5b30..7735711 100644
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -1,84 +1,102 @@
 import os
+
 import pytest
-from unittest.mock import MagicMock
-from scrapy import Item
+
+from scrapy import Spider
+from scrapy.utils.test import get_crawler
+from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem
 from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline
 
-@pytest.fixture
-def sample_item():
+
+@pytest.fixture(name="sample_spider")
+def fixture_sample_spider():
+    crawler = get_crawler(Spider)
+    return crawler._create_spider(
+        name="urls_test", allowed_domains="example.com", allowed_domain_paths="https://www.example.com"
+    )
+
+
+@pytest.fixture(name="sample_item")
+def fixture_sample_item() -> SearchGovSpidersItem:
     """Fixture for a sample item with a URL."""
-    item = Item()
-    item['url'] = "http://example.com"
+    item = SearchGovSpidersItem()
+    item["url"] = "http://example.com"
     return item
 
-@pytest.fixture
-def pipeline_no_api(mocker):
-    """Fixture for pipeline with no API URL set."""
-    mocker.patch('os.getenv', return_value=None)
+
+@pytest.fixture(name="mock_open")
+def fixture_mock_open(mocker):
+    return mocker.patch("builtins.open", mocker.mock_open())
+
+
+@pytest.fixture(name="pipeline_no_api")
+def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
+    mocker.patch.dict(os.environ, {})
     return SearchGovSpidersPipeline()
 
-@pytest.fixture
-def pipeline_with_api(mocker):
+
+@pytest.fixture(name="pipeline_with_api")
+def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline:
     """Fixture for pipeline with an API URL set."""
-    mocker.patch('os.getenv', return_value="http://mockapi.com")
+    mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"})
     return SearchGovSpidersPipeline()
 
-def test_write_to_file(pipeline_no_api, sample_item, mocker):
-    """Test that URLs are written to files when SPIDER_URLS_API is not set."""
-    mock_open = mocker.patch('open', mocker.mock_open())
 
-    pipeline_no_api.process_item(sample_item, None)
+def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider):
+    """Test that URLs are written to files when SPIDER_URLS_API is not set."""
+    pipeline_no_api.process_item(sample_item, sample_spider)
 
     # Ensure file is opened and written to
-    mock_open.assert_called_once_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8')
-    mock_open().write.assert_any_call(sample_item['url'] + "\n")
+    mock_open.assert_called_once_with(pipeline_no_api.base_file_name, "w", encoding="utf-8")
+    mock_open().write.assert_any_call(sample_item["url"] + "\n")
 
-def test_post_to_api(pipeline_with_api, sample_item, mocker):
+
+def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker):
     """Test that URLs are batched and sent via POST when SPIDER_URLS_API is set."""
-    mock_post = mocker.patch('requests.post')
+    mock_post = mocker.patch("requests.post")
 
-    pipeline_with_api.process_item(sample_item, None)
+    pipeline_with_api.process_item(sample_item, sample_spider)
 
     # Check that the batch contains the URL
-    assert sample_item['url'] in pipeline_with_api.urls_batch
+    assert sample_item["url"] in pipeline_with_api.urls_batch
 
     # Simulate max size to force post
-    mocker.patch.object(SearchGovSpidersPipeline, '_is_batch_too_large', return_value=True)
-    pipeline_with_api.process_item(sample_item, None)
+    mocker.patch.object(SearchGovSpidersPipeline, "_is_batch_too_large", return_value=True)
+    pipeline_with_api.process_item(sample_item, sample_spider)
 
     # Ensure POST request was made
     mock_post.assert_called_once_with("http://mockapi.com", json={"urls": pipeline_with_api.urls_batch})
 
-def test_rotate_file(pipeline_no_api, sample_item, mocker):
+
+def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker):
     """Test that file rotation occurs when max size is exceeded."""
-    mock_open = mocker.patch('open', mocker.mock_open())
-    mock_rename = mocker.patch('os.rename')
+    mock_rename = mocker.patch("os.rename")
 
-    mocker.patch.object(SearchGovSpidersPipeline, '_is_file_too_large', return_value=True)
+    mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=True)
     pipeline_no_api.process_item(sample_item, None)
 
     # Check if the file was rotated
-    mock_open.assert_called_with(pipeline_no_api.base_file_name, 'w', encoding='utf-8')
+    mock_open.assert_called_with(pipeline_no_api.base_file_name, "w", encoding="utf-8")
     mock_open().close.assert_called()
     mock_rename.assert_called_once_with(
-        pipeline_no_api.file_path,
-        pipeline_no_api.parent_file_path / "output/all-links-1.csv"
+        pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv"
     )
 
-def test_post_urls_on_spider_close(pipeline_with_api, mocker):
+
+def test_post_urls_on_spider_close(pipeline_with_api, sample_spider, mocker):
     """Test that remaining URLs are posted when spider closes and SPIDER_URLS_API is set."""
-    mock_post = mocker.patch('requests.post')
+    mock_post = mocker.patch("requests.post")
 
     pipeline_with_api.urls_batch = ["http://example.com"]
 
-    pipeline_with_api.close_spider(None)
+    pipeline_with_api.close_spider(sample_spider)
+
+    # Ensure POST request was made on spider close, cannot verify json once urls_batch is cleared
+    mock_post.assert_called_once_with("http://mockapi.com", json=mocker.ANY)
 
-    # Ensure POST request was made on spider close
-    mock_post.assert_called_once_with("http://mockapi.com", json={"urls": ["http://example.com"]})
 
-def test_close_file_on_spider_close(pipeline_no_api, mocker):
+def test_close_file_on_spider_close(pipeline_no_api, mock_open):
     """Test that the file is closed when the spider closes and no SPIDER_URLS_API is set."""
-    mock_open = mocker.patch('open', mocker.mock_open())
 
     pipeline_no_api.close_spider(None)
 

From b8d60219ebe50b8ac6ffa9920c981e1d7cabddd5 Mon Sep 17 00:00:00 2001
From: selfdanielj <38377823+selfdanielj@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:56:03 -0500
Subject: [PATCH 23/40] fix test looking for a mocked all-files.csv

---
 tests/search_gov_spiders/test_urls_files_size.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index 7735711..1c9a3b0 100644
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import pytest
 
@@ -42,8 +43,9 @@ def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline:
     return SearchGovSpidersPipeline()
 
 
-def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider):
+def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker):
     """Test that URLs are written to files when SPIDER_URLS_API is not set."""
+    mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=False)
     pipeline_no_api.process_item(sample_item, sample_spider)
 
     # Ensure file is opened and written to

From ae2605e252cebfd2feb730de58673fe8ff61a03f Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 10 Dec 2024 14:38:32 -0500
Subject: [PATCH 24/40] fixed no ui code

---
 cicd-scripts/helpers/run_without_ui.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh
index cabeb3d..1c88afe 100644
--- a/cicd-scripts/helpers/run_without_ui.sh
+++ b/cicd-scripts/helpers/run_without_ui.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
-
-sudo bash -c 'nohup ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &'
+SPIDER_PYTHON_VERSION=3.12
+sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &"

From f92f12e8c85ea1ccf402f36e46e4ee31e8b57590 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 10 Dec 2024 18:01:34 -0500
Subject: [PATCH 25/40] ...

---
 cicd-scripts/app_install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 6622445..b62abf4 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -128,4 +128,5 @@ new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh"
 # Add the new cron job to the crontab if it's not already present
 (crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab -
 
+
 echo "Cron job added: $new_cron"

From 251c6924bfe8632ff1bfb182e0a5bd00f7ed686a Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 11 Dec 2024 12:24:30 -0500
Subject: [PATCH 26/40] fixed executable scripts

---
 cicd-scripts/app_install.sh            | 2 +-
 cicd-scripts/app_start.sh              | 4 +++-
 cicd-scripts/app_stop.sh               | 2 +-
 cicd-scripts/helpers/kill_scheduler.sh | 7 ++++---
 cicd-scripts/helpers/run_with_ui.sh    | 5 ++---
 cicd-scripts/helpers/run_without_ui.sh | 1 +
 6 files changed, 12 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 cicd-scripts/helpers/kill_scheduler.sh

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index b62abf4..446ed5a 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -10,7 +10,7 @@ if [ "$(whoami)" = "search" ]; then
   echo "Executing cicd scripts as 'search' user"
 else
   echo "This script must be executed as 'search' user"
-  exit 1
+  return
 fi
 
 # Get missing packages
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 3610c15..2af738a 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,10 +1,12 @@
 #!/bin/bash
 
 # PUBLIC
-RUN_WITH_UI=true
+RUN_WITH_UI=false
 
 if $RUN_WITH_UI ; then
+    sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh
     source ./cicd-scripts/helpers/run_with_ui.sh
 else
+    sudo chmod +x ./cicd-scripts/helpers/run_without_ui.sh
     source ./cicd-scripts/helpers/run_without_ui.sh
 fi
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 0e39875..3079d15 100755
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -42,7 +42,7 @@ ps -ef | grep nohup | grep -v grep | awk '{print $2}'
 remove_cron_entry() {
     if [ -z "$1" ]; then
         echo "Error: No cron entry provided."
-        exit 1
+        return
     fi
 
     CRON_ENTRY="$1"
diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh
old mode 100644
new mode 100755
index ddc115b..76b39e6
--- a/cicd-scripts/helpers/kill_scheduler.sh
+++ b/cicd-scripts/helpers/kill_scheduler.sh
@@ -7,19 +7,20 @@ PROCESS_ID=$(pgrep -f "scrapy_scheduler.py")
 # Check if the process exists
 if [ -z "$PROCESS_ID" ]; then
   echo "No running process found for scrapy_scheduler.py."
-  exit 0
+  return
 fi
 
 # Kill the process
 echo "Killing process with PID: $PROCESS_ID"
 kill "$PROCESS_ID"
 
+# Pause to allow the process to terminate
 sleep 3
 
 # Verify if the process was killed
-if [ $? -eq 0 ]; then
+if ! kill -0 "$PROCESS_ID" 2>/dev/null; then
   echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated."
 else
   echo "Failed to terminate the process. Please check manually."
-  exit 1
+  return
 fi
diff --git a/cicd-scripts/helpers/run_with_ui.sh b/cicd-scripts/helpers/run_with_ui.sh
index ed2f400..de181e2 100755
--- a/cicd-scripts/helpers/run_with_ui.sh
+++ b/cicd-scripts/helpers/run_with_ui.sh
@@ -61,11 +61,11 @@ if check_url "$SCRAPYD_URL"; then
         echo "The scrapydweb service is running at $SCRAPYDWEB_URL"
     else
         echo "Error: scrapydweb failed at $SCRAPYDWEB_URL."
-        exit 1
+        return
     fi
 else
     echo "Error: scrapyd failed at $SCRAPYD_URL."
-    exit 1
+    return
 fi
 
 # Display the last few lines of logs
@@ -74,4 +74,3 @@ tail -n 10 /var/log/scrapyd.log
 
 echo -e "\n-- Last 10 lines of scrapydweb.log:\n"
 tail -n 10 /var/log/scrapydweb.log
-exit 0
diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh
index 1c88afe..edf0c3a 100644
--- a/cicd-scripts/helpers/run_without_ui.sh
+++ b/cicd-scripts/helpers/run_without_ui.sh
@@ -1,3 +1,4 @@
 #!/bin/bash
 SPIDER_PYTHON_VERSION=3.12
 sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &"
+echo "Running no UI vesrion of searchgov-spider..."

From 58d5495eda939696935a59d7de75df06cfa47458 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Thu, 12 Dec 2024 12:22:27 -0500
Subject: [PATCH 27/40] ...

---
 cicd-scripts/app_start.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 2af738a..e790b57 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 # PUBLIC
-RUN_WITH_UI=false
+SPIDER_RUN_WITH_UI=false
 
-if $RUN_WITH_UI ; then
+if $SPIDER_RUN_WITH_UI ; then
     sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh
     source ./cicd-scripts/helpers/run_with_ui.sh
 else

From 25321725e342c1f06ce272d8c7c1e32becec6d4c Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 04:40:59 -0500
Subject: [PATCH 28/40] fixed paths and scripts

---
 .circleci/config.yml                          |   0
 .codeclimate.yml                              |   0
 .github/dependabot.yml                        |   0
 .github/pull_request_template.md              |   0
 .gitignore                                    |   0
 .pre-commit-config.yaml                       |   0
 LICENSE                                       |   0
 README.md                                     |   0
 appspec.yml                                   |   2 +-
 cicd-scripts/app_install.sh                   | 195 +++++++++---------
 cicd-scripts/app_start.sh                     |  17 +-
 cicd-scripts/app_stop.sh                      | 135 ++++++++----
 cicd-scripts/helpers/check_cloudwatch.sh      |   4 +-
 cicd-scripts/helpers/check_codedeploy.sh      |   4 +-
 cicd-scripts/helpers/ensure_executable.sh     |  15 ++
 cicd-scripts/helpers/kill_scheduler.sh        |  28 ++-
 cicd-scripts/helpers/run_without_ui.sh        |   8 +-
 cicd-scripts/helpers/update_pythonpath.sh     |   0
 pyproject.toml                                |   0
 search_gov_crawler/benchmark.py               |   0
 search_gov_crawler/output/.gitignore          |   0
 search_gov_crawler/requirements.txt           |   0
 search_gov_crawler/scrapy.cfg                 |   0
 search_gov_crawler/scrapy_scheduler.py        |   0
 search_gov_crawler/scrapyd-logs/.gitignore    |   0
 search_gov_crawler/scrapyd.conf               |   0
 search_gov_crawler/scrapydweb_settings_v10.py |   0
 .../search_gov_logparser/__init__.py          |   0
 .../search_gov_scrapyd/__init__.py            |   0
 .../search_gov_scrapydweb/__init__.py         |   0
 .../search_gov_spiders/__init__.py            |   0
 .../reports/email/bases/report/base.jinja     |   2 -
 .../reports/email/bases/report/email.css      |   2 -
 .../reports/email/bases/report/medium.jinja   |   0
 .../reports/email/bases/report/report.css     |   2 +-
 .../search_gov_spiders/actions/results.css    |   0
 .../search_gov_spiders/actions/results.jinja  |   0
 .../search_gov_spiders/extensions/__init__.py |   0
 .../extensions/json_logging.py                |   0
 .../search_gov_spiders/helpers/__init__.py    |   0
 .../helpers/domain_spider.py                  |   0
 .../search_gov_spiders/items.py               |   0
 .../search_gov_spiders/middlewares.py         |   0
 .../search_gov_spiders/monitors.py            |   2 +-
 .../search_gov_spiders/pipelines.py           |   0
 .../search_gov_spiders/settings.py            |   0
 .../search_gov_spiders/spiders/__init__.py    |   0
 .../spiders/domain_spider.py                  |   0
 .../spiders/domain_spider_js.py               |   0
 .../utility_files/README.md                   |   0
 .../utility_files/crawl-sites.json            |   0
 .../utility_files/import_plist.py             |   0
 .../utility_files/init_schedule.py            |   0
 .../utility_files/scrutiny-2023-06-20.plist   |   0
 search_gov_crawler/setup.py                   |   0
 setup.cfg                                     |   0
 tests/__init__.py                             |   0
 tests/integration_tests/test_scrapyd.py       |   0
 tests/search_gov_spiders/conftest.py          |   0
 .../search_gov_spiders/crawl-sites-test.json  |   0
 .../scrapy_httpcache/domain_spider.db.bak     |   0
 .../scrapy_httpcache/domain_spider.db.dat     | Bin
 .../scrapy_httpcache/domain_spider.db.dir     |   0
 .../scrapy_httpcache/domain_spider_js.db.bak  |   0
 .../scrapy_httpcache/domain_spider_js.db.dat  | Bin
 .../scrapy_httpcache/domain_spider_js.db.dir  |   0
 tests/search_gov_spiders/test_extensions.py   |   0
 tests/search_gov_spiders/test_full_crawl.py   |   0
 tests/search_gov_spiders/test_helpers.py      |   0
 tests/search_gov_spiders/test_middlewares.py  |   0
 tests/search_gov_spiders/test_pipelines.py    |   0
 .../test_scrapy_scheduler.py                  |   0
 tests/search_gov_spiders/test_spider.py       |   0
 .../test_urls_files_size.py                   |   0
 .../search_gov_spiders/test_utiliity_files.py |   0
 75 files changed, 245 insertions(+), 171 deletions(-)
 mode change 100644 => 100755 .circleci/config.yml
 mode change 100644 => 100755 .codeclimate.yml
 mode change 100644 => 100755 .github/dependabot.yml
 mode change 100644 => 100755 .github/pull_request_template.md
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 .pre-commit-config.yaml
 mode change 100644 => 100755 LICENSE
 mode change 100644 => 100755 README.md
 mode change 100644 => 100755 appspec.yml
 mode change 100644 => 100755 cicd-scripts/helpers/check_cloudwatch.sh
 mode change 100644 => 100755 cicd-scripts/helpers/check_codedeploy.sh
 create mode 100755 cicd-scripts/helpers/ensure_executable.sh
 mode change 100644 => 100755 cicd-scripts/helpers/run_without_ui.sh
 mode change 100644 => 100755 cicd-scripts/helpers/update_pythonpath.sh
 mode change 100644 => 100755 pyproject.toml
 mode change 100644 => 100755 search_gov_crawler/benchmark.py
 mode change 100644 => 100755 search_gov_crawler/output/.gitignore
 mode change 100644 => 100755 search_gov_crawler/requirements.txt
 mode change 100644 => 100755 search_gov_crawler/scrapy.cfg
 mode change 100644 => 100755 search_gov_crawler/scrapy_scheduler.py
 mode change 100644 => 100755 search_gov_crawler/scrapyd-logs/.gitignore
 mode change 100644 => 100755 search_gov_crawler/scrapyd.conf
 mode change 100644 => 100755 search_gov_crawler/scrapydweb_settings_v10.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_logparser/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_scrapyd/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_scrapydweb/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/results.css
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/actions/results.jinja
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/extensions/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/extensions/json_logging.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/helpers/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/helpers/domain_spider.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/items.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/middlewares.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/monitors.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/pipelines.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/settings.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/__init__.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/domain_spider.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/README.md
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/import_plist.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py
 mode change 100644 => 100755 search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist
 mode change 100644 => 100755 search_gov_crawler/setup.py
 mode change 100644 => 100755 setup.cfg
 mode change 100644 => 100755 tests/__init__.py
 mode change 100644 => 100755 tests/integration_tests/test_scrapyd.py
 mode change 100644 => 100755 tests/search_gov_spiders/conftest.py
 mode change 100644 => 100755 tests/search_gov_spiders/crawl-sites-test.json
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat
 mode change 100644 => 100755 tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir
 mode change 100644 => 100755 tests/search_gov_spiders/test_extensions.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_full_crawl.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_helpers.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_middlewares.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_pipelines.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_scrapy_scheduler.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_spider.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_urls_files_size.py
 mode change 100644 => 100755 tests/search_gov_spiders/test_utiliity_files.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
old mode 100644
new mode 100755
diff --git a/.codeclimate.yml b/.codeclimate.yml
old mode 100644
new mode 100755
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
old mode 100644
new mode 100755
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
old mode 100644
new mode 100755
diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/LICENSE b/LICENSE
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/appspec.yml b/appspec.yml
old mode 100644
new mode 100755
index ccb3c07..87244e0
--- a/appspec.yml
+++ b/appspec.yml
@@ -2,7 +2,7 @@ version: 0.0
 os: linux
 permissions:
   - object: .
-    mode: 755
+    mode: 777
     acls:
       - "d:u::rwx"
       - "d:g::rwx"
diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 446ed5a..9b9c8df 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,132 +1,131 @@
 #!/bin/bash
 
-# Kill all spider services (if running)
-echo "Running app_stop.sh"
-sudo chmod +x ./cicd-scripts/app_stop.sh
-source ./cicd-scripts/app_stop.sh
-
-# CICD scripts can only runas 'search' user on AWS
-if [ "$(whoami)" = "search" ]; then
-  echo "Executing cicd scripts as 'search' user"
-else
-  echo "This script must be executed as 'search' user"
-  return
-fi
-
-# Get missing packages
-sudo apt-get install lzma
-sudo apt-get install liblzma-dev
-yes | sudo apt-get install libbz2-dev
-
-# Start AWS CloudWatch agent
-sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
-source ./cicd-scripts/helpers/check_cloudwatch.sh
-
-# Start AWS CodeDeploy agent
-sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
-source ./cicd-scripts/helpers/check_codedeploy.sh
-
-# PUBLIC
-SPIDER_PYTHON_VERSION=3.12
+chmod +x ./cicd-scripts/helpers/ensure_executable.sh
+source ./cicd-scripts/helpers/ensure_executable.sh
 
-# PRIVATE
+### VARIABLES ###
+SPIDER_PYTHON_VERSION=3.12
 _CURRENT_BUILD_DIR=${PWD}
+VENV_DIR=./venv
 
-# Update and upgrade the system without prompting for confirmation
-sudo apt-get update -y
-sudo apt-get upgrade -y
-sudo apt install acl -y
-
-# Required to give all app_* bash scripts read/write permissions to self and parent.
-# Give current directory and all its files rw permissions
-sudo chmod -R 755 .
-# All new files/directories will inherit rwx (required when installing and using sqllite)
-sudo setfacl -Rdm g:dgsearch:rwx .
+### FUNCTIONS ###
 
+# Stop spider services
+stop_services() {
+    echo "Running app_stop.sh..."
+    ensure_executable "./cicd-scripts/app_stop.sh"
+}
 
-# Install necessary system dependencies
-sudo apt-get install -y python-setuptools
+# Install missing system dependencies
+install_system_dependencies() {
+    echo "Installing system dependencies..."
+    sudo apt-get update -y
+    sudo apt-get install -y \
+        lzma liblzma-dev libbz2-dev python-setuptools \
+        acl build-essential checkinstall libreadline-dev \
+        libncursesw5-dev libssl-dev libsqlite3-dev tk-dev \
+        libgdbm-dev libc6-dev zlib1g-dev libffi-dev openssl
+}
 
+# Install Python
 install_python() {
-    echo "Installing ${SPIDER_PYTHON_VERSION}"
-    sudo apt-get install -y build-essential checkinstall libreadline-dev \
-                            libncursesw5-dev libssl-dev libsqlite3-dev \
-                            tk-dev libgdbm-dev libc6-dev libbz2-dev \
-                            zlib1g-dev openssl libffi-dev
-
-    # Download Python source code
+    echo "Installing Python ${SPIDER_PYTHON_VERSION}..."
     cd /usr/src
-    sudo wget https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
+    sudo wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
     sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz
-
-    # Build and install Python
     cd Python-${SPIDER_PYTHON_VERSION}.0
     sudo ./configure --enable-optimizations
     sudo make altinstall
+    cd "$_CURRENT_BUILD_DIR"
+    echo "Python ${SPIDER_PYTHON_VERSION} installed successfully."
+}
 
-    # Return to the build directory
-    cd $_CURRENT_BUILD_DIR
+# Check and install Python if needed
+check_python() {
+    if ! command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then
+        install_python
+    else
+        echo "Python ${SPIDER_PYTHON_VERSION} already installed: $(python${SPIDER_PYTHON_VERSION} --version)"
+    fi
+}
 
-    echo "Python ${SPIDER_PYTHON_VERSION} has been installed."
+# Set environment paths
+update_pythonpath() {
+  ensure_executable "./cicd-scripts/helpers/update_pythonpath.sh"
 }
 
-# Check if Python is installed
-if command -v python${SPIDER_PYTHON_VERSION} &>/dev/null; then
-    echo "Python ${SPIDER_PYTHON_VERSION} is already installed: $(python${SPIDER_PYTHON_VERSION} --version)"
-else
-    echo "Python ${SPIDER_PYTHON_VERSION} is not installed. Installing Python ${SPIDER_PYTHON_VERSION}..."
-    install_python
-fi
+# Setup virtual environment
+setup_virtualenv() {
+    echo "Setting up virtual environment..."
+    python${SPIDER_PYTHON_VERSION} -m venv "$VENV_DIR"
+    source "$VENV_DIR/bin/activate"
+    python -m pip install --upgrade pip
+}
 
-# Set PYTHONPATH env
-source ./cicd-scripts/helpers/update_pythonpath.sh
+# Install dependencies
+install_dependencies() {
+    echo "Installing dependencies..."
+    python -m pip install --upgrade -r ./search_gov_crawler/requirements.txt
+    echo "Installing Playwright..."
+    python -m pip install --upgrade pytest-playwright playwright
+    playwright install --with-deps
+    deactivate
+}
 
-# Use venv with Python 3.12
-sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m pip install --upgrade pip
+# Configure permissions
+configure_permissions() {
+    echo "Configuring file permissions..."
+    sudo chmod -R 777 .
+    sudo chown -R "$(whoami)" .
+    sudo setfacl -Rdm g:dgsearch:rwx .
+}
 
-# Create a virtual environment using Python
-echo "Creating python${SPIDER_PYTHON_VERSION} virtual environment..."
-sudo /usr/local/bin/python${SPIDER_PYTHON_VERSION} -m venv ./venv
+# Manage cron jobs
+manage_cron_jobs() {
+    echo "Managing cron jobs..."
+    crontab -l | grep -v 'app_start.sh' > temp_cron || true
+    echo "@reboot $(pwd)/cicd-scripts/app_start.sh" >> temp_cron
+    crontab temp_cron
+    rm temp_cron
+    echo "Cron jobs updated."
+}
 
-# Activate the virtual environment
-source ./venv/bin/activate
+# Start monitoring agents
+start_agents() {
+    echo "Starting AWS CloudWatch agent..."
+    ensure_executable "./cicd-scripts/helpers/check_cloudwatch.sh"
 
-# Install all spider dependencies
-echo "Installing dependencies..."
-sudo pip install --force-reinstall -r ./search_gov_crawler/requirements.txt
-sudo pip install pytest-playwright playwright -U
-playwright install
+    echo "Starting AWS CodeDeploy agent..."
+    ensure_executable "./cicd-scripts/helpers/check_codedeploy.sh"
+}
 
-echo "Dependencies installed."
+### SCRIPT EXECUTION ###
 
+# Stop running services
+stop_services
 
-# Remove any outstanding app_start.sh reboot cronjobs
-echo "Removing any app_start.sh reboot cron jobs..."
-crontab -l > cron_backup.bak
+# Install system dependencies
+install_system_dependencies
 
-# Remove lines containing 'app_start.sh' and update crontab
-crontab -l | grep -v 'app_start.sh' > cron_backup_filtered
+# Check and install Python if missing
+check_python
 
-# Check if there are changes
-if cmp -s cron_backup_filtered cron_backup.bak; then
-  echo "No cron jobs with 'app_start.sh' found."
-else
-  sudo crontab cron_backup_filtered
-  echo "Cron jobs containing 'app_start.sh' have been removed."
-fi
+# Set environment paths
+update_pythonpath
 
-# Clean up temporary files
-rm cron_backup_filtered cron_backup.bak
+# Configure permissions
+configure_permissions
 
-# Add cron job to run the app back up on ec2 restart
-echo "Adding app_start.sh reboot cron job..."
-sudo chmod +x ./cicd-scripts/app_start.sh
+# Setup and activate virtual environment
+setup_virtualenv
 
-# Define the new cron job
-new_cron="@reboot at now + 1 min -f $(pwd)/cicd-scripts/app_start.sh"
+# Install dependencies
+install_dependencies
 
-# Add the new cron job to the crontab if it's not already present
-(crontab -l | grep -v "$new_cron" ; echo "$new_cron") | crontab -
+# Start AWS agents
+start_agents
 
+# Manage cron jobs
+manage_cron_jobs
 
-echo "Cron job added: $new_cron"
+echo "App installation completed successfully."
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index e790b57..889b511 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
 
-# PUBLIC
+chmod +x ./cicd-scripts/helpers/ensure_executable.sh
+source ./cicd-scripts/helpers/ensure_executable.sh
+
+# TODO: Make it part of the local env variable that is set by Ansible
 SPIDER_RUN_WITH_UI=false
 
-if $SPIDER_RUN_WITH_UI ; then
-    sudo chmod +x ./cicd-scripts/helpers/run_with_ui.sh
-    source ./cicd-scripts/helpers/run_with_ui.sh
+# Determine which script to run based on the SPIDER_RUN_WITH_UI flag
+if $SPIDER_RUN_WITH_UI; then
+    SCRIPT="./cicd-scripts/helpers/run_with_ui.sh"
 else
-    sudo chmod +x ./cicd-scripts/helpers/run_without_ui.sh
-    source ./cicd-scripts/helpers/run_without_ui.sh
+    SCRIPT="./cicd-scripts/helpers/run_without_ui.sh"
 fi
+
+# Ensure the script exists, is executable, and run it
+ensure_executable "$SCRIPT"
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index 3079d15..ea1cb66 100755
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,58 +1,115 @@
 #!/bin/bash
 
-# Clear all cache
-echo "Purge all pip cache..."
-# We can't do `$pip cache purge`, this does the samething:
-sudo rm -r ~/.cache/pip
-sudo rm -rf /root/.cache/pip
-
-# Kill scrapy schedular (if running):
-echo "Stopping scrapy_scheduler.py (if running)"
-sudo chmod +x ./cicd-scripts/helpers/kill_scheduler.sh
-source ./cicd-scripts/helpers/kill_scheduler.sh
-
-echo "Stopping all scrapyd and scrapydweb tasks..."
-# Kill all scrapydweb and scrapyd jobs
-if sudo pkill -f "scrapydweb" 2>/dev/null; then
-    echo "scrapydweb tasks stopped."
-else
-    echo "No scrapydweb tasks running."
-fi
-
-if sudo pkill -f "scrapyd" 2>/dev/null; then
-    echo "scrapyd tasks stopped."
-else
-    echo "No scrapyd tasks running."
-fi
+chmod +x ./cicd-scripts/helpers/ensure_executable.sh
+source ./cicd-scripts/helpers/ensure_executable.sh
 
-# Display remaining scrapy processes (if any)
-echo -e "\nRemaining scrapy processes (if any):"
-ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running."
+### FUNCTIONS ###
+
+# Remove virtual environment if it exists
+remove_venv() {
+    if [ -d ./venv ]; then
+        echo "Removing virtual environment..."
+        rm -rf ./venv/
+    fi
+}
 
-# Force kill any remaning scrapy background jobs still running
-sudo ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9
+# Purge pip cache
+purge_pip_cache() {
+    echo "Purging pip cache..."
+    rm -rf ~/.cache/pip /root/.cache/pip
+}
+
+# Stop scrapy scheduler if running
+stop_scrapy_scheduler() {
+    echo "Stopping scrapy_scheduler.py (if running)..."
+    ensure_executable "./cicd-scripts/helpers/kill_scheduler.sh"
+}
 
-# Kill all nohup jobs (runs with python)
-ps -ef | grep nohup | grep -v grep | awk '{print $2}'
+# Stop scrapyd and scrapydweb tasks
+stop_scrapy_tasks() {
+    echo "Stopping all scrapyd and scrapydweb tasks..."
 
-# Remove other deploy cron jobs:
-#!/bin/bash
+    # Kill scrapydweb tasks
+    if pkill -f "scrapydweb" 2>/dev/null; then
+        echo "scrapydweb tasks stopped."
+    else
+        echo "No scrapydweb tasks running."
+    fi
+
+    # Kill scrapyd tasks
+    if pkill -f "scrapyd" 2>/dev/null; then
+        echo "scrapyd tasks stopped."
+    else
+        echo "No scrapyd tasks running."
+    fi
+}
+
+# Display remaining scrapy processes
+display_remaining_scrapy_processes() {
+    echo -e "\nRemaining scrapy processes (if any):"
+    ps -ef | grep scrapy | grep -v grep || echo "No scrapy processes running."
+}
+
+# Force kill any remaining scrapy background jobs
+kill_remaining_scrapy_jobs() {
+    echo "Force killing remaining scrapy background jobs..."
+    if ps aux | grep -ie [s]crapy | awk '{print $2}' | xargs kill -9; then
+        echo "Remaining scrapy jobs killed."
+    else
+        echo "No remaining scrapy jobs to kill."
+    fi
+}
+
+# Remove nohup jobs (python scripts)
+remove_nohup_jobs() {
+    echo "Removing nohup jobs (python)..."
+    ps -ef | grep nohup | grep -v grep | awk '{print $2}' | xargs kill -9
+}
 
-# Function to remove crontab entries referencing a given cron entry string
+# Remove cron job entries referencing the given string
 remove_cron_entry() {
     if [ -z "$1" ]; then
         echo "Error: No cron entry provided."
         return
     fi
 
-    CRON_ENTRY="$1"
+    local CRON_ENTRY="$1"
+    local CRON_USER=$(whoami)
 
-    # Remove entries referencing the script
-    sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab -
+    echo "Removing cron job entries referencing: $CRON_ENTRY"
 
-    echo "Removed any crontab entries referencing $CRON_ENTRY."
+    # Remove cron job for the current user (including the full path if needed)
+    sudo crontab -l -u "$CRON_USER" 2>/dev/null | grep -v -F "$CRON_ENTRY" | sudo crontab -u "$CRON_USER" -
+
+    echo "Cron job entries for '$CRON_ENTRY' removed."
 }
 
-# Remove any other cron job entries
+### SCRIPT EXECUTION ###
+
+# Remove virtual environment
+remove_venv
+
+# Purge pip cache
+purge_pip_cache
+
+# Stop scrapy scheduler if running
+stop_scrapy_scheduler
+
+# Stop scrapyd and scrapydweb tasks
+stop_scrapy_tasks
+
+# Display remaining scrapy processes (if any)
+display_remaining_scrapy_processes
+
+# Force kill any remaining scrapy background jobs
+kill_remaining_scrapy_jobs
+
+# Remove nohup jobs (python)
+remove_nohup_jobs
+
+# Remove specific cron jobs
 remove_cron_entry "check_cloudwatch.sh"
 remove_cron_entry "check_codedeploy.sh"
+remove_cron_entry "app_start.sh"
+
+echo "App stop completed successfully."
diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh
old mode 100644
new mode 100755
index 18e4870..29bb1db
--- a/cicd-scripts/helpers/check_cloudwatch.sh
+++ b/cicd-scripts/helpers/check_cloudwatch.sh
@@ -18,10 +18,10 @@ check_cloudwatch() {
 # Ensure the script is added to crontab for execution on reboot
 setup_cron() {
     sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
-    CRON_ENTRY="@reboot /bin/bash $PWD/cicd-scripts/helpers/check_cloudwatch.sh"
+    CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_cloudwatch.sh"
 
     # Update crontab, ensuring no duplicates
-    (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab -
+    (crontab -l 2>/dev/null | grep -v -F "check_cloudwatch.sh"; echo "$CRON_ENTRY") | crontab -
     echo "Crontab entry added to ensure the script runs on reboot."
 }
 
diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh
old mode 100644
new mode 100755
index 98731dc..2faaca1
--- a/cicd-scripts/helpers/check_codedeploy.sh
+++ b/cicd-scripts/helpers/check_codedeploy.sh
@@ -18,10 +18,10 @@ check_codedeploy() {
 # Ensure the script is added to crontab for execution on reboot
 setup_cron() {
     sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
-    CRON_ENTRY="@reboot /bin/bash $PWD/helpers/check_codedeploy.sh"
+    CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_codedeploy.sh"
 
     # Update crontab, ensuring no duplicates
-    (sudo crontab -l 2>/dev/null | grep -v -F "$CRON_ENTRY"; echo "$CRON_ENTRY") | sudo crontab -
+    (crontab -l 2>/dev/null | grep -v -F "check_codedeploy.sh"; echo "$CRON_ENTRY") | crontab -
     echo "Crontab entry added to ensure the script runs on reboot."
 }
 
diff --git a/cicd-scripts/helpers/ensure_executable.sh b/cicd-scripts/helpers/ensure_executable.sh
new file mode 100755
index 0000000..88e6439
--- /dev/null
+++ b/cicd-scripts/helpers/ensure_executable.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Function to ensure a file exists, is executable, and then runs it
+ensure_executable() {
+  local script="$1"
+
+  if [ -f "$script" ]; then
+    chmod +x "$script"
+    echo "$script is now executable."
+    source "$script"
+  else
+    echo "Error: $script not found!"
+    # exit 1
+  fi
+}
diff --git a/cicd-scripts/helpers/kill_scheduler.sh b/cicd-scripts/helpers/kill_scheduler.sh
index 76b39e6..4c559da 100755
--- a/cicd-scripts/helpers/kill_scheduler.sh
+++ b/cicd-scripts/helpers/kill_scheduler.sh
@@ -4,23 +4,21 @@
 echo "Searching for scrapy_scheduler.py process..."
 PROCESS_ID=$(pgrep -f "scrapy_scheduler.py")
 
-# Check if the process exists
-if [ -z "$PROCESS_ID" ]; then
+# Check if the process ID was found
+if [ -n "$PROCESS_ID" ]; then
   echo "No running process found for scrapy_scheduler.py."
-  return
-fi
 
-# Kill the process
-echo "Killing process with PID: $PROCESS_ID"
-kill "$PROCESS_ID"
+  # Kill the process
+  echo "Killing process with PID: $PROCESS_ID"
+  kill "$PROCESS_ID" 2>/dev/null
 
-# Pause to allow the process to terminate
-sleep 3
+  # Pause to allow the process to terminate
+  sleep 3
 
-# Verify if the process was killed
-if ! kill -0 "$PROCESS_ID" 2>/dev/null; then
-  echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated."
-else
-  echo "Failed to terminate the process. Please check manually."
-  return
+  # Verify if the process was killed
+  if ! kill -0 "$PROCESS_ID" 2>/dev/null; then
+    echo "Process scrapy_scheduler.py (PID: $PROCESS_ID) has been terminated."
+  else
+    echo "Failed to terminate the process or process no longer exists."
+  fi
 fi
diff --git a/cicd-scripts/helpers/run_without_ui.sh b/cicd-scripts/helpers/run_without_ui.sh
old mode 100644
new mode 100755
index edf0c3a..247488b
--- a/cicd-scripts/helpers/run_without_ui.sh
+++ b/cicd-scripts/helpers/run_without_ui.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
-SPIDER_PYTHON_VERSION=3.12
-sudo bash -c "nohup /usr/local/bin/python${SPIDER_PYTHON_VERSION} ./search_gov_crawler/scrapy_scheduler.py > /var/log/scrapy.log 2>&1 &"
+
+# Run the script in the background using the virtual environment
+chmod +x ./search_gov_crawler/scrapy_scheduler.py
+
+sudo nohup bash -c "source ./venv/bin/activate && ./venv/bin/python ./search_gov_crawler/scrapy_scheduler.py" > /var/log/scrapy_scheduler.log 2>&1 &
+
 echo "Running no UI vesrion of searchgov-spider..."
diff --git a/cicd-scripts/helpers/update_pythonpath.sh b/cicd-scripts/helpers/update_pythonpath.sh
old mode 100644
new mode 100755
diff --git a/pyproject.toml b/pyproject.toml
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/benchmark.py b/search_gov_crawler/benchmark.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/output/.gitignore b/search_gov_crawler/output/.gitignore
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/scrapy.cfg b/search_gov_crawler/scrapy.cfg
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/scrapy_scheduler.py b/search_gov_crawler/scrapy_scheduler.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/scrapyd-logs/.gitignore b/search_gov_crawler/scrapyd-logs/.gitignore
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/scrapyd.conf b/search_gov_crawler/scrapyd.conf
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/scrapydweb_settings_v10.py b/search_gov_crawler/scrapydweb_settings_v10.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_logparser/__init__.py b/search_gov_crawler/search_gov_logparser/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_scrapyd/__init__.py b/search_gov_crawler/search_gov_scrapyd/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_scrapydweb/__init__.py b/search_gov_crawler/search_gov_scrapydweb/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/__init__.py b/search_gov_crawler/search_gov_spiders/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja
old mode 100644
new mode 100755
index 656c94b..47f6bff
--- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja
+++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja
@@ -23,5 +23,3 @@
         </table>
     </body>
 </html>
-
-
diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css
old mode 100644
new mode 100755
index 2f13050..7d63694
--- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css
+++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css
@@ -83,5 +83,3 @@ table{max-width:100%;background-color:transparent;border-collapse:collapse;borde
 .icon,.icon-big {display:inline-block;}
 .icon {width:34px;height:34px;}
 .icon-big {width:140px;height:140px;}
-
-
diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css
old mode 100644
new mode 100755
index bd1e39d..124bd07
--- a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css
+++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css
@@ -24,4 +24,4 @@ table.report-container td {padding: 40px 20px;}
 .report-section h2 {margin: 0 0 20px 0;padding: 0 0 12px 0;line-height: 20px;border-bottom: 1px solid #f4f4f4;}
 .report-section h3 {margin: 25px 0 5px 0;line-height: 24px;}
 .report-section h4 {margin: 0 0 2px 0;}
-.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;}
\ No newline at end of file
+.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;}
diff --git a/search_gov_crawler/search_gov_spiders/actions/results.css b/search_gov_crawler/search_gov_spiders/actions/results.css
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/actions/results.jinja b/search_gov_crawler/search_gov_spiders/actions/results.jinja
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/extensions/__init__.py b/search_gov_crawler/search_gov_spiders/extensions/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/extensions/json_logging.py b/search_gov_crawler/search_gov_spiders/extensions/json_logging.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/helpers/__init__.py b/search_gov_crawler/search_gov_spiders/helpers/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py b/search_gov_crawler/search_gov_spiders/helpers/domain_spider.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/items.py b/search_gov_crawler/search_gov_spiders/items.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/middlewares.py b/search_gov_crawler/search_gov_spiders/middlewares.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/monitors.py b/search_gov_crawler/search_gov_spiders/monitors.py
old mode 100644
new mode 100755
index 260dd94..7dafd2a
--- a/search_gov_crawler/search_gov_spiders/monitors.py
+++ b/search_gov_crawler/search_gov_spiders/monitors.py
@@ -14,4 +14,4 @@ class PeriodicMonitorSuite(MonitorSuite):
 
     monitors_failed_actions = [
         CreateCustomFileReport, SendSmtpEmail
-    ]
\ No newline at end of file
+    ]
diff --git a/search_gov_crawler/search_gov_spiders/pipelines.py b/search_gov_crawler/search_gov_spiders/pipelines.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/settings.py b/search_gov_crawler/search_gov_spiders/settings.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/spiders/__init__.py b/search_gov_crawler/search_gov_spiders/spiders/__init__.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py b/search_gov_crawler/search_gov_spiders/spiders/domain_spider_js.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/utility_files/README.md b/search_gov_crawler/search_gov_spiders/utility_files/README.md
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json b/search_gov_crawler/search_gov_spiders/utility_files/crawl-sites.json
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py b/search_gov_crawler/search_gov_spiders/utility_files/import_plist.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py b/search_gov_crawler/search_gov_spiders/utility_files/init_schedule.py
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist b/search_gov_crawler/search_gov_spiders/utility_files/scrutiny-2023-06-20.plist
old mode 100644
new mode 100755
diff --git a/search_gov_crawler/setup.py b/search_gov_crawler/setup.py
old mode 100644
new mode 100755
diff --git a/setup.cfg b/setup.cfg
old mode 100644
new mode 100755
diff --git a/tests/__init__.py b/tests/__init__.py
old mode 100644
new mode 100755
diff --git a/tests/integration_tests/test_scrapyd.py b/tests/integration_tests/test_scrapyd.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/conftest.py b/tests/search_gov_spiders/conftest.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/crawl-sites-test.json b/tests/search_gov_spiders/crawl-sites-test.json
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.bak
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dat
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider.db.dir
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.bak
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dat
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir b/tests/search_gov_spiders/scrapy_httpcache/domain_spider_js.db.dir
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_extensions.py b/tests/search_gov_spiders/test_extensions.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_helpers.py b/tests/search_gov_spiders/test_helpers.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_middlewares.py b/tests/search_gov_spiders/test_middlewares.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_pipelines.py b/tests/search_gov_spiders/test_pipelines.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_scrapy_scheduler.py b/tests/search_gov_spiders/test_scrapy_scheduler.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_spider.py b/tests/search_gov_spiders/test_spider.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
old mode 100644
new mode 100755
diff --git a/tests/search_gov_spiders/test_utiliity_files.py b/tests/search_gov_spiders/test_utiliity_files.py
old mode 100644
new mode 100755

From 721f77fc5054f2b9c093b49a14569649b5f3b3a3 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 04:46:13 -0500
Subject: [PATCH 29/40] fix

---
 cicd-scripts/app_start.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 889b511..31e9353 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
-source ./cicd-scripts/helpers/ensure_executable.sh
+./cicd-scripts/helpers/ensure_executable.sh
 
 # TODO: Make it part of the local env variable that is set by Ansible
 SPIDER_RUN_WITH_UI=false

From 866fbd40f1d3ed3dd96a0d8d960ac7312b8ff2f2 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 04:50:48 -0500
Subject: [PATCH 30/40] ...

---
 cicd-scripts/app_start.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 31e9353..8a8710e 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+echo "###" $(pwd)
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 ./cicd-scripts/helpers/ensure_executable.sh
 

From 9a528f10fdff59c37c6298fd8c9ec5a68eb8b451 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 05:13:53 -0500
Subject: [PATCH 31/40] set root path

---
 cicd-scripts/app_install.sh | 3 +++
 cicd-scripts/app_start.sh   | 5 ++++-
 cicd-scripts/app_stop.sh    | 3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 9b9c8df..d19f00d 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# CD into the current script directory (which != $pwd)
+cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
+
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 source ./cicd-scripts/helpers/ensure_executable.sh
 
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 8a8710e..a0fa24c 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
-echo "###" $(pwd)
+
+# CD into the current script directory (which != $pwd)
+cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
+
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 ./cicd-scripts/helpers/ensure_executable.sh
 
diff --git a/cicd-scripts/app_stop.sh b/cicd-scripts/app_stop.sh
index ea1cb66..9d536a8 100755
--- a/cicd-scripts/app_stop.sh
+++ b/cicd-scripts/app_stop.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# CD into the current script directory (which != $pwd)
+cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
+
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 source ./cicd-scripts/helpers/ensure_executable.sh
 

From 3f43ad8b879ff600e2356e70e336343380b96631 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 05:18:04 -0500
Subject: [PATCH 32/40] ...

---
 cicd-scripts/app_start.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index a0fa24c..5e3e45b 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -3,6 +3,8 @@
 # CD into the current script directory (which != $pwd)
 cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
 
+echo "### $pwd"
+
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 ./cicd-scripts/helpers/ensure_executable.sh
 

From 2841b49fcdae0bca1191f268a3bf3023b8b746ff Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 05:18:14 -0500
Subject: [PATCH 33/40] ...

---
 cicd-scripts/app_start.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index 5e3e45b..eb99ddb 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -6,7 +6,7 @@ cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
 echo "### $pwd"
 
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
-./cicd-scripts/helpers/ensure_executable.sh
+source ./cicd-scripts/helpers/ensure_executable.sh
 
 # TODO: Make it part of the local env variable that is set by Ansible
 SPIDER_RUN_WITH_UI=false

From a1ba2da143a2c260eda7482015694edbf02742dd Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 05:42:11 -0500
Subject: [PATCH 34/40] ...

---
 cicd-scripts/app_install.sh              | 14 ++++++++------
 cicd-scripts/app_start.sh                |  2 --
 cicd-scripts/helpers/check_cloudwatch.sh |  2 +-
 cicd-scripts/helpers/check_codedeploy.sh |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index d19f00d..1a61dee 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -34,11 +34,13 @@ install_system_dependencies() {
 install_python() {
     echo "Installing Python ${SPIDER_PYTHON_VERSION}..."
     cd /usr/src
-    sudo wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
-    sudo tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz
+    wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
+    tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz
     cd Python-${SPIDER_PYTHON_VERSION}.0
-    sudo ./configure --enable-optimizations
-    sudo make altinstall
+    ./configure --enable-optimizations
+    make
+    make install
+    make altinstall
     cd "$_CURRENT_BUILD_DIR"
     echo "Python ${SPIDER_PYTHON_VERSION} installed successfully."
 }
@@ -78,8 +80,8 @@ install_dependencies() {
 # Configure permissions
 configure_permissions() {
     echo "Configuring file permissions..."
-    sudo chmod -R 777 .
-    sudo chown -R "$(whoami)" .
+    chmod -R 777 .
+    chown -R "$(whoami)" .
     sudo setfacl -Rdm g:dgsearch:rwx .
 }
 
diff --git a/cicd-scripts/app_start.sh b/cicd-scripts/app_start.sh
index eb99ddb..76b0081 100755
--- a/cicd-scripts/app_start.sh
+++ b/cicd-scripts/app_start.sh
@@ -3,8 +3,6 @@
 # CD into the current script directory (which != $pwd)
 cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && cd ../
 
-echo "### $pwd"
-
 chmod +x ./cicd-scripts/helpers/ensure_executable.sh
 source ./cicd-scripts/helpers/ensure_executable.sh
 
diff --git a/cicd-scripts/helpers/check_cloudwatch.sh b/cicd-scripts/helpers/check_cloudwatch.sh
index 29bb1db..487d122 100755
--- a/cicd-scripts/helpers/check_cloudwatch.sh
+++ b/cicd-scripts/helpers/check_cloudwatch.sh
@@ -17,7 +17,7 @@ check_cloudwatch() {
 
 # Ensure the script is added to crontab for execution on reboot
 setup_cron() {
-    sudo chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
+    chmod +x ./cicd-scripts/helpers/check_cloudwatch.sh
     CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_cloudwatch.sh"
 
     # Update crontab, ensuring no duplicates
diff --git a/cicd-scripts/helpers/check_codedeploy.sh b/cicd-scripts/helpers/check_codedeploy.sh
index 2faaca1..6e6cf15 100755
--- a/cicd-scripts/helpers/check_codedeploy.sh
+++ b/cicd-scripts/helpers/check_codedeploy.sh
@@ -17,7 +17,7 @@ check_codedeploy() {
 
 # Ensure the script is added to crontab for execution on reboot
 setup_cron() {
-    sudo chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
+    chmod +x ./cicd-scripts/helpers/check_codedeploy.sh
     CRON_ENTRY="@reboot $(pwd)/cicd-scripts/helpers/check_codedeploy.sh"
 
     # Update crontab, ensuring no duplicates

From 43c8df0dd256f5f3aae1932ced6ac8372da4acce Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 12:39:53 -0500
Subject: [PATCH 35/40] test unit tests

---
 cicd-scripts/app_install.sh                      | 1 +
 tests/search_gov_spiders/test_urls_files_size.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cicd-scripts/app_install.sh b/cicd-scripts/app_install.sh
index 1a61dee..5179308 100755
--- a/cicd-scripts/app_install.sh
+++ b/cicd-scripts/app_install.sh
@@ -36,6 +36,7 @@ install_python() {
     cd /usr/src
     wget -q https://www.python.org/ftp/python/${SPIDER_PYTHON_VERSION}.0/Python-${SPIDER_PYTHON_VERSION}.0.tgz
     tar xzf Python-${SPIDER_PYTHON_VERSION}.0.tgz
+    sudo chown -R $(whoami) ./Python-${SPIDER_PYTHON_VERSION}.0
     cd Python-${SPIDER_PYTHON_VERSION}.0
     ./configure --enable-optimizations
     make
diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index 1c9a3b0..2c683c0 100755
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -33,6 +33,7 @@ def fixture_mock_open(mocker):
 @pytest.fixture(name="pipeline_no_api")
 def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
     mocker.patch.dict(os.environ, {})
+    mocker.patch('os.getpid', return_value=1234)
     return SearchGovSpidersPipeline()
 
 
@@ -40,12 +41,12 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
 def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline:
     """Fixture for pipeline with an API URL set."""
     mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"})
+    mocker.patch('os.getpid', return_value=1234)
     return SearchGovSpidersPipeline()
 
 
 def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker):
     """Test that URLs are written to files when SPIDER_URLS_API is not set."""
-    mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=False)
     pipeline_no_api.process_item(sample_item, sample_spider)
 
     # Ensure file is opened and written to
@@ -74,11 +75,10 @@ def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker):
     """Test that file rotation occurs when max size is exceeded."""
     mock_rename = mocker.patch("os.rename")
 
-    mocker.patch.object(SearchGovSpidersPipeline, "_is_file_too_large", return_value=True)
     pipeline_no_api.process_item(sample_item, None)
 
     # Check if the file was rotated
-    mock_open.assert_called_with(pipeline_no_api.base_file_name, "w", encoding="utf-8")
+    mock_open.assert_called_with(pipeline_no_api.base_file_name, "a", encoding="utf-8")
     mock_open().close.assert_called()
     mock_rename.assert_called_once_with(
         pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv"

From 2638509ba155e689d225638ed1798cd2f4edba13 Mon Sep 17 00:00:00 2001
From: selfdanielj <38377823+selfdanielj@users.noreply.github.com>
Date: Tue, 17 Dec 2024 14:47:17 -0500
Subject: [PATCH 36/40] fixes to tests

---
 .../test_urls_files_size.py                   | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index 2c683c0..f2de164 100755
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -1,10 +1,9 @@
 import os
-from pathlib import Path
 
 import pytest
-
 from scrapy import Spider
 from scrapy.utils.test import get_crawler
+
 from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem
 from search_gov_crawler.search_gov_spiders.pipelines import SearchGovSpidersPipeline
 
@@ -33,7 +32,7 @@ def fixture_mock_open(mocker):
 @pytest.fixture(name="pipeline_no_api")
 def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
     mocker.patch.dict(os.environ, {})
-    mocker.patch('os.getpid', return_value=1234)
+    mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234)
     return SearchGovSpidersPipeline()
 
 
@@ -41,16 +40,17 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
 def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline:
     """Fixture for pipeline with an API URL set."""
     mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"})
-    mocker.patch('os.getpid', return_value=1234)
+    mocker.patch("os.getpid", return_value=1234)
     return SearchGovSpidersPipeline()
 
 
 def test_write_to_file(pipeline_no_api, mock_open, sample_item, sample_spider, mocker):
     """Test that URLs are written to files when SPIDER_URLS_API is not set."""
+    mocker.patch.object(SearchGovSpidersPipeline, "_file_size", return_value=100)
     pipeline_no_api.process_item(sample_item, sample_spider)
 
     # Ensure file is opened and written to
-    mock_open.assert_called_once_with(pipeline_no_api.base_file_name, "w", encoding="utf-8")
+    mock_open.assert_called_once_with(pipeline_no_api.file_path, "a", encoding="utf-8")
     mock_open().write.assert_any_call(sample_item["url"] + "\n")
 
 
@@ -64,7 +64,11 @@ def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker):
     assert sample_item["url"] in pipeline_with_api.urls_batch
 
     # Simulate max size to force post
-    mocker.patch.object(SearchGovSpidersPipeline, "_is_batch_too_large", return_value=True)
+    mocker.patch.object(
+        SearchGovSpidersPipeline,
+        "_batch_size",
+        return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES,
+    )
     pipeline_with_api.process_item(sample_item, sample_spider)
 
     # Ensure POST request was made
@@ -74,15 +78,17 @@ def test_post_to_api(pipeline_with_api, sample_item, sample_spider, mocker):
 def test_rotate_file(pipeline_no_api, mock_open, sample_item, mocker):
     """Test that file rotation occurs when max size is exceeded."""
     mock_rename = mocker.patch("os.rename")
-
+    mocker.patch.object(
+        SearchGovSpidersPipeline,
+        "_file_size",
+        return_value=SearchGovSpidersPipeline.MAX_FILE_SIZE_BYTES,
+    )
     pipeline_no_api.process_item(sample_item, None)
 
     # Check if the file was rotated
-    mock_open.assert_called_with(pipeline_no_api.base_file_name, "a", encoding="utf-8")
+    mock_open.assert_called_with(pipeline_no_api.file_path, "a", encoding="utf-8")
     mock_open().close.assert_called()
-    mock_rename.assert_called_once_with(
-        pipeline_no_api.file_path, pipeline_no_api.parent_file_path / "output/all-links-1.csv"
-    )
+    mock_rename.assert_called_once()
 
 
 def test_post_urls_on_spider_close(pipeline_with_api, sample_spider, mocker):

From 7e9c9350c48a119b7b5a68190ff3d86e90384225 Mon Sep 17 00:00:00 2001
From: selfdanielj <38377823+selfdanielj@users.noreply.github.com>
Date: Tue, 17 Dec 2024 15:12:05 -0500
Subject: [PATCH 37/40] use same method for both mocks

---
 tests/search_gov_spiders/test_urls_files_size.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/search_gov_spiders/test_urls_files_size.py b/tests/search_gov_spiders/test_urls_files_size.py
index f2de164..319b547 100755
--- a/tests/search_gov_spiders/test_urls_files_size.py
+++ b/tests/search_gov_spiders/test_urls_files_size.py
@@ -40,7 +40,8 @@ def fixture_pipeline_no_api(mock_open, mocker) -> SearchGovSpidersPipeline:
 def fixture_pipeline_with_api(mocker) -> SearchGovSpidersPipeline:
     """Fixture for pipeline with an API URL set."""
     mocker.patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"})
-    mocker.patch("os.getpid", return_value=1234)
+    mocker.patch("search_gov_crawler.search_gov_spiders.pipelines.SearchGovSpidersPipeline.APP_PID", 1234)
+
     return SearchGovSpidersPipeline()
 
 

From c2ac070c769cf0f7ee06e3887ad009328e2b707a Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Tue, 17 Dec 2024 15:37:49 -0500
Subject: [PATCH 38/40] added dedup test for full coverage

---
 .../test_deduplicator_pipeline.py             | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 tests/search_gov_spiders/test_deduplicator_pipeline.py

diff --git a/tests/search_gov_spiders/test_deduplicator_pipeline.py b/tests/search_gov_spiders/test_deduplicator_pipeline.py
new file mode 100644
index 0000000..86d9ee0
--- /dev/null
+++ b/tests/search_gov_spiders/test_deduplicator_pipeline.py
@@ -0,0 +1,117 @@
+import os
+import pytest
+from unittest.mock import MagicMock, patch
+from scrapy.exceptions import DropItem
+from search_gov_crawler.search_gov_spiders.pipelines import (
+    SearchGovSpidersPipeline,
+    DeDeuplicatorPipeline,
+)
+
+# ---------------------------
+# Fixtures
+# ---------------------------
+
+@pytest.fixture
+def sample_item():
+    """Fixture for a valid sample item."""
+    return {"url": "http://example.com"}
+
+@pytest.fixture
+def invalid_item():
+    """Fixture for an invalid item with no URL."""
+    return {}
+
+@pytest.fixture
+def sample_spider():
+    """Fixture for a mock spider with a logger."""
+    class SpiderMock:
+        logger = MagicMock()
+    return SpiderMock()
+
+@pytest.fixture
+def pipeline_no_api():
+    """Fixture for SearchGovSpidersPipeline with no SPIDER_URLS_API."""
+    with patch.dict(os.environ, {}, clear=True):
+        return SearchGovSpidersPipeline()
+
+@pytest.fixture
+def pipeline_with_api():
+    """Fixture for SearchGovSpidersPipeline with SPIDER_URLS_API set."""
+    with patch.dict(os.environ, {"SPIDER_URLS_API": "http://mockapi.com"}):
+        return SearchGovSpidersPipeline()
+
+@pytest.fixture
+def deduplicator_pipeline():
+    """Fixture for DeDeuplicatorPipeline with clean state."""
+    return DeDeuplicatorPipeline()
+
+# ---------------------------
+# Tests for SearchGovSpidersPipeline
+# ---------------------------
+
+def test_missing_url_in_item(pipeline_no_api, sample_spider, invalid_item):
+    """
+    Verify DropItem exception is raised when an item has no URL.
+    """
+    with pytest.raises(DropItem, match="Missing URL in item"):
+        pipeline_no_api.process_item(invalid_item, sample_spider)
+
+# ---------------------------
+# Tests for DeDeuplicatorPipeline
+# ---------------------------
+
+@pytest.mark.parametrize(
+    "item",
+    [
+        {"url": "http://example.com/1"},
+        {"url": "http://example.com/2"},
+    ],
+)
+def test_deduplicator_pipeline_unique_items(deduplicator_pipeline, item):
+    """
+    Verify that unique items are processed successfully.
+    """
+    result = deduplicator_pipeline.process_item(item, None)
+    assert result == item
+
+
+def test_deduplicator_pipeline_duplicate_item(deduplicator_pipeline, sample_item):
+    """
+    Verify that duplicate items raise DropItem.
+    """
+    deduplicator_pipeline.process_item(sample_item, None)  # First time should pass
+
+    with pytest.raises(DropItem, match="Item already seen!"):
+        deduplicator_pipeline.process_item(sample_item, None)  # Duplicate raises DropItem
+
+
+def test_deduplicator_pipeline_multiple_items(deduplicator_pipeline):
+    """
+    Verify that multiple unique items are processed without errors.
+    """
+    item1 = {"url": "http://example.com/1"}
+    item2 = {"url": "http://example.com/2"}
+
+    result1 = deduplicator_pipeline.process_item(item1, None)
+    result2 = deduplicator_pipeline.process_item(item2, None)
+
+    assert result1 == item1
+    assert result2 == item2
+
+
+def test_deduplicator_pipeline_clean_state():
+    """
+    Verify that a new instance of DeDeuplicatorPipeline starts with a clean state.
+    """
+    pipeline1 = DeDeuplicatorPipeline()
+    pipeline2 = DeDeuplicatorPipeline()
+
+    item = {"url": "http://example.com/1"}
+
+    # First pipeline processes the item
+    result = pipeline1.process_item(item, None)
+    assert result == item
+
+    # Second pipeline should also process the same item as it has a clean state
+    result = pipeline2.process_item(item, None)
+    assert result == item

From 0db90b22e48ce01ad70d8157fe9a4edc2a90f675 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 18 Dec 2024 11:21:48 -0500
Subject: [PATCH 39/40] code feedback and unit tests

---
 README.md                                   |  2 +-
 __init__.py                                 |  0
 search_gov_crawler/__init__.py              |  0
 search_gov_crawler/requirements.txt         |  1 -
 tests/search_gov_spiders/test_full_crawl.py | 12 ++----------
 5 files changed, 3 insertions(+), 12 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 search_gov_crawler/__init__.py

diff --git a/README.md b/README.md
index bd7c2e0..94b760c 100755
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ Make sure to run `pip install -r requirements.txt` and `playwright install` befo
 1. Navigate to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory
 2. Enter one of two following commands:
 
-    * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the “FEEDS” variable of the [*settings.py*](search_gov_crawler/search_gov_spiders/settings.py) file:
+    * This command will output the yielded URLs in the destination (relative to the [*spiders*](search_gov_crawler/search_gov_spiders/spiders) directory) and file format specified in the `search_gov_crawler/search_gov_spiders/pipelines.py`:
 
           $ scrapy runspider <spider_file.py>
 
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/search_gov_crawler/__init__.py b/search_gov_crawler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt
index f0f56e0..19790be 100755
--- a/search_gov_crawler/requirements.txt
+++ b/search_gov_crawler/requirements.txt
@@ -10,5 +10,4 @@ scrapy-playwright==0.0.42
 scrapyd==1.5.0
 scrapyd-client==2.0.0
 scrapydweb @ git+https://github.com/GSA/searchgov-scrapydweb
-spidermon [monitoring] == 1.22.0
 spidermon[monitoring]==1.22.0
diff --git a/tests/search_gov_spiders/test_full_crawl.py b/tests/search_gov_spiders/test_full_crawl.py
index 1bfd892..58e24ab 100755
--- a/tests/search_gov_spiders/test_full_crawl.py
+++ b/tests/search_gov_spiders/test_full_crawl.py
@@ -94,18 +94,10 @@ def test_full_crawl(mock_scrapy_settings, monkeypatch, spider, use_dedup, crawl_
         temp_dir.joinpath("output").mkdir(exist_ok=True)
 
         def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs):
-            # pipeline_cls.current_file_size = 0
-            # pipeline_cls.file_number = 1
-            # pipeline_cls.parent_file_path = temp_dir
-            # pipeline_cls.base_path_name = str(pipeline_cls.parent_file_path / "output/all-links.csv")
-            # pipeline_cls.short_file = open(pipeline_cls.base_path_name, "w", encoding="utf-8")
-            # pipeline_cls.max_file_size = 3900
-            # pipeline_cls.paginate = True
-
             pipeline_cls.api_url = None
             pipeline_cls.file_number = 1
             pipeline_cls.parent_file_path = temp_dir
-            pipeline_cls.base_file_name = temp_dir / "output" / "all-links.csv"
+            pipeline_cls.base_file_name = temp_dir / "output" / "all-links-p1234.csv"
             pipeline_cls.file_path = pipeline_cls.base_file_name
             pipeline_cls.current_file = open(pipeline_cls.file_path, "w", encoding="utf-8")
 
@@ -122,7 +114,7 @@ def mock_init(pipeline_cls, *_args, temp_dir=temp_dir, **_kwargs):
         with open(output_file.name, encoding="UTF") as f:
             links = json.load(f)
 
-        split_files = list(temp_dir.glob("all-links*.csv"))
+        split_files = list(temp_dir.glob("all-links-p*.csv"))
 
         # verify total links match expected
         assert len(links) == expected_results

From 0b05f3f74d0f7928c639663fd5f4cd33ed2c8270 Mon Sep 17 00:00:00 2001
From: Igor Zaytsev <igor.zaytsev.dev@gmail.com>
Date: Wed, 18 Dec 2024 11:55:24 -0500
Subject: [PATCH 40/40] optimized unit tests

---
 .../test_deduplicator_pipeline.py             | 31 +++++++++++++++-
 tests/search_gov_spiders/test_pipelines.py    | 36 -------------------
 2 files changed, 30 insertions(+), 37 deletions(-)
 delete mode 100755 tests/search_gov_spiders/test_pipelines.py

diff --git a/tests/search_gov_spiders/test_deduplicator_pipeline.py b/tests/search_gov_spiders/test_deduplicator_pipeline.py
index 86d9ee0..55ee3e2 100644
--- a/tests/search_gov_spiders/test_deduplicator_pipeline.py
+++ b/tests/search_gov_spiders/test_deduplicator_pipeline.py
@@ -1,12 +1,13 @@
 import os
 import pytest
+from contextlib import suppress
 from unittest.mock import MagicMock, patch
 from scrapy.exceptions import DropItem
 from search_gov_crawler.search_gov_spiders.pipelines import (
     SearchGovSpidersPipeline,
     DeDeuplicatorPipeline,
 )
-
+from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem
 # ---------------------------
 # Fixtures
 # ---------------------------
@@ -115,3 +116,31 @@ def test_deduplicator_pipeline_clean_state():
     # Second pipeline should also process the same item as it has a clean state
     result = pipeline2.process_item(item, None)
     assert result == item
+
+@pytest.mark.parametrize(
+    ("items", "urls_seen_length"),
+    [
+        (
+            [
+                SearchGovSpidersItem(url="https://www.example.com/1"),
+                SearchGovSpidersItem(url="https://www.example.com/2"),
+            ],
+            2,
+        ),
+        (
+            [
+                SearchGovSpidersItem(url="https://www.example.com/1"),
+                SearchGovSpidersItem(url="https://www.example.com/1"),
+            ],
+            1,
+        ),
+    ],
+)
+def test_deduplicator_pipeline(items, urls_seen_length):
+    pl = DeDeuplicatorPipeline()
+
+    with suppress(DropItem):
+        for item in items:
+            pl.process_item(item, None)
+
+    assert len(pl.urls_seen) == urls_seen_length
diff --git a/tests/search_gov_spiders/test_pipelines.py b/tests/search_gov_spiders/test_pipelines.py
deleted file mode 100755
index 0b85135..0000000
--- a/tests/search_gov_spiders/test_pipelines.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from contextlib import suppress
-
-import pytest
-from scrapy.exceptions import DropItem
-
-from search_gov_crawler.search_gov_spiders.items import SearchGovSpidersItem
-from search_gov_crawler.search_gov_spiders.pipelines import DeDeuplicatorPipeline
-
-
-@pytest.mark.parametrize(
-    ("items", "urls_seen_length"),
-    [
-        (
-            [
-                SearchGovSpidersItem(url="https://www.example.com/1"),
-                SearchGovSpidersItem(url="https://www.example.com/2"),
-            ],
-            2,
-        ),
-        (
-            [
-                SearchGovSpidersItem(url="https://www.example.com/1"),
-                SearchGovSpidersItem(url="https://www.example.com/1"),
-            ],
-            1,
-        ),
-    ],
-)
-def test_deduplicator_pipeline(items, urls_seen_length):
-    pl = DeDeuplicatorPipeline()
-
-    with suppress(DropItem):
-        for item in items:
-            pl.process_item(item, None)
-
-    assert len(pl.urls_seen) == urls_seen_length