Merge pull request #554 from HHS/main

v3.7.1.111 and v3.7.2.112 to prod
HHS · Dec 6, 2024 · dd24e11 · dd24e11
2 parents ee1aa65 + b8fb9cd
commit dd24e11
Show file tree

Hide file tree

Showing 89 changed files with 7,668 additions and 3,279 deletions.
diff --git a/.circleci/build-and-test/commands.yml b/.circleci/build-and-test/commands.yml
@@ -25,7 +25,7 @@
             fi
             echo "export CURRENT_FLAG=$CURRENT_FLAG" >> $BASH_ENV
       - run:
-          name: Upload code coverage report if target branch
+          name: Upload code coverage report of target branch
           command: codecov -t "$CODECOV_TOKEN" -f <<parameters.coverage-report>> -F "$CURRENT_FLAG"
 
   install-nodejs-machine:

diff --git a/.circleci/deployment/commands.yml b/.circleci/deployment/commands.yml
@@ -226,15 +226,8 @@
         default: CF_APP
     steps:
       - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            apk update
-            apk add jq
-            apk add curl
-            # TODO: Add Signature check
-            curl -L "https://packages.cloudfoundry.org/stable?release=linux64-binary&version=v7&source=github" | tar -zx
-            mv cf7 /usr/local/bin/cf
+      - sudo-check
+      - cf-check
       - login-cloud-dot-gov:
           cf-password: <<parameters.cf-password>>
           cf-username: <<parameters.cf-username>>
@@ -285,16 +278,7 @@
         type: string
     steps:
       - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            sudo apt update
-            sudo apt install jq
-            sudo apt install curl
-            # TODO: Add Signature check
-            curl -L "https://packages.cloudfoundry.org/stable?release=linux64-binary&version=v7&source=github" | tar -zx
-            sudo mv cf7 /usr/local/bin/cf
-            sudo chmod +x /usr/local/bin/cf
+      - cf-check
       - login-cloud-dot-gov:
           cf-password: <<parameters.cf-password>>
           cf-username: <<parameters.cf-username>>

diff --git a/.gitconfig b/.gitconfig
@@ -14,3 +14,4 @@
 	allowed = .git/config:.*
 	allowed = .gitconfig:.*
 	allowed = .*DJANGO_SECRET_KEY=.*
+	allowed = ./tdrs-backend/plg/loki/manifest.yml:*
diff --git a/.github/ISSUE_TEMPLATE/design-deliverable-issue-template.md b/.github/ISSUE_TEMPLATE/design-deliverable-issue-template.md
@@ -24,7 +24,7 @@ assignees: ''
 - [ ] Documentation work for the following has occurred:
    - [ ] Relevant User stories.
    - [ ] Recommended pa11y checks.
-   - [ ] Updating living UX documents, e.g. User Flows or Personas(if relevant).
+   - [ ] Updating living UX documents, e.g. User Flows, Personas, [Service Blueprint](https://www.figma.com/design/irgQPLTrajxCXNiYBTEnMV/TDP-Mockups-For-Feedback?node-id=9080-4762) (if relevant).
 - [ ] Internal Raft Review has occurred to ensure DoD standards and QA
 - [ ] Dev/Design sync has occurred; resulting tickets created
 - [ ] The design is usable and accessible, meaning it adheres to definition of done standards for design work.

diff --git a/.github/ISSUE_TEMPLATE/research-synthesis-issue-template.md b/.github/ISSUE_TEMPLATE/research-synthesis-issue-template.md
@@ -13,7 +13,8 @@ assignees: ''
 
 **AC:**
 
-- [ ] A hack.md with the drafted synthesis has been reviewed.
+- [ ] A Gitbook with the drafted synthesis has been reviewed.
+- [ ] [TDP Service Blueprint](https://www.figma.com/design/irgQPLTrajxCXNiYBTEnMV/TDP-Mockups-For-Feedback?node-id=9080-4762) has been updated, as appplicable
 - [ ] PR has been opened containing the final draft of the synthesis.
 - [ ] Internal Raft Review has occurred to ensure DoD standards and QA
 - [ ] The content is usable and accessible, meaning it adheres to definition of done standards for design work.
@@ -35,4 +36,4 @@ assignees: ''
 
 **Supporting Documentation:**
 
-- --Link to hack.md--
+- --Link to the gitbook page--
diff --git a/.gitignore b/.gitignore
@@ -109,4 +109,9 @@ cypress.env.json
 
 # Patches
 *.patch
+
+# Logs
+*.log
+
+# DB seeds
 tdrs-backend/*.pg
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -2,11 +2,6 @@ version: '3'
 
 tasks:
 
-  upload-kibana-objs:
-    desc: Upload dashboards to Kibana server
-    cmds:
-      - 'curl -X POST localhost:5601/api/saved_objects/_import -H "kbn-xsrf: true" --form file=@tdrs-backend/tdpservice/search_indexes/kibana_saved_objs.ndjson'
-
   create-network:
     desc: Create the external network
     cmds:
@@ -251,7 +246,7 @@ tasks:
     desc: Open a shell in the frontend container
     dir: tdrs-frontend
     cmds:
-      - docker-compose -f docker-compose.yml exec tdp-frontend sh
+      - docker-compose -f docker-compose.yml exec tdp-frontend bash
 
   up:
     desc: Start both frontend and backend web servers
@@ -268,4 +263,4 @@ tasks:
   help:
     desc: Show this help message
     cmds:
-      - task --list
+      - task --list
diff --git a/codecov.yml b/codecov.yml
@@ -42,4 +42,7 @@ flags:
     carryforward: true
 
 ignore:
-  - "tdrs-backend/tdpservice/scheduling/db_backup.py"
+  - "tdrs-backend/tdpservice/scheduling/db_backup.py"
+  - "tdrs-backend/tdpservice/search_indexes/admin/mulitselect_filter.py"
+  - "tdrs-backend/tdpservice/email/helpers/account_access_requests.py"
+  - "tdrs-backend/tdpservice/search_indexes/admin/filters.py"
diff --git a/docs/Security-Compliance/diagram.drawio b/docs/Security-Compliance/diagram.drawio
diff --git a/docs/Security-Compliance/diagram.png b/docs/Security-Compliance/diagram.png
diff --git a/docs/Technical-Documentation/Architecture-Decision-Record/022-app-monitoring.md b/docs/Technical-Documentation/Architecture-Decision-Record/022-app-monitoring.md
@@ -0,0 +1,68 @@
+# 22. Monitoring Application Health and Performance
+
+Date: 2024-09-30
+
+## Status
+
+Pending
+
+## Context
+Historic feedback highlighted an ongoing desire for improved alerting and monitoring mechanisms, particularly originating in issue [#831](https://github.com/raft-tech/TANF-app/issues/831) circa 2021. Currently, our cloud platform has limited logging features and user interface issues leading to a "blindness" to errors and stack traces that have occurred, ultimately impairing our ability to maintain system stability; additionally, the existing dashboards only offer live performance data lacking data over time or any archives. Without context for either performance or system logging, determination of anomalous or erroneous system behavior is not possible.
+
+Additionally, we have experienced critical blocking issues related to our updates to both Elasticsearch (ES) and PostgreSQL, which have compounded the need for more proactive alerting and load-testing in lower environments. Without timely notifications, we risk delays in addressing failures that could escalate into more significant problems.
+
+
+## Decision
+We will build out a suite of tools in accordance with industry best practices to monitor our applications. Implementing a comprehensive monitoring and alerting ecosystem will not only help in identifying errors in real-time but also enable us to establish benchmarks based on historical data. This approach will foster a more proactive response strategy, ensuring that potential issues are mitigated before they impact our users or that system owners and system admins are aware of issues that have impacted users.
+
+<p style="text-align:center; margin:0; padding:0;">Cloud Environments Workflow</p>
+
+![Environments](../diagrams/TDP_Environments.png)
+
+### Why Sentry
+Sentry captures unhandled exceptions and incorporates detail context about exceptions including error messages, stack traces, affected URLs and user data information. Such information is essential in demystifying the cause of error.
+
+Additionally, as can be seen in the image below, the following information is available:
+
+- Frequency: shows the frequency detail of error
+- Timeline: when has the error happened in a period
+- Can create a ticket and assign automatically
+- Variables at each step of stack trace. This is very important for debugging
+
+<p style="text-align:center; margin:0; padding:0;">Issues with filter enabled</p>
+
+![Issues with filter enabled](../images/sentry/1.%20Issues%20with%20filter%20enabled.png)
+
+<p style="text-align:center; margin:0;padding:0;">Detail exceptions</p>
+
+![Detail exceptions](../images/sentry/3.%20detail%20about%20exception.png)
+
+<p style="text-align:center; margin:0; padding:0;">Full stack trace of the exceptions</p>
+
+![Full stack trace of the exceptions](../images/sentry/4.%20full%20stack%20trace%20of%20the%20exceptions.png)
+
+
+Performance monitoring in Sentry can greatly enhance the backend application by providing real-time insights into how the TANF app is performing. Sentry tracks various metrics such as response time, database queries, and external API calls. These metrics will help identify performance bottlenecks associated to the backend app.
+
+A unique ability of Sentry is that it links performance issues and groups them together. This gives us the ability to visualize areas that consistently have poor performance. Allowing us to swarm and resolve the most frequent offenders that have the highest impact. Sentry also detects issues with web transactions, database queries, and function regressions (if the duration of function has increased).
+
+### Why Prometheus-Loki-Grafana
+
+Grafana shall provide a visualization dashboard for these various tools which will collect and aggregate performance metrics, system logs, and allow deeper analysis for all aspects of our systems: frontend, proxies, backend, databases, and even networking. Additionally, the development team will seek to hone a proactive alerting system for out-of-threshold issues and errors for improved visibility of system issues.
+
+The storing of system logs will allow more expedient troubleshooting and debugging that is currently out of reach with Cloud.gov's existing Kibana interface for logging. The ability to find and correlate log events is critical to technical analysis of faults, performance degradation, and system's overall health.
+
+By having our monitoring ecosystem take in performance metrics, we will garner performance metrics over time as opposed to simply a live snapshot as is currently provided. This will allow spotting of anomolous or out-of-bounds behaviors such as out of memory, high memory, cpu spikes, and disk thrashing.
+
+Finally, having all of this data in one place will allow technical staff to easily cross-reference given time periods with problematic performance, ongoing issues, or error stacktraces leading to a holistic view of all of our applications both in lower tier development sites and in critical production.
+
+## Consequences
+
+* Increased platform costs for running these tools
+* Time and effort maintaining and configuring these new systems
+* "Noisy" notifications from from out-of-tune alerting
+* Efforts made towards security compliance as these systems have intimate access to our systems and data
+* Learning curve for technical staff
+
+## Notes
+Given the prohibitive costs of self-hosting Sentry in Cloud.gov, we propose using Sentry's Cloud SaaS offering which will alter the [boundary diagram](../../Security-Compliance/diagram.png). The other tools in use (PLG stack and associated), will be self-hosted and maintained by the technical staff both at Raft and OFA.
diff --git a/docs/Technical-Documentation/diagrams/TDP_Environments.png b/docs/Technical-Documentation/diagrams/TDP_Environments.png
diff --git a/docs/Technical-Documentation/images/sentry/1. Issues with filter enabled.png b/docs/Technical-Documentation/images/sentry/1. Issues with filter enabled.png
diff --git a/docs/Technical-Documentation/images/sentry/2. Unhandled exceptiions are logged.png b/docs/Technical-Documentation/images/sentry/2. Unhandled exceptiions are logged.png
diff --git a/docs/Technical-Documentation/images/sentry/3. detail about exception.png b/docs/Technical-Documentation/images/sentry/3. detail about exception.png
diff --git a/...Technical-Documentation/images/sentry/4. full stack trace of the exceptions.png b/...Technical-Documentation/images/sentry/4. full stack trace of the exceptions.png