From d6050a3c9d26ba177d3233b22bd7300ac78f94a7 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Tue, 10 Dec 2024 10:34:58 +0000
Subject: [PATCH] chore:Handle failed URL fetches in legislation cache building

Add handling for failed URL fetches by logging errors and storing them in a list instead of raising exceptions. This ensures that the process can continue attempting to build the cache while logging a comprehensive list of any failed sources.
---
 app/cache/legislation.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/app/cache/legislation.py b/app/cache/legislation.py
index de2e19b..152c2ac 100644
--- a/app/cache/legislation.py
+++ b/app/cache/legislation.py
@@ -4,6 +4,7 @@
 
 import logging
 import re
+import time
 import xml.etree.ElementTree as ET  # nosec BXXX
 
 from typing import Optional
@@ -123,6 +124,8 @@ def build_cache(self, config: SearchDocumentConfig):
         logger.info("building legislation cache...")
         dataset = construction_legislation_dataframe()
 
+        failed_url_fetches = []
+
         # For each row, get the URL from the column named
         # 'URI to Extract XML Data'
         # and store the XML data in a list
@@ -185,10 +188,15 @@ def build_cache(self, config: SearchDocumentConfig):
 
                     # Insert or update the document
                     insert_or_update_document(document_json)
+
+                # # Sleep for a short time to avoid rate limiting
+                # time.sleep(0.5)
             except Exception as e:
                 logger.error(f"error fetching data from {url}: {e}")
-                raise e
+                failed_url_fetches.append(url)
 
+        if failed_url_fetches:
+            logger.warning(f"failed to fetch data {len(failed_url_fetches)} legislation sources: {failed_url_fetches}")
     def _to_json(
         self,
         description,