Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add retry mechanism for HTTP requests and update dependencies #59

Merged
merged 2 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/docker-manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,20 @@ jobs:
labels: |
org.opencontainers.image.documentation=https://github.com/${{ github.repository }}/blob/${{ env.BRANCH }}/README.md
org.opencontainers.image.version=${{ env.BRANCH }}
annotations: |
org.opencontainers.image.description=Docker compose environment (based on pycsw) for development and testing with CKAN Open Data portals.
org.opencontainers.image.source=https://github.com/${{ github.repository }}

- name: Replace slashes in BRANCH to avoid errors
run: echo "BRANCH=${BRANCH////_}" >> $GITHUB_ENV

- name: Build and push
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
with:
push: true
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
context: ${{ env.CONTEXT }}
file: ${{ env.CONTEXT }}${{ env.DOCKERFILE_PATH }}/${{ env.DOCKERFILE }}

Expand All @@ -60,7 +64,7 @@ jobs:
no-fail: true

- name: Run Trivy container image vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
uses: aquasecurity/trivy-action@0.28.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
format: sarif
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
run: echo "BRANCH=${BRANCH////_}" >> $GITHUB_ENV

- name: Build and push
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
with:
push: true
tags: ghcr.io/${{ github.repository }}:${{ github.head_ref }}
Expand All @@ -70,7 +70,7 @@ jobs:
no-fail: true

- name: Run Trivy container image vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
uses: aquasecurity/trivy-action@0.28.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
format: sarif
Expand Down
48 changes: 35 additions & 13 deletions ckan2pycsw/ckan2pycsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from datetime import datetime, time
import subprocess
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# third-party libraries
import psutil
Expand Down Expand Up @@ -58,6 +60,18 @@
SSL_UNVERIFIED_MODE = os.environ.get("SSL_UNVERIFIED_MODE", False)


session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)


def get_datasets(base_url):
"""
Retrieve a generator of CKAN datasets from the specified CKAN instance.
Expand All @@ -78,30 +92,38 @@ def get_datasets(base_url):
if not base_url.endswith("/"):
base_url += "/"

if SSL_UNVERIFIED_MODE == True or SSL_UNVERIFIED_MODE == "True":
logging.warning(f"[INSECURE] SSL_UNVERIFIED_MODE:'{SSL_UNVERIFIED_MODE}'. Only if you trust the CKAN_URL: {base_url}.")
if SSL_UNVERIFIED_MODE in [True, "True"]:
logging.warning(f"[INSECURE] SSL_UNVERIFIED_MODE:'{SSL_UNVERIFIED_MODE}'. Solo si confías en CKAN_URL: {base_url}.")

package_search = urljoin(base_url, "api/3/action/package_search")
res = requests.get(package_search, params={"rows": 0}, verify=not SSL_UNVERIFIED_MODE)
res.raise_for_status() # Raises a HTTPError if the response is not 200

# Usar la sesión configurada con reintentos y timeout
res = session.get(package_search, params={"rows": 0}, verify=not SSL_UNVERIFIED_MODE, timeout=10)
res.raise_for_status()
end = res.json().get("result", {}).get("count", 0)
rows = 10
rows = 100 # Number of files
for start in range(0, end, rows):
res = requests.get(package_search, params={"start": start, "rows": rows}, verify=not SSL_UNVERIFIED_MODE)
res.raise_for_status() # Check response status
logging.info(f"Fetching datasets with start={start} and rows={rows}") # Log de progreso
try:
res = session.get(package_search, params={"start": start, "rows": rows}, verify=not SSL_UNVERIFIED_MODE, timeout=30)
res.raise_for_status()
datasets = res.json()["result"]["results"]
except ValueError as e: # Catch JSON decode error
logging.error(f"Error decoding JSON from response: {e}")
continue # Skip to the next iteration
except ValueError as e:
logging.error(f"Error al decodificar JSON: {e}")
continue
except requests.exceptions.RequestException as e:
logging.error(f"Request error: {e}", exc_info=True)
continue

for dataset in datasets:
if dataset.get("type") == "dataset":
yield dataset
except requests.exceptions.RequestException as e:
logging.error(f"Request error while communicating with CKAN instance {base_url}: {e}")
except requests.exceptions.Timeout:
logging.error(f"Timeout error for request starting at {start}", exc_info=True)
except requests.exceptions.ConnectionError:
logging.error(f"Connection error for request starting at {start}", exc_info=True)
except Exception as e:
logging.error(f"Unexpected error: {e}")
logging.error(f"Unexpected error at start={start}: {e}", exc_info=True)

def main():
"""
Expand Down
Loading
Loading