From 79695e6dd8d9abe86cccfaf84c13311b38552269 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Fri, 11 Oct 2024 11:07:24 -0300 Subject: [PATCH] Gracefully handle HTTP errors closes #111 We were setting `raise_for_status` and not handling exceptions raised by aiohttp, which clogged up the logs with stack traces and env variables. I did not add an automatic retry to HTTP requests because the issues in the cluster are not necessarily going to solve themselves in the scope of 5 retried requests. I'll be dealing with the gitlab 500 errors in a different PR. --- gantry/clients/gitlab.py | 2 +- gantry/clients/prometheus/prometheus.py | 10 +++++----- gantry/routes/collection.py | 20 +++++++++++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/gantry/clients/gitlab.py b/gantry/clients/gitlab.py index 97f9500..1e94e55 100644 --- a/gantry/clients/gitlab.py +++ b/gantry/clients/gitlab.py @@ -17,7 +17,7 @@ async def _request(self, url: str, response_type: str) -> dict | str: returns: the response from Gitlab in the specified format """ - async with aiohttp.ClientSession(raise_for_status=True) as session: + async with aiohttp.ClientSession() as session: async with session.get(url, headers=self.headers) as resp: if response_type == "json": return await resp.json() diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py index 424d1b1..c61a3d8 100644 --- a/gantry/clients/prometheus/prometheus.py +++ b/gantry/clients/prometheus/prometheus.py @@ -68,17 +68,17 @@ async def query_range(self, query: str | dict, start: int, end: int) -> list: async def _query(self, url: str) -> list: """Query Prometheus with a query string""" - async with aiohttp.ClientSession(raise_for_status=True) as session: + async with aiohttp.ClientSession() as session: # submit cookie with request async with session.get(url, cookies=self.cookies) as resp: try: return await resp.json() except aiohttp.ContentTypeError: - logger.error( - """Prometheus query failed with unexpected response. - The cookie may have expired.""" + # this will get caught in collection.py and fetch_job won't continue + raise aiohttp.ClientError( + """Prometheus query failed with unexpected + response, cookie may have expired.""" ) - return {} def prettify_res(self, response: dict) -> list: """Process Prometheus response into a list of dicts with {label: value}""" diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index 538cd9d..7908672 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -1,6 +1,7 @@ import logging import re +import aiohttp import aiosqlite from gantry.clients import db @@ -56,20 +57,25 @@ async def fetch_job( ): return - # check if the job is a ghost - job_log = await gitlab.job_log(job.gl_id) - is_ghost = "No need to rebuild" in job_log - if is_ghost: - logger.warning(f"job {job.gl_id} is a ghost, skipping") - return - try: + # all code that makes HTTP requests should be in this try block + + # check if the job is a ghost + job_log = await gitlab.job_log(job.gl_id) + is_ghost = "No need to rebuild" in job_log + if is_ghost: + logger.warning(f"job {job.gl_id} is a ghost, skipping") + return + annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint) resources, node_hostname = await prometheus.job.get_resources( annotations["pod"], job.midpoint ) usage = await prometheus.job.get_usage(annotations["pod"], job.start, job.end) node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint) + except aiohttp.ClientError as e: + logger.error(f"Request failed: {e}") + return except IncompleteData as e: # missing data, skip this job logger.error(f"{e} job={job.gl_id}")