Skip to content

Commit

Permalink
Fix: Service Lambda timeouts cause user-facing 5xx responses (#6284, PR
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Nov 20, 2024
2 parents 6799093 + 4e6a242 commit 93637ff
Show file tree
Hide file tree
Showing 13 changed files with 857 additions and 663 deletions.
162 changes: 24 additions & 138 deletions lambdas/indexer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,21 @@
Optional,
)

# noinspection PyPackageRequirements
import chalice
from chalice import (
Response,
)

from azul import (
CatalogName,
cached_property,
config,
)
from azul.chalice import (
AzulChaliceApp,
LambdaMetric,
)
from azul.deployment import (
aws,
)
from azul.health import (
HealthController,
HealthApp,
)
from azul.hmac import (
HMACAuthentication,
Expand All @@ -46,9 +41,6 @@
from azul.openapi.responses import (
json_content,
)
from azul.openapi.spec import (
CommonEndpointSpecs,
)

log = logging.getLogger(__name__)

Expand All @@ -64,16 +56,12 @@
# changes and reset the minor version to zero. Otherwise, increment only
# the minor version for backwards compatible changes. A backwards
# compatible change is one that does not require updates to clients.
'version': '1.0'
'version': '1.1'
}
}


class IndexerApp(AzulChaliceApp, SignatureHelper):

@cached_property
def health_controller(self):
return self._controller(HealthController, lambda_name='indexer')
class IndexerApp(HealthApp, SignatureHelper):

@cached_property
def index_controller(self) -> IndexController:
Expand All @@ -98,7 +86,9 @@ def log_forwarder(self, prefix: str):
error_decorator = self.metric_alarm(metric=LambdaMetric.errors,
threshold=1, # One alarm …
period=24 * 60 * 60) # … per day.
throttle_decorator = self.metric_alarm(metric=LambdaMetric.throttles)
throttle_decorator = self.metric_alarm(metric=LambdaMetric.throttles,
threshold=0,
period=5 * 60)
retry_decorator = self.retry(num_retries=2)

def decorator(f):
Expand All @@ -115,120 +105,7 @@ def _authenticate(self) -> Optional[HMACAuthentication]:
app = IndexerApp()
configure_app_logging(app, log)


@app.route(
'/',
cache_control='public, max-age=0, must-revalidate',
cors=False
)
def swagger_ui():
return app.swagger_ui()


@app.route(
'/static/{file}',
cache_control='public, max-age=86400',
cors=True
)
def static_resource(file):
return app.swagger_resource(file)


common_specs = CommonEndpointSpecs(app_name='indexer')


@app.route(
'/openapi',
methods=['GET'],
cache_control='public, max-age=500',
cors=True,
**common_specs.openapi
)
def openapi():
return Response(status_code=200,
headers={'content-type': 'application/json'},
body=app.spec())


@app.route(
'/version',
methods=['GET'],
cors=True,
**common_specs.version
)
def version():
from azul.changelog import (
compact_changes,
)
return {
'git': config.lambda_git_status,
'changes': compact_changes(limit=10)
}


@app.route(
'/health',
methods=['GET'],
cors=True,
**common_specs.full_health
)
def health():
return app.health_controller.health()


@app.route(
'/health/basic',
methods=['GET'],
cors=True,
**common_specs.basic_health
)
def basic_health():
return app.health_controller.basic_health()


@app.route(
'/health/cached',
methods=['GET'],
cors=True,
**common_specs.cached_health
)
def cached_health():
return app.health_controller.cached_health()


@app.route(
'/health/fast',
methods=['GET'],
cors=True,
**common_specs.fast_health
)
def fast_health():
return app.health_controller.fast_health()


@app.route(
'/health/{keys}',
methods=['GET'],
cors=True,
**common_specs.custom_health
)
def health_by_key(keys: Optional[str] = None):
return app.health_controller.custom_health(keys)


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=1,
period=24 * 60 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles)
@app.retry(num_retries=0)
# FIXME: Remove redundant prefix from name
# https://github.com/DataBiosphere/azul/issues/5337
@app.schedule(
'rate(1 minute)',
name='indexercachehealth'
)
def update_health_cache(_event: chalice.app.CloudWatchEvent):
app.health_controller.update_cache()
globals().update(app.default_routes())


@app.route(
Expand Down Expand Up @@ -303,9 +180,11 @@ def post_notification(catalog: CatalogName, action: str):


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.contribution_concurrency(retry=False) * 2 / 3))
threshold=int(config.contribution_concurrency(retry=False) * 2 / 3),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(96000 / config.contribution_concurrency(retry=False)))
threshold=int(96000 / config.contribution_concurrency(retry=False)),
period=5 * 60)
@app.on_sqs_message(
queue=config.notifications_queue_name(),
batch_size=1
Expand All @@ -315,9 +194,11 @@ def contribute(event: chalice.app.SQSEvent):


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.aggregation_concurrency(retry=False) * 3))
threshold=int(config.aggregation_concurrency(retry=False) * 3),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(37760 / config.aggregation_concurrency(retry=False)))
threshold=int(37760 / config.aggregation_concurrency(retry=False)),
period=5 * 60)
@app.on_sqs_message(
queue=config.tallies_queue_name(),
batch_size=IndexController.document_batch_size
Expand All @@ -330,8 +211,11 @@ def aggregate(event: chalice.app.SQSEvent):
# with more RAM in the tallies_retry queue.

@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.aggregation_concurrency(retry=True) * 1 / 16))
@app.metric_alarm(metric=LambdaMetric.throttles)
threshold=int(config.aggregation_concurrency(retry=True) * 1 / 16),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=0,
period=5 * 60)
@app.on_sqs_message(
queue=config.tallies_queue_name(retry=True),
batch_size=IndexController.document_batch_size
Expand All @@ -344,9 +228,11 @@ def aggregate_retry(event: chalice.app.SQSEvent):
# retried with more RAM and a longer timeout in the notifications_retry queue.

@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.contribution_concurrency(retry=True) * 1 / 4))
threshold=int(config.contribution_concurrency(retry=True) * 1 / 4),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(31760 / config.contribution_concurrency(retry=True)))
threshold=int(31760 / config.contribution_concurrency(retry=True)),
period=5 * 60)
@app.on_sqs_message(
queue=config.notifications_queue_name(retry=True),
batch_size=1
Expand Down
74 changes: 72 additions & 2 deletions lambdas/indexer/openapi.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
{
"openapi": "3.0.1",
"info": {
"title": "azul_indexer",
"title": "azul-indexer-dev",
"description": "\nThis is the internal API for Azul's indexer component.\n",
"version": "1.0"
"version": "1.1"
},
"paths": {
"/": {
"get": {
"summary": "A Swagger UI for interactive use of this REST API",
"tags": [
"Auxiliary"
],
"responses": {
"200": {
"description": "The response body is an HTML page containing the Swagger UI"
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
}
}
},
"/openapi": {
"get": {
"summary": "Return OpenAPI specifications for this REST API",
Expand Down Expand Up @@ -59,13 +75,46 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
"Auxiliary"
]
}
},
"/static/{file}": {
"parameters": [
{
"name": "file",
"in": "path",
"required": true,
"schema": {
"type": "string"
},
"description": "The name of a static file to be returned"
}
],
"get": {
"summary": "Static files needed for the Swagger UI",
"tags": [
"Auxiliary"
],
"responses": {
"200": {
"description": "The response body is the contents of the requested file"
},
"404": {
"description": "The requested file does not exist"
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
}
}
},
"/version": {
"get": {
"summary": "Describe current version of this REST API",
Expand Down Expand Up @@ -137,6 +186,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
}
}
Expand Down Expand Up @@ -233,6 +285,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
Expand Down Expand Up @@ -322,6 +377,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
Expand Down Expand Up @@ -417,6 +475,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
Expand Down Expand Up @@ -512,6 +573,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
Expand Down Expand Up @@ -632,6 +696,9 @@
}
}
}
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
},
"tags": [
Expand Down Expand Up @@ -741,6 +808,9 @@
},
"401": {
"description": "Request lacked a valid HMAC header"
},
"504": {
"description": "\nRequest timed out. When handling this response, clients\nshould wait the number of seconds specified in the\n`Retry-After` header and then retry the request.\n"
}
}
}
Expand Down
Loading

0 comments on commit 93637ff

Please sign in to comment.