Skip to content

Commit

Permalink
DRIVERS-1541 Retry KMS decrypt requests on transient errors (#783)
Browse files Browse the repository at this point in the history
  • Loading branch information
adriandole authored Aug 29, 2024
1 parent 3f5b2f3 commit 2265e79
Show file tree
Hide file tree
Showing 23 changed files with 700 additions and 9 deletions.
13 changes: 10 additions & 3 deletions integrating.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,19 +203,26 @@ The responses from one or more HTTP messages to KMS.
(Note, the driver MAY fan out all HTTP requests at the same time).
2. For each context:

a. Create/reuse a TLS socket connected to the endpoint indicated by
a. Delay the message by the time in microseconds indicated by
`mongocrypt_kms_ctx_usleep` if returned value is greater than 0.

b. Create/reuse a TLS socket connected to the endpoint indicated by
`mongocrypt_kms_ctx_endpoint`. The endpoint string is a host name with
a port number separated by a colon. E.g.
"kms.us-east-1.amazonaws.com:443". A port number will always be
included. Drivers may assume the host name is not an IP address or IP
literal.

b. Write the message from `mongocrypt_kms_ctx_message` to the
c. Write the message from `mongocrypt_kms_ctx_message` to the
> socket.

c. Feed the reply back with `mongocrypt_kms_ctx_feed`. Repeat
d. Feed the reply back with `mongocrypt_kms_ctx_feed`. Repeat
> until `mongocrypt_kms_ctx_bytes_needed` returns 0.

If any step encounters a network error, continue to the next KMS context if
`mongocrypt_kms_ctx_fail` returns true. Otherwise, abort and report an
error.

3. When done feeding all replies, call `mongocrypt_ctx_kms_done`.

**Applies to...**
Expand Down
3 changes: 3 additions & 0 deletions kms-message/src/kms_message/kms_response_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ kms_response_parser_error (kms_response_parser_t *parser);
KMS_MSG_EXPORT (void)
kms_response_parser_destroy (kms_response_parser_t *parser);

KMS_MSG_EXPORT (void)
kms_response_parser_reset (kms_response_parser_t *parser);

#ifdef __cplusplus
} /* extern "C" */
#endif
Expand Down
8 changes: 8 additions & 0 deletions kms-message/src/kms_response_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ _parser_init (kms_response_parser_t *parser)
parser->kmip = NULL;
}

void
kms_response_parser_reset (kms_response_parser_t *parser)
{
KMS_ASSERT(!parser->kmip); // KMIP is not-yet supported.
_parser_destroy(parser);
_parser_init(parser);
}

kms_response_parser_t *
kms_response_parser_new (void)
{
Expand Down
3 changes: 2 additions & 1 deletion src/mongocrypt-ctx-datakey.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ static mongocrypt_kms_ctx_t *_next_kms_ctx(mongocrypt_ctx_t *ctx) {
BSON_ASSERT_PARAM(ctx);

dkctx = (_mongocrypt_ctx_datakey_t *)ctx;
if (dkctx->kms_returned) {
if (!dkctx->kms.should_retry && dkctx->kms_returned) {
return NULL;
}
dkctx->kms.should_retry = false; // Reset retry state.
dkctx->kms_returned = true;
return &dkctx->kms;
}
Expand Down
1 change: 1 addition & 0 deletions src/mongocrypt-ctx-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ typedef struct __mongocrypt_ctx_opts_t {
_mongocrypt_buffer_t key_material;
mongocrypt_encryption_algorithm_t algorithm;
_mongocrypt_kek_t kek;
bool retry_enabled;

struct {
mongocrypt_index_type_t value;
Expand Down
9 changes: 8 additions & 1 deletion src/mongocrypt-ctx.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ mongocrypt_ctx_t *mongocrypt_ctx_new(mongocrypt_t *crypt) {
ctx->crypt = crypt;
ctx->status = mongocrypt_status_new();
ctx->opts.algorithm = MONGOCRYPT_ENCRYPTION_ALGORITHM_NONE;
ctx->opts.retry_enabled = crypt->retry_enabled;
ctx->state = MONGOCRYPT_CTX_DONE;
return ctx;
}
Expand Down Expand Up @@ -513,8 +514,9 @@ mongocrypt_kms_ctx_t *mongocrypt_ctx_next_kms_ctx(mongocrypt_ctx_t *ctx) {
return NULL;
}

mongocrypt_kms_ctx_t *ret;
switch (ctx->state) {
case MONGOCRYPT_CTX_NEED_KMS: return ctx->vtable.next_kms_ctx(ctx);
case MONGOCRYPT_CTX_NEED_KMS: ret = ctx->vtable.next_kms_ctx(ctx); break;
case MONGOCRYPT_CTX_ERROR: return NULL;
case MONGOCRYPT_CTX_DONE:
case MONGOCRYPT_CTX_NEED_KMS_CREDENTIALS:
Expand All @@ -525,6 +527,11 @@ mongocrypt_kms_ctx_t *mongocrypt_ctx_next_kms_ctx(mongocrypt_ctx_t *ctx) {
case MONGOCRYPT_CTX_READY:
default: _mongocrypt_ctx_fail_w_msg(ctx, "wrong state"); return NULL;
}

if (ret) {
ret->retry_enabled = ctx->opts.retry_enabled;
}
return ret;
}

bool mongocrypt_ctx_provide_kms_providers(mongocrypt_ctx_t *ctx, mongocrypt_binary_t *kms_providers_definition) {
Expand Down
7 changes: 7 additions & 0 deletions src/mongocrypt-key-broker.c
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,13 @@ mongocrypt_kms_ctx_t *_mongocrypt_key_broker_next_kms(_mongocrypt_key_broker_t *
return NULL;
}

// Check if any requests need retry
for (key_returned_t *ptr = kb->keys_returned; ptr != NULL; ptr = ptr->next) {
if (ptr->kms.should_retry) {
ptr->kms.should_retry = false;
return &ptr->kms;
}
}
while (kb->decryptor_iter) {
if (!kb->decryptor_iter->decrypted) {
key_returned_t *key_returned;
Expand Down
6 changes: 6 additions & 0 deletions src/mongocrypt-kms-ctx-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,14 @@ struct _mongocrypt_kms_ctx_t {
char *endpoint;
_mongocrypt_log_t *log;
char *kmsid;
int64_t sleep_usec;
int attempts;
bool retry_enabled;
bool should_retry;
};

static const int kms_max_attempts = 3;

bool _mongocrypt_kms_ctx_init_aws_decrypt(mongocrypt_kms_ctx_t *kms,
_mongocrypt_opts_kms_providers_t *kms_providers,
_mongocrypt_key_doc_t *key,
Expand Down
156 changes: 156 additions & 0 deletions src/mongocrypt-kms-ctx.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
#include "mongocrypt-crypto-private.h"
#include "mongocrypt-ctx-private.h"
#include "mongocrypt-endpoint-private.h"
#include "mongocrypt-kek-private.h"
#include "mongocrypt-kms-ctx-private.h"
#include "mongocrypt-log-private.h"
#include "mongocrypt-opts-private.h"
#include "mongocrypt-private.h"
#include "mongocrypt-status-private.h"
#include "mongocrypt-util-private.h"
#include "mongocrypt.h"
#include <bson/bson.h>
#include <kms_message/kms_azure_request.h>
#include <kms_message/kms_b64.h>
#include <kms_message/kms_gcp_request.h>
Expand Down Expand Up @@ -142,6 +144,9 @@ _init_common(mongocrypt_kms_ctx_t *kms, _mongocrypt_log_t *log, _kms_request_typ
kms->status = mongocrypt_status_new();
kms->req_type = kms_type;
_mongocrypt_buffer_init(&kms->result);
kms->sleep_usec = 0;
kms->attempts = 0;
kms->should_retry = false;
}

bool _mongocrypt_kms_ctx_init_aws_decrypt(mongocrypt_kms_ctx_t *kms,
Expand Down Expand Up @@ -427,11 +432,21 @@ uint32_t mongocrypt_kms_ctx_bytes_needed(mongocrypt_kms_ctx_t *kms) {
if (!mongocrypt_status_ok(kms->status) || !_mongocrypt_buffer_empty(&kms->result)) {
return 0;
}
if (kms->should_retry) {
return 0;
}
want_bytes = kms_response_parser_wants_bytes(kms->parser, DEFAULT_MAX_KMS_BYTE_REQUEST);
BSON_ASSERT(want_bytes >= 0);
return (uint32_t)want_bytes;
}

int64_t mongocrypt_kms_ctx_usleep(mongocrypt_kms_ctx_t *kms) {
if (!kms) {
return 0;
}
return kms->sleep_usec;
}

static void
_handle_non200_http_status(int http_status, const char *body, size_t body_len, mongocrypt_status_t *status) {
BSON_ASSERT_PARAM(body);
Expand All @@ -455,6 +470,55 @@ _handle_non200_http_status(int http_status, const char *body, size_t body_len, m
CLIENT_ERR("Error in KMS response. HTTP status=%d. Response body=\n%s", http_status, body);
}

static int64_t backoff_time_usec(int64_t attempts) {
static bool seeded = false;
if (!seeded) {
srand((uint32_t)time(NULL));
seeded = true;
}

/* Exponential backoff with jitter. */
const int64_t base = 200000; /* 0.2 seconds */
const int64_t max = 20000000; /* 20 seconds */
BSON_ASSERT(attempts > 0);
int64_t backoff = base * ((int64_t)1 << (attempts - 1));
if (backoff > max) {
backoff = max;
}

/* Full jitter: between 1 and current max */
return (int64_t)((double)rand() / (double)RAND_MAX * (double)backoff) + 1;
}

static bool should_retry_http(int http_status, _kms_request_type_t t) {
static const int retryable_aws[] = {408, 429, 500, 502, 503, 509};
static const int retryable_azure[] = {408, 429, 500, 502, 503, 504};
if (t == MONGOCRYPT_KMS_AWS_ENCRYPT || t == MONGOCRYPT_KMS_AWS_DECRYPT) {
for (size_t i = 0; i < sizeof(retryable_aws) / sizeof(retryable_aws[0]); i++) {
if (http_status == retryable_aws[i]) {
return true;
}
}
} else if (t == MONGOCRYPT_KMS_AZURE_WRAPKEY || t == MONGOCRYPT_KMS_AZURE_UNWRAPKEY) {
for (size_t i = 0; i < sizeof(retryable_azure) / sizeof(retryable_azure[0]); i++) {
if (http_status == retryable_azure[i]) {
return true;
}
}
} else if (t == MONGOCRYPT_KMS_GCP_ENCRYPT || t == MONGOCRYPT_KMS_GCP_DECRYPT) {
if (http_status == 408 || http_status == 429 || http_status / 500 == 1) {
return true;
}
}
return false;
}

static void set_retry(mongocrypt_kms_ctx_t *kms) {
kms->should_retry = true;
kms->attempts++;
kms->sleep_usec = backoff_time_usec(kms->attempts);
}

/* An AWS KMS context has received full response. Parse out the result or error.
*/
static bool _ctx_done_aws(mongocrypt_kms_ctx_t *kms, const char *json_field) {
Expand Down Expand Up @@ -485,6 +549,21 @@ static bool _ctx_done_aws(mongocrypt_kms_ctx_t *kms, const char *json_field) {
}
body = kms_response_get_body(response, &body_len);

if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
if (kms->attempts >= kms_max_attempts) {
// Wrap error to indicate maximum retries occurred.
_handle_non200_http_status(http_status, body, body_len, status);
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
kms_max_attempts,
mongocrypt_status_message(status, NULL));
goto fail;
} else {
ret = true;
set_retry(kms);
goto fail;
}
}

if (http_status != 200) {
_handle_non200_http_status(http_status, body, body_len, status);
goto fail;
Expand Down Expand Up @@ -643,6 +722,21 @@ static bool _ctx_done_azure_wrapkey_unwrapkey(mongocrypt_kms_ctx_t *kms) {
}
body = kms_response_get_body(response, &body_len);

if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
if (kms->attempts >= kms_max_attempts) {
// Wrap error to indicate maximum retries occurred.
_handle_non200_http_status(http_status, body, body_len, status);
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
kms_max_attempts,
mongocrypt_status_message(status, NULL));
goto fail;
} else {
ret = true;
set_retry(kms);
goto fail;
}
}

if (body_len == 0) {
CLIENT_ERR("Empty KMS response. HTTP status=%d", http_status);
goto fail;
Expand Down Expand Up @@ -737,6 +831,21 @@ static bool _ctx_done_gcp(mongocrypt_kms_ctx_t *kms, const char *json_field) {
}
body = kms_response_get_body(response, &body_len);

if (kms->retry_enabled && should_retry_http(http_status, kms->req_type)) {
if (kms->attempts >= kms_max_attempts) {
// Wrap error to indicate maximum retries occurred.
_handle_non200_http_status(http_status, body, body_len, status);
CLIENT_ERR("KMS request failed after maximum of %d retries: %s",
kms_max_attempts,
mongocrypt_status_message(status, NULL));
goto fail;
} else {
ret = true;
set_retry(kms);
goto fail;
}
}

if (http_status != 200) {
_handle_non200_http_status(http_status, body, body_len, status);
goto fail;
Expand Down Expand Up @@ -995,6 +1104,53 @@ static bool _ctx_done_kmip_decrypt(mongocrypt_kms_ctx_t *kms_ctx) {
return ret;
}

bool mongocrypt_kms_ctx_fail(mongocrypt_kms_ctx_t *kms) {
if (!kms || !kms->retry_enabled) {
return false;
}

kms->should_retry = false;
mongocrypt_status_t *status = kms->status;

if (!kms->retry_enabled) {
CLIENT_ERR("KMS request failed due to network error");
return false;
}

if (kms->attempts >= kms_max_attempts) {
CLIENT_ERR("KMS request failed after %d retries due to a network error", kms_max_attempts);
return false;
}

// Check if request type is retryable. Some requests are non-idempotent and cannot be safely retried.
_kms_request_type_t retryable_types[] = {MONGOCRYPT_KMS_AWS_ENCRYPT,
MONGOCRYPT_KMS_AWS_DECRYPT,
MONGOCRYPT_KMS_AZURE_WRAPKEY,
MONGOCRYPT_KMS_AZURE_UNWRAPKEY,
MONGOCRYPT_KMS_GCP_ENCRYPT,
MONGOCRYPT_KMS_GCP_DECRYPT};
bool is_retryable = false;
for (size_t i = 0; i < sizeof(retryable_types) / sizeof(retryable_types[0]); i++) {
if (retryable_types[i] == kms->req_type) {
is_retryable = true;
break;
}
}
if (!is_retryable) {
CLIENT_ERR("KMS request failed due to network error");
return false;
}

// Mark KMS context as retryable. Return again in `mongocrypt_ctx_next_kms_ctx`.
set_retry(kms);

// Reset intermediate state of parser.
if (kms->parser) {
kms_response_parser_reset(kms->parser);
}
return true;
}

bool mongocrypt_kms_ctx_feed(mongocrypt_kms_ctx_t *kms, mongocrypt_binary_t *bytes) {
if (!kms) {
return false;
Expand Down
1 change: 1 addition & 0 deletions src/mongocrypt-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct _mongocrypt_t {
_mongo_crypt_v1_vtable csfle;
/// Pointer to the global csfle_lib object. Should not be freed directly.
mongo_crypt_v1_lib *csfle_lib;
bool retry_enabled;
};

typedef enum {
Expand Down
6 changes: 6 additions & 0 deletions src/mongocrypt.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ bool mongocrypt_setopt_log_handler(mongocrypt_t *crypt, mongocrypt_log_fn_t log_
return true;
}

bool mongocrypt_setopt_retry_kms(mongocrypt_t *crypt, bool enable) {
ASSERT_MONGOCRYPT_PARAM_UNINIT(crypt);
crypt->retry_enabled = enable;
return true;
}

bool mongocrypt_setopt_kms_provider_aws(mongocrypt_t *crypt,
const char *aws_access_key_id,
int32_t aws_access_key_id_len,
Expand Down
Loading

0 comments on commit 2265e79

Please sign in to comment.