From 62a03c5b83015aee5cf6d72051a97f3c4ddd8813 Mon Sep 17 00:00:00 2001 From: Michael Graeb Date: Tue, 3 Jan 2023 15:47:46 -0800 Subject: [PATCH] Websocket: Validate UTF-8 in text payloads (#418) Validate text payloads, as required by [RFC-6455 Section 5.6](https://www.rfc-editor.org/rfc/rfc6455#section-5.6) --- include/aws/http/private/websocket_decoder.h | 13 +- source/websocket.c | 4 +- source/websocket_decoder.c | 52 ++++- tests/CMakeLists.txt | 4 + tests/test_websocket_decoder.c | 216 ++++++++++++++++++- tests/test_websocket_handler.c | 4 +- 6 files changed, 285 insertions(+), 8 deletions(-) diff --git a/include/aws/http/private/websocket_decoder.h b/include/aws/http/private/websocket_decoder.h index b48cb594b..5b77b9104 100644 --- a/include/aws/http/private/websocket_decoder.h +++ b/include/aws/http/private/websocket_decoder.h @@ -26,6 +26,7 @@ enum aws_websocket_decoder_state { AWS_WEBSOCKET_DECODER_STATE_MASKING_KEY, AWS_WEBSOCKET_DECODER_STATE_PAYLOAD_CHECK, AWS_WEBSOCKET_DECODER_STATE_PAYLOAD, + AWS_WEBSOCKET_DECODER_STATE_FRAME_END, AWS_WEBSOCKET_DECODER_STATE_DONE, }; @@ -38,6 +39,11 @@ struct aws_websocket_decoder { bool expecting_continuation_data_frame; /* True when the next data frame must be CONTINUATION frame */ + /* True while processing a TEXT "message" (from the start of a TEXT frame, + * until the end of the TEXT or CONTINUATION frame with the FIN bit set). */ + bool processing_text_message; + struct aws_utf8_validator *text_message_validator; + void *user_data; aws_websocket_decoder_frame_fn *on_frame; aws_websocket_decoder_payload_fn *on_payload; @@ -48,10 +54,14 @@ AWS_EXTERN_C_BEGIN AWS_HTTP_API void aws_websocket_decoder_init( struct aws_websocket_decoder *decoder, + struct aws_allocator *alloc, aws_websocket_decoder_frame_fn *on_frame, aws_websocket_decoder_payload_fn *on_payload, void *user_data); +AWS_HTTP_API +void aws_websocket_decoder_clean_up(struct aws_websocket_decoder *decoder); + /** * Returns when all data is processed, or a frame and its payload have completed. * `data` will be advanced to reflect the amount of data processed by this call. @@ -59,8 +69,7 @@ void aws_websocket_decoder_init( * The `on_frame` and `on_payload` callbacks may each be invoked once as a result of this call. * If an error occurs, the decoder is invalid forevermore. */ -AWS_HTTP_API -int aws_websocket_decoder_process( +AWS_HTTP_API int aws_websocket_decoder_process( struct aws_websocket_decoder *decoder, struct aws_byte_cursor *data, bool *frame_complete); diff --git a/source/websocket.c b/source/websocket.c index 8024d3988..6ae0e64e0 100644 --- a/source/websocket.c +++ b/source/websocket.c @@ -304,7 +304,8 @@ struct aws_websocket *aws_websocket_handler_new(const struct aws_websocket_handl aws_websocket_encoder_init(&websocket->thread_data.encoder, s_encoder_stream_outgoing_payload, websocket); - aws_websocket_decoder_init(&websocket->thread_data.decoder, s_decoder_on_frame, s_decoder_on_payload, websocket); + aws_websocket_decoder_init( + &websocket->thread_data.decoder, options->allocator, s_decoder_on_frame, s_decoder_on_payload, websocket); aws_linked_list_init(&websocket->synced_data.outgoing_frame_list); @@ -346,6 +347,7 @@ static void s_handler_destroy(struct aws_channel_handler *handler) { AWS_LOGF_TRACE(AWS_LS_HTTP_WEBSOCKET, "id=%p: Destroying websocket.", (void *)websocket); + aws_websocket_decoder_clean_up(&websocket->thread_data.decoder); aws_byte_buf_clean_up(&websocket->thread_data.incoming_ping_payload); aws_mutex_clean_up(&websocket->synced_data.lock); aws_mem_release(websocket->alloc, websocket); diff --git a/source/websocket_decoder.c b/source/websocket_decoder.c index 275b77f4b..28503b347 100644 --- a/source/websocket_decoder.c +++ b/source/websocket_decoder.c @@ -5,6 +5,8 @@ #include +#include + #include typedef int(state_fn)(struct aws_websocket_decoder *decoder, struct aws_byte_cursor *data); @@ -86,6 +88,10 @@ static int s_state_opcode_byte(struct aws_websocket_decoder *decoder, struct aws } } + if (decoder->current_frame.opcode == AWS_WEBSOCKET_OPCODE_TEXT) { + decoder->processing_text_message = true; + } + decoder->state = AWS_WEBSOCKET_DECODER_STATE_LENGTH_BYTE; return AWS_OP_SUCCESS; } @@ -234,7 +240,7 @@ static int s_state_payload_check(struct aws_websocket_decoder *decoder, struct a decoder->state_bytes_processed = 0; decoder->state = AWS_WEBSOCKET_DECODER_STATE_PAYLOAD; } else { - decoder->state = AWS_WEBSOCKET_DECODER_STATE_DONE; + decoder->state = AWS_WEBSOCKET_DECODER_STATE_FRAME_END; } return AWS_OP_SUCCESS; @@ -266,9 +272,16 @@ static int s_state_payload(struct aws_websocket_decoder *decoder, struct aws_byt } } - /* TODO: validate utf-8 */ /* TODO: validate payload of CLOSE frame */ + /* Validate the UTF-8 for TEXT messages (a TEXT frame and any subsequent CONTINUATION frames) */ + if (decoder->processing_text_message && aws_websocket_is_data_frame(decoder->current_frame.opcode)) { + if (aws_utf8_validator_update(decoder->text_message_validator, payload)) { + AWS_LOGF_ERROR(AWS_LS_HTTP_WEBSOCKET, "id=%p: Received invalid UTF-8", (void *)decoder->user_data); + return aws_raise_error(AWS_ERROR_HTTP_WEBSOCKET_PROTOCOL_ERROR); + } + } + /* Invoke on_payload() callback to inform user of payload data */ int err = decoder->on_payload(payload, decoder->user_data); if (err) { @@ -280,9 +293,34 @@ static int s_state_payload(struct aws_websocket_decoder *decoder, struct aws_byt /* If all data consumed, proceed to next state. */ if (decoder->state_bytes_processed == decoder->current_frame.payload_length) { - decoder->state++; + decoder->state = AWS_WEBSOCKET_DECODER_STATE_FRAME_END; + } + + return AWS_OP_SUCCESS; +} + +/* FRAME_END: Perform checks once we reach the end of the frame. */ +static int s_state_frame_end(struct aws_websocket_decoder *decoder, struct aws_byte_cursor *data) { + (void)data; + + /* If we're done processing a text message (a TEXT frame and any subsequent CONTINUATION frames), + * complete the UTF-8 validation */ + if (decoder->processing_text_message && aws_websocket_is_data_frame(decoder->current_frame.opcode) && + decoder->current_frame.fin) { + + if (aws_utf8_validator_finalize(decoder->text_message_validator)) { + AWS_LOGF_ERROR( + AWS_LS_HTTP_WEBSOCKET, + "id=%p: Received invalid UTF-8 (incomplete encoding)", + (void *)decoder->user_data); + return aws_raise_error(AWS_ERROR_HTTP_WEBSOCKET_PROTOCOL_ERROR); + } + + decoder->processing_text_message = false; } + /* Done! */ + decoder->state = AWS_WEBSOCKET_DECODER_STATE_DONE; return AWS_OP_SUCCESS; } @@ -295,6 +333,7 @@ static state_fn *s_state_functions[AWS_WEBSOCKET_DECODER_STATE_DONE] = { s_state_masking_key, s_state_payload_check, s_state_payload, + s_state_frame_end, }; int aws_websocket_decoder_process( @@ -330,6 +369,7 @@ int aws_websocket_decoder_process( void aws_websocket_decoder_init( struct aws_websocket_decoder *decoder, + struct aws_allocator *alloc, aws_websocket_decoder_frame_fn *on_frame, aws_websocket_decoder_payload_fn *on_payload, void *user_data) { @@ -338,4 +378,10 @@ void aws_websocket_decoder_init( decoder->user_data = user_data; decoder->on_frame = on_frame; decoder->on_payload = on_payload; + decoder->text_message_validator = aws_utf8_validator_new(alloc); +} + +void aws_websocket_decoder_clean_up(struct aws_websocket_decoder *decoder) { + aws_utf8_validator_destroy(decoder->text_message_validator); + AWS_ZERO_STRUCT(*decoder); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 90ce5f970..65e4dd6ad 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -167,6 +167,10 @@ add_test_case(websocket_decoder_fail_on_unknown_opcode) add_test_case(websocket_decoder_fragmented_message) add_test_case(websocket_decoder_fail_on_bad_fragmentation) add_test_case(websocket_decoder_control_frame_cannot_be_fragmented) +add_test_case(websocket_decoder_utf8_text) +add_test_case(websocket_decoder_fail_on_bad_utf8_text) +add_test_case(websocket_decoder_fragmented_utf8_text) +add_test_case(websocket_decoder_fail_on_fragmented_bad_utf8_text) add_test_case(websocket_decoder_on_frame_callback_can_fail_decoder) add_test_case(websocket_decoder_on_payload_callback_can_fail_decoder) add_test_case(websocket_encoder_sanity_check) diff --git a/tests/test_websocket_decoder.c b/tests/test_websocket_decoder.c index 71b7c5a3f..8bf43653d 100644 --- a/tests/test_websocket_decoder.c +++ b/tests/test_websocket_decoder.c @@ -60,7 +60,8 @@ static int s_on_payload(struct aws_byte_cursor data, void *user_data) { /* For resetting the decoder and its results mid-test */ static void s_decoder_tester_reset(struct decoder_tester *tester) { - aws_websocket_decoder_init(&tester->decoder, s_on_frame, s_on_payload, tester); + aws_websocket_decoder_clean_up(&tester->decoder); + aws_websocket_decoder_init(&tester->decoder, tester->alloc, s_on_frame, s_on_payload, tester); AWS_ZERO_STRUCT(tester->frame); tester->on_frame_count = 0; tester->payload.len = 0; @@ -89,6 +90,7 @@ static int s_decoder_tester_init(struct decoder_tester *tester, struct aws_alloc static int s_decoder_tester_clean_up(struct decoder_tester *tester) { aws_byte_buf_clean_up(&tester->payload); + aws_websocket_decoder_clean_up(&tester->decoder); aws_http_library_clean_up(); aws_logger_clean_up(&tester->logger); return AWS_OP_SUCCESS; @@ -652,6 +654,218 @@ DECODER_TEST_CASE(websocket_decoder_control_frame_cannot_be_fragmented) { return AWS_OP_SUCCESS; } +/* Test that we can process a TEXT frame with UTF-8 in it */ +DECODER_TEST_CASE(websocket_decoder_utf8_text) { + (void)ctx; + struct decoder_tester tester; + ASSERT_SUCCESS(s_decoder_tester_init(&tester, allocator)); + + uint8_t input[] = { + /* TEXT FRAME */ + 0x81, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x04, /* mask | 7bit payload len */ + /* payload - codepoint U+10348 as 4-byte UTF-8 */ + 0xF0, + 0x90, + 0x8D, + 0x88, + }; + + struct aws_websocket_frame expected_frame = { + .fin = true, + .opcode = AWS_WEBSOCKET_OPCODE_TEXT, + .payload_length = 4, + }; + const char *expected_payload = "\xF0\x90\x8D\x88"; + + bool frame_complete; + struct aws_byte_cursor input_cursor = aws_byte_cursor_from_array(input, sizeof(input)); + ASSERT_SUCCESS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + + /* check result */ + ASSERT_TRUE(frame_complete); + ASSERT_SUCCESS(s_compare_frame(&expected_frame, &tester.frame)); + ASSERT_TRUE(aws_byte_buf_eq_c_str(&tester.payload, expected_payload)); + + ASSERT_SUCCESS(s_decoder_tester_clean_up(&tester)); + return AWS_OP_SUCCESS; +} + +/* Test that a TEXT frame with invalid UTF-8 fails */ +DECODER_TEST_CASE(websocket_decoder_fail_on_bad_utf8_text) { + (void)ctx; + struct decoder_tester tester; + ASSERT_SUCCESS(s_decoder_tester_init(&tester, allocator)); + + { /* Test validation failing when it hits totally bad byte values */ + uint8_t input[] = { + /* TEXT FRAME */ + 0x81, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload - illegal UTF-8 value */ + 0xFF, + }; + + bool frame_complete; + struct aws_byte_cursor input_cursor = aws_byte_cursor_from_array(input, sizeof(input)); + ASSERT_FAILS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_INT_EQUALS(AWS_ERROR_HTTP_WEBSOCKET_PROTOCOL_ERROR, aws_last_error()); + } + + s_decoder_tester_reset(&tester); + + { /* Test validation failing at the end, due to a 4-byte codepoint missing 1 byte */ + uint8_t input[] = { + /* TEXT FRAME */ + 0x81, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x03, /* mask | 7bit payload len */ + /* payload - codepoint U+10348 as 4-byte UTF-8, but missing 4th byte */ + 0xF0, + 0x90, + 0x8D, + /* 0x88, <-- missing 4th byte */ + }; + + bool frame_complete; + struct aws_byte_cursor input_cursor = aws_byte_cursor_from_array(input, sizeof(input)); + ASSERT_FAILS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_INT_EQUALS(AWS_ERROR_HTTP_WEBSOCKET_PROTOCOL_ERROR, aws_last_error()); + } + + ASSERT_SUCCESS(s_decoder_tester_clean_up(&tester)); + return AWS_OP_SUCCESS; +} + +/* Test that UTF-8 can be validated even if it's fragmented across frames */ +DECODER_TEST_CASE(websocket_decoder_fragmented_utf8_text) { + (void)ctx; + struct decoder_tester tester; + ASSERT_SUCCESS(s_decoder_tester_init(&tester, allocator)); + + /* Split a 4-byte UTF-8 codepoint across a fragmented message. + * codepoint U+10348 is UTF-8 bytes: 0xF0, 0x90, 0x8D, 0x88 */ + uint8_t input[] = { + /* TEXT FRAME */ + 0x01, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload */ + 0xF0, /* 1/4 UTF-8 bytes */ + + /* CONTINUATION FRAME */ + 0x00, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x02, /* mask | 7bit payload len */ + /* payload */ + 0x90, /* 2/4 UTF-8 bytes */ + 0x8D, /* 3/4 UTF-8 bytes */ + + /* PING FRAME - Control frames may be injected in the middle of a fragmented message. */ + 0x89, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload - PING payload should not interfere with validation */ + 0xFF, + + /* CONTINUATION FRAME */ + 0x80, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload */ + 0x88, /* 4/4 UTF-8 bytes */ + }; + + struct aws_websocket_frame expected_frames[] = { + { + .fin = false, + .opcode = 1, + .payload_length = 1, + }, + { + .fin = false, + .opcode = 0, + .payload_length = 2, + }, + { + .fin = true, + .opcode = 9, + .payload_length = 1, + }, + { + .fin = true, + .opcode = 0, + .payload_length = 1, + }, + }; + + struct aws_byte_cursor input_cursor = aws_byte_cursor_from_array(input, sizeof(input)); + for (size_t i = 0; i < AWS_ARRAY_SIZE(expected_frames); ++i) { + bool frame_complete; + ASSERT_SUCCESS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_TRUE(frame_complete); + ASSERT_UINT_EQUALS(i + 1, tester.on_frame_count); + ASSERT_SUCCESS(s_compare_frame(&expected_frames[i], &tester.frame)); + } + ASSERT_UINT_EQUALS(0, input_cursor.len); + + ASSERT_SUCCESS(s_decoder_tester_clean_up(&tester)); + return AWS_OP_SUCCESS; +} + +/* Test that UTF-8 validator works even when text is fragmented across multiple frames */ +DECODER_TEST_CASE(websocket_decoder_fail_on_fragmented_bad_utf8_text) { + (void)ctx; + struct decoder_tester tester; + ASSERT_SUCCESS(s_decoder_tester_init(&tester, allocator)); + + /* Split a 4-byte UTF-8 codepoint across a fragmented message, but omit he last byte. + * codepoint U+10348 is UTF-8 bytes: 0xF0, 0x90, 0x8D, 0x88 */ + uint8_t input[] = { + /* TEXT FRAME */ + 0x01, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload */ + 0xF0, /* 1/4 UTF-8 bytes */ + + /* CONTINUATION FRAME */ + 0x00, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload */ + 0x90, /* 2/4 UTF-8 bytes */ + + /* PING FRAME - Control frames may be injected in the middle of a fragmented message. */ + 0x89, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload - PING payload shouldn't interfere with the TEXT's validation */ + 0x8D, + + /* CONTINUATION FRAME */ + 0x80, /* fin | rsv1 | rsv2 | rsv3 | 4bit opcode */ + 0x01, /* mask | 7bit payload len */ + /* payload */ + 0x8D, /* 3/4 UTF-8 bytes */ + /* 0x88, <-- MISSING 4/4 UTF-8 bytes */ + }; + + bool frame_complete; + struct aws_byte_cursor input_cursor = aws_byte_cursor_from_array(input, sizeof(input)); + + /* TEXT should pass */ + ASSERT_SUCCESS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_TRUE(frame_complete); + + /* CONTINUATION should pass */ + ASSERT_SUCCESS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_TRUE(frame_complete); + + /* PING should pass */ + ASSERT_SUCCESS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_TRUE(frame_complete); + + /* final CONTINUATION should fail because the message ended with an incomplete UTF-8 encoding */ + ASSERT_FAILS(aws_websocket_decoder_process(&tester.decoder, &input_cursor, &frame_complete)); + ASSERT_INT_EQUALS(AWS_ERROR_HTTP_WEBSOCKET_PROTOCOL_ERROR, aws_last_error()); + + ASSERT_SUCCESS(s_decoder_tester_clean_up(&tester)); + return AWS_OP_SUCCESS; +} + /* Test that an error from the on_frame callback fails the decoder */ DECODER_TEST_CASE(websocket_decoder_on_frame_callback_can_fail_decoder) { (void)ctx; diff --git a/tests/test_websocket_handler.c b/tests/test_websocket_handler.c index c1643e81e..4c284d331 100644 --- a/tests/test_websocket_handler.c +++ b/tests/test_websocket_handler.c @@ -490,7 +490,8 @@ static int s_tester_init(struct tester *tester, struct aws_allocator *alloc) { ASSERT_NOT_NULL(tester->websocket); testing_channel_drain_queued_tasks(&tester->testing_channel); - aws_websocket_decoder_init(&tester->written_frame_decoder, s_on_written_frame, s_on_written_frame_payload, tester); + aws_websocket_decoder_init( + &tester->written_frame_decoder, alloc, s_on_written_frame, s_on_written_frame_payload, tester); aws_websocket_encoder_init(&tester->readpush_encoder, s_stream_readpush_payload, tester); return AWS_OP_SUCCESS; @@ -512,6 +513,7 @@ static int s_tester_clean_up(struct tester *tester) { aws_byte_buf_clean_up(&tester->all_writepush_data); + aws_websocket_decoder_clean_up(&tester->written_frame_decoder); aws_http_library_clean_up(); aws_logger_clean_up(&tester->logger); return AWS_OP_SUCCESS;