Unicode: add unit tests and additional comments for valid_utf8()

AlekseyCherepanov · Sep 15, 2024 · f5d53d1 · f5d53d1
1 parent 1c8fd6b
commit f5d53d1
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 5 deletions.
diff --git a/src/Makefile.in b/src/Makefile.in
@@ -803,7 +803,10 @@ testfiles:
 ###############################################################################
 
 UNIT_TEST_OBJS = \
-	tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o
+	tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o unicode.o
+
+UNIT_TEST_INCLUDED_PIECES = \
+	tests/test_valid_utf8.c
 
 tests/unit-tests.o:	tests/unit-tests.c common.h memory.h misc.h
 	$(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG  tests/unit-tests.c
@@ -823,7 +826,7 @@ tests/memory.o:	memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h jo
 # keep the 'easy name' build target of unit-tests   The 'real' target is ../run/unit-tests[.exe]
 unit-tests:	../run/unit-tests@EXE_EXT@
 
-../run/unit-tests@EXE_EXT@:	$(UNIT_TEST_OBJS)
+../run/unit-tests@EXE_EXT@:	$(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES)
 	$(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@
 	@ echo "Now Running the Unit Tests"
 	@ ${POSSIBLE_WINE_MSG}

diff --git a/src/Makefile.legacy b/src/Makefile.legacy
@@ -337,8 +337,8 @@ default:
 	@echo "beos-x86-any             BeOS, x86"
 	@echo "generic                  Any other Unix-like system with gcc"
 
-unit-tests:
-	$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c
+unit-tests: unicode.o
+	$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.o -lcrypto
 	../run/unit-tests
 
 linux-x86-64-avx512:

diff --git a/src/tests/test_valid_utf8.c b/src/tests/test_valid_utf8.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024 Aleksey Cherepanov
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ */
+
+/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences
+ *
+ * It tests only a sequence of bytes for single character. Plus a few
+ * cases of additional byte after valid 2-bytes long sequence are
+ * tested. Higher-level logic is not checked. Test is sparse: checks
+ * of continuous blocks are applied skipping parts. Dense checks are
+ * applied close to borders to catch off-by-one mistakes. Valid
+ * sequences are limited to single character (1-4 bytes). Invalid
+ * sequences go up to 5 bytes and use even bigger steps for skipping.
+ * ASCII bytes at trailing positions are tested lightly.
+ *
+ * Description in PR#5531 contains a script to test valid_utf8()
+ * against Python3. https://github.com/openwall/john/pull/5531
+ */
+
+/* Related info about UTF-8
+ *
+ * Valid UTF-8 sequences of bytes (1-4 bytes long):
+ *
+ * 00..7F
+ *
+ * C2..DF  80..BF
+ *
+ *   E0    A0..BF  80..BF
+ *   ED    80..9F  80..BF
+ *   Ex    80..BF  80..BF  where Ex does not include E0 and ED
+ *
+ *   F0    90..BF  80..BF  80..BF
+ *   F4    80..8F  80..BF  80..BF  notice 8F as upper bound in the second byte
+ * F1..F3  80..BF  80..BF  80..BF
+ *
+ * (X..Y denotes range from X to Y inclusive. X and Y are byte
+ * values written in hex.)
+ *
+ * Incomplete sequences are invalid.
+ *
+ * Range 80..BF is for trailing bytes (also called continuation
+ * bytes). It is not a valid starting byte. Adjacent values C0 and C1
+ * could be considered starting 2-bytes sequences but they are not
+ * valid in UTF-8.
+ *
+ * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
+ * byte at the second position. E0/ED use different halves of the
+ * range for the second byte. F0/F4 allow the second byte in other
+ * proportions (48:16), not overlapping too.
+ *
+ * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
+ * 80..BF are invalid at starting position.
+ * 00..7F,C2..F4 are invalid at any trailing position (actually they
+ * invalidate previous char while new starting byte itself can be a
+ * part of valid char, but the whole string would be invalid for
+ * purposes of valid_utf8()).
+ *
+ * See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
+ */
+
+/* This file has up to 6 levels of nesting, so tab-width 4 might be
+ * helpful. Deep nesting is the price for simple regular structure. */
+
+#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0)
+
+#define valid_utf8(a) (inc_test(), valid_utf8((a)))
+#define expect(cond) \
+	do { \
+		if (!(cond)) { \
+			printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \
+			       Results.test_name, #cond, hex(buf, strlen((void *)buf))); \
+			inc_failed_test(); \
+			return; /* early exit for the whole test */ \
+		} \
+	} while (0)
+
+void _test_valid_utf8()
+{
+	UTF8 buf[6] = {};
+
+	/* Empty string is valid. */
+	expect(valid_utf8(buf) == 1);
+
+	/* 1 byte: ASCII is valid, non-ascii alone is invalid. */
+	for (int c = 0; c < 256; c++) {
+		buf[0] = c;
+		buf[1] = '\0';
+		expect(valid_utf8(buf) == (c < 128));
+	}
+
+	/* Setup dense check around borders of 80..BF range for trailing bytes. */
+	unsigned char trailing_sparse_check[256] = {};
+	for (int c = 0x79; c < 256; c += 16)
+		trailing_sparse_check[c] = 1;
+	for (int c = 0x80 - 8; c < 0x80 + 8; c++)
+		trailing_sparse_check[c] = 1;
+	for (int c = 0xBF - 8; c < 0xBF + 8; c++)
+		trailing_sparse_check[c] = 1;
+
+	/* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */
+	for (int c1 = 128; c1 < 256; c1++) {
+		buf[0] = c1;
+		buf[1] = '\0';
+
+		int step = 1;
+
+		/* Invalid starting byte would be checked with all endings. So
+		 * checks are sparse for invalid starting bytes. */
+		if (buf[0] < 0xC2 || 0xF4 < buf[0])
+			step = 15; /* sparse checks */
+		else
+			step = 1;
+
+		for (int c2 = 0x70, r2; c2 < 256; c2 += step) {
+			/* The second byte is checked sparsely only for invalid starts. */
+			buf[1] = c2;
+			buf[2] = '\0';
+			r2 = valid_utf8(buf);
+
+			if (0xC2 <= buf[0] && buf[0] < 0xE0 &&
+			    is_trailing(buf[1])) {
+
+				expect(r2 == 2);
+
+				/* Additional test with 41 and F5 after valid 2-bytes sequence */
+				buf[3] = '\0';
+				buf[2] = 'A';
+				expect(valid_utf8(buf) == 2);
+				buf[2] = 0xF5;
+				expect(valid_utf8(buf) == 0);
+
+				continue;
+			}
+
+			expect(r2 == 0);
+			for (int c3 = 0x79, r3; c3 < 256; c3++) {
+				if (0 == trailing_sparse_check[c3])
+					continue; /* run code below sparsely */
+				buf[2] = c3;
+				buf[3] = '\0';
+				r3 = valid_utf8(buf);
+
+				if ((buf[0] == 0xE0 &&
+				     0xA0 <= buf[1] && buf[1] < 0xC0 &&
+				     is_trailing(buf[2])) ||
+
+				    (buf[0] == 0xED &&
+				     0x80 <= buf[1] && buf[1] < 0xA0 &&
+				     is_trailing(buf[2])) ||
+
+				    (0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED &&
+				     is_trailing(buf[1]) &&
+				     is_trailing(buf[2]))) {
+
+					expect(r3 == 2);
+					continue;
+				}
+
+				expect(r3 == 0);
+				for (int c4 = 0x79, r4; c4 < 256; c4++) {
+					if (0 == trailing_sparse_check[c4])
+						continue; /* run code below sparsely */
+					buf[3] = c4;
+					buf[4] = '\0';
+					r4 = valid_utf8(buf);
+
+					if ((buf[0] == 0xF0 &&
+					     0x90 <= buf[1] && buf[1] < 0xC0 &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3])) ||
+
+					    (buf[0] == 0xF4 &&
+					     0x80 <= buf[1] && buf[1] < 0x90 &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3])) ||
+
+					    ((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) &&
+					     is_trailing(buf[1]) &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3]))) {
+
+						expect(r4 == 2);
+						continue;
+					}
+
+					expect(r4 == 0);
+					for (int c5 = 0x79; c5 < 256; c5 += 32) {
+						/* We test only a few values for the fifth byte. */
+						buf[4] = c5;
+						buf[5] = '\0';
+						expect(valid_utf8(buf) == 0);
+					}
+				}
+			}
+		}
+	}
+}
+
+void test_valid_utf8()
+{
+	start_test(__FUNCTION__);
+	failed = 0;
+	_test_valid_utf8();
+	end_test();
+}
+
+#undef expect
+#undef is_trailing
+#undef valid_utf8
diff --git a/src/tests/unit-tests.c b/src/tests/unit-tests.c
@@ -41,9 +41,12 @@
 #include "../misc.h"
 #include "../memory.h"
 #include "../common.h"
+#include "../unicode.h"
 
 #include "../sha2.h"
 
+struct options_main options; /* fake symbol to compile with unicode.o */
+
 char *_fgetl_pad = NULL;
 #ifdef __sun
 /* Solaris fprintf() seems to get confused at around 16384 */
@@ -2439,6 +2442,9 @@ void test_sha2_c() {
 	end_test();
 }
 
+/* Tests for unicode.c */
+#include "test_valid_utf8.c"
+
 int main() {
 	start_of_run = clock();
 
@@ -2493,6 +2499,9 @@ int main() {
 	set_unit_test_source("sha2.c");
 	test_sha2_c();
 
+	set_unit_test_source("unicode.c");
+	test_valid_utf8();
+
 	// perform dump listing of all processed functions.
 	dump_stats();
 

diff --git a/src/unicode.c b/src/unicode.c
@@ -530,7 +530,48 @@ inline size_t strlen_any(const void *source)
 	return len;
 }
 
-/* Check if a string is valid UTF-8 */
+/*
+ * Check if a string is valid UTF-8
+ *
+ * Valid UTF-8 sequences of bytes (1-4 bytes long):
+ *
+ * 00..7F
+ *
+ * C2..DF  80..BF
+ *
+ *   E0    A0..BF  80..BF
+ *   ED    80..9F  80..BF
+ *   Ex    80..BF  80..BF  where Ex does not include E0 and ED
+ *
+ *   F0    90..BF  80..BF  80..BF
+ *   F4    80..8F  80..BF  80..BF  notice 8F as upper bound in the second byte
+ * F1..F3  80..BF  80..BF  80..BF
+ *
+ * (X..Y denotes range from X to Y inclusive. X and Y are byte
+ * values written in hex.)
+ *
+ * Incomplete sequences are invalid.
+ *
+ * Range 80..BF is for trailing bytes (also called continuation
+ * bytes). It is not a valid starting byte. Adjacent values C0 and C1
+ * could be considered starting 2-bytes sequences but they are not
+ * valid in UTF-8.
+ *
+ * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
+ * byte at the second position. E0/ED use different halves of the
+ * range for the second byte. F0/F4 allow the second byte in other
+ * proportions (48:16), not overlapping too.
+ *
+ * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
+ * 80..BF are invalid at starting position.
+ * 00..7F,C2..F4 are invalid at any trailing position (actually they
+ * invalidate previous char while new starting byte itself could be a
+ * part of a valid char, but even then the whole string would be
+ * invalid for purposes of valid_utf8()).
+ *
+ * See also  unicode.h  and  tests/test_valid_utf8.c
+ * See also  https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
+ */
 int valid_utf8(const UTF8 *source)
 {
 	UTF8 a;