From 62d64e6a62b9b23270783a883baed35a2f479dbe Mon Sep 17 00:00:00 2001
From: Martin Fouilleul <martinfouilleul@gmail.com>
Date: Fri, 4 Oct 2024 19:08:02 +0200
Subject: [PATCH] utf8 validation

---
 src/util/utf8.c | 78 ++++++++++++++++++++++++++++++-------------------
 src/util/utf8.h | 15 ++++++++++
 2 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/src/util/utf8.c b/src/util/utf8.c
index f992db46..f18fc734 100644
--- a/src/util/utf8.c
+++ b/src/util/utf8.c
@@ -27,8 +27,6 @@ static const char trailingBytesForUTF8[256] = {
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
 };
 
-#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)
-
 //-----------------------------------------------------------------
 //NOTE: getting sizes / offsets / indices
 //-----------------------------------------------------------------
@@ -129,25 +127,38 @@ oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset)
     //NOTE(martin): get the first codepoint in str, and advance index to the
     //              next oc_utf8 character
     //TODO(martin): check for utf-16 surrogate pairs
-    oc_utf32 cp = 0;
-    u64 sz = 0;
+    oc_utf8_dec res = { .status = OC_UTF8_OK };
 
-    if(offset >= string.len || !string.ptr[offset])
+    if(offset >= string.len)
     {
-        cp = 0;
-        sz = 1;
+        res.status = OC_UTF8_OUT_OF_BOUNDS;
+        res.size = 1;
     }
     else if(!oc_utf8_is_start_byte(string.ptr[offset]))
     {
         //NOTE(martin): unexpected continuation or invalid character.
-        cp = 0xfffd;
-        sz = 1;
+        if((string.ptr[offset] & 0xc0) == 0x80)
+        {
+            res.status = OC_UTF8_UNEXPECTED_CONTINUATION_BYTE;
+        }
+        else
+        {
+            res.status = OC_UTF8_INVALID_BYTE;
+        }
+
+        res.codepoint = 0xfffd;
+        res.size = 1;
     }
     else
     {
         int expectedSize = oc_utf8_size_from_leading_char(string.ptr[offset]);
         do
         {
+            if(offset >= string.len)
+            {
+                res.status = OC_UTF8_OUT_OF_BOUNDS;
+                break;
+            }
             /*NOTE(martin):
 				we shift 6 bits and add the next byte at each round.
 				at the end we have our oc_utf8 codepoint, added to the shifted versions
@@ -155,39 +166,46 @@ oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset)
 				precomputed in offsetsFromUTF8.
 			*/
             unsigned char b = string.ptr[offset];
-            cp <<= 6;
-            cp += b;
+            res.codepoint <<= 6;
+            res.codepoint += b;
             offset += 1;
-            sz++;
+            res.size++;
 
-            if(b == 0xc0 || b == 0xc1 || b >= 0xc5)
+            if(b == 0xc0 || b == 0xc1 || b >= 0xf5)
             {
                 //NOTE(martin): invalid byte encountered
+                res.status = OC_UTF8_INVALID_BYTE;
+                break;
+            }
+            if(res.size > 1 && oc_utf8_is_start_byte(b))
+            {
+                res.status = OC_UTF8_UNEXPECTED_LEADING_BYTE;
                 break;
             }
         }
-        while(offset < string.len
-              && string.ptr[offset]
-              && !oc_utf8_is_start_byte(string.ptr[offset])
-              && sz < expectedSize);
+        while(res.size < expectedSize);
 
-        if(sz != expectedSize)
-        {
-            //NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
-            cp = 0xfffd;
-        }
-        else
+        if(res.status == OC_UTF8_OK)
         {
-            cp -= offsetsFromUTF8[sz - 1];
+            res.codepoint -= offsetsFromUTF8[res.size - 1];
 
             //NOTE(martin): check for invalid codepoints
-            if(cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff))
+            if((res.size == 3 && res.codepoint < 0x800) || (res.size == 4 && res.codepoint < 0x10000))
             {
-                cp = 0xfffd;
+                res.status = OC_UTF8_OVERLONG_ENCODING;
             }
+            else if(res.codepoint > 0x10ffff || (res.codepoint >= 0xd800 && res.codepoint <= 0xdfff))
+            {
+                res.status = OC_UTF8_INVALID_CODEPOINT;
+            }
+        }
+
+        if(res.status != OC_UTF8_OK)
+        {
+            //NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
+            res.codepoint = 0xfffd;
         }
     }
-    oc_utf8_dec res = { .codepoint = cp, .size = sz };
     return (res);
 }
 
@@ -225,7 +243,7 @@ oc_str8 oc_utf8_encode(char* dest, oc_utf32 codePoint)
         dest[3] = (codePoint & 0x3F) | 0x80;
         sz = 4;
     }
-    oc_str8 res = {.ptr = dest , .len = sz};
+    oc_str8 res = { .ptr = dest, .len = sz };
     return (res);
 }
 
@@ -239,7 +257,7 @@ oc_str32 oc_utf8_to_codepoints(u64 maxCount, oc_utf32* backing, oc_str8 string)
         backing[codePointIndex] = decode.codepoint;
         byteOffset += decode.size;
     }
-    oc_str32 res = {.ptr = backing , .len = codePointIndex};
+    oc_str32 res = { .ptr = backing, .len = codePointIndex };
     return (res);
 }
 
@@ -257,7 +275,7 @@ oc_str8 oc_utf8_from_codepoints(u64 maxBytes, char* backing, oc_str32 codePoints
         oc_utf8_encode(backing + byteOffset, codePoint);
         byteOffset += byteCount;
     }
-    oc_str8 res = {.ptr = backing , .len = byteOffset};
+    oc_str8 res = { .ptr = backing, .len = byteOffset };
     return (res);
 }
 
diff --git a/src/util/utf8.h b/src/util/utf8.h
index f699aeb6..a299f11b 100644
--- a/src/util/utf8.h
+++ b/src/util/utf8.h
@@ -21,6 +21,8 @@ typedef u32 oc_utf32;
 //-----------------------------------------------------------------
 //NOTE: getting sizes / offsets / indices
 //-----------------------------------------------------------------
+#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)
+
 ORCA_API u32 oc_utf8_size_from_leading_char(char leadingChar);
 ORCA_API u32 oc_utf8_codepoint_size(oc_utf32 codePoint);
 
@@ -33,8 +35,21 @@ ORCA_API u64 oc_utf8_prev_offset(oc_str8 string, u64 byteOffset);
 //-----------------------------------------------------------------
 //NOTE: encoding / decoding
 //-----------------------------------------------------------------
+
+typedef enum oc_utf8_status
+{
+    OC_UTF8_OK,
+    OC_UTF8_OUT_OF_BOUNDS,
+    OC_UTF8_UNEXPECTED_CONTINUATION_BYTE,
+    OC_UTF8_UNEXPECTED_LEADING_BYTE,
+    OC_UTF8_INVALID_BYTE,
+    OC_UTF8_INVALID_CODEPOINT,
+    OC_UTF8_OVERLONG_ENCODING,
+} oc_utf8_status;
+
 typedef struct oc_utf8_dec
 {
+    oc_utf8_status status;
     oc_utf32 codepoint; //NOTE: decoded codepoint
     u32 size;           //NOTE: size of corresponding oc_utf8 sequence
 } oc_utf8_dec;