From b7e5cab0ff89478bcfe2916c5a8844d05a782a23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= <git@random-projects.net>
Date: Sun, 21 Jan 2024 11:11:17 +0100
Subject: [PATCH] Implements ctype.h functions, adds support for panics, adds
 more code validation.

---
 README.md                    |  12 ++-
 build.zig                    | 132 ++++++++++++++++++++++++++++++++-
 include/ctype.h              |   2 +-
 include/foundation/libc.h    |  25 +++++++
 src/libc.zig                 |  74 +++++++++++++++++++
 src/modules/ctype.zig        | 137 +++++++++++++++++++++++++++++++----
 tests/syntactic-validation.c |   1 +
 7 files changed, 365 insertions(+), 18 deletions(-)
 create mode 100644 include/foundation/libc.h

diff --git a/README.md b/README.md
index 2aecec2..cec3d39 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,16 @@ The first goal is to reach full C11 *freestanding* support.
 - No support for `wchar_t` and `wchar.h` as it isn't portable between compilers.
 - Multi-byte character strings are implemented as UTF-8.
 
+## Customization
+
+Foundation libc doesn't really support much customization/configuration except for the hard required options.
+
+There is [`foundation/libc.h`](include/foundation/libc.h) which documents the behaviour of all required configurations.
+
+Right now, the following configurations exist:
+
+- `foundation_libc_panic_handler`, which allows users to catch detectable undefined behaviour.
+
 ## Development
 
 Zig Version: 0.11
@@ -67,7 +77,7 @@ Which functions belong into which header can be figured out by taking a look at
 | --------------- | ------------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
 | `assert.h`      | ❌             |                       | Conditionally compiled macro that compares its argument to zero                                         |
 | `complex.h`     | ❌             |                       | (since C99) Complex number arithmetic                                                                   |
-| `ctype.h`       | ✅             | ⏳                     | Functions to determine the type contained in character data                                             |
+| `ctype.h`       | ✅             | ✅                     | Functions to determine the type contained in character data                                             |
 | `errno.h`       | ✅             | ✅                     | Macros reporting error conditions                                                                       |
 | `fenv.h`        | 🔮             |                       | (since C99) Floating-point environment                                                                  |
 | `float.h`       | 🔀             |                       | Limits of floating-point types                                                                          |
diff --git a/build.zig b/build.zig
index 926565c..7a4f7bd 100644
--- a/build.zig
+++ b/build.zig
@@ -19,11 +19,18 @@ pub fn createLibrary(b: *std.Build, target: std.zig.CrossTarget, optimize: std.b
 }
 
 pub fn build(b: *std.Build) void {
+    const validation_step = b.step("validate", "Runs the test suite and validates everything. Automatically triggered in Debug builds.");
+
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
     const single_threaded = b.option(bool, "single-threaded", "Create a single-threaded libc implementation (default: false)") orelse false;
 
+    // Run validation in debug builds for convenience:
+    if (optimize == .Debug) {
+        b.getInstallStep().dependOn(validation_step);
+    }
+
     // check if the host has a gcc or clang available:
     const maybe_gcc = b.findProgram(&.{"gcc"}, &.{}) catch null;
     const maybe_clang = b.findProgram(&.{"clang"}, &.{}) catch null;
@@ -41,6 +48,22 @@ pub fn build(b: *std.Build) void {
 
     // test suite:
     {
+        // Compile for huge amount of targets to detect breakage early on:
+        for ([_]bool{ false, true }) |validation_single_threaded| {
+            for (std.enums.values(std.builtin.OptimizeMode)) |validation_optimize| {
+                for (validation_target_list) |validation_target| {
+
+                    // skip everything that cannot support multithreading on freestanding:
+                    if (!validation_single_threaded and !target_can_multithread(validation_target))
+                        continue;
+
+                    const vlc = createLibrary(b, validation_target, validation_optimize);
+                    vlc.single_threaded = validation_single_threaded;
+                    validation_step.dependOn(&vlc.step);
+                }
+            }
+        }
+
         const syntax_validator_source: std.Build.LazyPath = .{ .path = "tests/syntactic-validation.c" };
 
         // use the shipped C compiler to validate our code:
@@ -59,7 +82,7 @@ pub fn build(b: *std.Build) void {
             _ = syntax_validator.getEmittedBin();
 
             // Just compile, do not install:
-            b.getInstallStep().dependOn(&syntax_validator.step);
+            validation_step.dependOn(&syntax_validator.step);
         }
 
         // use the host C compilers to validate our code:
@@ -89,7 +112,7 @@ pub fn build(b: *std.Build) void {
             ext_compiler.addArg("-o");
             ext_compiler.addArg(b.pathJoin(&.{ b.makeTempPath(), "dummy" })); // we don't really care where this ends up
 
-            b.getInstallStep().dependOn(&ext_compiler.step);
+            validation_step.dependOn(&ext_compiler.step);
         }
     }
 }
@@ -104,6 +127,7 @@ const header_files = [_][]const u8{
     "string.h",
     "tgmath.h",
     "uchar.h",
+    "foundation/libc.h",
 };
 
 const common_c_flags = [_][]const u8{
@@ -119,6 +143,110 @@ const common_c_flags = [_][]const u8{
     "-Wno-reserved-identifier", // we actually want to implement those!
 };
 
+fn target_can_multithread(target: std.zig.CrossTarget) bool {
+    return switch (target.getCpuArch()) {
+        .wasm32,
+        .wasm64,
+        .msp430,
+        => false,
+
+        else => true,
+    };
+}
+
+const validation_target_list = [_]std.zig.CrossTarget{
+    .{}, // regular host platform
+    .{ .os_tag = .freestanding }, // host platform, but no OS
+
+    // Check several common cpu targets:
+
+    // arm:
+    .{ .cpu_arch = .arm, .os_tag = .freestanding },
+    .{ .cpu_arch = .armeb, .os_tag = .freestanding },
+    .{ .cpu_arch = .thumb, .os_tag = .freestanding },
+    .{ .cpu_arch = .thumbeb, .os_tag = .freestanding },
+    .{ .cpu_arch = .aarch64, .os_tag = .freestanding },
+    // .{ .cpu_arch = .aarch64_32, .os_tag = .freestanding }, // error: unknown target triple 'aarch64_32-unknown-unknown-eabi', please use -triple or -arch
+    .{ .cpu_arch = .aarch64_be, .os_tag = .freestanding },
+
+    // risc-v:
+    .{ .cpu_arch = .riscv32, .os_tag = .freestanding },
+    .{ .cpu_arch = .riscv64, .os_tag = .freestanding },
+
+    // intel:
+    .{ .cpu_arch = .x86_64, .os_tag = .freestanding },
+    .{ .cpu_arch = .x86, .os_tag = .freestanding },
+
+    // mips:
+    .{ .cpu_arch = .mips, .os_tag = .freestanding },
+    .{ .cpu_arch = .mips64, .os_tag = .freestanding },
+    .{ .cpu_arch = .mips64el, .os_tag = .freestanding },
+    .{ .cpu_arch = .mipsel, .os_tag = .freestanding },
+
+    // sparc:
+    .{ .cpu_arch = .sparc, .os_tag = .freestanding },
+    .{ .cpu_arch = .sparc64, .os_tag = .freestanding },
+    .{ .cpu_arch = .sparcel, .os_tag = .freestanding },
+
+    // power:
+    .{ .cpu_arch = .powerpc, .os_tag = .freestanding },
+    .{ .cpu_arch = .powerpc64, .os_tag = .freestanding },
+    .{ .cpu_arch = .powerpc64le, .os_tag = .freestanding },
+    .{ .cpu_arch = .powerpcle, .os_tag = .freestanding },
+
+    // web assembly:
+    .{ .cpu_arch = .wasm32, .os_tag = .freestanding },
+    .{ .cpu_arch = .wasm64, .os_tag = .freestanding },
+
+    // nice to have, but broken:
+    .{ .cpu_arch = .avr, .os_tag = .freestanding },
+    // .{ .cpu_arch = .msp430, .os_tag = .freestanding }, // error: unknown target CPU 'generic'
+    // .{ .cpu_arch = .m68k, .os_tag = .freestanding },
+    // .{ .cpu_arch = .xtensa, .os_tag = .freestanding },
+
+    // Not evaluated if reasonable to check:
+    //   arc
+    //   csky
+    //   hexagon
+    //   hsail
+    //   hsail64
+    //   kalimba
+    //   lanai
+    //   le32
+    //   le64
+    //   loongarch32
+    //   loongarch64
+    //   r600
+    //   s390x
+    //   shave
+    //   spu_2
+    //   tce
+    //   tcele
+    //   ve
+    //   xcore
+
+    // will never be supported due to their properties:
+    //   spir
+    //   spir64
+    //   spirv32
+    //   spirv64
+
+    //   bpfeb
+    //   bpfel
+
+    //   renderscript32
+    //   renderscript64
+
+    //   amdgcn
+    //   amdil
+    //   amdil64
+
+    //   nvptx
+    //   nvptx64
+
+    //   dxil
+};
+
 const sdk_root = computeSdkRoot();
 
 fn computeSdkRoot() []const u8 {
diff --git a/include/ctype.h b/include/ctype.h
index feeed0d..c4e7caa 100644
--- a/include/ctype.h
+++ b/include/ctype.h
@@ -1,7 +1,7 @@
 #ifndef _FOUNDATION_LIBC_CTYPE_H_
 #define _FOUNDATION_LIBC_CTYPE_H_
 
-// TODO: #define EOF (-1)
+#define EOF (-1)
 
 int isalnum(int c);
 int isalpha(int c);
diff --git a/include/foundation/libc.h b/include/foundation/libc.h
new file mode 100644
index 0000000..7b245c8
--- /dev/null
+++ b/include/foundation/libc.h
@@ -0,0 +1,25 @@
+#ifndef _FOUNDATION_LIBC_INTERNALS_H_
+#define _FOUNDATION_LIBC_INTERNALS_H_
+
+#include <stddef.h>
+
+///
+/// Panic handler for undefined, but catchable behaviour in safe modes.
+///
+/// This will be invoked when Zig detects undefined behaviour at runtime,
+/// or when foundation libc can recognize illegal arguments.
+///
+/// The function receives a non-terminated pointer to the panic message
+/// with `msg_len` bytes of UTF-8 encoded payload.
+///
+/// It has a weak default implementation shipped, so just implement this
+/// function to plug in your own custom behaviour.
+/// The default implementation is done by invoking a `trap` instruction to
+/// emit an illegal instruction or otherwise crash the program execution.
+///
+/// NOTE: This function must never return, because otherwise, the undefined
+///       behaviour will be actually undefined!
+///
+void foundation_libc_panic_handler(char const * msg_ptr, size_t msg_len);
+
+#endif
diff --git a/src/libc.zig b/src/libc.zig
index 5dbdd5b..4475168 100644
--- a/src/libc.zig
+++ b/src/libc.zig
@@ -1,6 +1,38 @@
 const std = @import("std");
+const builtin = @import("builtin");
+
+pub const h = @cImport({
+    @cInclude("ctype.h");
+    @cInclude("errno.h");
+    @cInclude("inttypes.h");
+    @cInclude("math.h");
+    @cInclude("setjmp.h");
+    @cInclude("stdlib.h");
+    @cInclude("string.h");
+    @cInclude("tgmath.h");
+    @cInclude("uchar.h");
+    @cInclude("foundation/libc.h");
+});
 
 comptime {
+    // Some assertions over the target platform:
+    std.debug.assert(@bitSizeOf(c_char) == 8);
+
+    // Ensure hierarchy:
+    std.debug.assert(@bitSizeOf(c_short) >= @bitSizeOf(c_char));
+    std.debug.assert(@bitSizeOf(c_int) >= @bitSizeOf(c_short));
+    std.debug.assert(@bitSizeOf(c_long) >= @bitSizeOf(c_int));
+    std.debug.assert(@bitSizeOf(c_longlong) >= @bitSizeOf(c_long));
+
+    // Ensure same-sized signed and unsigned
+    std.debug.assert(@bitSizeOf(c_ushort) == @bitSizeOf(c_short));
+    std.debug.assert(@bitSizeOf(c_uint) == @bitSizeOf(c_int));
+    std.debug.assert(@bitSizeOf(c_ulong) == @bitSizeOf(c_long));
+    std.debug.assert(@bitSizeOf(c_ulonglong) == @bitSizeOf(c_longlong));
+}
+
+comptime {
+    // Drag in all implementations, so they are compiled:
     _ = @import("modules/ctype.zig");
     _ = @import("modules/errno.zig");
     _ = @import("modules/math.zig");
@@ -9,3 +41,45 @@ comptime {
     _ = @import("modules/string.zig");
     _ = @import("modules/uchar.zig");
 }
+
+/// Invokes safety-checked undefined behaviour, use this to implement
+/// UB checks in the libc itself.
+pub fn undefined_behaviour(comptime string: []const u8) noreturn {
+    switch (builtin.mode) {
+        // In debug mode, trigger a breakpoint so it's easier to detect the situation
+        // of the undefined behaviour:
+        .Debug => {
+            @breakpoint();
+            @panic("UNDEFINED BEHAVIOUR: " ++ string);
+        },
+
+        // Safe modes have nice messages with
+        .ReleaseSafe => @panic("UNDEFINED BEHAVIOUR DETECTED: " ++ string),
+
+        .ReleaseSmall => @panic("UB"),
+
+        .ReleaseFast => unreachable,
+    }
+}
+
+/// Zig panic handler, forwards panics to `foundation_libc_panic_handler`.
+pub fn panic(msg: []const u8, maybe_error_return_trace: ?*std.builtin.StackTrace, maybe_return_address: ?usize) noreturn {
+    _ = maybe_error_return_trace;
+    _ = maybe_return_address;
+    h.foundation_libc_panic_handler(msg.ptr, msg.len);
+    unreachable;
+}
+
+/// default implementation for `foundation_libc_panic_handler`.
+fn fallback_panic_handler(msg_ptr: [*]const u8, msg_len: usize) callconv(.C) noreturn {
+    _ = msg_ptr;
+    _ = msg_len;
+    @trap();
+}
+comptime {
+    @export(fallback_panic_handler, std.builtin.ExportOptions{
+        .name = "foundation_libc_panic_handler",
+        .linkage = .Weak,
+        .visibility = .default,
+    });
+}
diff --git a/src/modules/ctype.zig b/src/modules/ctype.zig
index 2f3bee6..1f61331 100644
--- a/src/modules/ctype.zig
+++ b/src/modules/ctype.zig
@@ -1,18 +1,127 @@
 //! implementation of `ctype.h`
+//!
+//! The header <ctype.h> declares several functions useful for classifying and mapping
+//! characters. In all cases the argument is an int, the value of which shall be
+//! representable as an unsigned char or shall equal the value of the macro EOF. If the
+//! argument has any other value, the behavior is undefined.
+//!
 
 const std = @import("std");
+const libc = @import("../libc.zig");
 
-// TODO: isalnum
-// TODO: isalpha
-// TODO: isblank
-// TODO: iscntrl
-// TODO: isdigit
-// TODO: isgraph
-// TODO: islower
-// TODO: isprint
-// TODO: ispunct
-// TODO: isspace
-// TODO: isupper
-// TODO: isxdigit
-// TODO: tolower
-// TODO: toupper
+// Use an alias to std.ascii to allow potential future replacement
+// of the locale implementation:
+const locale = std.ascii;
+
+const EOF = libc.h.EOF;
+
+/// Convert input to u8, undefined behaviour
+fn conv(c: c_int) ?u8 {
+    if (c == EOF)
+        return null;
+    return std.math.cast(u8, c) orelse libc.undefined_behaviour("passed a value that is not unsigned char nor EOF to a ctype function");
+}
+
+/// The isalnum function tests for any character for which isalpha or isdigit is true.
+export fn isalnum(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isAlphanumeric(u));
+}
+
+/// The isalpha function tests for any character for which isupper or islower is true,
+/// or any character that is one of a locale-specific set of alphabetic characters for which
+/// none of iscntrl, isdigit, ispunct, or isspace is true.200) In the "C" locale,
+/// isalpha returns true only for the characters for which isupper or islower is true.
+export fn isalpha(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isAlphabetic(u));
+}
+
+/// The isblank function tests for any character that is a standard blank character or is one
+/// of a locale-specific set of characters for which isspace is true and that is used to
+/// separate words within a line of text. The standard blank characters are the following:
+/// space (' '), and horizontal tab ('\t'). In the "C" locale, isblank returns true only
+/// for the standard blank characters.
+export fn isblank(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(u == ' ' or u == '\t');
+}
+
+/// The iscntrl function tests for any control character.
+export fn iscntrl(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isControl(u));
+}
+
+/// The isdigit function tests for any decimal-digit character (as defined in 5.2.1).
+export fn isdigit(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isDigit(u));
+}
+
+/// The isgraph function tests for any printing character except space (' ').
+export fn isgraph(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isPrint(u) and (c != ' '));
+}
+
+/// The islower function tests for any character that is a lowercase letter or is one of a
+/// locale-specific set of characters for which none of iscntrl, isdigit, ispunct, or
+/// isspace is true. In the "C" locale, islower returns true only for the lowercase
+/// letters (as defined in 5.2.1).
+export fn islower(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isLower(u));
+}
+
+/// The isprint function tests for any printing character including space (' ').
+export fn isprint(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isPrint(u));
+}
+
+/// The ispunct function tests for any printing character that is one of a locale-specific set
+/// of punctuation characters for which neither isspace nor isalnum is true. In the "C"
+/// locale, ispunct returns true for every printing character for which neither isspace
+/// nor isalnum is true.
+export fn ispunct(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(!locale.isWhitespace(u) and !locale.isAlphanumeric(u));
+}
+
+/// The isspace function tests for any character that is a standard white-space character or
+/// is one of a locale-specific set of characters for which isalnum is false. The standard
+/// white-space characters are the following: space (' '), form feed ('\f'), new-line
+/// ('\n'), carriage return ('\r'), horizontal tab ('\t'), and vertical tab ('\v'). In the
+/// "C" locale, isspace returns true only for the standard white-space characters.
+export fn isspace(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isWhitespace(u));
+}
+
+/// The isupper function tests for any character that is an uppercase letter or is one of a
+/// locale-specific set of characters for which none of iscntrl, isdigit, ispunct, or
+/// isspace is true. In the "C" locale, isupper returns true only for the uppercase
+/// letters (as defined in 5.2.1).
+export fn isupper(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isUpper(u));
+}
+
+/// The isxdigit function tests for any hexadecimal-digit character (as defined in 6.4.4.1).
+export fn isxdigit(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return @intFromBool(locale.isHex(u));
+}
+
+/// The tolower function converts an uppercase letter to a corresponding lowercase letter.
+export fn tolower(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return locale.toLower(u);
+}
+
+/// The toupper function converts a lowercase letter to a corresponding uppercase letter.
+export fn toupper(c: c_int) c_int {
+    const u = conv(c) orelse return EOF;
+    return locale.toUpper(u);
+}
diff --git a/tests/syntactic-validation.c b/tests/syntactic-validation.c
index e009250..c31050e 100644
--- a/tests/syntactic-validation.c
+++ b/tests/syntactic-validation.c
@@ -2,6 +2,7 @@
 // our own files must be included as non-system includes to trigger warnings:
 #include "ctype.h"
 #include "errno.h"
+#include "foundation/libc.h"
 #include "inttypes.h"
 #include "math.h"
 #include "setjmp.h"