From b7e5cab0ff89478bcfe2916c5a8844d05a782a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Sun, 21 Jan 2024 11:11:17 +0100 Subject: [PATCH] Implements ctype.h functions, adds support for panics, adds more code validation. --- README.md | 12 ++- build.zig | 132 ++++++++++++++++++++++++++++++++- include/ctype.h | 2 +- include/foundation/libc.h | 25 +++++++ src/libc.zig | 74 +++++++++++++++++++ src/modules/ctype.zig | 137 +++++++++++++++++++++++++++++++---- tests/syntactic-validation.c | 1 + 7 files changed, 365 insertions(+), 18 deletions(-) create mode 100644 include/foundation/libc.h diff --git a/README.md b/README.md index 2aecec2..cec3d39 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,16 @@ The first goal is to reach full C11 *freestanding* support. - No support for `wchar_t` and `wchar.h` as it isn't portable between compilers. - Multi-byte character strings are implemented as UTF-8. +## Customization + +Foundation libc doesn't really support much customization/configuration except for the hard required options. + +There is [`foundation/libc.h`](include/foundation/libc.h) which documents the behaviour of all required configurations. + +Right now, the following configurations exist: + +- `foundation_libc_panic_handler`, which allows users to catch detectable undefined behaviour. + ## Development Zig Version: 0.11 @@ -67,7 +77,7 @@ Which functions belong into which header can be figured out by taking a look at | --------------- | ------------- | --------------------- | ------------------------------------------------------------------------------------------------------- | | `assert.h` | ❌ | | Conditionally compiled macro that compares its argument to zero | | `complex.h` | ❌ | | (since C99) Complex number arithmetic | -| `ctype.h` | ✅ | ⏳ | Functions to determine the type contained in character data | +| `ctype.h` | ✅ | ✅ | Functions to determine the type contained in character data | | `errno.h` | ✅ | ✅ | Macros reporting error conditions | | `fenv.h` | 🔮 | | (since C99) Floating-point environment | | `float.h` | 🔀 | | Limits of floating-point types | diff --git a/build.zig b/build.zig index 926565c..7a4f7bd 100644 --- a/build.zig +++ b/build.zig @@ -19,11 +19,18 @@ pub fn createLibrary(b: *std.Build, target: std.zig.CrossTarget, optimize: std.b } pub fn build(b: *std.Build) void { + const validation_step = b.step("validate", "Runs the test suite and validates everything. Automatically triggered in Debug builds."); + const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); const single_threaded = b.option(bool, "single-threaded", "Create a single-threaded libc implementation (default: false)") orelse false; + // Run validation in debug builds for convenience: + if (optimize == .Debug) { + b.getInstallStep().dependOn(validation_step); + } + // check if the host has a gcc or clang available: const maybe_gcc = b.findProgram(&.{"gcc"}, &.{}) catch null; const maybe_clang = b.findProgram(&.{"clang"}, &.{}) catch null; @@ -41,6 +48,22 @@ pub fn build(b: *std.Build) void { // test suite: { + // Compile for huge amount of targets to detect breakage early on: + for ([_]bool{ false, true }) |validation_single_threaded| { + for (std.enums.values(std.builtin.OptimizeMode)) |validation_optimize| { + for (validation_target_list) |validation_target| { + + // skip everything that cannot support multithreading on freestanding: + if (!validation_single_threaded and !target_can_multithread(validation_target)) + continue; + + const vlc = createLibrary(b, validation_target, validation_optimize); + vlc.single_threaded = validation_single_threaded; + validation_step.dependOn(&vlc.step); + } + } + } + const syntax_validator_source: std.Build.LazyPath = .{ .path = "tests/syntactic-validation.c" }; // use the shipped C compiler to validate our code: @@ -59,7 +82,7 @@ pub fn build(b: *std.Build) void { _ = syntax_validator.getEmittedBin(); // Just compile, do not install: - b.getInstallStep().dependOn(&syntax_validator.step); + validation_step.dependOn(&syntax_validator.step); } // use the host C compilers to validate our code: @@ -89,7 +112,7 @@ pub fn build(b: *std.Build) void { ext_compiler.addArg("-o"); ext_compiler.addArg(b.pathJoin(&.{ b.makeTempPath(), "dummy" })); // we don't really care where this ends up - b.getInstallStep().dependOn(&ext_compiler.step); + validation_step.dependOn(&ext_compiler.step); } } } @@ -104,6 +127,7 @@ const header_files = [_][]const u8{ "string.h", "tgmath.h", "uchar.h", + "foundation/libc.h", }; const common_c_flags = [_][]const u8{ @@ -119,6 +143,110 @@ const common_c_flags = [_][]const u8{ "-Wno-reserved-identifier", // we actually want to implement those! }; +fn target_can_multithread(target: std.zig.CrossTarget) bool { + return switch (target.getCpuArch()) { + .wasm32, + .wasm64, + .msp430, + => false, + + else => true, + }; +} + +const validation_target_list = [_]std.zig.CrossTarget{ + .{}, // regular host platform + .{ .os_tag = .freestanding }, // host platform, but no OS + + // Check several common cpu targets: + + // arm: + .{ .cpu_arch = .arm, .os_tag = .freestanding }, + .{ .cpu_arch = .armeb, .os_tag = .freestanding }, + .{ .cpu_arch = .thumb, .os_tag = .freestanding }, + .{ .cpu_arch = .thumbeb, .os_tag = .freestanding }, + .{ .cpu_arch = .aarch64, .os_tag = .freestanding }, + // .{ .cpu_arch = .aarch64_32, .os_tag = .freestanding }, // error: unknown target triple 'aarch64_32-unknown-unknown-eabi', please use -triple or -arch + .{ .cpu_arch = .aarch64_be, .os_tag = .freestanding }, + + // risc-v: + .{ .cpu_arch = .riscv32, .os_tag = .freestanding }, + .{ .cpu_arch = .riscv64, .os_tag = .freestanding }, + + // intel: + .{ .cpu_arch = .x86_64, .os_tag = .freestanding }, + .{ .cpu_arch = .x86, .os_tag = .freestanding }, + + // mips: + .{ .cpu_arch = .mips, .os_tag = .freestanding }, + .{ .cpu_arch = .mips64, .os_tag = .freestanding }, + .{ .cpu_arch = .mips64el, .os_tag = .freestanding }, + .{ .cpu_arch = .mipsel, .os_tag = .freestanding }, + + // sparc: + .{ .cpu_arch = .sparc, .os_tag = .freestanding }, + .{ .cpu_arch = .sparc64, .os_tag = .freestanding }, + .{ .cpu_arch = .sparcel, .os_tag = .freestanding }, + + // power: + .{ .cpu_arch = .powerpc, .os_tag = .freestanding }, + .{ .cpu_arch = .powerpc64, .os_tag = .freestanding }, + .{ .cpu_arch = .powerpc64le, .os_tag = .freestanding }, + .{ .cpu_arch = .powerpcle, .os_tag = .freestanding }, + + // web assembly: + .{ .cpu_arch = .wasm32, .os_tag = .freestanding }, + .{ .cpu_arch = .wasm64, .os_tag = .freestanding }, + + // nice to have, but broken: + .{ .cpu_arch = .avr, .os_tag = .freestanding }, + // .{ .cpu_arch = .msp430, .os_tag = .freestanding }, // error: unknown target CPU 'generic' + // .{ .cpu_arch = .m68k, .os_tag = .freestanding }, + // .{ .cpu_arch = .xtensa, .os_tag = .freestanding }, + + // Not evaluated if reasonable to check: + // arc + // csky + // hexagon + // hsail + // hsail64 + // kalimba + // lanai + // le32 + // le64 + // loongarch32 + // loongarch64 + // r600 + // s390x + // shave + // spu_2 + // tce + // tcele + // ve + // xcore + + // will never be supported due to their properties: + // spir + // spir64 + // spirv32 + // spirv64 + + // bpfeb + // bpfel + + // renderscript32 + // renderscript64 + + // amdgcn + // amdil + // amdil64 + + // nvptx + // nvptx64 + + // dxil +}; + const sdk_root = computeSdkRoot(); fn computeSdkRoot() []const u8 { diff --git a/include/ctype.h b/include/ctype.h index feeed0d..c4e7caa 100644 --- a/include/ctype.h +++ b/include/ctype.h @@ -1,7 +1,7 @@ #ifndef _FOUNDATION_LIBC_CTYPE_H_ #define _FOUNDATION_LIBC_CTYPE_H_ -// TODO: #define EOF (-1) +#define EOF (-1) int isalnum(int c); int isalpha(int c); diff --git a/include/foundation/libc.h b/include/foundation/libc.h new file mode 100644 index 0000000..7b245c8 --- /dev/null +++ b/include/foundation/libc.h @@ -0,0 +1,25 @@ +#ifndef _FOUNDATION_LIBC_INTERNALS_H_ +#define _FOUNDATION_LIBC_INTERNALS_H_ + +#include + +/// +/// Panic handler for undefined, but catchable behaviour in safe modes. +/// +/// This will be invoked when Zig detects undefined behaviour at runtime, +/// or when foundation libc can recognize illegal arguments. +/// +/// The function receives a non-terminated pointer to the panic message +/// with `msg_len` bytes of UTF-8 encoded payload. +/// +/// It has a weak default implementation shipped, so just implement this +/// function to plug in your own custom behaviour. +/// The default implementation is done by invoking a `trap` instruction to +/// emit an illegal instruction or otherwise crash the program execution. +/// +/// NOTE: This function must never return, because otherwise, the undefined +/// behaviour will be actually undefined! +/// +void foundation_libc_panic_handler(char const * msg_ptr, size_t msg_len); + +#endif diff --git a/src/libc.zig b/src/libc.zig index 5dbdd5b..4475168 100644 --- a/src/libc.zig +++ b/src/libc.zig @@ -1,6 +1,38 @@ const std = @import("std"); +const builtin = @import("builtin"); + +pub const h = @cImport({ + @cInclude("ctype.h"); + @cInclude("errno.h"); + @cInclude("inttypes.h"); + @cInclude("math.h"); + @cInclude("setjmp.h"); + @cInclude("stdlib.h"); + @cInclude("string.h"); + @cInclude("tgmath.h"); + @cInclude("uchar.h"); + @cInclude("foundation/libc.h"); +}); comptime { + // Some assertions over the target platform: + std.debug.assert(@bitSizeOf(c_char) == 8); + + // Ensure hierarchy: + std.debug.assert(@bitSizeOf(c_short) >= @bitSizeOf(c_char)); + std.debug.assert(@bitSizeOf(c_int) >= @bitSizeOf(c_short)); + std.debug.assert(@bitSizeOf(c_long) >= @bitSizeOf(c_int)); + std.debug.assert(@bitSizeOf(c_longlong) >= @bitSizeOf(c_long)); + + // Ensure same-sized signed and unsigned + std.debug.assert(@bitSizeOf(c_ushort) == @bitSizeOf(c_short)); + std.debug.assert(@bitSizeOf(c_uint) == @bitSizeOf(c_int)); + std.debug.assert(@bitSizeOf(c_ulong) == @bitSizeOf(c_long)); + std.debug.assert(@bitSizeOf(c_ulonglong) == @bitSizeOf(c_longlong)); +} + +comptime { + // Drag in all implementations, so they are compiled: _ = @import("modules/ctype.zig"); _ = @import("modules/errno.zig"); _ = @import("modules/math.zig"); @@ -9,3 +41,45 @@ comptime { _ = @import("modules/string.zig"); _ = @import("modules/uchar.zig"); } + +/// Invokes safety-checked undefined behaviour, use this to implement +/// UB checks in the libc itself. +pub fn undefined_behaviour(comptime string: []const u8) noreturn { + switch (builtin.mode) { + // In debug mode, trigger a breakpoint so it's easier to detect the situation + // of the undefined behaviour: + .Debug => { + @breakpoint(); + @panic("UNDEFINED BEHAVIOUR: " ++ string); + }, + + // Safe modes have nice messages with + .ReleaseSafe => @panic("UNDEFINED BEHAVIOUR DETECTED: " ++ string), + + .ReleaseSmall => @panic("UB"), + + .ReleaseFast => unreachable, + } +} + +/// Zig panic handler, forwards panics to `foundation_libc_panic_handler`. +pub fn panic(msg: []const u8, maybe_error_return_trace: ?*std.builtin.StackTrace, maybe_return_address: ?usize) noreturn { + _ = maybe_error_return_trace; + _ = maybe_return_address; + h.foundation_libc_panic_handler(msg.ptr, msg.len); + unreachable; +} + +/// default implementation for `foundation_libc_panic_handler`. +fn fallback_panic_handler(msg_ptr: [*]const u8, msg_len: usize) callconv(.C) noreturn { + _ = msg_ptr; + _ = msg_len; + @trap(); +} +comptime { + @export(fallback_panic_handler, std.builtin.ExportOptions{ + .name = "foundation_libc_panic_handler", + .linkage = .Weak, + .visibility = .default, + }); +} diff --git a/src/modules/ctype.zig b/src/modules/ctype.zig index 2f3bee6..1f61331 100644 --- a/src/modules/ctype.zig +++ b/src/modules/ctype.zig @@ -1,18 +1,127 @@ //! implementation of `ctype.h` +//! +//! The header declares several functions useful for classifying and mapping +//! characters. In all cases the argument is an int, the value of which shall be +//! representable as an unsigned char or shall equal the value of the macro EOF. If the +//! argument has any other value, the behavior is undefined. +//! const std = @import("std"); +const libc = @import("../libc.zig"); -// TODO: isalnum -// TODO: isalpha -// TODO: isblank -// TODO: iscntrl -// TODO: isdigit -// TODO: isgraph -// TODO: islower -// TODO: isprint -// TODO: ispunct -// TODO: isspace -// TODO: isupper -// TODO: isxdigit -// TODO: tolower -// TODO: toupper +// Use an alias to std.ascii to allow potential future replacement +// of the locale implementation: +const locale = std.ascii; + +const EOF = libc.h.EOF; + +/// Convert input to u8, undefined behaviour +fn conv(c: c_int) ?u8 { + if (c == EOF) + return null; + return std.math.cast(u8, c) orelse libc.undefined_behaviour("passed a value that is not unsigned char nor EOF to a ctype function"); +} + +/// The isalnum function tests for any character for which isalpha or isdigit is true. +export fn isalnum(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isAlphanumeric(u)); +} + +/// The isalpha function tests for any character for which isupper or islower is true, +/// or any character that is one of a locale-specific set of alphabetic characters for which +/// none of iscntrl, isdigit, ispunct, or isspace is true.200) In the "C" locale, +/// isalpha returns true only for the characters for which isupper or islower is true. +export fn isalpha(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isAlphabetic(u)); +} + +/// The isblank function tests for any character that is a standard blank character or is one +/// of a locale-specific set of characters for which isspace is true and that is used to +/// separate words within a line of text. The standard blank characters are the following: +/// space (' '), and horizontal tab ('\t'). In the "C" locale, isblank returns true only +/// for the standard blank characters. +export fn isblank(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(u == ' ' or u == '\t'); +} + +/// The iscntrl function tests for any control character. +export fn iscntrl(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isControl(u)); +} + +/// The isdigit function tests for any decimal-digit character (as defined in 5.2.1). +export fn isdigit(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isDigit(u)); +} + +/// The isgraph function tests for any printing character except space (' '). +export fn isgraph(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isPrint(u) and (c != ' ')); +} + +/// The islower function tests for any character that is a lowercase letter or is one of a +/// locale-specific set of characters for which none of iscntrl, isdigit, ispunct, or +/// isspace is true. In the "C" locale, islower returns true only for the lowercase +/// letters (as defined in 5.2.1). +export fn islower(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isLower(u)); +} + +/// The isprint function tests for any printing character including space (' '). +export fn isprint(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isPrint(u)); +} + +/// The ispunct function tests for any printing character that is one of a locale-specific set +/// of punctuation characters for which neither isspace nor isalnum is true. In the "C" +/// locale, ispunct returns true for every printing character for which neither isspace +/// nor isalnum is true. +export fn ispunct(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(!locale.isWhitespace(u) and !locale.isAlphanumeric(u)); +} + +/// The isspace function tests for any character that is a standard white-space character or +/// is one of a locale-specific set of characters for which isalnum is false. The standard +/// white-space characters are the following: space (' '), form feed ('\f'), new-line +/// ('\n'), carriage return ('\r'), horizontal tab ('\t'), and vertical tab ('\v'). In the +/// "C" locale, isspace returns true only for the standard white-space characters. +export fn isspace(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isWhitespace(u)); +} + +/// The isupper function tests for any character that is an uppercase letter or is one of a +/// locale-specific set of characters for which none of iscntrl, isdigit, ispunct, or +/// isspace is true. In the "C" locale, isupper returns true only for the uppercase +/// letters (as defined in 5.2.1). +export fn isupper(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isUpper(u)); +} + +/// The isxdigit function tests for any hexadecimal-digit character (as defined in 6.4.4.1). +export fn isxdigit(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return @intFromBool(locale.isHex(u)); +} + +/// The tolower function converts an uppercase letter to a corresponding lowercase letter. +export fn tolower(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return locale.toLower(u); +} + +/// The toupper function converts a lowercase letter to a corresponding uppercase letter. +export fn toupper(c: c_int) c_int { + const u = conv(c) orelse return EOF; + return locale.toUpper(u); +} diff --git a/tests/syntactic-validation.c b/tests/syntactic-validation.c index e009250..c31050e 100644 --- a/tests/syntactic-validation.c +++ b/tests/syntactic-validation.c @@ -2,6 +2,7 @@ // our own files must be included as non-system includes to trigger warnings: #include "ctype.h" #include "errno.h" +#include "foundation/libc.h" #include "inttypes.h" #include "math.h" #include "setjmp.h"