From 190fd1f19575280b8d9f8e0886afba08cea654c8 Mon Sep 17 00:00:00 2001 From: Akuli <akuviljanen17@gmail.com> Date: Sun, 10 Dec 2023 20:10:10 +0200 Subject: [PATCH] Document a lot of stuff (#440) --- README.md | 3 +- doc/perf.md | 56 +- doc/tutorial.md | 654 +++++++++++++++++++++ doc/ub.md | 389 ++++++++++++ doctest.sh | 11 +- self_hosted/tokenizer.jou | 2 +- src/tokenize.c | 2 +- tests/syntax_error/overlong_char.jou | 2 +- tests/syntax_error/utf8_multibyte_char.jou | 2 + 9 files changed, 1071 insertions(+), 50 deletions(-) create mode 100644 doc/tutorial.md create mode 100644 doc/ub.md create mode 100644 tests/syntax_error/utf8_multibyte_char.jou diff --git a/README.md b/README.md index 66735670..5cbe0e75 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ def main() -> int: return 0 ``` -See the [examples](./examples/) and [tests](./tests/) directories for more example programs. +See the [examples](./examples/) and [tests](./tests/) directories for more example programs +or read [the Jou tutorial](./doc/tutorial.md). So far, Jou is usable enough to do [Advent of Code 2023](https://adventofcode.com/). We'll see whether I get 50 stars with Jou this year. diff --git a/doc/perf.md b/doc/perf.md index 6b3e2144..b7af5d37 100644 --- a/doc/perf.md +++ b/doc/perf.md @@ -1,4 +1,4 @@ -# Performance and optimizations +# Performance and Optimizations Because Jou uses [LLVM](https://llvm.org/), it is faster than interpreted languages like Python, @@ -198,7 +198,7 @@ Let's explore these with more examples. TODO: write this section once a large Jou program exists and name it Example #1 -### Example #2: Optimizer's assumptions +### Example #2: Optimizer's assumptions and undefined behavior Let's write a program that crashes if the user selects yes. @@ -240,9 +240,13 @@ $ ``` The optimizations make the program ignore the code to access the value of a `NULL` pointer. -Essentially it thinks that the `x = *foo` code will never run, because you aren't supposed to access the value of a NULL pointer. This code will thus get ignored. +Essentially it thinks that the `x = *foo` code will never run, +because you aren't supposed to access the value of a NULL pointer. +This code will thus get ignored. -Sidenote: if you want the program to crash with optimizations on, then you should do so using `abort()` function for example: +Accessing a `NULL` pointer is an example of **undefined behavior**, or **UB** for short. +Undefined behavior is generally a Bad Thing: if your code has UB, you should fix it. +For example, a much better way to crash the program would be using `abort()` function: ```python import "stdlib/io.jou" @@ -255,7 +259,7 @@ def main() -> int: return 0 ``` -Now the program crashes when `y` is typed, even if optimizations are enabled: +Now the program contains no UB. It crashes when `y` is typed, even if optimizations are enabled: ``` $ ./jou -O3 asd.jou @@ -263,41 +267,9 @@ Crash this program? (y/n) y Aborted ``` -Accessing the value of a NULL pointer is an example of **undefined behavior** (UB). -The optimizer naturally assumes that your program does not have anything that causes UB, -and as such if it does, it could in principle do anything when it is ran with optimizations enabled. Use at your own risk. +UB is easiest to find and understand when optimizations are turned off. +For example, the optimizer might realize that a large part of the code cannot possibly run without invoking UB, +and hence just delete it, like it deleted our crashing code in the above example. +This would be much more confusing to debug than a crash. -Here are a few examples of things that are UB in Jou: -- Accessing the value of a `NULL` pointer. -- Setting the value of a `NULL` pointer. -- Reading the 11th member from an array of length 10. -- Using the value of a variable before it has been set. - For example, `x: int` followed by `printf("%d\n", x)` - without doing something like `x = 0` before printing. - -The takeaway from this is that these are all things that one would never do intentionally. -The rest of Jou's documentation aims to mention other things that are UB. - -In some other languages, it is easier to get UB than in Jou. -For example, in C it is UB to add two `int`s so large -that the result doesn't fit into an `int`, -but in Jou, math operations are instead guaranteed to wrap around: - -```python -import "stdlib/io.jou" - -def main() -> int: - printf("%d\n", (254 as byte) + (0 as byte)) # Output: 254 - printf("%d\n", (254 as byte) + (1 as byte)) # Output: 255 - printf("%d\n", (254 as byte) + (2 as byte)) # Output: 0 - printf("%d\n", (254 as byte) + (3 as byte)) # Output: 1 - printf("%d\n", (254 as byte) + (4 as byte)) # Output: 2 - - printf("%d\n", 2147483646 + 0) # Output: 2147483646 - printf("%d\n", 2147483646 + 1) # Output: 2147483647 - printf("%d\n", 2147483646 + 2) # Output: -2147483648 - printf("%d\n", 2147483646 + 3) # Output: -2147483647 - printf("%d\n", 2147483646 + 4) # Output: -2147483646 - - return 0 -``` +For more about UB, see [the UB docs](ub.md). diff --git a/doc/tutorial.md b/doc/tutorial.md new file mode 100644 index 00000000..105d6daa --- /dev/null +++ b/doc/tutorial.md @@ -0,0 +1,654 @@ +# Jou Tutorial + +On surface level, Jou looks a lot like Python, but **it doesn't behave like Python**, +so you will probably be disappointed if you know Python well and you expect all of your knowledge to work as is. +The main differences are: +- Jou is compiled into native binaries, not interpreted. +- Jou uses C's standard library. +- Jou's integer types are fixed-size and can wrap around. +- All data in a computer consists of bytes. High-level languages hide this fact, Jou exposes it. +- Jou doesn't hide various other details about how computers work. +- Jou has Undefined Behavior. +- Jou uses manual memory management, not garbage-collection. + +If none of this makes any sense to you, that's fine. +The rest of this page explains it all using lots of example code. + +Basically, all of this means that Jou is more difficult to use, but as a result, +Jou code tends to run faster than e.g. Python (see [the performance docs](./perf.md) for more details). +Also, knowing Jou makes learning other low-level languages (C, C++, Rust, ...) much easier. + + +## Main function and binaries + +When you run a Jou program, Jou first produces an executable file, and then runs it. +On Windows, executable file names must end with `.exe` (e.g. `jou.exe` or `hello.exe`). +On most other systems, executable files typically don't have a file extension at all (e.g. `jou` or `hello`). +By default, Jou places executables into a folder named `jou_compiled/`. + +For example, if you run `hello.jou`, you get a file named +`jou_compiled\hello\hello.exe` (Windows) or `jou_compiled/hello/hello` (other platforms). +You can run this file without Jou, or even move it to a different computer that doesn't have Jou, and run it there. + +When the operating system runs an executable, +it finds a function named `main()` in it and calls it. +The return value of the `main()` function is an integer, +and the operating system gives it to the program that ran the executable. +This means that every executable must have a `main()` function that returns an integer. +Jou doesn't hide this, and therefore all Jou programs contain something like this: + +```python +def main() -> int: + ... + return 0 +``` + +This integer is called the **exit code** of the process. +By convention, exit code `0` means "success". Anything else means "error". +You can use different exit codes to represent different errors, but `1` is the most common. + + +## Printing + +To print a string, you can use the `puts()` function from [stdlib/io.jou](../stdlib/io.jou): + +```python +import "stdlib/io.jou" + +def main() -> int: + puts("Hello") # Output: Hello + return 0 +``` + +However, `puts()` only prints strings. +You can use `printf()` to print values of other types. +Here's how it works: + +```python +import "stdlib/io.jou" + +def main() -> int: + printf("Hello\n") # Output: Hello + printf("strings %s %s\n", "foo", "bar") # Output: strings foo bar + printf("ints %d %d %d\n", 1, 2, 3) # Output: ints 1 2 3 + printf("doubles %f %.2f\n", 3.1415, 3.1415) # Output: doubles 3.141500 3.14 + printf("floats %f %.2f\n", 3.1415 as float, 3.1415 as float) # Output: floats 3.141500 3.14 + printf("%d is %s and %d is %s\n", 4, "even", 7, "odd") # Output: 4 is even and 7 is odd + return 0 +``` + +Here: +- `%d` gets replaced with an `int` argument that you provide +- `%s` means a string +- `%f` means a `float` or `double` (`float` takes up less memory but is also less accurate, just use `double` if you don't know which to use) +- `%.2f` means a `float` or `double` rounded to two decimal places +- `as float` is a type cast, needed to construct a `float`. + +There are various other `%` things you can pass to `printf()`. +Just search something like "printf format specifiers" online: +`printf()` is actually not a Jou-specific thing (see below). + +You need the `\n` to get a newline. +The `printf()` function doesn't add it automatically. +This seems annoying, but on the other hand, it means that you can do things like this: + +```python +import "stdlib/io.jou" + +# Output: the numbers are 1 2 3 +def main() -> int: + printf("the numbers are") + for i = 1; i <= 3; i++: + printf(" %d", i) + printf("\n") + return 0 +``` + + +## C's standard library (libc) + +We did `import "stdlib/io.jou"` to use the `printf()` function. +If you look at [stdlib/io.jou](../stdlib/io.jou), +there is only one line of code related to `printf()`: + +```python +declare printf(pattern: byte*, ...) -> int # Example: printf("%s %d\n", "hi", 123) +``` + +How in the world can this one line of code define a function that does so many different things? + +This doesn't actually define the `printf()` function, it only **declares** it. +This line of code tells the Jou compiler +"there exists a function named `printf()`, and it is defined somewhere else". +The `printf()` function is actually defined in the **libc**, +which is the standard library of the C programming language. + +C is an old, small, simple and low-level programming language. +Jou is very heavily inspired by C, and in many ways similar to C and compatible with C. +For example, Jou programs can use libraries written in C, +so in practice, any large Jou project needs libc anyway. +With `declare`, we basically use things that the libc provides instead of reinventing the wheel. + + +## `byte`, `int`, `long` + +From a programmer's point of view, a byte is an integer between 0 and 255 (inclusive). +Alternatively, you can think of a `byte` as consisting of 8 bits, where a bit means 0 or 1. +Two bits can be set to 4 different states (00, 01, 10, 11), so you could use 2 bits to represent numbers 0 to 3. +Similarly, 8 bits can be set to 256 different states +that correspond with numbers 0 to 255. + +In Jou, the `byte` data type represents a single byte. +To construct a byte, you can do e.g. `123 as byte`, +where the type cast with `as` converts from `int` to `byte`. +If you try to convert a number larger than 255 into a `byte`, it will wrap back around to zero: + +```python +import "stdlib/io.jou" + +def main() -> int: + printf("%d\n", 254 as byte) # Output: 254 + printf("%d\n", 255 as byte) # Output: 255 + printf("%d\n", 256 as byte) # Output: 0 + printf("%d\n", 257 as byte) # Output: 1 + printf("%d\n", 258 as byte) # Output: 2 + return 0 +``` + +Bytes get converted to `int` implicitly when calling `printf()`, +so it's fine to specify `%d` and pass in a `byte`. + +Each byte has 256 different possible values (0 - 255), +so with 2 bytes, you get `256 * 256` different values: +for each first byte, you have 256 possible second bytes. +If we used 4 bytes instead of one byte, we would get `256 * 256 * 256 * 256 = 4294967296` different combinations, +and we would be able to handle much bigger numbers. +In fact, this is exactly what Jou's `int` does: +**Jou's `int` is 4 bytes (32 bits)**. +For example, `1000` and `1000000` are valid `int`s: + +```python +import "stdlib/io.jou" + +def main() -> int: + printf("%d\n", 1000 * 1000) # Output: 1000000 + printf("%d\n", 1000 * 1000 * 1000) # Output: 1000000000 + return 0 +``` + +Specifically, the range of an `int` is from `-2147483648` to `2147483647`. +Note that `int`s can be negative, but bytes cannot. +This works by basically using the first bit as the sign bit: +the first bit is 1 for negative numbers and 0 for nonnegative numbers, +and the remaining 31 bits work more or less like you would expect. + +Sometimes `int` isn't big enough. +When `int` wraps around, you usually get negative numbers when you expect things to be positive, +and you should probably use `long` instead of `int`. +**Jou's `long` is 8 bytes (64 bits)**, so twice the size of an `int` and hence much less likely to wrap around. +To create a `long`, add `L` to the end of the number, as in `123L` or `-2000000000000L`. +To print a `long`, use `%lld` instead of `%d`. + +```python +import "stdlib/io.jou" + +def main() -> int: + printf("%d\n", 1000 * 1000 * 1000 * 1000) # Output: -727379968 + printf("%lld\n", 1000L * 1000L * 1000L * 1000L) # Output: 1000000000000 + return 0 +``` + +The range of `long` is from `-9223372036854775808` to `9223372036854775807`. +Please create an issue on GitHub if you need an even larger range. + + +## Pointers + +In this context, "memory" means the computer's RAM, not hard disk or SSD. + +All data in any modern computer consists of bytes. +A computer's memory is basically a big list of bytes, +and an `int` is just 4 consecutive bytes somewhere inside that list. +Jou does not hide that, and in fact, as a Jou programmer +**you will need to often treat the computer's memory as a big array of bytes**. + +To get started, let's make a variable and ask Jou to print its index in the big list of bytes: + +```python +import "stdlib/io.jou" + +def main() -> int: + b = 123 as byte + printf("%p\n", &b) + return 0 +``` + +Here the `p` of `%p` is short for "pointer". + +This prints something like `0x7ffd85fd3db7`. +This is a number written in hexadecimal, +and it means `140726851419575`. +Hexadecimal basically means that instead of representing numbers with 10 digits (`0`-`9`), +we use 16 "digits" (`0`-`9` and then `a`-`f`). +The prefix `0x` is a convention to indicate that the number is in heXadecimal. +How exactly hexadecimal works is not really relevant here, +but what matters is that we got some number. +So: + +``` +memory_of_the_computer[140726851419575] == 123 +``` + +Numbers that represent indexes into the computer's memory like this +are called **memory addresses**. +The `&` operator is called the **address-of operator**, +because `&b` computes the address of the `b` variable. + +<p><details> +<summary>An unimportant "ahchthually" that you can skip</summary> + +The memory addresses are not necessary just indexes into RAM. +For example, the Linux kernel moves infrequently accessed things to disk +when RAM is about to get full (this is called **swapping**). +This doesn't change memory addresses within the program, +so you don't need to think about swapping when you write Jou programs. +The OS will take care of mapping your memory addresses to the right place. + +I think the locations in RAM are called **physical addresses**, +and the memory addresses that Jou programs see are called **virtual addresses**. +I'm not sure about the names though. +I don't think of this much: I just imagine that everything goes in RAM, +and on the rest of this page I continue to do so. + +</details> + +If you run the code above, +you will almost certainly get a different memory address than I got. +Even on the same computer I get a different memory address every time, +because the program essentially loads into whatever memory location is available: + +``` +$ ./jou asd.jou +0x7ffe7e1ded17 +$ ./jou asd.jou +0x7ffff24bec87 +$ ./jou asd.jou +0x7fff356b6dd7 +$ ./jou asd.jou +0x7ffeabcfe7f7 +``` + +In Jou, memory addresses are represented as **pointers**. +A pointer is a memory address together with a type. +For example, `&b` is a pointer of type `byte*`, meaning a pointer to a value of type `byte`. +Similarly, `int*` would be a pointer to a value of type `int`, +pointing to the first of the 4 consecutive bytes that an `int` uses. +We could, for example, make a function that sets the value of a given `int*`: + +```python +import "stdlib/io.jou" + +def set_to_500(pointer: int*) -> None: + *pointer = 500 + +def main() -> int: + n = 123 + set_to_500(&n) + printf("%d\n", n) # Output: 500 + return 0 +``` + +Because the `set_to_500()` function knows the memory address of the `n` variable, +it can just set the value at that memory address. +The `*` operator is sometimes called the **value-of operator**, +and `*foo` means the value of a pointer `foo`. +Note that the value-of operator is the opposite of the address-of operator: +`&*foo` and `*&foo` are unnecessary, because you might as well use `foo` directly. + +As you can see, a function call can change the values of variables outside that function. +However, the variables passed as pointers are clearly marked with `&`, +so it isn't as confusing as it seems to be at first. +A common way to use this is to return multiple values from the same function: + +```python +import "stdlib/io.jou" + +def get_point(x: int*, y: int*) -> None: + *x = 123 + *y = 456 + +def main() -> int: + x: int + y: int + get_point(&x, &y) + printf("The point is (%d,%d)\n", x, y) # Output: The point is (123,456) + return 0 +``` + +Instead of pointers, you could also use an `int[2]` array to return the two values. +However, **this doesn't mean that you don't need to understand pointers**, +as they have many other uses in Jou. + +```python +import "stdlib/io.jou" + +def get_point() -> int[2]: + return [123, 456] + +def main() -> int: + point = get_point() + printf("The point is (%d,%d)\n", point[0], point[1]) # Output: The point is (123,456) + return 0 +``` + + +## Undefined Behavior (UB) + +Consider again the pointer example above: + +```python +import "stdlib/io.jou" + +def get_point(x: int*, y: int*) -> None: + *x = 123 + *y = 456 + +def main() -> int: + x: int + y: int + get_point(&x, &y) + printf("The point is (%d,%d)\n", x, y) # Output: The point is (123,456) + return 0 +``` + +Here `x: int` creates a variable of type `int` without assigning a value to it. +If you try to use the value of `x` before it is set, +you will most likely get a compiler warning together with a random garbage value when the program runs. +For example, if I delete the `get_point(&x, &y)` line, I get: + +``` +compiler warning for file "asd.jou", line 10: the value of 'x' is undefined +compiler warning for file "asd.jou", line 10: the value of 'y' is undefined +The point is (-126484104,-126484088) +``` + +Again, Jou doesn't attempt to hide the way the computer's memory works. +When you do `x: int`, you tell Jou: +"give me 4 bytes of memory, and from now on, interpret those 4 bytes as an integer". +That memory has probably been used for something else before your function gets it, +so it will contain whatever the previous thing stored there. +Those 4 bytes were probably not used as an integer, +and once you interpret them as an integer anyway, +you tend to get something nonsensical. + +This is one example of **UB (Undefined Behavior)** in Jou. +In general, UB is a Bad Thing, because code that contains UB can behave unpredictably. +You need to know about UB, +because **the Jou compiler does not always warn you when you're about to do UB.** +See [UB documentation](ub.md) for more info. + + +## Memory safety, speed, ease of use: pick two + +Ideally, a programming language would be: +- memory safe (basically means that you cannot get UB by accident) +- fast +- simple/easy to use. + +So far I haven't seen a programming language that would check all boxes to me, +and I think it is not possible to make such a language. +However, every combination of two features has been done: +- Jou and C are fast and simple languages, but not memory safe. +- Python is memory safe and easy to use, but not very fast compared to Jou or C. +- Rust is memory safe and fast, but difficult to use. + +Jou intentionally chooses the same tradeoff as C. +The purpose of Jou is to be a lot like C, +but with various annoyances fixed, and of course, with Python's simple syntax. + + +## Characters + +You can place a character in single quotes to specify a byte. +This byte is the number that represents the character in the computer's memory. +For example, almost all `a` characters in your computer are represented with the byte 97. + +```python +import "stdlib/io.jou" + +def main() -> int: + printf("%d\n", 'a') # Output: 97 + printf("%d\n", ':') # Output: 58 + printf("%d\n", '0') # Output: 48 + return 0 +``` + +Note that single quotes specify a byte and double quotes specify a string. + +This clearly cannot work for all characters, +because there are thousands of different charaters, but only 256 different bytes. +For example, `'Ω'` doesn't work: + +```python +printf("%d\n", 'Ω') # Error: single quotes are for specifying a byte, maybe use double quotes to instead make a string? +``` + +In fact, this only works for ASCII characters, such as letters `A-Z a-z` and numbers `0-9`. +There are a total of 128 ASCII characters (bytes 0 to 127). +Other characters are made up by combining multiple bytes per character (bytes 128 to 255). +This is how UTF-8 works. +It is used in Jou, because it is by far the most common way to represent text in computers, +and using anything else would be weird and impractical. + +To see how many bytes a character consists of, +you can use the `strlen()` function from [stdlib/str.jou](../stdlib/str.jou). +It calculates the length of a string in bytes. + +```python +import "stdlib/io.jou" +import "stdlib/str.jou" + +def main() -> int: + printf("%lld\n", strlen("o")) # Output: 1 + printf("%lld\n", strlen("Ω")) # Output: 2 + printf("%lld\n", strlen("foo")) # Output: 3 + printf("%lld\n", strlen("fΩΩ")) # Output: 5 + return 0 +``` + +We are using `%lld`, because `strlen()` returns a `long`. +You can see it by looking at how [stdlib/str.jou](../stdlib/str.jou) declares `strlen()`: + +```python +declare strlen(s: byte*) -> long +``` + + +## More about strings + +A Jou string is just a chunk of memory, +represented as a `byte*` pointer to the start of the memory. +There is a zero byte to mark the end of the string. + +For example, the string `"hello"` is 6 bytes. Let's print the bytes. + +```python +import "stdlib/io.jou" + +def main() -> int: + s = "hello" + for i = 0; i < 6; i++: + printf("byte %d = %d\n", i, s[i]) + return 0 + +# Output: byte 0 = 104 +# Output: byte 1 = 101 +# Output: byte 2 = 108 +# Output: byte 3 = 108 +# Output: byte 4 = 111 +# Output: byte 5 = 0 +``` + +Each byte corresponds with a letter. For example, 108 is the letter `l`. +You can see that it is repeated: there are two `l`'s in `hello`. + +``` + 'h' 'e' 'l' 'l' 'o' +memory_of_the_computer = [ ..., 104, 101, 108, 108, 111, 0, ... ] + ↑ + s +``` + +The syntax `s[i]` gets the value `i` items forward from the pointer. +Because we have a `byte*` pointer, each item is 1 byte, +so `s[3]` moves 3 bytes forward, for example. + +``` + 'h' 'e' 'l' 'l' 'o' +memory_of_the_computer = [ ..., 104, 101, 108, 108, 111, 0, ... ] + s[0] s[1] s[2] s[3] s[4] s[5] +``` + +To slice the string to get just `llo`, you can simply do `&s[2]`; +that is, take a pointer to `s[2]`. + +```python +import "stdlib/io.jou" + +def main() -> int: + s = "hello" + printf("%s\n", &s[2]) # Output: llo + return 0 +``` + +You can also use the `++` and `--` operator to move pointers by one item at a time. +They move strings one byte at a time, because strings are `byte*` pointers. + +```python +import "stdlib/io.jou" + +def main() -> int: + s = "hello" + s++ + printf("%s\n", s) # Output: ello + s++ + printf("%s\n", s) # Output: llo + s-- + s-- + printf("%s\n", s) # Output: hello + return 0 +``` + +To instead remove characters from the end of the string, +you can simply place a zero byte to the middle of the string. +Usually the zero byte is written as `'\0'`, which means same as `0 as byte` +but is slightly more readable after getting used to it. + +```python +import "stdlib/io.jou" + +def main() -> int: + s = "hello" + s[2] = '\0' + printf("%s\n", s) # Output: he + return 0 +``` + +However, this code contains a subtle bug. +To see it, let's put this code into a loop and add some prints: + +```python +import "stdlib/io.jou" + +def main() -> int: + for i = 0; i < 3; i++: + s = "hello" + printf("Before truncation: %s\n", s) + s[2] = '\0' + printf("After truncation: %s\n", s) + return 0 +``` + +This prints: + +``` +Before truncation: hello +After truncation: he +Before truncation: he +After truncation: he +Before truncation: he +After truncation: he +``` + +It seems that the string `"hello"` became permanently truncated. +When the loop does `s = "hello"` for a second time, +it actually gets the truncated version `"he"`. + +**Do not modify strings in this way.** +They are not meant to be modified. +If you want to modify a string, use an array of bytes, +e.g. `byte[100]` for a maximum length of 100 bytes (including `'\0'`). +To do that, simply specify the type of the string as `byte[100]`: + +```python +import "stdlib/io.jou" + +def main() -> int: + for i = 0; i < 3; i++: + # create an array to hold the characters + s: byte[100] = "hello" + printf("Before truncation: %s\n", s) + s[2] = '\0' + printf("After truncation: %s\n", s) + return 0 +``` + +Now this prints: + +``` +Before truncation: hello +After truncation: he +Before truncation: hello +After truncation: he +Before truncation: hello +After truncation: he +``` + +Note that `s[2] = '\0'` and printing `s` work in the same exact way +regardless of whether `s` is a `byte*` or a `byte[100]`. +Specifically, Jou does an **implicit cast** that +takes the pointer to the first element of the array, +and so the `byte[100]` can act as a `byte*` when needed. + +If you don't want to hard-code a maximum size for the string (100 in this example), +you can instead use heap memory. +The `strdup()` function from [stdlib/str.jou](../stdlib/str.jou) +allocates the right amount of heap memory to hold a string (including the `'\0'`) and copies it there. +You should `free()` the memory once you no longer need the string. + +TODO: document heap allocations better + +```python +import "stdlib/io.jou" +import "stdlib/str.jou" +import "stdlib/mem.jou" + +def main() -> int: + s = strdup("hello") + + printf("Before truncation: %s\n", s) # Output: Before truncation: hello + s[2] = '\0' + printf("After truncation: %s\n", s) # Output: After truncation: he + + free(s) + return 0 +``` + + +## What next? + +To learn More about Jou, I recommend: +- reading other documentation files in the [doc](../doc/) folder +- reading files in [stdlib/](../stdlib/) +- writing small Jou programs (e.g. [Advent of Code](https://adventofcode.com/)) +- browsing Jou's issues on GitHub and fixing some of them :) diff --git a/doc/ub.md b/doc/ub.md new file mode 100644 index 00000000..4c2d4341 --- /dev/null +++ b/doc/ub.md @@ -0,0 +1,389 @@ +# Undefined Behavior + +Undefined behavior (UB) basically means that your code does something dumb. +For example, these things are UB: +- Reading the value of a `NULL` pointer. +- Setting the value of a `NULL` pointer. +- Reading or setting the 11th member in an array of length 10. +- Reading or setting the value of a pointer into a local variable that no longer exists. + Local variables no longer exists after the function has finished running, + either with a `return` statement or by reaching the end of the code in the function. +- Using the value of a variable before it has been set. + For example, `x: int` followed by `printf("%d\n", x)` + without doing something like `x = 0` before printing. + +In the rest of this file, we look at some of the most common symptoms of UB, +so that you will know what to look for when you cause UB. +Every experienced Jou (or C or C++) programmer has caused UB by accident and fixed it many times. + +If your program has UB, you might get: +- a garbage value that just happened to be in the computer's memory +- random results, e.g. sometimes what you expect and sometimes a garbage value +- a perfectly working program +- a crash +- something else. + +UB is not just a Jou thing. +If you want to learn other "fast" languages, such as C, C++, Rust or Zig, +you will need to eventually learn about UB anyway. +Rust handles UB differently from any other language I have seen. +See the end of this page. + +Also, UB can be useful if your code doesn't invoke it. +For example, because accessing elements beyond the end of an array is UB, +the Jou compiler doesn't add slow bounds-checking to array indexing in your programs. +See also [performance docs](perf.md). + + +## Garbage values + +For example, let's look at this program: + +```python +import "stdlib/io.jou" + +def main() -> int: + arr = [1, 2, 3] + + sum = 0 + for i = 0; i < 4; i++: + sum += arr[i] + printf("%d\n", sum) + + return 0 +``` + +This is supposed to calculate `1 + 2 + 3`, so it should print 6. +On my system it prints `-115019848`. +If I run the program again, it instead prints `1308074024`. +In fact, it seems like I get a different value every time. +The problem is that the loop reads one element beyond the end of the array, +so whatever garbage happens to be in the computer's memory at that location +gets converted to an integer and added to `sum`. + + +## Randomly working and not working + +Here's another common mistake that results in garbage values: + +```python +import "stdlib/io.jou" +import "stdlib/str.jou" + +def make_string(n: int) -> byte*: + result: byte[50] + sprintf(result, "foo%d", n) + return result + +def main() -> int: + printf("%s\n", make_string(3)) + return 0 +``` + +When I run this repeatedly on my computer, I sometimes get `foo3` and sometimes a blank line: + +``` +akuli@akuli-desktop:~/jou$ ./jou a.jou + +akuli@akuli-desktop:~/jou$ ./jou a.jou + +akuli@akuli-desktop:~/jou$ ./jou a.jou + +akuli@akuli-desktop:~/jou$ ./jou a.jou +foo3 +akuli@akuli-desktop:~/jou$ ./jou a.jou +foo3 +akuli@akuli-desktop:~/jou$ ./jou a.jou + +akuli@akuli-desktop:~/jou$ ./jou a.jou + +``` + +The `make_string()` function uses `sprintf()` from [stdlib/str.jou](../stdlib/str.jou) +to create a string that looks like `"foo3"`. +It then returns it as a `byte*`. +For convenience, Jou converts `byte[50]` strings to `byte*` strings implicitly +(works with any size of byte array), +so the function actually returns a pointer to the first character of the string. + +This program contains UB, because it reads from a pointer into a local variable that no longer exists. +More specifically, it tells `printf()` to read from a local variable inside `make_string()`, +but because the return value of `make_string()` is used as an argument to `printf()`, +the call to `make_string()` is evaluated first. +Once `make_string()` has returned, its local variables no longer exist, +and as you would expect, it is UB to access pointers that point into them. + +A simple fix is to return the entire array from `make_string()`, not just the first character. +In other words, we change `-> byte*` to `-> byte[50]`. +This gives us a new compiler error on a different line: + +``` +compiler error in file "a.jou", line 10: cannot create a pointer into an array that comes from a function call (try storing it to a local variable first) +``` + +Line 10 is `printf("%s\n", make_string(3))`. +The compiler is trying to convert the array into a pointer here, +because `printf()` wants a pointer. +If we just do like the error message suggests, +we end up storing the array in `main()`, which is great because it no longer vanishes unexpectedly: + +```python +import "stdlib/io.jou" +import "stdlib/str.jou" + +def make_string(n: int) -> byte[50]: + result: byte[50] + sprintf(result, "foo%d", n) + return result + +def main() -> int: + s = make_string(3) + printf("%s\n", s) # Output: foo3 + return 0 +``` + +This code does not contain UB, and it prints `foo3` as expected every time. + + +## Perfectly working program with UB + +Let's modify the example from earlier by making an array of `byte`s instead of `int`s. + +```python +import "stdlib/io.jou" + +def main() -> int: + arr = [1 as byte, 2 as byte, 3 as byte] + + sum = 0 + for i = 0; i < 4; i++: + sum += arr[i] + printf("%d\n", sum) + + return 0 +``` + +On my Linux system, this program prints 6 every time as expected. + +This program still contains UB, and it should be fixed. +I make no guarantees of anything working as expected when your program contains UB. +For example, your code might suddenly stop working when you [enable optimizations](perf.md), +or when you run the program on a different operating system. +In fact, the above program printed `2` when I tried it on Windows. + + +## Crashing and valgrind + +Let's try reading array elements way beyond the end of the array, rather than just one index beyond. + +```python +import "stdlib/io.jou" + +def main() -> int: + arr = [1, 2, 3] + + sum = 0 + for i = 0; i < 10000; i++: + sum += arr[i] + printf("%d\n", sum) + + return 0 +``` + +Here's what running this code looks like on my Linux system: + +``` +akuli@akuli-desktop:~/jou$ ./jou a.jou +Segmentation fault +``` + +`Segmentation fault` means that +the program tried to access memory that doesn't belong to it. +Only a small part of the computer's memory belongs to our program, +and when it accesses memory beyond that area, the operating system notices it and kills the program. + +The `Segmentation fault` error message doesn't mention the file name and line number (`a.jou`, `8`) where the crash happened. +It doesn't even mention the function name (`main()`). +If you are on Linux, you can install valgrind (e.g. `sudo apt install valgrind`) and invoke Jou with `--valgrind`. +If you need to debug a crash and you are not on Linux, please create an issue on GitHub. + +Running Jou with `--valgrind` looks like this: + +``` +akuli@akuli-desktop:~/jou$ ./jou --valgrind a.jou +==12317== Invalid read of size 4 +==12317== at 0x401180: main (in /home/akuli/jou/jou_compiled/a/a) +==12317== Address 0x1fff001000 is not stack'd, malloc'd or (recently) free'd +==12317== +==12317== +==12317== Process terminating with default action of signal 11 (SIGSEGV) +==12317== Access not within mapped region at address 0x1FFF001000 +==12317== at 0x401180: main (in /home/akuli/jou/jou_compiled/a/a) +==12317== If you believe this happened as a result of a stack +==12317== overflow in your program's main thread (unlikely but +==12317== possible), you can try to increase the size of the +==12317== main thread stack using the --main-stacksize= flag. +==12317== The main thread stack size used in this run was 8388608. +Segmentation fault +``` + +The relevant part of the error message is: + +``` +==12317== Invalid read of size 4 +==12317== at 0x401180: main (in /home/akuli/jou/jou_compiled/a/a) +==12317== Address 0x1fff001000 is not stack'd, malloc'd or (recently) free'd +``` + +Here `Invalid read` means that we tried to read memory that doesn't belong to the program, +and `size 4` means we tried to read 4 bytes at a time. +Because `int` is 4 bytes, seeing 4 bytes usually means that the code is trying to access an `int` value. +The first of the four bytes is at address `0x1fff001000`. + +It means that the crash happened in the `main()` function. +To see this better, let's modify the code so that multiple functions are involved in the crash: + +```python +def foo() -> int: + arr = [1, 2, 3] + sum = 0 + for i = 0; i < 10000; i++: + sum += arr[i] + return sum + +def bar() -> int: + return foo() + +def main() -> int: + bar() + return 0 +``` + +Now I get: + +``` +==12715== Invalid read of size 4 +==12715== at 0x401180: foo (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011A5: bar (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011AF: ??? (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011B5: main (in /home/akuli/jou/jou_compiled/a/a) +==12715== Address 0x1fff001000 is not stack'd, malloc'd or (recently) free'd +==12715== +==12715== +==12715== Process terminating with default action of signal 11 (SIGSEGV) +==12715== Access not within mapped region at address 0x1FFF001000 +==12715== at 0x401180: foo (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011A5: bar (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011AF: ??? (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011B5: main (in /home/akuli/jou/jou_compiled/a/a) +==12715== If you believe this happened as a result of a stack +==12715== overflow in your program's main thread (unlikely but +==12715== possible), you can try to increase the size of the +==12715== main thread stack using the --main-stacksize= flag. +==12715== The main thread stack size used in this run was 8388608. +Segmentation fault +``` + +The relevant lines are: + +``` +==12715== at 0x401180: foo (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011A5: bar (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011AF: ??? (in /home/akuli/jou/jou_compiled/a/a) +==12715== by 0x4011B5: main (in /home/akuli/jou/jou_compiled/a/a) +``` + +This means that: +- `foo()` crashed +- `bar()` is the function that called `foo()` +- `main()` is the function that called `bar()` + +The `???` is something irrelevant that I don't fully understand. It can be ignored. + +Unfortunately valgrind doesn't show see the name of the `.jou` file or any line numbers. +This could be fixed in the Jou compiler. +If you run into this and it annoys you, please create an issue on GitHub, +or if someone has already created the issue, add a comment to it. + + +## NULL pointers + +Consider this program: + +```python +import "stdlib/io.jou" + +def main() -> int: + p: int* = NULL + printf("%d\n", p[2]) + return 0 +``` + +This crashes with a `Segmentation fault` error. +With `jou --valgrind filename.jou` I get: + +``` +akuli@akuli-desktop:~/jou$ ./jou --valgrind a.jou +==17004== Invalid read of size 4 +==17004== at 0x401161: main (in /home/akuli/jou/jou_compiled/a/a) +==17004== Address 0x8 is not stack'd, malloc'd or (recently) free'd +==17004== +==17004== +==17004== Process terminating with default action of signal 11 (SIGSEGV) +==17004== Access not within mapped region at address 0x8 +==17004== at 0x401161: main (in /home/akuli/jou/jou_compiled/a/a) +==17004== If you believe this happened as a result of a stack +==17004== overflow in your program's main thread (unlikely but +==17004== possible), you can try to increase the size of the +==17004== main thread stack using the --main-stacksize= flag. +==17004== The main thread stack size used in this run was 8388608. +Segmentation fault +``` + +Here `Address 0x8` means that the memory we were reading is at address `0x8` in hexadecimal, which is 8. +This is because `NULL` means address 0, so +- `*p` or `p[0]` would access memory addresses 0, 1, 2 and 3 +- `p[1]` would access memory addresses 4, 5, 6, 7 +- `p[2]` would access memory addresses 8 (failed here), 9, 10 and 11. + +In general, reading or writing a NULL pointer crashes the program. +You can distinguish these crashes by looking at the address in valgrind output: +a small address like `0x8` means a `NULL` problem. +Previously we got a much bigger address `0x1fff001000` +when accessing memory beyond the end of an array. + +Note that because of optimizations, +the program might not actually access the NULL pointer as you would expect. +To work around that, you can use `jou --valgrind -O0 filename.jou`. +See also [the optimization docs](perf.md). + + +## Rust's approach to UB + +I try to add various warnings to Jou, so that the compiler will let you know if you're about to cause UB. +However, **Jou's compiler warnings will never cover all possible ways to get UB.** +Let me explain why. + +Rust is the only language I have seen that checks for all UB when compiling the code. +Practically, this means that: +- you need to convince the Rust compiler that your code does not have UB, and **it is hard** +- the Rust programming language has various complicated things that let programmers communicate UB related things to the compiler (e.g. lifetime annotations) +- sometimes you see `unsafe { ... }`, which basically disables Rust's compile-time checks. + +I don't want any of this in Jou. +I want Jou to be a simple, straight-forward and small language, a lot like like C. +Also, making a Rust-like language is much harder, +so if I tried to turn Jou into something similar to Rust, it would never be as good as Rust. +On the other hand, many people get annoyed with various things in C, +so it makes sense to create a new C-like programming language. + +That said, I think Rust is a great choice if you need something fast and correct, +and you have a lot of time and patience to learn a new language. +For example, I have written [catris](https://catris.net/) in Rust. + +If you want to eventually learn Rust, +I recommend first learning a language that makes you deal with UB, such as C or Jou. +This way you will appreciate how the Rust compiler makes it impossible to cause UB by accident. +Otherwise you will probably end up hating the Rust compiler (and hence the Rust programming language), +because the compiler complains "too much" about your code. +I have seen this happen to several people. diff --git a/doctest.sh b/doctest.sh index 05c99791..32e45a53 100755 --- a/doctest.sh +++ b/doctest.sh @@ -48,10 +48,11 @@ mkdir -p tmp/doctest for file in "${files[@]}"; do echo "Extracting doctests from $file..." - mkdir tmp/doctest/"$(basename "$file")" + temp_dir="tmp/doctest/$(echo -n "$file" | base64)" # make it possible to display file path later + mkdir "$temp_dir" for start_marker_lineno in $(grep -n '^```python$' "$file" | cut -d: -f1); do - outfile="tmp/doctest/$(basename "$file")/$((start_marker_lineno + 1)).jou" + outfile="$temp_dir/$((start_marker_lineno + 1)).jou" awk -v n=$start_marker_lineno '(/^```$/ && line > n) { stop=1 } (++line > n && !stop) { print }' "$file" > "$outfile" # Do not test if there is no expected output/errors @@ -66,10 +67,12 @@ nfail=0 cd tmp/doctest for file in */*.jou; do - echo "${file%.*}" | tr '/' ':' # foo.md/123.jou --> foo.md:123 + # print file and line number, as in "doc/foo.md:123: " + echo -n "$(basename "$(dirname "$file")" | base64 -d):$(basename "$file" | cut -d'.' -f1): " + cp "$file" test.jou if diff --text -u --color=always <(generate_expected_output test.jou | tr -d '\r') <( ("$jou" test.jou 2>&1 || true) | tr -d '\r'); then - echo " ok" + echo "ok" else ((nfail++)) || true fi diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index cd867f8c..7edc1fa3 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -309,7 +309,7 @@ class Tokenizer: if c == '\0' or c == '\n': break if c == '\'': - fail(location, "single quotes are for a single character, maybe use double quotes to instead make a string?") + fail(location, "single quotes are for specifying a byte, maybe use double quotes to instead make a string?") fail(location, "missing ' to end the byte literal") return c diff --git a/src/tokenize.c b/src/tokenize.c index 3e6b53f1..c7d6cbed 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -335,7 +335,7 @@ static char read_char_literal(struct State *st) if (len == 0) fail_with_error(st->location, "a byte literal cannot be empty, maybe use double quotes to instead make a string?"); if (len >= 2) - fail_with_error(st->location, "single quotes are for a single character, maybe use double quotes to instead make a string?"); + fail_with_error(st->location, "single quotes are for specifying a byte, maybe use double quotes to instead make a string?"); char result = s[0]; free(s); return result; diff --git a/tests/syntax_error/overlong_char.jou b/tests/syntax_error/overlong_char.jou index 79958763..4d00b36f 100644 --- a/tests/syntax_error/overlong_char.jou +++ b/tests/syntax_error/overlong_char.jou @@ -1,2 +1,2 @@ def main() -> int: - return 'hi' # Error: single quotes are for a single character, maybe use double quotes to instead make a string? + return 'hi' # Error: single quotes are for specifying a byte, maybe use double quotes to instead make a string? diff --git a/tests/syntax_error/utf8_multibyte_char.jou b/tests/syntax_error/utf8_multibyte_char.jou new file mode 100644 index 00000000..4993fccf --- /dev/null +++ b/tests/syntax_error/utf8_multibyte_char.jou @@ -0,0 +1,2 @@ +def main() -> int: + return 'ö' # Error: single quotes are for specifying a byte, maybe use double quotes to instead make a string?