Fix memory issues in lexer and tokenise some extra stuff

- change VAR token to NAME so it doubles as the token for functions (also tokenising those now) - replace op tokens with BINARY_OP and UNARY_OP tokens (also parsing unary minus now) - added null byte to end of string when reading file (lexer didn't properly know when to stop, causing invalid read) - also made a failed (uncommitted) attempt at a parser
tobyck · Aug 21, 2024 · 94e05c1 · 94e05c1
1 parent e519e5a
commit 94e05c1
Show file tree

Hide file tree

Showing 11 changed files with 62 additions and 95 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 build
 result
 experiments
+old
diff --git a/examples/a.bas b/examples/a.bas
@@ -1,3 +1,2 @@
- hello   there!
-this is some
-   text.
+let a=abs(2+3-6)*-4
+print a, "test"
diff --git a/examples/e.bas b/examples/e.bas
@@ -1,2 +1 @@
-reM hello
-REm  this is a commentntntntttt
+1+2*-3
diff --git a/examples/s.bas b/examples/s.bas
@@ -1,6 +1,6 @@
 REM this is a remark
 10 LET A=(2+3)*-4
    REM another remark
-   LET B="missing end quotes
-   PRINT A, 567, "th1s i5 string"
+   LET B = "missing end quotes
+   PRINT A, 567, "th1s i5 a string"
    PRINT invalid token 12.34
diff --git a/examples/test.bas b/examples/test.bas
@@ -1,9 +1,8 @@
 REM this is a demo
- 1 let A=2
+ 1 let A = 2
 REM this is another comment
 REM and so is this
 REM
 
 20 B=A+3*4           Rem 2+3*4=14
    LET C=(A + 3) / a REM should be 5/2=2
-
diff --git a/src/lexer.c b/src/lexer.c
@@ -9,14 +9,12 @@
 char *stringify_token_type(TokenType token_type) {
 	switch (token_type) {
 		case TOKEN_LET: return "LET";
-		case TOKEN_VAR: return "VAR";
-		case TOKEN_EQ: return "EQ";
+		case TOKEN_NAME: return "NAME";
+		case TOKEN_ASSIGN: return "ASSIGN";
 		case TOKEN_INT: return "INT";
-		case TOKEN_ADD: return "ADD";
-		case TOKEN_SUB: return "SUB";
-		case TOKEN_MUL: return "MUL";
-		case TOKEN_DIV: return "DIV";
-		case TOKEN_MOD: return "MOD";
+		case TOKEN_NEGATE: return "NEGATE";
+		case TOKEN_BINARY_OP: return "BINARY_OP";
+		case TOKEN_UNARY_OP: return "UNARY_OP";
 		case TOKEN_OPEN_PAREN: return "OPEN_PAREN";
 		case TOKEN_CLOSE_PAREN: return "CLOSE_PAREN";
 		case TOKEN_PRINT: return "PRINT";
@@ -173,14 +171,17 @@ inline char peek_nth(Lexer *lexer, size_t n) { return lexer->code[lexer->current
 inline char consume(Lexer *lexer) { return lexer->code[lexer->current_index++]; }
 
 void lexer_invalid_token(Lexer *lexer, size_t line, size_t column) {
-	printf("whaaaat\n");
 	Token previous_invalid = last_token(lexer->result.invalid);
 
 	if (
 		lexer->result.invalid.length > 0 && // if there's already an invalid token
 		// and its column + length = the current column, append to previous token
+		// i.e. this invalid char is directly next to the previous invalid token
 		previous_invalid.column + strlen(previous_invalid.literal) == column
-	) append_char(&previous_invalid.literal, consume(lexer));
+	)
+		// append the char to the previous invalid token
+		append_char(&lexer->result.invalid.tokens[lexer->result.invalid.length - 1].literal, consume(lexer));
+
 	// otherwise push a new invalid token
 	else push_token(&lexer->result.invalid, (Token){
 		TOKEN_INVALID, alloc_char_as_str(consume(lexer)), line, column
@@ -226,19 +227,38 @@ LexerResult lex(char *code) {
 
 		// simple single char tokens
 
+		// simple case where a single char is mapped to single token with no additional info
 		#define simple_token_case(char, type) \
 			case char: \
 				push_token(&lexer->result.valid, (Token){ type, NULL, l, c }); \
 				consume(lexer); \
 				continue;
 
 		switch (peek(lexer)) {
-			simple_token_case('=', TOKEN_EQ)
-			simple_token_case('+', TOKEN_ADD)
-			simple_token_case('-', TOKEN_SUB)
-			simple_token_case('*', TOKEN_MUL)
-			simple_token_case('/', TOKEN_DIV)
-			simple_token_case('%', TOKEN_MOD)
+			case '+':
+			case '*':
+			case '/':
+			case '%':
+				push_token(&lexer->result.valid, (Token){
+					TOKEN_BINARY_OP, alloc_char_as_str(consume(lexer)), l, c
+				});
+				continue;
+			case '-': {
+				char *literal = alloc_char_as_str(consume(lexer));
+				switch (last_token(lexer->result.valid).type) {
+					case TOKEN_CLOSE_PAREN:
+					case TOKEN_INT:
+					case TOKEN_NAME:
+						push_token(&lexer->result.valid, (Token){
+							TOKEN_BINARY_OP, literal, l, c
+						});
+						break;
+					default:
+						push_token(&lexer->result.valid, (Token){ TOKEN_UNARY_OP, literal, l, c });
+				}
+				continue;
+			}
+			simple_token_case('=', TOKEN_ASSIGN)
 			simple_token_case('(', TOKEN_OPEN_PAREN)
 			simple_token_case(')', TOKEN_CLOSE_PAREN)
 			simple_token_case(',', TOKEN_COMMA)
@@ -306,19 +326,15 @@ LexerResult lex(char *code) {
 			continue;
 		}
 
-		// vars
+		// names (vars/functions)
 
-		Token previous_token = last_token(lexer->result.valid);
-		if (
-			isalpha(peek(lexer)) &&
-			lexer->current_index > 0 &&
-			!isalpha(peek_nth(lexer, 1)) &&
-			!isalpha(peek_nth(lexer, -1))
-		) {
+		if (isalpha(peek(lexer))) {
+			char *name = alloc_char_as_str(consume(lexer));
+			while (isalpha(peek(lexer)))
+				append_char(&name, consume(lexer));
 			push_token(&lexer->result.valid, (Token){
-				TOKEN_VAR, alloc_char_as_str(toupper(consume(lexer))), l, c
+				TOKEN_NAME, name, l, c
 			});
-			consume(lexer);
 			continue;
 		}
 

diff --git a/src/lexer.h b/src/lexer.h
@@ -5,14 +5,12 @@
 
 typedef enum {
 	TOKEN_LET,
-	TOKEN_VAR,
-	TOKEN_EQ,
+	TOKEN_NAME,
+	TOKEN_ASSIGN,
 	TOKEN_INT,
-	TOKEN_ADD,
-	TOKEN_SUB,
-	TOKEN_MUL,
-	TOKEN_DIV,
-	TOKEN_MOD,
+	TOKEN_NEGATE,
+	TOKEN_BINARY_OP,
+	TOKEN_UNARY_OP,
 	TOKEN_OPEN_PAREN,
 	TOKEN_CLOSE_PAREN,
 	TOKEN_PRINT,

diff --git a/src/main.c b/src/main.c
@@ -2,8 +2,9 @@
 #include <stdio.h>
 #include <stdbool.h>
 
-#include "lexer.h"
 #include "utils.h"
+#include "lexer.h"
+#include "parser.h"
 
 int main(int argc, char *argv[]) {
 	if (argc > 2) {
@@ -16,11 +17,14 @@ int main(int argc, char *argv[]) {
 	char *code = read_file(argv[1]);
 
 	LexerResult tokens = lex(code);
-
 	print_token_list(tokens.valid);
 	print_token_list(tokens.invalid);
 	print_lexer_errors(tokens.errors);
 
+	// AST ast = parse(tokens.valid);
+	// print_ast(ast);
+	// free_ast(ast);
+
 	free(code);
 	free_lexer_result(tokens);
 

diff --git a/src/parser.c b/src/parser.c
@@ -1 +0,0 @@
-#include "parser.h"

diff --git a/src/parser.h b/src/parser.h
@@ -1,50 +0,0 @@
-#ifndef INCLUDE_PARSER_H
-#define INCLUDE_PARSER_H
-
-#include <stdlib.h>
-
-typedef enum {
-	AST_ASSIGNMENT,
-	AST_EXPRESSION,
-	AST_STRING,
-	AST_PRINT
-} ASTNodeType;
-
-typedef enum {
-	MATH_OP_ADD,
-	MATH_OP_SUB,
-	MATH_OP_MUL,
-	MATH_OP_DIV,
-	MATH_OP_MOD,
-} MathOp;
-
-typedef struct {
-	struct ASTExpression *exprs;
-	size_t length;
-} ASTExpressionList;
-
-typedef struct ASTNode {
-	ASTNodeType type;
-	union {
-		struct ASTExpression {
-			MathOp op;
-			struct ASTExpression *lhs;
-			struct ASTExpression *rhs;
-		} expr;
-		struct ASTAssignment {
-			char variable;
-			struct ASTExpression *expr;
-		} assingment;
-		char *string;
-		ASTExpressionList print;
-	} node;
-} ASTNode;
-
-typedef struct {
-	ASTNode nodes;
-	size_t length;
-} AST;
-
-extern AST new_ast();
-
-#endif // INCLUDE_PARSER_H

diff --git a/src/utils.c b/src/utils.c
@@ -26,7 +26,7 @@ char *read_file(char *path) {
 		exit(EXIT_FAILURE);
 	}
 
-	char *buffer = malloc(file_length);
+	char *buffer = malloc(file_length + 1);
 
 	if (buffer == NULL) {
 		printf("Error: could not allocate buffer for file content\n");
@@ -37,6 +37,8 @@ char *read_file(char *path) {
 	size_t amount_read = fread(buffer, 1, file_length, file);
 	fclose(file);
 
+	buffer[file_length] = '\0';
+
 	if (amount_read != file_length) {
 		printf("Error: did not manage to read the whole file.\n");
 		exit(EXIT_FAILURE);