diff --git a/examples/expr.c.par b/examples/expr.c.par index e615216..65f408e 100644 --- a/examples/expr.c.par +++ b/examples/expr.c.par @@ -1,3 +1,4 @@ +// täst %!language C; %whitespaces ' \t'; diff --git a/src/parse.c b/src/parse.c index 30aeee0..1cfca0b 100644 --- a/src/parse.c +++ b/src/parse.c @@ -3,7 +3,7 @@ DO NOT EDIT THIS FILE MANUALLY, IT WILL GO AWAY! */ -#if 1694 == 0 +#if 1701 == 0 #include #include #include @@ -23,8 +23,9 @@ struct _rhs_item #define MALLOC_STEP 255 #endif -#define UNICC_GETINPUT \ +#define UNICC_GETCHAR( pcb ) \ *pcb->src ? *(pcb->src++) : *pcb->src + #define UNICC_PARSE_ERROR( pcb ) \ parse_error( pcb ) @@ -1524,8 +1525,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -1544,9 +1545,9 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 140, 3, 310, 136, 3, 318, 132, 3, 326, + 120, 3, 350, 128, 3, 334, 124, 3, 342, - 120, 3, 350, 116, 3, 358, 112, 3, 366, 108, 3, 374, @@ -1593,8 +1594,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -1614,9 +1615,9 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 140, 3, 310, 136, 3, 318, 132, 3, 326, + 120, 3, 350, 128, 3, 334, 124, 3, 342, - 120, 3, 350, 116, 3, 358, 112, 3, 366, 108, 3, 374, @@ -1770,8 +1771,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -1900,8 +1901,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -2412,15 +2413,15 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 148, 3, 294, 138, 3, 314, 130, 3, 330, - 122, 3, 346, 114, 3, 362, + 122, 3, 346, 106, 3, 378, 98, 3, 394, 90, 3, 410, + 79, 3, 434, 86, 3, 423, 83, 3, 424, 81, 3, 430, - 79, 3, 434, 78, 3, 438, 77, 3, 440, 75, 3, 446, @@ -2445,16 +2446,16 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 160, 3, 270, 157, 3, 276, 153, 3, 284, - 142, 3, 306, - 134, 3, 322, 126, 3, 338, + 142, 3, 306, 118, 3, 354, + 134, 3, 322, 110, 3, 370, - 102, 3, 386, 94, 3, 402, + 102, 3, 386, 87, 3, 418, - 82, 3, 428, 76, 3, 444, + 82, 3, 428, 74, 3, 448, 72, 3, 452, 70, 3, 456, @@ -2589,8 +2590,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -2606,16 +2607,16 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 160, 3, 270, 157, 3, 276, 153, 3, 284, - 142, 3, 306, - 134, 3, 322, 126, 3, 338, + 142, 3, 306, 118, 3, 354, + 134, 3, 322, 110, 3, 370, - 102, 3, 386, 94, 3, 402, + 102, 3, 386, 87, 3, 418, - 82, 3, 428, 76, 3, 444, + 82, 3, 428, 74, 3, 448, 72, 3, 452, 70, 3, 456, @@ -2645,8 +2646,8 @@ UNICC_STATIC int _go[ 158 ][ 28 * 3 + 1 ] = 117, 3, 356, 111, 3, 368, 109, 3, 372, - 103, 3, 384, 101, 3, 388, + 103, 3, 384, 95, 3, 400, 93, 3, 404 }, @@ -16560,67 +16561,61 @@ UNICC_STATIC int _alloc_stack( _pcb* pcb ) return 0; } -#ifndef UNICC_GETINPUT +#ifndef UNICC_GETCHAR +#define UNICC_GETCHAR( pcb ) getchar() +#endif #if UNICC_UTF8 -static int offsets_utf8[ 6 ] = +UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb ) { - 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL -}; + unsigned char first = UNICC_GETCHAR( pcb ); -static int trailbyte_utf8[ 256 ] = -{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; + if ((first & 0x80) == 0) + { + // Single-byte ASCII character + return first; + } + else if ((first & 0xE0) == 0xC0) + { + // Two-byte sequence (110xxxxx 10xxxxxx) + unsigned char second = UNICC_GETCHAR( pcb ); + return ((first & 0x1F) << 6) | (second & 0x3F); + } + else if ((first & 0xF0) == 0xE0) + { + // Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[2]; -UNICC_STATIC UNICC_CHAR _utf8_getchar( int (*getfn)() ) -{ - UNICC_CHAR ch = 0; - int nb; - int c; - - if( !getfn ) - getfn = getchar; - - switch( ( nb = trailbyte_utf8[ ( c = (*getfn)() ) ] ) ) - { - case 3: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 2: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 1: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 0: - ch += c; - break; + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); + + return + ((first & 0x0F) << 12) + | ((bytes[0] & 0x3F) << 6) + | (bytes[1] & 0x3F) + ; } + else if ((first & 0xF8) == 0xF0) + { + // Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[3]; - ch -= offsets_utf8[ nb ]; -#if UNICC_DEBUG > 3 - fprintf( stderr, "%s: getchar: %d\n", UNICC_PARSER, ch ); -#endif - return ch; -} -#define UNICC_GETINPUT _utf8_getchar( getchar ) + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); + bytes[2] = UNICC_GETCHAR( pcb ); -#else -#define UNICC_GETINPUT getchar() -#endif + return + ((first & 0x07) << 18) + | ((bytes[0] & 0x3F) << 12) + | ((bytes[1] & 0x3F) << 6) + | (bytes[2] & 0x3F) + ; + } + return -1; // Invalid UTF-8 sequence +} +#else +#define _get_char( pcb ) UNICC_GETCHAR( pcb ) #endif UNICC_STATIC UNICC_CHAR _get_input( _pcb* pcb, unsigned int offset ) @@ -16669,7 +16664,7 @@ UNICC_STATIC UNICC_CHAR _get_input( _pcb* pcb, unsigned int offset ) pcb->bufend = pcb->buf + size; } - if( pcb->is_eof || ( *( pcb->bufend ) = (UNICC_CHAR)UNICC_GETINPUT ) + if( pcb->is_eof || ( *( pcb->bufend ) = _get_char( pcb ) ) == pcb->eof ) { #if UNICC_DEBUG > 2 @@ -17231,7 +17226,7 @@ int _parse( _pcb* pcb ) break; case 12: { - #line 132 "src/parse.par" + #line 133 "src/parse.par" if( !( parser->p_template ) ) parser->p_template = pstrdup( strbuf ); @@ -17245,13 +17240,13 @@ int _parse( _pcb* pcb ) break; case 13: { - #line 160 "src/parse.par" + #line 161 "src/parse.par" parser->p_mode = MODE_SCANNERLESS; ; } break; case 14: { - #line 162 "src/parse.par" + #line 163 "src/parse.par" parser->p_mode = MODE_SCANNER; ; } break; @@ -17277,7 +17272,7 @@ int _parse( _pcb* pcb ) break; case 19: { - #line 174 "src/parse.par" + #line 175 "src/parse.par" LIST* l; SYMBOL* s; @@ -17307,7 +17302,7 @@ int _parse( _pcb* pcb ) break; case 20: { - #line 201 "src/parse.par" + #line 202 "src/parse.par" LIST* l; SYMBOL* s; @@ -17336,7 +17331,7 @@ int _parse( _pcb* pcb ) break; case 21: { - #line 227 "src/parse.par" + #line 228 "src/parse.par" LIST* l; SYMBOL* s; @@ -17364,32 +17359,32 @@ int _parse( _pcb* pcb ) break; case 22: { - #line 252 "src/parse.par" + #line 253 "src/parse.par" set_assoc_and_prec( ( ( pcb->tos - 0 )->value.value_1 ), ASSOC_LEFT ); ; } break; case 23: { - #line 256 "src/parse.par" + #line 257 "src/parse.par" set_assoc_and_prec( ( ( pcb->tos - 0 )->value.value_1 ), ASSOC_RIGHT ); ; } break; case 24: { - #line 260 "src/parse.par" + #line 261 "src/parse.par" set_assoc_and_prec( ( ( pcb->tos - 0 )->value.value_1 ), ASSOC_NOASSOC ); ; } break; case 25: { - #line 263 "src/parse.par" + #line 264 "src/parse.par" parser->p_prefix = pstrcatstr( parser->p_prefix, strbuf, FALSE ); ; } break; case 26: { - #line 267 "src/parse.par" + #line 268 "src/parse.par" if( !( parser->p_def_action ) ) parser->p_def_action = pstrdup( strbuf ); else @@ -17402,7 +17397,7 @@ int _parse( _pcb* pcb ) break; case 27: { - #line 277 "src/parse.par" + #line 278 "src/parse.par" if( !( parser->p_def_action_e ) ) parser->p_def_action_e = pstrdup( strbuf ); else @@ -17415,7 +17410,7 @@ int _parse( _pcb* pcb ) break; case 28: { - #line 287 "src/parse.par" + #line 288 "src/parse.par" if( !( parser->p_def_type ) ) parser->p_def_type = create_vtype( parser, (char*)( ( pcb->tos - 0 )->value.value_3 ) ); @@ -17431,7 +17426,7 @@ int _parse( _pcb* pcb ) break; case 29: { - #line 300 "src/parse.par" + #line 301 "src/parse.par" /* Ouput a warning, if this definition is effectless */ if( parser->p_mode != MODE_SCANNERLESS ) @@ -17448,40 +17443,40 @@ int _parse( _pcb* pcb ) break; case 30: { - #line 314 "src/parse.par" + #line 315 "src/parse.par" parser->p_cis_strings = !( ( pcb->tos - 0 )->value.value_0 ); ; } break; case 31: { - #line 317 "src/parse.par" + #line 318 "src/parse.par" parser->p_reserve_regex = !( ( pcb->tos - 0 )->value.value_0 ); ; } break; case 32: { - #line 320 "src/parse.par" + #line 321 "src/parse.par" parser->p_header = pstrcatstr( parser->p_header, strbuf, FALSE ); ; } break; case 33: { - #line 324 "src/parse.par" + #line 325 "src/parse.par" parser->p_footer = pstrcatstr( parser->p_footer, strbuf, FALSE ); ; } break; case 34: { - #line 327 "src/parse.par" + #line 328 "src/parse.par" parser->p_pcb = pstrcatstr( parser->p_pcb, strbuf, FALSE ); ; } break; case 35: { - #line 332 "src/parse.par" + #line 333 "src/parse.par" char* src; char* filename = strbuf; @@ -17511,37 +17506,37 @@ int _parse( _pcb* pcb ) break; case 37: { - #line 356 "src/parse.par" + #line 357 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 38: { - #line 359 "src/parse.par" + #line 360 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 39: { - #line 360 "src/parse.par" + #line 361 "src/parse.par" pcb->ret.value_0 = TRUE; ; } break; case 40: { - #line 364 "src/parse.par" + #line 365 "src/parse.par" pcb->ret.value_1 = list_push( ( ( pcb->tos - 1 )->value.value_1 ), (void*)( ( pcb->tos - 0 )->value.value_2 ) ); ; } break; case 41: { - #line 366 "src/parse.par" + #line 367 "src/parse.par" pcb->ret.value_1 = list_push( (LIST*)NULL, (void*)( ( pcb->tos - 0 )->value.value_2 ) ); ; } break; case 42: { - #line 370 "src/parse.par" + #line 371 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, strbuf, SYM_NON_TERMINAL, TRUE ); pcb->ret.value_2->defined = TRUE; @@ -17551,19 +17546,19 @@ int _parse( _pcb* pcb ) break; case 43: { - #line 378 "src/parse.par" + #line 379 "src/parse.par" pcb->ret.value_1 = list_push( ( ( pcb->tos - 1 )->value.value_1 ), (void*)( ( pcb->tos - 0 )->value.value_2 ) ); ; } break; case 44: { - #line 380 "src/parse.par" + #line 381 "src/parse.par" pcb->ret.value_1 = list_push( (LIST*)NULL, (void*)( ( pcb->tos - 0 )->value.value_2 ) ); ; } break; case 45: { - #line 384 "src/parse.par" + #line 385 "src/parse.par" pcb->ret.value_1 = list_push( ( ( pcb->tos - 1 )->value.value_1 ), (void*)( ( pcb->tos - 0 )->value.value_2 ) ); ( ( pcb->tos - 0 )->value.value_2 )->derived_from = @@ -17573,7 +17568,7 @@ int _parse( _pcb* pcb ) break; case 46: { - #line 390 "src/parse.par" + #line 391 "src/parse.par" pcb->ret.value_1 = list_push( (LIST*)NULL, (void*)( ( pcb->tos - 0 )->value.value_2 ) ); @@ -17583,7 +17578,7 @@ int _parse( _pcb* pcb ) break; case 47: { - #line 398 "src/parse.par" + #line 399 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, strbuf, SYM_REGEX_TERMINAL, TRUE ); @@ -17602,31 +17597,31 @@ int _parse( _pcb* pcb ) break; case 48: { - #line 414 "src/parse.par" + #line 415 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 49: { - #line 415 "src/parse.par" + #line 416 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 50: { - #line 416 "src/parse.par" + #line 417 "src/parse.par" pcb->ret.value_0 = TRUE; ; } break; case 51: { - #line 417 "src/parse.par" + #line 418 "src/parse.par" pcb->ret.value_0 = TRUE; ; } break; case 52: { - #line 425 "src/parse.par" + #line 426 "src/parse.par" LIST* l; LIST* m; @@ -17707,7 +17702,7 @@ int _parse( _pcb* pcb ) break; case 55: { - #line 495 "src/parse.par" + #line 496 "src/parse.par" SYMBOL* primary; SYMBOL* s; @@ -17797,61 +17792,61 @@ int _parse( _pcb* pcb ) break; case 60: { - #line 563 "src/parse.par" + #line 564 "src/parse.par" greedy = TRUE; ; } break; case 61: { - #line 565 "src/parse.par" + #line 566 "src/parse.par" greedy = FALSE; ; } break; case 62: { - #line 569 "src/parse.par" + #line 570 "src/parse.par" pcb->ret.value_0 = TRUE; ; } break; case 63: { - #line 571 "src/parse.par" + #line 572 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 64: { - #line 575 "src/parse.par" + #line 576 "src/parse.par" pcb->ret.value_1 = list_push( ( ( pcb->tos - 2 )->value.value_1 ), ( ( pcb->tos - 0 )->value.value_4 ) ); ; } break; case 65: { - #line 578 "src/parse.par" + #line 579 "src/parse.par" pcb->ret.value_1 = list_push( (LIST*)NULL, ( ( pcb->tos - 0 )->value.value_4 ) ); ; } break; case 66: { - #line 582 "src/parse.par" + #line 583 "src/parse.par" pcb->ret.value_3 = pstrdup( strbuf ); ; } break; case 67: { - #line 584 "src/parse.par" + #line 585 "src/parse.par" pcb->ret.value_3 = pstrdup( strbuf ); ; } break; case 68: { - #line 586 "src/parse.par" + #line 587 "src/parse.par" pcb->ret.value_3 = (char*)NULL; ; } break; case 69: { - #line 592 "src/parse.par" + #line 593 "src/parse.par" ( ( pcb->tos - 3 )->value.value_4 )->line = ( ( pcb->tos - 4 )->value.value_5 ); @@ -17896,7 +17891,7 @@ int _parse( _pcb* pcb ) break; case 75: { - #line 610 "src/parse.par" + #line 611 "src/parse.par" pcb->ret.value_4 = current_prod = create_production( parser, (SYMBOL*)NULL ); @@ -17905,13 +17900,13 @@ int _parse( _pcb* pcb ) break; case 76: { - #line 618 "src/parse.par" + #line 619 "src/parse.par" current_prod->prec = ( ( pcb->tos - 0 )->value.value_2 )->prec; ; } break; case 77: { - #line 624 "src/parse.par" + #line 625 "src/parse.par" append_to_production( ( ( pcb->tos - 2 )->value.value_4 ), ( ( pcb->tos - 1 )->value.value_2 ), ( *strbuf == '\0' ? (char*)NULL : pstrdup( strbuf ) ) ); @@ -17921,7 +17916,7 @@ int _parse( _pcb* pcb ) break; case 78: { - #line 632 "src/parse.par" + #line 633 "src/parse.par" pcb->ret.value_4 = current_prod = create_production( parser, (SYMBOL*)NULL ); @@ -17934,7 +17929,7 @@ int _parse( _pcb* pcb ) break; case 79: { - #line 644 "src/parse.par" + #line 645 "src/parse.par" switch( ( ( pcb->tos - 0 )->value.value_5 ) ) { @@ -17956,7 +17951,7 @@ int _parse( _pcb* pcb ) break; case 80: { - #line 664 "src/parse.par" + #line 665 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, P_ERROR_RESYNC, SYM_SYSTEM_TERMINAL, TRUE ); @@ -17965,7 +17960,7 @@ int _parse( _pcb* pcb ) break; case 81: { - #line 671 "src/parse.par" + #line 672 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, P_END_OF_FILE, SYM_SYSTEM_TERMINAL, TRUE ); @@ -17979,7 +17974,7 @@ int _parse( _pcb* pcb ) break; case 83: { - #line 681 "src/parse.par" + #line 682 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, strbuf, SYM_NON_TERMINAL, TRUE ); pcb->ret.value_2->used = TRUE; @@ -17991,7 +17986,7 @@ int _parse( _pcb* pcb ) break; case 84: { - #line 692 "src/parse.par" + #line 693 "src/parse.par" char temp_nonterm[ ONE_LINE + 1 ]; PROD* prod; @@ -18037,7 +18032,7 @@ int _parse( _pcb* pcb ) break; case 85: { - #line 736 "src/parse.par" + #line 737 "src/parse.par" pcb->ret.value_4 = current_prod; current_prod = (PROD*)NULL; ; @@ -18045,7 +18040,7 @@ int _parse( _pcb* pcb ) break; case 86: { - #line 743 "src/parse.par" + #line 744 "src/parse.par" pccl* ccl; @@ -18066,7 +18061,7 @@ int _parse( _pcb* pcb ) break; case 87: { - #line 762 "src/parse.par" + #line 763 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, strbuf, SYM_REGEX_TERMINAL, TRUE ); @@ -18087,7 +18082,7 @@ int _parse( _pcb* pcb ) break; case 88: { - #line 781 "src/parse.par" + #line 782 "src/parse.par" pcb->ret.value_2 = get_symbol( parser, strbuf, SYM_REGEX_TERMINAL, TRUE ); @@ -18102,25 +18097,25 @@ int _parse( _pcb* pcb ) break; case 89: { - #line 794 "src/parse.par" + #line 795 "src/parse.par" pcb->ret.value_5 = (int)'*'; ; } break; case 90: { - #line 797 "src/parse.par" + #line 798 "src/parse.par" pcb->ret.value_5 = (int)'+'; ; } break; case 91: { - #line 800 "src/parse.par" + #line 801 "src/parse.par" pcb->ret.value_5 = (int)'?'; ; } break; case 92: { - #line 803 "src/parse.par" + #line 804 "src/parse.par" pcb->ret.value_5 = 0; ; } break; @@ -18136,7 +18131,7 @@ int _parse( _pcb* pcb ) break; case 95: { - #line 809 "src/parse.par" + #line 810 "src/parse.par" reset_strbuf(); ; } break; @@ -18147,7 +18142,7 @@ int _parse( _pcb* pcb ) break; case 97: { - #line 818 "src/parse.par" + #line 819 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_alt( ( ( pcb->tos - 2 )->value.value_6 ), ( ( pcb->tos - 0 )->value.value_6 ), (pregex_ptn*)NULL ); @@ -18161,7 +18156,7 @@ int _parse( _pcb* pcb ) break; case 99: { - #line 829 "src/parse.par" + #line 830 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_seq( ( ( pcb->tos - 1 )->value.value_6 ), ( ( pcb->tos - 0 )->value.value_6 ), (pregex_ptn*)NULL ); @@ -18175,7 +18170,7 @@ int _parse( _pcb* pcb ) break; case 101: { - #line 841 "src/parse.par" + #line 842 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_kle( ( ( pcb->tos - 1 )->value.value_6 ) ); ; @@ -18183,7 +18178,7 @@ int _parse( _pcb* pcb ) break; case 102: { - #line 847 "src/parse.par" + #line 848 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_pos( ( ( pcb->tos - 1 )->value.value_6 ) ); ; @@ -18191,7 +18186,7 @@ int _parse( _pcb* pcb ) break; case 103: { - #line 853 "src/parse.par" + #line 854 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_opt( ( ( pcb->tos - 1 )->value.value_6 ) ); ; @@ -18204,7 +18199,7 @@ int _parse( _pcb* pcb ) break; case 105: { - #line 863 "src/parse.par" + #line 864 "src/parse.par" pccl* ccl; @@ -18218,7 +18213,7 @@ int _parse( _pcb* pcb ) break; case 106: { - #line 875 "src/parse.par" + #line 876 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_string( strbuf, 0 ); ; @@ -18226,7 +18221,7 @@ int _parse( _pcb* pcb ) break; case 107: { - #line 881 "src/parse.par" + #line 882 "src/parse.par" pccl* ccl; greedy = FALSE; @@ -18241,7 +18236,7 @@ int _parse( _pcb* pcb ) break; case 108: { - #line 894 "src/parse.par" + #line 895 "src/parse.par" pcb->ret.value_6 = pregex_ptn_create_sub( ( ( pcb->tos - 1 )->value.value_6 ) ); ; @@ -18274,13 +18269,13 @@ int _parse( _pcb* pcb ) break; case 114: { - #line 908 "src/parse.par" + #line 909 "src/parse.par" pcb->ret.value_0 = FALSE; ; } break; case 115: { - #line 912 "src/parse.par" + #line 913 "src/parse.par" pcb->ret.value_0 = TRUE; ; } break; @@ -18296,19 +18291,19 @@ int _parse( _pcb* pcb ) break; case 118: { - #line 921 "src/parse.par" + #line 922 "src/parse.par" reset_strbuf(); ; } break; case 119: { - #line 925 "src/parse.par" + #line 926 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 120: { - #line 928 "src/parse.par" + #line 929 "src/parse.par" strbuf_append( (char)'\\' ); strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); @@ -18317,37 +18312,37 @@ int _parse( _pcb* pcb ) break; case 121: { - #line 934 "src/parse.par" + #line 935 "src/parse.par" pcb->ret.value_0 = TRUE ; } break; case 122: { - #line 935 "src/parse.par" + #line 936 "src/parse.par" pcb->ret.value_0 = FALSE ; } break; case 123: { - #line 939 "src/parse.par" + #line 940 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 124: { - #line 940 "src/parse.par" + #line 941 "src/parse.par" reset_strbuf(); ; } break; case 125: { - #line 944 "src/parse.par" + #line 945 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 126: { - #line 947 "src/parse.par" + #line 948 "src/parse.par" strbuf_append( (char)'\\' ); strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; @@ -18355,25 +18350,25 @@ int _parse( _pcb* pcb ) break; case 127: { - #line 953 "src/parse.par" + #line 954 "src/parse.par" pcb->ret.value_3 = pstrdup( strbuf ); ; } break; case 128: { - #line 954 "src/parse.par" + #line 955 "src/parse.par" pcb->ret.value_3 = (char*)NULL; ; } break; case 129: { - #line 958 "src/parse.par" + #line 959 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 130: { - #line 959 "src/parse.par" + #line 960 "src/parse.par" reset_strbuf(); ; } break; @@ -18384,7 +18379,7 @@ int _parse( _pcb* pcb ) break; case 132: { - #line 966 "src/parse.par" + #line 967 "src/parse.par" reset_strbuf(); strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); @@ -18393,7 +18388,7 @@ int _parse( _pcb* pcb ) break; case 133: { - #line 974 "src/parse.par" + #line 975 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; @@ -18414,13 +18409,13 @@ int _parse( _pcb* pcb ) break; case 137: { - #line 986 "src/parse.par" + #line 987 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 138: { - #line 988 "src/parse.par" + #line 989 "src/parse.par" reset_strbuf(); strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); @@ -18429,7 +18424,7 @@ int _parse( _pcb* pcb ) break; case 139: { - #line 996 "src/parse.par" + #line 997 "src/parse.par" pcb->ret.value_3 = pstrdup( ( ( pcb->tos - 0 )->value.value_3 ) ); reset_strbuf(); @@ -18438,13 +18433,13 @@ int _parse( _pcb* pcb ) break; case 140: { - #line 1003 "src/parse.par" + #line 1004 "src/parse.par" pcb->ret.value_3 = strbuf; ; } break; case 141: { - #line 1005 "src/parse.par" + #line 1006 "src/parse.par" reset_strbuf(); pcb->ret.value_3 = (char*)NULL; @@ -18453,7 +18448,7 @@ int _parse( _pcb* pcb ) break; case 142: { - #line 1012 "src/parse.par" + #line 1013 "src/parse.par" if( !parser->p_template ) { @@ -18470,7 +18465,7 @@ int _parse( _pcb* pcb ) break; case 143: { - #line 1027 "src/parse.par" + #line 1028 "src/parse.par" last_code_begin = pcb->line; ; } break; @@ -18481,19 +18476,19 @@ int _parse( _pcb* pcb ) break; case 145: { - #line 1031 "src/parse.par" + #line 1032 "src/parse.par" reset_strbuf(); ; } break; case 146: { - #line 1035 "src/parse.par" + #line 1036 "src/parse.par" strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); ; } break; case 147: { - #line 1038 "src/parse.par" + #line 1039 "src/parse.par" reset_strbuf(); strbuf_append( ( ( pcb->tos - 0 )->value.value_5 ) ); @@ -18532,7 +18527,7 @@ int _parse( _pcb* pcb ) break; case 160: { - #line 1057 "src/parse.par" + #line 1058 "src/parse.par" pcb->ret.value_5 = ( ( pcb->tos - 0 )->value.value_5 ); ; @@ -18550,7 +18545,7 @@ int _parse( _pcb* pcb ) break; case 163: { - #line 1066 "src/parse.par" + #line 1067 "src/parse.par" pcb->ret.value_5 = pcb->line; ; } break; diff --git a/src/parse.par b/src/parse.par index 7ea7984..ed61ead 100644 --- a/src/parse.par +++ b/src/parse.par @@ -7,7 +7,7 @@ * Parser configuration */ #whitespaces whitespace; -#lexeme terminal identifier modifier code ccl_string kw type; +#lexeme terminal identifier modifier code ccl_string kw bkw type; #lexeme separation on; #default action [* @@ = @1; *]; #default epsilon action [* @@ = 0; *]; @@ -16,6 +16,7 @@ pboolean main; char* filename; char* src; + char* capture; *]; /* @@ -33,14 +34,22 @@ struct @@prefix_rhs_item char* ident; }; + + #ifndef MALLOC_STEP #define MALLOC_STEP 255 #endif -#define UNICC_GETINPUT \ - *pcb->src ? *(pcb->src++) : *pcb->src -#define UNICC_PARSE_ERROR( pcb ) \ - parse_error( pcb ) +/* +int echo_char(char c) { + printf(">%c< (%d)\n", c, c); + return c; +} + +#define UNICC_GETCHAR( pcb ) echo_char( *pcb->src ? *(pcb->src++) : *pcb->src ) +*/ +#define UNICC_GETCHAR( pcb ) ( *pcb->src ? *(pcb->src++) : *pcb->src ) +#define UNICC_PARSE_ERROR( pcb ) parse_error( pcb ) extern int error_count; @@ -52,39 +61,12 @@ static SYMBOL* current_sym = (SYMBOL*)NULL; static BOOLEAN greedy = TRUE; static PARSER* parser; -char* strbuf; -char* regex; - -/* Append character to current string */ -static void strbuf_append( char ch ) -{ - int len; - - len = pstrlen( strbuf ); - - if( !strbuf ) - strbuf = (char*)pmalloc( ( MALLOC_STEP + 2 ) * sizeof( char ) ); - else if( len % MALLOC_STEP == 0 ) - strbuf = (char*)prealloc( (char*)strbuf, ( len + MALLOC_STEP + 2 ) - * sizeof( char ) ); - - strbuf[len] = ch; - strbuf[len+1] = '\0'; - strbuf[len+2] = '\0'; -} - - -/* Create a new string */ -static void reset_strbuf( void ) -{ - if( strbuf ) - { - *strbuf = '\0'; - *(strbuf+1) = '\0'; - } +char* ustrndup( char* origin, char* start, size_t n ) { + char* ret = pstrndup(start, n); + fprintf( stderr, "%s = >%s<\n", origin, ret ); + return ret; } - /* Set precedence and associativiy */ static void set_assoc_and_prec( LIST* symbols, int assoc ) { @@ -131,7 +113,7 @@ fixed_directive : "mode" mode_type | "language" string_or_ident [* if( !( parser->p_template ) ) - parser->p_template = pstrdup( strbuf ); + parser->p_template = @2; else if( !pcb->main ) print_error( parser, ERR_DIRECTIVE_ALREADY_USED, ERRSTYLE_WARNING | ERRSTYLE_FILEINFO, @@ -144,7 +126,7 @@ fixed_directive : "mode" mode_type | "character universe" integer [* - int universe = atoi( strbuf ); + int universe = @2; if( universe > 0 ) parser->p_universe = universe; @@ -260,12 +242,11 @@ directive_parms : "whitespaces" symbol_list [* set_assoc_and_prec( @2, ASSOC_NOASSOC ); *] | "prefix" string - [* parser->p_prefix = pstrcatstr( - parser->p_prefix, strbuf, FALSE ); *] + [* parser->p_prefix = @2 *] | "default action" code_opt [* if( !( parser->p_def_action ) ) - parser->p_def_action = pstrdup( strbuf ); + parser->p_def_action = @2; else print_error( parser, ERR_DIRECTIVE_ALREADY_USED, ERRSTYLE_WARNING | ERRSTYLE_FILEINFO, @@ -274,8 +255,8 @@ directive_parms : "whitespaces" symbol_list *] | "default epsilon action" code_opt - [* if( !( parser->p_def_action_e ) ) - parser->p_def_action_e = pstrdup( strbuf ); + [* if( !( parser->p_def_action_e ) ) + parser->p_def_action_e = @2; else print_error( parser, ERR_DIRECTIVE_ALREADY_USED, ERRSTYLE_WARNING | ERRSTYLE_FILEINFO, @@ -317,37 +298,30 @@ directive_parms : "whitespaces" symbol_list [* parser->p_reserve_regex = !@2; *] | "prologue" code - [* parser->p_header = pstrcatstr( - parser->p_header, strbuf, FALSE ); *] + [* parser->p_header = @2 *] | "epilogue" code - [* parser->p_footer = pstrcatstr( - parser->p_footer, strbuf, FALSE ); *] + [* parser->p_footer = @2 *] | "pcb" code - [* parser->p_pcb = pstrcatstr( - parser->p_pcb, strbuf, FALSE ); *] + [* parser->p_pcb = @2 *] - | "extends" string + | "extends" string:filename [* char* src; - char* filename = strbuf; - - strbuf = NULL; - if( !pfiletostr( &src, filename ) ) + if( !pfiletostr( &src, @filename ) ) { print_error( parser, ERR_OPEN_INPUT_FILE, - ERRSTYLE_FATAL, filename ); + ERRSTYLE_FATAL, @filename ); } else { - parse_grammar( parser, filename, src ); - strbuf = NULL; + parse_grammar( parser, @filename, src ); pfree( src ); } - pfree( filename ); + pfree( @filename ); *] ; @@ -366,11 +340,13 @@ symbol_list : symbol_list sym [* @@ = list_push( (LIST*)NULL, (void*)@1 ); *] ; -lhs : identifier - [* @@ = get_symbol( parser, - strbuf, SYM_NON_TERMINAL, TRUE ); +lhs : identifier + [* + @@ = get_symbol( parser, @identifier, SYM_NON_TERMINAL, TRUE ); @@->defined = TRUE; @@->line = pcb->line; + + pfree( @identifier ); *] ; @@ -394,20 +370,22 @@ alt_regex_sym: alt_regex_sym regex_sym *] ; -regex_sym : identifier - [* @@ = get_symbol( parser, - strbuf, SYM_REGEX_TERMINAL, TRUE ); +regex_sym : identifier + [* + @@ = get_symbol( parser, @identifier, SYM_REGEX_TERMINAL, TRUE ); if( @@->defined ) { print_error( parser, ERR_DOUBLE_TERMINAL_DEF, ERRSTYLE_FATAL | ERRSTYLE_FILEINFO, pcb->filename, pcb->line, - @@->name ); + @identifier ); } @@->defined = TRUE; @@->line = pcb->line; + + pfree( @identifier ) *] ; @@ -514,21 +492,9 @@ definition : lhs:primary l = list_next( l ) ) { s = (SYMBOL*)list_access( l ); - - /* - Last symbol gets strbuf-pointer, - all other assignments need to be - duplicated. - */ - if( list_next( l ) ) - s->code = pstrdup( strbuf ); - else - s->code = strbuf; - + s->code = @code_opt; primary->code_at = last_code_begin; } - - strbuf = (char*)NULL; } /* Value type */ @@ -579,14 +545,14 @@ productions : productions '|' production ; ast_node : '=' identifier - [* @@ = pstrdup( strbuf ); *] + [* @@ = @identifier; *] | '=' string - [* @@ = pstrdup( strbuf ); *] + [* @@ = @string *] | [* @@ = (char*)NULL; *] ; -production : line_number rhs_opt:rhs code_opt_dup:act +production : line_number rhs_opt:rhs code_opt:act ast_node prod_directives* [* @@ -621,10 +587,9 @@ prod_directives: '#%' "precedence" terminal rhs : rhs symbol access_name - [* append_to_production( @1, @2, - ( *strbuf == '\0' ? (char*)NULL : - pstrdup( strbuf ) ) ); - @@ = @1; + [* + append_to_production( @rhs, @symbol, @access_name ); + @@ = @1; *] | symbol access_name @@ -633,9 +598,7 @@ rhs : rhs symbol access_name create_production( parser, (SYMBOL*)NULL ); - append_to_production( @@, @1, - ( *strbuf == '\0' ? (char*)NULL : - pstrdup( strbuf ) ) ); + append_to_production( @@, @symbol, @access_name); *] ; @@ -678,12 +641,14 @@ sym : terminal | identifier - [* @@ = get_symbol( parser, - strbuf, SYM_NON_TERMINAL, TRUE ); + [* + @@ = get_symbol( parser, @identifier, SYM_NON_TERMINAL, TRUE ); @@->used = TRUE; if( @@->line < 0 ) @@->line = pcb->line; + + pfree( @identifier ); *] //Embedded productions @@ -741,14 +706,7 @@ stack_cur_prod terminal : ccl [* - pccl* ccl; - - ccl = pccl_create( -1, -1, strbuf ); - if( @1 ) - pccl_negate( ccl ); - - @@ = get_symbol( parser, (void*)ccl, - SYM_CCL_TERMINAL, TRUE ); + @@ = get_symbol( parser, (void*)@ccl, SYM_CCL_TERMINAL, TRUE ); @@->defined = TRUE; @@->used = TRUE; @@ -760,15 +718,33 @@ terminal : ccl | kw [* - @@ = get_symbol( parser, - strbuf, SYM_REGEX_TERMINAL, TRUE ); + @@ = get_symbol( parser, @kw, SYM_REGEX_TERMINAL, TRUE ); @@->used = TRUE; @@->defined = TRUE; @@->keyword = TRUE; - @@->emit = @kw ? pstrdup( strbuf ) : NULL; - @@->ptn = pregex_ptn_create_string( strbuf, + @@->ptn = pregex_ptn_create_string( @kw, + parser->p_cis_strings ? + PREGEX_COMP_INSENSITIVE : 0 ); + + if( @@->line < 0 ) + @@->line = pcb->line; + + pfree( @kw ) + *] + + | bkw + + [* + @@ = get_symbol( parser, @bkw, SYM_REGEX_TERMINAL, TRUE ); + + @@->used = TRUE; + @@->defined = TRUE; + @@->keyword = TRUE; + @@->emit = @bkw; + + @@->ptn = pregex_ptn_create_string( @bkw, parser->p_cis_strings ? PREGEX_COMP_INSENSITIVE : 0 ); @@ -778,8 +754,8 @@ terminal : ccl | '@' identifier - [* @@ = get_symbol( parser, - strbuf, SYM_REGEX_TERMINAL, TRUE ); + [* + @@ = get_symbol( parser, @identifier, SYM_REGEX_TERMINAL, TRUE ); /* @@->defined = TRUE; DO NOT SET DEFINED! */ @@ -787,6 +763,8 @@ terminal : ccl if( @@->line < 0 ) @@->line = pcb->line; + + pfree( @identifier ); *] ; @@ -804,9 +782,9 @@ modifier : '*' ; -access_name : ':' identifier - | ':' string_single - | [* reset_strbuf(); *] +access_name : ':' identifier [* @@ = @identifier *] + | ':' string [* @@ = @string *] + | [* @@ = NULL *] ; /* Regular Expression parser and NFA generator */ @@ -861,19 +839,15 @@ re_factor : ccl [* - pccl* ccl; - - ccl = pccl_create( -1, -1, strbuf ); - if( @1 ) - pccl_negate( ccl ); - - @@ = pregex_ptn_create_char( ccl ); + @@ = pregex_ptn_create_char( @ccl ); + pfree( @ccl ) *] | kw [* - @@ = pregex_ptn_create_string( strbuf, 0 ); + @@ = pregex_ptn_create_string( @kw, 0 ); + pfree( @kw ) *] | '.' @@ -897,118 +871,99 @@ re_factor ; /* General parsing objects */ -string : string_single+ - ; -string_single : ccl_string | kw +string : ccl_string | kw ; -ccl : ccl_string +ccl : ccl_string - [* @@ = FALSE; *] + [* + @@ = pccl_create( -1, -1, @ccl_string ); + pfree(@ccl_string); + *] | '!' ccl_string - [* @@ = TRUE; *] + [* + @@ = pccl_create( -1, -1, @ccl_string ); + pccl_negate( @@ ); + pfree(@ccl_string); + *] ; /* ------------------------------------- TODO: Must be re-designed... --- */ -ccl_string : '\'' ccl_str '\''; +ccl_string + : '\'' ccl_str '\'' + [* @@ = ustrndup( "ccl_string", pcb->capture, pcb->src - pcb->capture - 2 ) *] + ; -ccl_str : ccl_str ccl_char - | - [* reset_strbuf(); *] +ccl_str : ccl_str ccl_char + | [* pcb->capture = pcb->src - 2; *] ; -ccl_char : !'\\\'' - [* strbuf_append( @1 ); *] +ccl_char : !'\\\'' + | '\\' !'\0' + ; - | '\\' !'\0' - [* - strbuf_append( (char)'\\' ); - strbuf_append( @2 ); - *] +kw : '\"' kw_str '\"' + [* @@ = ustrndup( "kw", pcb->capture, pcb->src - pcb->capture - 2 ) *] ; -kw : '\"' '\"' kw_str '\"' '\"' [* @@ = TRUE *] - | '\"' kw_str '\"' [* @@ = FALSE *] +bkw : '\"' '\"' kw_str '\"' '\"' + [* @@ = ustrndup( "bkw", pcb->capture, pcb->src - pcb->capture - 3 ) *] ; kw_str : kw_str kw_char - [* strbuf_append( @2 ); *] - | [* reset_strbuf(); *] + | [* pcb->capture = pcb->src - 2; *] ; -kw_char : !'\\"' - [* strbuf_append( @1 ); *] - - | '\\' !'\0' - [* strbuf_append( (char)'\\' ); - strbuf_append( @2 ); - *] +kw_char : !'\\"' + | '\\' !'\0' ; type : '<' type_str '>' - [* @@ = pstrdup( strbuf ); *] + [* @@ = ustrndup( "type", pcb->capture, pcb->src - pcb->capture - 2 ); *] | [* @@ = (char*)NULL; *] ; type_str : type_str !'>' - [* strbuf_append( @2 ); *] - | [* reset_strbuf(); *] + | [* pcb->capture = pcb->src - 2; *] ; -identifier : identifier_start identifier_follow +identifier + : identifier_start identifier_follow + [* @@ = ustrndup( "identifier", pcb->capture, pcb->src - pcb->capture - 2 ) *] ; identifier_start: 'A-Za-z_' - [* - reset_strbuf(); - strbuf_append( @1 ); - *] + [* pcb->capture = pcb->src - 3; *] ; identifier_follow : identifier_follow 'A-Za-z0-9_' - [* strbuf_append( @2 ); *] | ; - -string_or_ident : string +string_or_ident + : string | identifier ; /* ------------------------------------- TODO: ...until here --- */ integer : integer '0-9' - [* strbuf_append( @2 ); *] - | '0-9' - [* - reset_strbuf(); - strbuf_append( @1 ); - *] - ; + [* @@ = atoi( pcb->capture ) *] -code_opt_dup - : code_opt - [* - @@ = pstrdup( @code_opt ); - reset_strbuf(); - *] + | '0-9' + [* pcb->capture = pcb->src - 1; *] ; code_opt : code - [* @@ = strbuf; *] - | - [* - reset_strbuf(); - @@ = (char*)NULL; - *] + | [* @@ = NULL *] ; -code : code_begin inner_code_opt "*]" +code : "[*" line_number inner_code_opt "*]" [* if( !parser->p_template ) { @@ -1016,35 +971,26 @@ code : code_begin inner_code_opt "*]" ERR_NO_TARGET_TPL_SUPPLY, ERRSTYLE_WARNING | ERRSTYLE_IMPORTANT | ERRSTYLE_FILEINFO, - pcb->filename, last_code_begin ); - - reset_strbuf(); + pcb->filename, @line_number ); } - *] - ; -code_begin : "[*" - [* last_code_begin = pcb->line; *] + @@ = ustrndup( "code", pcb->capture, pcb->src - pcb->capture - 2 - 1 ) + *] ; inner_code_opt : inner_code - | [* reset_strbuf(); *] + | ; inner_code : inner_code anychar - [* strbuf_append( @2 ); *] - | anychar - [* - reset_strbuf(); - strbuf_append( @1 ); - *] + [* pcb->capture = pcb->src - 2; *] ; whitespace : ' ' | '\t' | "/*" comment? "*/" - | "//" scomment? '\n' + | "//" ( !'\n' )* '\n' | '\r' | '\n' ; @@ -1054,13 +1000,6 @@ comment : comment anychar ; anychar : !'\0' - [* - @@ = @1; - *] - ; - -scomment : scomment !'\n' - | !'\n' ; line_number : [* @@ = pcb->line; *] @@ -1128,11 +1067,7 @@ int parse_grammar( PARSER* p, char* filename, char* src ) if( p && src ) { parser = p; - strbuf_append( '\0' ); - @@prefix_parse( &pcb ); - - pfree( strbuf ); } return pcb.error_count + error_count; diff --git a/targets/C.source/C.xml b/targets/C.source/C.xml index 9c530b8..b389a0b 100644 --- a/targets/C.source/C.xml +++ b/targets/C.source/C.xml @@ -765,7 +765,9 @@ between the rows/columns, except the last row/column. %%%include fn.stack.c -%%%include fn.getchar.c +#ifndef UNICC_GETCHAR +#define UNICC_GETCHAR( pcb ) getchar() +#endif %%%include fn.getinput.c diff --git a/targets/C.source/Makefile b/targets/C.source/Makefile index e43e99c..e1e0358 100644 --- a/targets/C.source/Makefile +++ b/targets/C.source/Makefile @@ -9,7 +9,6 @@ SOURCE = \ fn.clearin.c \ fn.debug.c \ fn.getact.c \ - fn.getchar.c \ fn.getgo.c \ fn.getinput.c \ fn.getsym.c \ @@ -36,4 +35,3 @@ $(C_XML): $(SOURCE) $(C_XML_SRC) $(MKPARSER) clean: -$(RM) $(C_XML) - diff --git a/targets/C.source/fn.getchar.c b/targets/C.source/fn.getchar.c deleted file mode 100644 index b174512..0000000 --- a/targets/C.source/fn.getchar.c +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef UNICC_GETINPUT - -#if UNICC_UTF8 -static int offsets_utf8[ 6 ] = -{ - 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL -}; - -static int trailbyte_utf8[ 256 ] = -{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -UNICC_STATIC UNICC_CHAR @@prefix_utf8_getchar( int (*getfn)() ) -{ - UNICC_CHAR ch = 0; - int nb; - int c; - - if( !getfn ) - getfn = getchar; - - switch( ( nb = trailbyte_utf8[ ( c = (*getfn)() ) ] ) ) - { - case 3: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 2: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 1: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 0: - ch += c; - break; - } - - ch -= offsets_utf8[ nb ]; -#if UNICC_DEBUG > 3 - fprintf( stderr, "%s: getchar: %d\n", UNICC_PARSER, ch ); -#endif - return ch; -} -#define UNICC_GETINPUT @@prefix_utf8_getchar( getchar ) - -#else -#define UNICC_GETINPUT getchar() -#endif - -#endif diff --git a/targets/C.source/fn.getinput.c b/targets/C.source/fn.getinput.c index b145a2d..35563cd 100644 --- a/targets/C.source/fn.getinput.c +++ b/targets/C.source/fn.getinput.c @@ -1,3 +1,56 @@ +#if UNICC_UTF8 +UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb ) +{ + unsigned char first = UNICC_GETCHAR( pcb ); + + if ((first & 0x80) == 0 || first >= 0x80) + { + // Single-byte ASCII character (probably ISO-8859-1) + return first; + } + else if ((first & 0xE0) == 0xC0) + { + // Two-byte sequence (110xxxxx 10xxxxxx) + unsigned char second = UNICC_GETCHAR( pcb ); + return ((first & 0x1F) << 6) | (second & 0x3F); + } + else if ((first & 0xF0) == 0xE0) + { + // Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[2]; + + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); + + return + ((first & 0x0F) << 12) + | ((bytes[0] & 0x3F) << 6) + | (bytes[1] & 0x3F) + ; + } + else if ((first & 0xF8) == 0xF0) + { + // Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[3]; + + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); + bytes[2] = UNICC_GETCHAR( pcb ); + + return + ((first & 0x07) << 18) + | ((bytes[0] & 0x3F) << 12) + | ((bytes[1] & 0x3F) << 6) + | (bytes[2] & 0x3F) + ; + } + + return -1; // Invalid UTF-8 sequence +} +#else +#define @@prefix_get_char( pcb ) UNICC_GETCHAR( pcb ) +#endif + UNICC_STATIC UNICC_CHAR @@prefix_get_input( @@prefix_pcb* pcb, unsigned int offset ) { #if UNICC_DEBUG > 2 @@ -44,7 +97,7 @@ UNICC_STATIC UNICC_CHAR @@prefix_get_input( @@prefix_pcb* pcb, unsigned int offs pcb->bufend = pcb->buf + size; } - if( pcb->is_eof || ( *( pcb->bufend ) = (UNICC_CHAR)UNICC_GETINPUT ) + if( pcb->is_eof || ( *( pcb->bufend ) = @@prefix_get_char( pcb ) ) == pcb->eof ) { #if UNICC_DEBUG > 2 diff --git a/targets/c.tlt b/targets/c.tlt index 3f86eaf..9e16fd6 100644 --- a/targets/c.tlt +++ b/targets/c.tlt @@ -1019,67 +1019,61 @@ UNICC_STATIC int @@prefix_alloc_stack( @@prefix_pcb* pcb ) return 0; } -#ifndef UNICC_GETINPUT +#ifndef UNICC_GETCHAR +#define UNICC_GETCHAR( pcb ) getchar() +#endif #if UNICC_UTF8 -static int offsets_utf8[ 6 ] = +UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb ) { - 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL -}; - -static int trailbyte_utf8[ 256 ] = -{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; + unsigned char first = UNICC_GETCHAR( pcb ); -UNICC_STATIC UNICC_CHAR @@prefix_utf8_getchar( int (*getfn)() ) -{ - UNICC_CHAR ch = 0; - int nb; - int c; + if ((first & 0x80) == 0 || first >= 0x80) + { + // Single-byte ASCII character (probably ISO-8859-1) + return first; + } + else if ((first & 0xE0) == 0xC0) + { + // Two-byte sequence (110xxxxx 10xxxxxx) + unsigned char second = UNICC_GETCHAR( pcb ); + return ((first & 0x1F) << 6) | (second & 0x3F); + } + else if ((first & 0xF0) == 0xE0) + { + // Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[2]; - if( !getfn ) - getfn = getchar; + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); - switch( ( nb = trailbyte_utf8[ ( c = (*getfn)() ) ] ) ) + return + ((first & 0x0F) << 12) + | ((bytes[0] & 0x3F) << 6) + | (bytes[1] & 0x3F) + ; + } + else if ((first & 0xF8) == 0xF0) { - case 3: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 2: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 1: - ch += c; - ch <<= 6; - c = (*getfn)(); - case 0: - ch += c; - break; + // Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + unsigned char bytes[3]; + + bytes[0] = UNICC_GETCHAR( pcb ); + bytes[1] = UNICC_GETCHAR( pcb ); + bytes[2] = UNICC_GETCHAR( pcb ); + + return + ((first & 0x07) << 18) + | ((bytes[0] & 0x3F) << 12) + | ((bytes[1] & 0x3F) << 6) + | (bytes[2] & 0x3F) + ; } - ch -= offsets_utf8[ nb ]; -#if UNICC_DEBUG > 3 - fprintf( stderr, "%s: getchar: %d\n", UNICC_PARSER, ch ); -#endif - return ch; + return -1; // Invalid UTF-8 sequence } -#define UNICC_GETINPUT @@prefix_utf8_getchar( getchar ) - #else -#define UNICC_GETINPUT getchar() -#endif - +#define @@prefix_get_char( pcb ) UNICC_GETCHAR( pcb ) #endif UNICC_STATIC UNICC_CHAR @@prefix_get_input( @@prefix_pcb* pcb, unsigned int offset ) @@ -1128,7 +1122,7 @@ UNICC_STATIC UNICC_CHAR @@prefix_get_input( @@prefix_pcb* pcb, unsigned int offs pcb->bufend = pcb->buf + size; } - if( pcb->is_eof || ( *( pcb->bufend ) = (UNICC_CHAR)UNICC_GETINPUT ) + if( pcb->is_eof || ( *( pcb->bufend ) = @@prefix_get_char( pcb ) ) == pcb->eof ) { #if UNICC_DEBUG > 2