-
Notifications
You must be signed in to change notification settings - Fork 2
/
Camllexer.mll
508 lines (439 loc) · 20 KB
/
Camllexer.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
(****************************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* INRIA Rocquencourt *)
(* *)
(* Copyright 2006-2010 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed under *)
(* the terms of the GNU Library General Public License, with the special *)
(* exception on linking described in LICENSE at the top of the Objective *)
(* Caml source tree. *)
(* *)
(****************************************************************************)
(* Authors:
* - Xavier Leroy: initial version for OCaml
* - Daniel de Rauglaudre: some parts from Camlp4
* - Nicolas Pouillard: this actual implementation
*)
{
(** A lexical analyzer. *)
open Camltoken
open Located
type 'a iterator = unit -> 'a option
type 'a iterator_list = unit -> 'a list
let flatten_iterator_list next0 =
let queue = ref [] in
let rec next () =
match !queue with
| x :: xs -> queue := xs; Some x
| [] ->
match next0 () with
| [] -> None
| [x] -> Some x
| x :: xs ->
queue := xs;
Some x
in next
let (<.>) f g x = f (g x)
let sf = Printf.sprintf
type flags = { quotations : bool (** Enables the lexing of quotations *)
; antiquotations : bool (** Enables the lexing of anti-quotations *)
; line_directives : bool (** Honor the # line directives *)
}
let default_flags = { quotations = false
; antiquotations = false
; line_directives = true
}
type position = Lexing.position = { pos_fname : string;
pos_lnum : int;
pos_bol : int;
pos_cnum : int;
}
type token = caml_token located
type context =
{ stack : (position * unterminated) list (** Stack of opened constructs *)
; flags : flags (** Lexing flavors *)
; lexbuf : Lexing.lexbuf
; buffer : Buffer.t
}
let default_context lb =
{ stack = []
; flags = default_flags
; lexbuf = lb
; buffer = Buffer.create 256
}
(* To buffer comments, quotations and antiquotations *)
let store c = Buffer.add_string c.buffer (Lexing.lexeme c.lexbuf)
let buff_contents c =
let contents = Buffer.contents c.buffer in
Buffer.reset c.buffer; contents
(* Some projections *)
let quotations c = c.flags.quotations
let antiquots c = c.flags.antiquotations
let line_directives c = c.flags.line_directives
(* Various location/postion related functions *)
let (>>>) p k = { p with pos_cnum = p.pos_cnum + k }
let set_sp c sp = c.lexbuf.Lexing.lex_start_p <- sp
let get_sp c = c.lexbuf.Lexing.lex_start_p
let move_sp shift c = set_sp c (get_sp c >>> shift)
let move_cpos shift c =
c.lexbuf.Lexing.lex_curr_pos <- c.lexbuf.Lexing.lex_curr_pos + shift
(* Update the current location with file name and line number. *)
let update_absolute_position c file line =
let lexbuf = c.lexbuf in
let pos = lexbuf.Lexing.lex_curr_p in
let new_file = match file with
| None -> pos.pos_fname
| Some s -> s
in
lexbuf.Lexing.lex_curr_p <- { pos with
pos_fname = new_file;
pos_lnum = line;
pos_bol = pos.pos_cnum;
}
let update_relative_position c line chars =
let lexbuf = c.lexbuf in
let pos = lexbuf.Lexing.lex_curr_p in
lexbuf.Lexing.lex_curr_p <- { pos with
pos_lnum = pos.pos_lnum + line;
pos_bol = pos.pos_cnum - chars;
}
let update_chars c chars = update_relative_position c 1 chars
(* Given the length of the input and a the get function of
the input. count_newlines returns the number of newlines
and the offset of the last one. *)
let count_newlines len s =
let rec go count nl_off i =
if i >= len then (count, nl_off)
else
match s(i) with
| '\n' -> go (count + 1) i (i + 1)
| '\r' ->
if i + 1 < len && s(i + 1) = '\n' then
go (count + 1) (i + 1) (i + 2)
else
go (count + 1) i (i + 1)
| _ -> go count nl_off (i + 1)
in go 0 0 0
let update_loc c =
let lb = c.lexbuf in
let len = lb.Lexing.lex_curr_pos - lb.Lexing.lex_start_pos in
let newlines, last_newline_offset = count_newlines len (Lexing.lexeme_char lb) in
let chars = len - 1 - last_newline_offset in
if newlines <> 0 then update_relative_position c newlines chars
let parse_with_sp f c =
let sp = get_sp c in
let r = f c c.lexbuf in
set_sp c sp; r
let parse_in frame f c = f { c with stack = (get_sp c, frame) :: c.stack } c.lexbuf
let store_parse f c = store c ; f c c.lexbuf
let parse f c = f c c.lexbuf
let parse' f c () = f c c.lexbuf
let (&) x f = match x with
| [] -> f ()
| us -> us
let unterminated s u = mkERROR s (Unterminated u)
let unterminated1 s u c = unterminated s [(get_sp c, u)]
let illegal_character c = mkERROR (String.make 1 c) (Illegal_character c)
let mkANTIQUOT c sp ?name s = set_sp c sp; mkANTIQUOT ?name s
let mkBLANKS_ s tail =
match s with
| "" -> tail
| s -> mkBLANKS s :: tail
let mkPSYMBOL ?(pre_blanks="") ?(post_blanks="") op =
assert (op <> "");
let may_warn =
if post_blanks = "" && op.[String.length op - 1] = '*'
then [mkWARNING Comment_not_end]
else []
in
mkSYMBOL "(" :: mkBLANKS_ pre_blanks
( mkSYMBOL op
:: mkBLANKS_ post_blanks
( mkSYMBOL ")"
:: may_warn))
let parse_comment comment c =
let sp = get_sp c in
let r = parse_in Ucomment comment c in
let contents = buff_contents c in
set_sp c sp;
match r with
| [] -> mkCOMMENT contents
| us -> unterminated contents us
let parse_quotation quotation c name loc =
let mk contents =
{ q_name = name ;
q_loc = loc ;
q_contents = contents }
in
let sp = get_sp c in
let r = parse_in Uquotation quotation c in
let contents = buff_contents c in
set_sp c sp;
let drop_end n s = String.sub s 0 (String.length s - n) in
match r with
| [] -> let s = contents in
mkQUOTATION (mk (drop_end 2 s))
| us -> unterminated (drop_end 2 (string_of_quotation (mk contents))) us
}
let newline = ('\n' | '\r' | "\r\n")
let blank = [' ' '\t' '\012']
let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let identchar =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
let ident = (lowercase|uppercase) identchar*
let not_star_symbolchar =
['$' '!' '%' '&' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '\\']
let symbolchar = '*' | not_star_symbolchar
let quotchar =
['!' '%' '&' '+' '-' '.' '/' ':' '=' '?' '@' '^' '|' '~' '\\' '*']
let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
let hex_literal =
'0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
['0'-'9'] ['0'-'9' '_']*
('.' ['0'-'9' '_']* )?
(['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
let char_literal_no_nl_quote =
( [^ '\\' '\n' '\r' '"']
| '\\' ( ['\\' '"' 'n' 't' 'b' 'r' ' ' '\'']
| ['0'-'9'] ['0'-'9'] ['0'-'9']
| 'x' hexa_char hexa_char ))
let char_litteral = char_literal_no_nl_quote | '"' | newline | ('\\' _)
let char = "'" char_litteral "'"
let string_char = char_literal_no_nl_quote | newline | ('\\' _)
let string = '"' string_char* '"'
let unterminated_string = '"' string_char* '\\'? eof
(* Delimitors are extended (from 3.09) in a conservative way *)
(* These chars that can't start an expression or a pattern: *)
let safe_delimchars = ['%' '&' '/' '@' '^']
(* These symbols are unsafe since "[<", "[|", etc. exsist. *)
let delimchars = safe_delimchars | ['|' '<' '>' ':' '=' '.']
let left_delims = ['(' '[' '{']
let right_delims = [')' ']' '}']
let left_delimitor =
(* At least a safe_delimchars *)
left_delims delimchars* safe_delimchars (delimchars|left_delims)*
(* A '(' or a new super '(' without "(<" *)
| '(' (['|' ':'] delimchars*)?
(* Old brackets, no new brackets starting with "[|" or "[:" *)
| '[' ['|' ':']?
(* Old "[<","{<" and new ones *)
| ['[' '{'] delimchars* '<'
(* Old brace and new ones *)
| '{' (['|' ':'] delimchars*)?
let right_delimitor =
(* At least a safe_delimchars *)
(delimchars|right_delims)* safe_delimchars (delimchars|right_delims)* right_delims
(* A ')' or a new super ')' without ">)" *)
| (delimchars* ['|' ':'])? ')'
(* Old brackets, no new brackets ending with "|]" or ":]" *)
| ['|' ':']? ']'
(* Old ">]",">}" and new ones *)
| '>' delimchars* [']' '}']
(* Old brace and new ones *)
| (delimchars* ['|' ':'])? '}'
rule token c = parse
| '\n' { update_chars c 0; [mkNEWLINE LF] }
| '\r' { update_chars c 0; [mkNEWLINE CR] }
| "\r\n" { update_chars c 0; [mkNEWLINE CRLF] }
| blank + as x { [mkBLANKS x] }
| "~" (lowercase identchar * as x) ':' { [mkLABEL x] }
| "?" (lowercase identchar * as x) ':' { [mkOPTLABEL x] }
| lowercase identchar * as x { [mkLIDENT x] }
| uppercase identchar * as x { [mkUIDENT x] }
| int_literal as i { [mkINT i] }
| float_literal as f { [mkFLOAT f] }
| (int_literal as i) "l" { [mkINT32 i] }
| (int_literal as i) "L" { [mkINT64 i] }
| (int_literal as i) "n" { [mkNATIVEINT i] }
| '"' (string_char* as s) '"' { update_loc c; let x,y = mkSTRING s in x::y }
| unterminated_string as s { update_loc c ;
[unterminated1 s Ustring c] }
| "'" (char_litteral as s) "'" { update_loc c; [mkCHAR s] }
| "(*" { store c; [parse_comment comment c] }
| "(*)" { store c; [mkWARNING Comment_start; parse_comment comment c] }
| "<<" (quotchar* as beginning)
{ if quotations c
then (move_cpos (-String.length beginning) c;
[parse_quotation quotation c "" ""])
else parse (symbolchar_star ("<<" ^ beginning)) c }
| "<<>>"
{ if quotations c
then [mkQUOTATION { q_name = ""; q_loc = ""; q_contents = "" }]
else parse (symbolchar_star "<<>>") c }
| "<@"
{ if quotations c then parse_with_sp left_angle_at c
else parse (symbolchar_star "<@") c }
| "<:"
{ if quotations c then parse_with_sp left_angle_colon c
else parse (symbolchar_star "<:") c }
| "#" ([' ' '\t']* as bl1) ('0'* as zeros) ('0' | ['1'-'9']['0'-'9']* as num)
([' ' '\t']* as bl2) ("\"" ([^ '\n' '\r' '"' ] * as name) "\"")?
([^ '\n' '\r']* as com) (newline as nl)
{ let inum = int_of_string num in
let nl = newline_of_string nl in
if line_directives c then
update_absolute_position c name inum
else
update_chars c 0;
[mkLINE_DIRECTIVE{l_blanks1=bl1;
l_zeros=String.length zeros;
l_linenum=inum;
l_blanks2=bl2;
l_filename=name;
l_comment=com;
l_newline=nl}] }
| '(' (not_star_symbolchar symbolchar* as op) (blank* as post_blanks) ')'
{ mkPSYMBOL ~post_blanks op }
| '(' (blank+ as pre_blanks) (symbolchar+ as op) (blank* as post_blanks) ')'
{ mkPSYMBOL ~pre_blanks ~post_blanks op }
| ( "#" | "`" | "'" | "," | "." | ".." | ":" | "::"
| ":=" | ":>" | ";" | ";;" | "_"
| left_delimitor | right_delimitor ) as x { [mkSYMBOL x] }
| '$' { if antiquots c
then parse (dollar (get_sp c)) c
else parse (symbolchar_star "$") c }
| ['~' '?' '!' '=' '<' '>' '|' '&' '@' '^' '+' '-' '*' '/' '%' '\\'] symbolchar *
as x { [mkSYMBOL x] }
| eof
{ let pos = lexbuf.Lexing.lex_curr_p in
lexbuf.Lexing.lex_curr_p <- { pos with pos_bol = pos.pos_bol + 1 ;
pos_cnum = pos.pos_cnum + 1 }; [] }
| _ as c { [illegal_character c] }
and comment c = parse
"(*" { store c ;
parse_in Ucomment comment c &
parse' comment c }
| "*)" { store c; [] }
| '<' (':' ident)? ('@' ident)? '<'
{ store c;
(if quotations c then parse_in Uquotation quotation c else []) &
parse' comment c }
| ident { store_parse comment c }
| string { update_loc c; store_parse comment c }
| unterminated_string { update_loc c; store c; (get_sp c, Ustring) :: c.stack }
| "''" { store_parse comment c }
| char { update_loc c; store_parse comment c }
| eof { c.stack }
| newline { update_chars c 0; store_parse comment c }
| _ { store_parse comment c }
and symbolchar_star beginning c = parse
| symbolchar* as tok { move_sp (-String.length beginning) c ;
[mkSYMBOL (beginning ^ tok)] }
(* <@ *)
and left_angle_at c = parse
| (ident as loc) '<' { [parse_quotation quotation c "" loc] }
| symbolchar* as tok { [mkSYMBOL("<@" ^ tok)] }
(* <: *)
and left_angle_colon c = parse
| (ident as name) '<' { [parse_quotation quotation c name ""] }
| (ident as name) '@' (ident as loc) '<'
{ [parse_quotation quotation c name loc] }
| symbolchar* as tok { [mkSYMBOL("<:" ^ tok)] }
and quotation c = parse
| '<' (':' ident)? ('@' ident)? '<' { store c ;
parse_in Uquotation quotation c &
parse' quotation c }
| ">>" { store c; [] }
| eof { c.stack }
| newline { update_chars c 0; store_parse quotation c }
| _ { store_parse quotation c }
and dollar sp c = parse
| '$' { [mkANTIQUOT c sp ""] }
| ('`'? (identchar+|'.'+) as name) ':' { parse (antiquot sp name) c }
| newline { update_chars c 0; store_parse (antiquot sp "") c }
| _ { store_parse (antiquot sp "") c }
and antiquot sp name c = parse
| '$' { [mkANTIQUOT c sp ~name (buff_contents c)] }
| eof { set_sp c sp ;
[unterminated1 (sf "$%s:%s" name (buff_contents c)) Uantiquot c] }
| newline { update_chars c 0; store_parse (antiquot sp name) c }
| '<' (':' ident)? ('@' ident)? '<'
{ store c; match parse_in Uquotation quotation c with
| [] -> parse (antiquot sp name) c
| stack -> [unterminated (buff_contents c) stack] }
| _ { store_parse (antiquot sp name) c }
{
let iterator_of_stream s () =
match Stream.peek s with
| Some x -> Stream.junk s; Some x
| None -> None
(* If we doesn't want to block on waiting input,
we can't return more than one element at a time. *)
let lexing_store next buff max =
assert (max > 0);
match next () with
| Some x -> buff.[0] <- x; 1
| _ -> 0
let distribute_positions p0 _pN =
let rec loop pp p = function
| [] -> []
| WARNING Comment_not_end as tok :: toks ->
locate (p >>> (-2)) tok p :: loop pp p toks
| WARNING _ :: _ ->
assert false
| tok :: toks ->
let p' = p >>> token_width tok in
locate p tok p' :: loop p p' toks
in loop p0 p0
let rec distribute_location bpos apos = function
| [] -> []
| [tok] -> [locate bpos tok apos]
| [WARNING Comment_start as t1; COMMENT _ as t2] ->
[locate bpos t1 (bpos >>> 2); locate bpos t2 apos]
| STRING _ as t1 :: warns ->
let rec loop = function
| [] -> []
| WARNING (Illegal_escape_in_string(s, i)) as t1 :: toks ->
(* TODO: Wrong position if the string contains newlines *)
let ppi = bpos >>> i in
locate ppi t1 (ppi >>> 1 + String.length s) :: loop toks
| _ -> assert false
in
locate bpos t1 apos :: loop warns
| toks -> distribute_positions bpos apos toks
(* I do not really know what to do about the ``end of input''.
I see various options:
1/ With an EOI token.
The output stream is infinite and repeats EOI indefinitely
because each time we give eof to the token rule it gives
us EOI.
2/ With an EOI token. The output stream terminates with a single EOI token.
3/ The output stream terminates without outputing any EOI token.
Previously it was 1/, and know it is 3/. Implenting 2/ would require
some state.
*)
let from_context c =
let next_list () =
let toks = parse token c in
let bpos = Lexing.lexeme_start_p c.lexbuf in
let apos = Lexing.lexeme_end_p c.lexbuf in
distribute_location bpos apos toks
in flatten_iterator_list next_list
let from_lexbuf flags pos lb =
lb.Lexing.lex_abs_pos <- pos.pos_cnum;
lb.Lexing.lex_curr_p <- pos;
from_context { (default_context lb) with flags = flags }
let from_string flags pos str =
let lb = Lexing.from_string str in
from_lexbuf flags pos lb
let from_channel flags pos ic =
let lb = Lexing.from_channel ic in
from_lexbuf flags pos lb
let from_iterator flags pos next =
let lb = Lexing.from_function (lexing_store next) in
from_lexbuf flags pos lb
let from_stream flags pos strm =
from_iterator flags pos (iterator_of_stream strm)
}