forked from indeyets/syck
-
Notifications
You must be signed in to change notification settings - Fork 0
/
README.BYTECODE
484 lines (465 loc) · 18.8 KB
/
README.BYTECODE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
#
# Reflects Oren's comments, adds yamlbyte.h at the bottom
#
subject: Revision #4 of YAML Bytecodes
summary: >
This proposal defines a 'preparsed' format where a YAML syntax
is converted into a series of events, as bytecodes. Each bytecode
appears on its own line, starting with a single character and ending
with a line feed character, '\n'.
codes:
#
# Primary Bytecodes (Capital Letters)
#
# These bytecodes form the minimum needed to represent YAML information
# from the serial model (ie, without format and comments)
#
'D':
name: Document
desc: >
Indicates that a document has begun, either it is
the beginning of a YAML stream, or a --- has been
found. Thus, an empty document is expressed
as "D\n"
'V':
name: Directive
desc: >
This represents any YAML directives immediately following
a 'D' bytecode. For example '--- %YAML:1.0' produces the
bytecode "D\nVYAML:1.0\n".
'P':
name: Pause Stream
desc: >
This is the instruction when a document is terminated, but
another document has not yet begun. Thus, it is optional,
and typically used to pause parsing. For example,
a stream starting with an empty document, but then in a
hold state for the next document would be: "D\nP\n"
'\z':
name: Finish (end stream)
desc: >
YAML bytecodes are meant to be passable as a single "C"
string, and thus the null terminator can optionally be
used to signal the end of a stream. When writing bytecodes
out to a flat file, the file need not contain a null
terminator; however, when read into memory it should
always have a null terminator.
'M':
name: Mapping
desc: >
Indicates the begin of a mapping, children of the
mapping are provided as a series of K1,V1,K2,V2
pairs as they are found in the input stream. For
example, the bytecodes for "{ a: b, c: d }" would
be "M\nSa\nSb\nSc\nSd\nE\n"
'Q':
name: Sequence
desc: >
Indicates the begin of a sequence, children are provided
following till a '.' bytecode is encountered. So, the
bytecodes for "[ one, two ]" would be "Q\nSone\nStwo\nE\n"
'E':
name: End Collection
desc: >
This closes the outermost Collection (Mapping, Sequence),
note that the document has one and only one node following
it, therefore it is not a branch.
'S':
name: Scalar
desc: >
This indicates the start of a scalar value, which can
be continued by the 'N' and 'C' bytecodes. This bytecode
is used for sequence entries, keys, values, etc.
'C':
name: Scalar Continuation
desc: >
Since a scalar may not fit within a buffer, and since it
may not contain a \n character, it may have to be broken
into several chunks.
'N':
name: Normalized New Line (in a scalar value)
desc: >
Scalar values must be chunked so that new lines and
null values do not occur within a 'S' or 'C' bytecode
(in the bytecodes, all other C0 need not be escaped).
This bytecode is then used to represent one or more
newlines, with the number of newlines optionally
following. For example,
"Hello\nWorld" would be "SHello\nN\nCWorld\n", and
"Hello\n\n\nWorld" is "SHello\nN3\nCWorld\n"
If the new line is an LS or a PS, the N bytecode can
be followed with a L or P. Thus, "Hello\PWorld\L" is
reported "SHello\nNP\nWorld\NL\n"
'Z':
name: Null Character (in a scalar value)
desc: >
As in normalized new lines above, since the null character
cannot be used in the bytecodes, is must be escaped, ie,
"Hello\zWorld" would be "SHello\nZ\nCWorld\n".
'A':
name: Alias
desc: >
This is used when ever there is an alias node, for
example, "[ &X one, *X ]" would be normalized
to "S\nAX\nSone\nRX\nE\n" -- in this example, the
anchor bytecode applies to the very next content
bytecode.
'R':
name: Reference (Anchor)
desc: >
This bytecode associates an anchor with the very next
content node, see the 'A' alias bytecode.
'T':
name: Transfer
desc: >
This is the transfer method. If the value begins with
a '!', then it is not normalized. Otherwise, the value
is a fully qualified URL, with a semicolon. The transfer
method applies only to the node immediately following,
and thus it can be seen as a modifier like the anchor.
For example, "Ttag:yaml.org,2002:str\nSstring\n" is
normalized, "T!str\nSstring\n" is not.
#
# Formatting bytecodes (lower case)
#
# The following bytecodes are purely at the syntax level and
# useful for pretty printers and emitters. Since the range of
# lower case letters is contiguous, it could be easy for a
# processor to simply ignore all bytecodes in this range.
#
'c':
name: Comment
desc: >
This is a single line comment. It is terminated like all
of the other variable length items, with a '\n'.
'i':
name: Indent
desc: >
Specifies number of additional spaces to indent for
subsequent block style nodes, "i4\n" specifies 4 char indent.
's':
name: Scalar styling
desc: >
This bytecode, is followed with one of the following
items to indicate the style to be used for the very
next content node. It is an error to specify a style for
a scalar other than double quoted when it must be escaped.
Furthermore, there must be agreement between the style
and the very next content node, in other words, a scalar
style requires that the next content node be an S.
> flow scalar
" double quoted scalar
' single quoted scalar
| literal scalar
p plain scalar
{ inline mapping
[ inline sequence
b block style (for mappings and sequences'")
#
# Advanced bytecodes (not alphabetic)
#
# These are optional goodies which one could find useful.
#
'#':
name: Line Number
desc: >
This bytecode allows the line number of the very next
node to be reported.
'!':
name: Notice
desc: >
This is a message sent from the producer to the consumer
regarding the state of the stream or document. It does
not necessarly end a stream, as the 'finish' bytecode can
be used for this purpose. This signal has a packed format,
with the error number, a comma, and a textual message:
"#22\n!73,Indentation mismatch\n"
"#132\n!84,Tabs are illegal for indentation\n"
',':
name: Span
desc: >
This bytecode gives the span of the very next 'S', 'M',
or 'Q' bytecode -- including its subordinates. For scalars,
it includes the span of all subordinate 'N' and 'C' codes.
For mappings or sequences, this gives the length all the
way to the corresponding 'E' bytecode so that the entire
branch can be skipped. The length is given starting at
the corresponding 'S', 'M' or 'Q' bytecode and extends
to the first character following subordinate nodes.
Since this length instruction is meant to be used to 'speed'
things up, and since calculating the length via hand is not
really ideal, the length is expressed in Hex. This will allow
programs to easily convert the length to an actual value
(converting from hex to integers is easier than decimal).
Furthermore, all leading x's are ignored (so that they can
be filled in later) and if the bytecode value is all x's,
then the length is unknown. Lastly, this length is expressed
in 8 bit units for UTF-8, and 16 bit units for UTF-16.
For example,
--- [[one, two], three]
Is expressed as,
"?25\nD\n?x1E\nQ\n?xxE\nQ\nSone\nStwo\nE\nSthree\nE\n"
Thus it is seen that the address of D plus 37 is the null
terminator for the string, the first 'Q' plus 30 also
gives the null teriminator, and the second 'Q' plus
14 jumps to the opening 'S' for the third scalar.
'@':
name: Allocate
desc: >
This is a hint telling the processor how many items
are in the following collection (mapping pairs, or
sequence values), or how many character units need
to be allocated to hold the next value. Clearly this
is encoding specific value. The length which
follows is in hex (not decimal).
For example, "one", could be "@x3\nSone"
design:
-
name: streaming support
problem: >
The interface should ideally allow for a YAML document to be
moved incrementally as a stream through a process. In particular,
YAML is inheritently line oriented, thus the interface should
probably reflect this fundamental character.
solution: >
The bytecodes deliver scalars as chunks, each chunk limited to
at most one line. While this is not ideal for passing large
binary objects, it is simple and easy to understand.
-
name: push
problem: >
The most common 'parsers' out there for YAML are push style, where
the producer owns the 'C' program stack, and the consumer keeps
its state as a heap object. Ideal use of a push interface is an
emitter, since this allows the sender (the application program)
to use the program stack and thus keep its state on the call stack
in local, automatic variables.
solution: >
A push interface simply can call a single event handler with a
(bytecode, payload) tuple. Since the core complexity is in the
bytecodes, the actual function signature is straight-forward
allowing for relative language independence. Since the bytecode
is always one character, the event handler could just receive
a string where the tuple is implicit.
-
name: pull
problem: >
The other alternative for a streaming interface is a 'pull' mechanism,
or iterator model where the consumer owns the C stack and the producer
keeps any state needed as a heap object. Ideal use of a pull
interface is a parser, since this allows the receiver (the application
program) to use the program stack, keeping its state on the call stack
in local variables.
solution: >
A pull interface would also be a simple function, that when called
filles a buffer with binary node(s). Or, in a language with
garbage collection, could be implemented as an iterator returning
a string containing the bytecode line (bytecode followed immediately
by the bytecode argument as a single string) or as a tuple.
-
name: pull2push
problem: >
This is done easily via a small loop which pulls from the
iterator and pushes to the event handler.
solution: >
For python, assuming the parser is implemented as an iterator
where one can 'pull' bytecode, args tuples, and assuming the
emitter has a event callback taking a bytecode, args tuple,
we have:
def push2pull(parser, emitter):
for (bytecode, args) in parser:
emitter.push(bytecode, args)
-
name: push2pull
problem: >
This requires the entire YAML stream be cashed in memory, or
each of the two stages in a thread or different continuation
with shared memory or pipe between them.
solution: >
This use case seems much easier with a binary stream; that is,
one need not convert the style of functions between the push
vs pull pattern. And, for languages supporting continuations,
(ruby) perhaps push vs pull is not even an issue... for a
language like python, one would use the threaded Queue object,
one thread pushes (bytecode, args) tuples into the Queue, while
the other thread pulls the tuples out. Simple.
-
name: neutrality
problem: >
It would be ideal of the C Program interface was simple enough
to be independent of programming language. In an ideal case,
imagine a flow of YAML structured data through various processing
stages on a server; where each processing stage is written in
a different programming language.
solution: >
While it may be hard for each language to write a syntax parser
filled with all of the little details, it would be much much
easier to write a parser for these bytecodes; as it involves
simple string handling, dispatching on the first character in
each string.
-
name: tools
problem: >
A goal of mine is to have a YPATH expression language, a schema
language, and a transformation language. I would like these items
to be reusable by a great number of platforms/languages, and in
particular as its own callable processing stage.
solution: >
If such an expression language was written on top of a bytecode
format like this, via a simple pull function (/w adapters for
push2pull and pull2push) quite a bit of reusability could emerge.
Imagine a schema validator which is injected into the bytecode stream
and it is an identity operation unless an exception occurs, in
which case, it terminates the document and makes the next document
be a description of the validation error.
-
name: encoding
problem: >
Text within the bytecode format must be given an encoding. There are
several considerations at hand listed below.
solution: >
The YAML bytecode format uses the same encodings as YAML itself,
and thus is independent of actual encoding. A parser library should
have several functions to convert between the encodings.
examples:
-
yaml: |
---
- plain
- >
this is a flow scalar
- >
another flow scalar which is continued
on a second line and indented 2 spaces
- &001 !str |
This is a block scalar, both typed
and anchored
- *001 # this was an alias
- "This is a \"double quoted\" scalar"
bytecode: |
D
Q
Splain
f
Sthis is a flow scalar
Sanother flow scalar which is continued
Con a second line and indented 2 spaces
b
a001
t!str
SThis is a block scalar, both typed
N
Cand anchored
R001
cthis was an alias
d
SThis is a "double quoted" scalar
E
cheader: |
/* yamlbyte.h
*
* The YAML bytecode "C" interface header file. See the YAML bytecode
* reference for bytecode sequence rules and for the meaning of each
* bytecode.
*/
#ifndef YAMLBYTE_H
#define YAMLBYTE_H
#include <stddef.h>
/* list out the various YAML bytecodes */
typedef enum {
/* content bytecodes */
YAML_FINISH = 0,
YAML_DOCUMENT = 'D',
YAML_DIRECTIVE = 'V',
YAML_PAUSE = 'P',
YAML_MAPPING = 'M',
YAML_SEQUENCE = 'S',
YAML_ENDMAPSEQ = 'E',
YAML_SCALAR = 'S',
YAML_CONTINUE = 'C',
YAML_NEWLINE = 'N',
YAML_NULLCHAR = 'Z',
YAML_ALIAS = 'A',
YAML_ANCHOR = 'R',
YAML_TRANSFER = 'T',
/* formatting bytecodes */
YAML_COMMENT = 'c',
YAML_INDENT = 'i',
YAML_STYLE = 's',
/* other bytecodes */
YAML_LINENUMBER = '#',
YAML_NOTICE = '!',
YAML_SPAN = ',',
YAML_ALLOC = '@'
} yaml_code_t;
/* additional modifiers for the YAML_STYLE bytecode */
typedef enum {
YAML_FLOW = '>',
YAML_LITERAL = '|',
YAML_BLOCK = 'b',
YAML_PLAIN = 'p',
YAML_INLINE_MAPPING = '{',
YAML_INLINE_SEQUENCE = '}',
YAML_SINGLE_QUOTED = 39,
YAML_DOUBLE_QUOTED = '"'
} yaml_style_t;
typedef unsigned char yaml_utf8_t;
typedef unsigned short yaml_utf16_t;
#ifdef YAML_UTF8
#ifdef YAML_UTF16
#error Must only define YAML_UTF8 or YAML_UTF16
#endif
typedef yaml_utf8_t yaml_char_t;
#else
#ifdef YAML_UTF16
typedef yaml_utf16_t yaml_char_t;
#else
#error Must define YAML_UTF8 or YAML_UTF16
#endif
#endif
/* return value for push function, tell parser if you want to stop */
typedef enum {
YAML_MORE = 1, /* producer should continue to fire events */
YAML_STOP = 0 /* producer should stop firing events */
} yaml_more_t;
/* push bytecodes from a producer to a consumer
* where arg is null terminated /w a length */
typedef void * yaml_consumer_t;
typedef
yaml_more_t
(*yaml_push_t)(
yaml_consumer_t self,
yaml_code_t code,
const yaml_char_t *arg,
size_t arglen
);
/* pull bytecodes by the producer from the consumer, where
* producer must null terminate buff and return the number
* of sizeof(yaml_char_t) bytes used */
typedef void * yaml_producer_t;
typedef
size_t
(*yaml_pull_t)(
yaml_producer_t self,
yaml_code_t *code,
yaml_char_t *buff, /* at least 1K buffer */
size_t buffsize
); /* returns number of bytes used in the buffer */
/* canonical helper to show how to hook up a parser (as a push
* producer) to an emitter (as a push consumer) */
#define YAML_PULL2PUSH(pull, producer, push, consumer) \
do { \
yaml_code_t code = YAML_NOTICE; \
yaml_more_t more = YAML_CONTINUE; \
yaml_char_t buff[1024]; \
size_t size = 0; \
memset(buff, 0, 1024 * sizeof(yaml_char_t)); \
while( code && more) { \
size = (pull)((producer),&code, buff, 1024); \
assert(size < 1024 && !buff[size]); \
more = (push)((consumer),code, buff, size); \
} \
buff[0] = 0; \
(push)((consumer),YAML_FINISH, buff, 0); \
} while(1)
#endif