README.BYTECODE

#
# Reflects Oren's comments, adds yamlbyte.h at the bottom
#
subject: Revision #4 of YAML Bytecodes
summary: >
    This proposal defines a 'preparsed' format where a YAML syntax
    is converted into a series of events, as bytecodes.   Each bytecode
    appears on its own line, starting with a single character and ending
    with a line feed character, '\n'.
codes:
  #
  # Primary Bytecodes  (Capital Letters)
  #
  # These bytecodes form the minimum needed to represent YAML information
  # from the serial model (ie, without format and comments)
  #
    'D':
        name: Document
        desc: >
          Indicates that a document has begun, either it is
          the beginning of a YAML stream, or a --- has been
          found.   Thus, an empty document is expressed
          as "D\n"
    'V':
        name: Directive
        desc: >
          This represents any YAML directives immediately following
          a 'D' bytecode.  For example '--- %YAML:1.0' produces the
          bytecode "D\nVYAML:1.0\n".
    'P':
        name: Pause Stream
        desc: >
          This is the instruction when a document is terminated, but
          another document has not yet begun.  Thus, it is optional,
          and typically used to pause parsing.  For example, 
          a stream starting with an empty document, but then in a
          hold state for the next document would be: "D\nP\n"
    '\z':
        name: Finish (end stream)
        desc: >
          YAML bytecodes are meant to be passable as a single "C"
          string, and thus the null terminator can optionally be
          used to signal the end of a stream.  When writing bytecodes
          out to a flat file, the file need not contain a null
          terminator; however, when read into memory it should
          always have a null terminator.
    'M':
        name: Mapping
        desc: >
          Indicates the begin of a mapping, children of the 
          mapping are provided as a series of K1,V1,K2,V2 
          pairs as they are found in the input stream.  For 
          example, the bytecodes for "{ a: b, c: d }" would 
          be "M\nSa\nSb\nSc\nSd\nE\n" 
    'Q': 
        name: Sequence
        desc: >
          Indicates the begin of a sequence, children are provided
          following till a '.' bytecode is encountered.  So, the
          bytecodes for "[ one, two ]" would be "Q\nSone\nStwo\nE\n"
    'E': 
        name: End Collection
        desc: >
          This closes the outermost Collection (Mapping, Sequence),
          note that the document has one and only one node following
          it, therefore it is not a branch.
    'S':
        name: Scalar
        desc: >
          This indicates the start of a scalar value, which can
          be continued by the 'N' and 'C' bytecodes.   This bytecode
          is used for sequence entries, keys, values, etc.
    'C': 
        name: Scalar Continuation
        desc: >
          Since a scalar may not fit within a buffer, and since it 
          may not contain a \n character, it may have to be broken
          into several chunks.
    'N':
        name: Normalized New Line (in a scalar value)
        desc: >
          Scalar values must be chunked so that new lines and
          null values do not occur within a 'S' or 'C' bytecode
          (in the bytecodes, all other C0 need not be escaped).
          This bytecode is then used to represent one or more
          newlines, with the number of newlines optionally 
          following.   For example,
          "Hello\nWorld" would be "SHello\nN\nCWorld\n", and
          "Hello\n\n\nWorld" is "SHello\nN3\nCWorld\n"          

          If the new line is an LS or a PS, the N bytecode can
          be followed with a L or P.   Thus, "Hello\PWorld\L" is
          reported "SHello\nNP\nWorld\NL\n"

    'Z':
        name: Null Character (in a scalar value)
        desc: >
          As in normalized new lines above, since the null character
          cannot be used in the bytecodes, is must be escaped, ie,
          "Hello\zWorld" would be "SHello\nZ\nCWorld\n".  
    'A':
        name: Alias
        desc: >
          This is used when ever there is an alias node, for 
          example, "[ &X one, *X ]" would be normalized
          to "S\nAX\nSone\nRX\nE\n" -- in this example, the 
          anchor bytecode applies to the very next content 
          bytecode.
    'R': 
        name: Reference (Anchor)
        desc: >
          This bytecode associates an anchor with the very next
          content node, see the 'A' alias bytecode.
    'T':
        name: Transfer
        desc: >
          This is the transfer method.  If the value begins with
          a '!', then it is not normalized.  Otherwise, the value
          is a fully qualified URL, with a semicolon.  The transfer
          method applies only to the node immediately following, 
          and thus it can be seen as a modifier like the anchor.
          For example, "Ttag:yaml.org,2002:str\nSstring\n" is 
          normalized, "T!str\nSstring\n" is not. 
  #
  # Formatting bytecodes (lower case)
  #
  # The following bytecodes are purely at the syntax level and
  # useful for pretty printers and emitters.  Since the range of
  # lower case letters is contiguous, it could be easy for a 
  # processor to simply ignore all bytecodes in this range.
  #
    'c': 
        name: Comment
        desc: >
          This is a single line comment.  It is terminated like all
          of the other variable length items, with a '\n'.
    'i':
        name: Indent
        desc: >
          Specifies number of additional spaces to indent for
          subsequent block style nodes, "i4\n" specifies 4 char indent.
    's':
        name: Scalar styling
        desc: >
          This bytecode, is followed with one of the following
          items to indicate the style to be used for the very 
          next content node.  It is an error to specify a style for 
          a scalar other than double quoted when it must be escaped.  
          Furthermore, there must be agreement between the style
          and the very next content node, in other words, a scalar
          style requires that the next content node be an S.

          > flow scalar
          " double quoted scalar
          ' single quoted scalar
          | literal scalar
          p plain scalar
          { inline mapping
          [ inline sequence
          b block style (for mappings and sequences'")

   #
   # Advanced bytecodes (not alphabetic)
   #
   # These are optional goodies which one could find useful.
   #
    '#': 
        name: Line Number
        desc: >
          This bytecode allows the line number of the very next
          node to be reported.   
    '!':
        name: Notice
        desc: >
          This is a message sent from the producer to the consumer
          regarding the state of the stream or document.  It does
          not necessarly end a stream, as the 'finish' bytecode can
          be used for this purpose.  This signal has a packed format,
          with the error number, a comma, and a textual message:
              "#22\n!73,Indentation mismatch\n"
              "#132\n!84,Tabs are illegal for indentation\n"
    ',':
        name: Span
        desc: >
          This bytecode gives the span of the very next 'S', 'M', 
          or 'Q' bytecode -- including its subordinates.  For scalars,
          it includes the span of all subordinate 'N' and 'C' codes.
          For mappings or sequences, this gives the length all the
          way to the corresponding 'E' bytecode so that the entire 
          branch can be skipped.   The length is given starting at
          the corresponding 'S', 'M' or 'Q' bytecode and extends
          to the first character following subordinate nodes.
          
          Since this length instruction is meant to be used to 'speed'
          things up, and since calculating the length via hand is not
          really ideal, the length is expressed in Hex.  This will allow
          programs to easily convert the length to an actual value
          (converting from hex to integers is easier than decimal).
          Furthermore, all leading x's are ignored (so that they can
          be filled in later) and if the bytecode value is all x's, 
          then the length is unknown.  Lastly, this length is expressed
          in 8 bit units for UTF-8, and 16 bit units for UTF-16. 

          For example,
             --- [[one, two], three]
          Is expressed as,
             "?25\nD\n?x1E\nQ\n?xxE\nQ\nSone\nStwo\nE\nSthree\nE\n"

          Thus it is seen that the address of D plus 37 is the null 
          terminator for the string, the first 'Q' plus 30 also
          gives the null teriminator, and the second 'Q' plus
          14 jumps to the opening 'S' for the third scalar.
    '@':
        name: Allocate
        desc: >
          This is a hint telling the processor how many items
          are in the following collection (mapping pairs, or
          sequence values), or how many character units need
          to be allocated to hold the next value.  Clearly this
          is encoding specific value.   The length which 
          follows is in hex (not decimal).

          For example, "one", could be  "@x3\nSone"

design:
  - 
    name: streaming support
    problem: >
      The interface should ideally allow for a YAML document to be
      moved incrementally as a stream through a process.   In particular,
      YAML is inheritently line oriented, thus the interface should
      probably reflect this fundamental character.
    solution: >
      The bytecodes deliver scalars as chunks, each chunk limited to
      at most one line.   While this is not ideal for passing large
      binary objects, it is simple and easy to understand.
  -  
    name: push
    problem: >
      The most common 'parsers' out there for YAML are push style, where
      the producer owns the 'C' program stack, and the consumer keeps 
      its state as a heap object.  Ideal use of a push interface is an
      emitter, since this allows the sender (the application program)
      to use the program stack and thus keep its state on the call stack
      in local, automatic variables.
    solution: >
      A push interface simply can call a single event handler with a 
      (bytecode, payload) tuple.  Since the core complexity is in the 
      bytecodes, the actual function signature is straight-forward 
      allowing for relative language independence.  Since the bytecode
      is always one character, the event handler could just receive
      a string where the tuple is implicit.
  - 
    name: pull
    problem: >
      The other alternative for a streaming interface is a 'pull' mechanism,
      or iterator model where the consumer owns the C stack and the producer
      keeps any state needed as a heap object.  Ideal use of a pull
      interface is a parser, since this allows the receiver (the application
      program) to use the program stack, keeping its state on the call stack
      in local variables.
    solution: >
      A pull interface would also be a simple function, that when called
      filles a buffer with binary node(s).   Or, in a language with 
      garbage collection, could be implemented as an iterator returning
      a string containing the bytecode line (bytecode followed immediately
      by the bytecode argument as a single string) or as a tuple.
  - 
    name: pull2push
    problem: >
      This is done easily via a small loop which pulls from the 
      iterator and pushes to the event handler.
    solution: >
      For python, assuming the parser is implemented as an iterator
      where one can 'pull' bytecode, args tuples, and assuming the
      emitter has a event callback taking a bytecode, args tuple, 
      we have:

        def push2pull(parser, emitter): 
           for (bytecode, args) in parser:
               emitter.push(bytecode, args)

  - 
    name: push2pull
    problem: >
      This requires the entire YAML stream be cashed in memory, or
      each of the two stages in a thread or different continuation
      with shared memory or pipe between them.
    solution: >
      This use case seems much easier with a binary stream; that is,
      one need not convert the style of functions between the push
      vs pull pattern.   And, for languages supporting continuations,
      (ruby) perhaps push vs pull is not even an issue...   for a
      language like python, one would use the threaded Queue object,
      one thread pushes (bytecode, args) tuples into the Queue, while
      the other thread pulls the tuples out.  Simple.
  -
    name: neutrality
    problem: >
      It would be ideal of the C Program interface was simple enough
      to be independent of programming language.   In an ideal case,
      imagine a flow of YAML structured data through various processing
      stages on a server; where each processing stage is written in
      a different programming language.
    solution: >
      While it may be hard for each language to write a syntax parser
      filled with all of the little details, it would be much much
      easier to write a parser for these bytecodes; as it involves
      simple string handling, dispatching on the first character in
      each string.  
  - 
    name: tools
    problem: > 
      A goal of mine is to have a YPATH expression language, a schema
      language, and a transformation language.   I would like these items
      to be reusable by a great number of platforms/languages, and in
      particular as its own callable processing stage.
    solution: >
      If such an expression language was written on top of a bytecode
      format like this, via a simple pull function (/w adapters for
      push2pull and pull2push) quite a bit of reusability could emerge.
      Imagine a schema validator which is injected into the bytecode stream
      and it is an identity operation unless an exception occurs, in 
      which case, it terminates the document and makes the next document
      be a description of the validation error.
  - 
    name: encoding
    problem: >
      Text within the bytecode format must be given an encoding.  There are
      several considerations at hand listed below.
    solution: >
      The YAML bytecode format uses the same encodings as YAML itself,
      and thus is independent of actual encoding.  A parser library should
      have several functions to convert between the encodings. 
examples:
  - 
    yaml: |
      ---
      - plain
      - >
        this is a flow scalar
      - > 
        another flow scalar which is continued
        on a second line and indented 2 spaces
      - &001 !str |
        This is a block scalar, both typed
        and anchored
      - *001 # this was an alias
      - "This is a \"double quoted\" scalar"
    bytecode: |
      D
      Q
      Splain
      f
      Sthis is a flow scalar
      Sanother flow scalar which is continued
      Con a second line and indented 2 spaces
      b
      a001
      t!str
      SThis is a block scalar, both typed
      N
      Cand anchored
      R001
      cthis was an alias
      d
      SThis is a "double quoted" scalar
      E
cheader: |
    /*  yamlbyte.h
     *
     *  The YAML bytecode "C" interface header file.   See the YAML bytecode
     *  reference for bytecode sequence rules and for the meaning of each
     *  bytecode.
     */
    
    #ifndef YAMLBYTE_H
    #define YAMLBYTE_H
    #include <stddef.h>
    /* list out the various YAML bytecodes */
    typedef enum {
        /* content bytecodes */
        YAML_FINISH    = 0,
        YAML_DOCUMENT  = 'D',
        YAML_DIRECTIVE = 'V',
        YAML_PAUSE     = 'P',
        YAML_MAPPING   = 'M',
        YAML_SEQUENCE  = 'S',
        YAML_ENDMAPSEQ = 'E',
        YAML_SCALAR    = 'S',
        YAML_CONTINUE  = 'C',
        YAML_NEWLINE   = 'N',
        YAML_NULLCHAR  = 'Z',
        YAML_ALIAS     = 'A',
        YAML_ANCHOR    = 'R',
        YAML_TRANSFER  = 'T',
        /* formatting bytecodes */
        YAML_COMMENT = 'c',
        YAML_INDENT  = 'i',
        YAML_STYLE   = 's',
        /* other bytecodes */
        YAML_LINENUMBER = '#',
        YAML_NOTICE = '!',
        YAML_SPAN   = ',',
        YAML_ALLOC  = '@'
    } yaml_code_t;
    
    /* additional modifiers for the YAML_STYLE bytecode */
    typedef enum {
       YAML_FLOW = '>',
       YAML_LITERAL = '|',
       YAML_BLOCK = 'b',
       YAML_PLAIN = 'p',
       YAML_INLINE_MAPPING = '{',
       YAML_INLINE_SEQUENCE = '}',
       YAML_SINGLE_QUOTED = 39,
       YAML_DOUBLE_QUOTED = '"'
    } yaml_style_t;
    
    typedef unsigned char yaml_utf8_t;
    typedef unsigned short yaml_utf16_t;
    #ifdef YAML_UTF8
      #ifdef YAML_UTF16
        #error Must only define YAML_UTF8 or YAML_UTF16
      #endif
      typedef yaml_utf8_t yaml_char_t;
    #else
      #ifdef YAML_UTF16
        typedef yaml_utf16_t yaml_char_t;
      #else
        #error Must define YAML_UTF8 or YAML_UTF16
      #endif
    #endif
    
    /* return value for push function, tell parser if you want to stop */
    typedef enum {  
        YAML_MORE = 1, /* producer should continue to fire events */
        YAML_STOP = 0  /* producer should stop firing events      */
    } yaml_more_t;
    
    /* push bytecodes from a producer to a consumer
     * where arg is null terminated /w a length */
    typedef void * yaml_consumer_t;
    typedef 
      yaml_more_t
       (*yaml_push_t)(
         yaml_consumer_t self,
         yaml_code_t code,
         const yaml_char_t *arg,
         size_t arglen
       );
    
    /* pull bytecodes by the producer from the consumer, where 
     * producer must null terminate buff and return the number
     * of sizeof(yaml_char_t) bytes used */
    typedef void * yaml_producer_t;
    typedef 
      size_t  
        (*yaml_pull_t)(
          yaml_producer_t self,
          yaml_code_t *code, 
          yaml_char_t *buff,     /* at least 1K buffer */
          size_t buffsize
        );  /* returns number of bytes used in the buffer */
    
    /* canonical helper to show how to hook up a parser (as a push
     * producer) to an emitter (as a push consumer)  */
    #define YAML_PULL2PUSH(pull, producer, push, consumer)      \
      do {                                                      \
          yaml_code_t code = YAML_NOTICE;                       \
          yaml_more_t more = YAML_CONTINUE;                     \
          yaml_char_t buff[1024];                               \
          size_t      size = 0;                                 \
          memset(buff, 0, 1024 * sizeof(yaml_char_t));          \
          while( code && more) {                                \
              size = (pull)((producer),&code, buff, 1024);      \
              assert(size < 1024 && !buff[size]);               \
              more = (push)((consumer),code, buff, size);       \
          }                                                     \
          buff[0] = 0;                                          \
          (push)((consumer),YAML_FINISH, buff, 0);              \
      } while(1)
    
    #endif