Update parser to support comment body (#22)

This updates the yacc file and parser to recognise the special @comment entry type. https://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html#comment
nickng · Dec 12, 2023 · bc6976e · bc6976e
1 parent 7d26f02
commit bc6976e
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 71 deletions.
diff --git a/bibtex.y b/bibtex.y
@@ -24,7 +24,7 @@ var bib *BibTex // Only for holding current bib
 
 %token tCOMMENT tSTRING tPREAMBLE
 %token tATSIGN tCOLON tEQUAL tCOMMA tPOUND tLBRACE tRBRACE tDQUOTE tLPAREN tRPAREN
-%token <strval> tBAREIDENT tIDENT
+%token <strval> tBAREIDENT tIDENT tCOMMENTBODY
 %type <bibtex> bibtex
 %type <bibentry> bibentry
 %type <bibtag> tag stringentry
@@ -47,8 +47,7 @@ bibentry : tATSIGN tBAREIDENT tLBRACE tBAREIDENT tCOMMA tags tRBRACE { $$ = NewB
          | tATSIGN tBAREIDENT tLPAREN tBAREIDENT tCOMMA tags tRPAREN { $$ = NewBibEntry($2, $4); for _, t := range $6 { $$.AddField(t.key, t.val) } }
          ;
 
-commententry : tATSIGN tCOMMENT tLBRACE longstring tRBRACE {}
-             | tATSIGN tCOMMENT tLPAREN longstring tRBRACE {}
+commententry : tATSIGN tCOMMENT tCOMMENTBODY { }
              ;
 
 stringentry : tATSIGN tSTRING tLBRACE tBAREIDENT tEQUAL longstring tRBRACE { $$ = &bibTag{key: $4, val: $6 } }

diff --git a/bibtex.y.go b/bibtex.y.go
diff --git a/docs.go b/docs.go
@@ -3,15 +3,15 @@
 // The package contains a simple parser and data structure to represent bibtex
 // records.
 //
-// Supported syntax
+// # Supported syntax
 //
 // The basic syntax is:
 //
-//     @BIBTYPE{IDENT,
-//         key1 = word,
-//         key2 = "quoted",
-//         key3 = {quoted},
-//     }
+//	@BIBTYPE{IDENT,
+//	    key1 = word,
+//	    key2 = "quoted",
+//	    key3 = {quoted},
+//	}
 //
 // where BIBTYPE is the type of document (e.g. inproceedings, article, etc.)
 // and IDENT is a string identifier.
@@ -20,5 +20,4 @@
 // found in the link below. If there are any problems, please file any issues
 // with a minimal working example at the GitHub repository.
 // http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html
-//
 package bibtex // import "github.com/nickng/bibtex"
diff --git a/scanner.go b/scanner.go
@@ -12,8 +12,9 @@ var parseField bool
 
 // scanner is a lexical scanner
 type scanner struct {
-	r   *bufio.Reader
-	pos tokenPos
+	commentMode bool
+	r           *bufio.Reader
+	pos         tokenPos
 }
 
 // newScanner returns a new instance of scanner.
@@ -79,6 +80,13 @@ func (s *scanner) Scan() (tok token, lit string, err error) {
 		if parseField {
 			return s.scanBraced()
 		}
+		// If we're reading a comment, return everything after {
+		// to the next @-sign (exclusive)
+		if s.commentMode {
+			s.unread()
+			commentBodyTok, commentBody := s.scanCommentBody()
+			return commentBodyTok, commentBody, nil
+		}
 		return tLBRACE, string(ch), nil
 	case '}':
 		if parseField { // reset parseField if reached end of entry.
@@ -122,6 +130,7 @@ func (s *scanner) scanBare() (token, string) {
 	}
 	str := buf.String()
 	if strings.ToLower(str) == "comment" {
+		s.commentMode = true
 		return tCOMMENT, str
 	} else if strings.ToLower(str) == "preamble" {
 		return tPREAMBLE, str
@@ -193,6 +202,28 @@ func (s *scanner) scanQuoted() (token, string) {
 	return tILLEGAL, buf.String()
 }
 
+// skipCommentBody is a scan method used for reading bibtex
+// comment item by reading all runes until the next @.
+//
+// e.g.
+// @comment{...anything can go here even if braces are unbalanced@
+// comment body string will be "...anything can go here even if braces are unbalanced"
+func (s *scanner) scanCommentBody() (token, string) {
+	var buf bytes.Buffer
+	for {
+		if ch := s.read(); ch == eof {
+			break
+		} else if ch == '@' {
+			s.unread()
+			break
+		} else {
+			_, _ = buf.WriteRune(ch)
+		}
+	}
+	s.commentMode = false
+	return tCOMMENTBODY, buf.String()
+}
+
 // ignoreWhitespace consumes the current rune and all contiguous whitespace.
 func (s *scanner) ignoreWhitespace() {
 	for {