From 21711058797d65690ac6a0c62f23ee1f18cc7874 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sun, 20 Jan 2019 15:45:34 -0800 Subject: [PATCH] Fix bug in NTriples::Reader.unescape where it was overeager; now uses a string scanner to iterate through the string buffer. --- lib/rdf/ntriples/reader.rb | 36 ++++++++++++++++-------------------- spec/ntriples_spec.rb | 3 ++- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/lib/rdf/ntriples/reader.rb b/lib/rdf/ntriples/reader.rb index 7a1207d9..ac8aa535 100644 --- a/lib/rdf/ntriples/reader.rb +++ b/lib/rdf/ntriples/reader.rb @@ -135,7 +135,6 @@ def self.parse_node(input, **options) # @return [RDF::URI] def self.parse_uri(input, intern: false, **options) if input =~ URIREF - uri_str = unescape($1) RDF::URI.send(intern ? :intern : :new, unescape($1)) end end @@ -178,26 +177,23 @@ def self.parse_literal(input, **options) def self.unescape(string) # Note: avoiding copying the input string when no escaping is needed # greatly reduces the number of allocations and the processing time. - unless string.encoding == Encoding::UTF_8 - string = string.dup.force_encoding(Encoding::UTF_8) - end - - has_escape_chars = ESCAPE_CHARS_ESCAPED_REGEXP.match?(string) - has_uchar = UCHAR.match?(string) - - string = string.dup if has_escape_chars || has_uchar + string = string.dup.force_encoding(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8 + scanner = StringScanner.new(string) - # Decode \t|\n|\r|\"\'\|\\ character escapes using Regexp: - string.gsub!(ESCAPE_CHARS_ESCAPED_REGEXP) do - ESCAPE_CHARS_ESCAPED.fetch($~[0]) - end if has_escape_chars + buffer = "" - # Decode \uXXXX and \UXXXXXXXX code points: - string.gsub!(UCHAR) do - [($1 || $2).hex].pack('U*') - end if has_uchar + while !scanner.eos? + buffer << if scanner.scan(ESCAPE_CHARS_ESCAPED_REGEXP) + ESCAPE_CHARS_ESCAPED[scanner.matched] + elsif scanner.scan(UCHAR) + scanner.matched.sub(UCHAR) {[($1 || $2).hex].pack('U*')} + else + # Scan one character + scanner.getch + end + end - string + buffer end ## @@ -257,7 +253,7 @@ def read_uriref(intern: false, **options) uri.canonicalize! if canonicalize? uri end - rescue ArgumentError => e + rescue ArgumentError log_error("Invalid URI (found: \"<#{uri_str}>\")", lineno: lineno, token: "<#{uri_str}>", exception: RDF::ReaderError) end @@ -265,7 +261,7 @@ def read_uriref(intern: false, **options) # @return [RDF::Node] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (nodeID) def read_node - if node_id = match(NODEID) + if node_id = match(NODEID) @nodes ||= {} @nodes[node_id] ||= RDF::Node.new(node_id) end diff --git a/spec/ntriples_spec.rb b/spec/ntriples_spec.rb index 9680e388..d5509d12 100644 --- a/spec/ntriples_spec.rb +++ b/spec/ntriples_spec.rb @@ -213,12 +213,13 @@ "_\\u6C34_" => "_\xE6\xB0\xB4_", "\\u677E\\u672C \\u540E\\u5B50"=> "松本 后子", "D\\u00FCrst" => "Dürst", + "\\u0039" => "9", + "\\\\u0039" => "\\u0039", } strings.each do |string, unescaped| specify string do unescaped = unescaped.encode(Encoding::UTF_8) expect(reader.unescape(string.freeze)).to eq unescaped - end end end