Skip to content

Commit

Permalink
Fix bug in NTriples::Reader.unescape where it was overeager; now uses…
Browse files Browse the repository at this point in the history
… a string scanner to iterate through the string buffer.
  • Loading branch information
gkellogg committed Jan 20, 2019
1 parent f859e03 commit 2171105
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 21 deletions.
36 changes: 16 additions & 20 deletions lib/rdf/ntriples/reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def self.parse_node(input, **options)
# @return [RDF::URI]
def self.parse_uri(input, intern: false, **options)
if input =~ URIREF
uri_str = unescape($1)
RDF::URI.send(intern ? :intern : :new, unescape($1))
end
end
Expand Down Expand Up @@ -178,26 +177,23 @@ def self.parse_literal(input, **options)
def self.unescape(string)
# Note: avoiding copying the input string when no escaping is needed
# greatly reduces the number of allocations and the processing time.
unless string.encoding == Encoding::UTF_8
string = string.dup.force_encoding(Encoding::UTF_8)
end

has_escape_chars = ESCAPE_CHARS_ESCAPED_REGEXP.match?(string)
has_uchar = UCHAR.match?(string)

string = string.dup if has_escape_chars || has_uchar
string = string.dup.force_encoding(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
scanner = StringScanner.new(string)

# Decode \t|\n|\r|\"\'\|\\ character escapes using Regexp:
string.gsub!(ESCAPE_CHARS_ESCAPED_REGEXP) do
ESCAPE_CHARS_ESCAPED.fetch($~[0])
end if has_escape_chars
buffer = ""

# Decode \uXXXX and \UXXXXXXXX code points:
string.gsub!(UCHAR) do
[($1 || $2).hex].pack('U*')
end if has_uchar
while !scanner.eos?
buffer << if scanner.scan(ESCAPE_CHARS_ESCAPED_REGEXP)
ESCAPE_CHARS_ESCAPED[scanner.matched]
elsif scanner.scan(UCHAR)
scanner.matched.sub(UCHAR) {[($1 || $2).hex].pack('U*')}
else
# Scan one character
scanner.getch
end
end

string
buffer
end

##
Expand Down Expand Up @@ -257,15 +253,15 @@ def read_uriref(intern: false, **options)
uri.canonicalize! if canonicalize?
uri
end
rescue ArgumentError => e
rescue ArgumentError
log_error("Invalid URI (found: \"<#{uri_str}>\")", lineno: lineno, token: "<#{uri_str}>", exception: RDF::ReaderError)
end

##
# @return [RDF::Node]
# @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (nodeID)
def read_node
if node_id = match(NODEID)
if node_id = match(NODEID)
@nodes ||= {}
@nodes[node_id] ||= RDF::Node.new(node_id)
end
Expand Down
3 changes: 2 additions & 1 deletion spec/ntriples_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,13 @@
"_\\u6C34_" => "_\xE6\xB0\xB4_",
"\\u677E\\u672C \\u540E\\u5B50"=> "松本 后子",
"D\\u00FCrst" => "Dürst",
"\\u0039" => "9",
"\\\\u0039" => "\\u0039",
}
strings.each do |string, unescaped|
specify string do
unescaped = unescaped.encode(Encoding::UTF_8)
expect(reader.unescape(string.freeze)).to eq unescaped

end
end
end
Expand Down

0 comments on commit 2171105

Please sign in to comment.