From f859e032fc5d2e6ed34c43444f0d18ccd3b68a6e Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sat, 19 Jan 2019 16:12:21 -0800 Subject: [PATCH] Change N-Triples literal output encoding to limit the number of ECHAR escapes used based on [Canonical form of N-Triples](https://www.w3.org/TR/n-triples/#canonical-ntriples): > Within STRING_LITERAL_QUOTE, only the characters `U+0022`, `U+005C`, `U+000A`, `U+000D` are encoded using `ECHAR`. `ECHAR **must not** be used for characters that are allowed directly in STRING_LITERAL_QUOTE. --- lib/rdf/ntriples/writer.rb | 7 ++----- spec/ntriples_spec.rb | 20 +++++++++++--------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/rdf/ntriples/writer.rb b/lib/rdf/ntriples/writer.rb index 95915faf..aacfecd3 100644 --- a/lib/rdf/ntriples/writer.rb +++ b/lib/rdf/ntriples/writer.rb @@ -116,6 +116,8 @@ def self.escape_unicode(u, encoding) # sequences, otherwise, assume the test-cases escape sequences. Otherwise, # the N-Triples recommendation includes `\b` and `\f` escape sequences. # + # Within STRING_LITERAL_QUOTE, only the characters `U+0022`, `U+005C`, `U+000A`, `U+000D` are encoded using `ECHAR`. `ECHAR` must not be used for characters that are allowed directly in STRING_LITERAL_QUOTE. + # # @param [Integer, #ord] u # @return [String] # @raise [ArgumentError] if `u` is not a valid Unicode codepoint @@ -124,15 +126,10 @@ def self.escape_unicode(u, encoding) def self.escape_ascii(u, encoding) case (u = u.ord) when (0x00..0x07) then escape_utf16(u) - when (0x08) then (encoding && encoding == Encoding::ASCII ? escape_utf16(u) : "\\b") - when (0x09) then "\\t" when (0x0A) then "\\n" - when (0x0B) then escape_utf16(u) - when (0x0C) then (encoding && encoding == Encoding::ASCII ? escape_utf16(u) : "\\f") when (0x0D) then "\\r" when (0x0E..0x1F) then escape_utf16(u) when (0x22) then "\\\"" - when (0x27) then "\\'" when (0x5C) then "\\\\" when (0x7F) then escape_utf16(u) when (0x00..0x7F) then u.chr diff --git a/spec/ntriples_spec.rb b/spec/ntriples_spec.rb index 18637a20..9680e388 100644 --- a/spec/ntriples_spec.rb +++ b/spec/ntriples_spec.rb @@ -672,16 +672,18 @@ # @see http://www.w3.org/TR/rdf-testcases/#ntrip_strings it "should correctly escape ASCII characters (#x0-#x7F)" do - (0x00..0x08).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } - expect(writer.escape(0x09.chr, encoding)).to eq "\\t" + (0x00..0x07).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } + expect(writer.escape(0x08.chr, encoding)).to eq "\b" + expect(writer.escape(0x09.chr, encoding)).to eq "\t" expect(writer.escape(0x0A.chr, encoding)).to eq "\\n" - (0x0B..0x0C).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } + expect(writer.escape(0x0B.chr, encoding)).to eq "\v" + expect(writer.escape(0x0C.chr, encoding)).to eq "\f" expect(writer.escape(0x0D.chr, encoding)).to eq "\\r" (0x0E..0x1F).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } (0x20..0x21).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } expect(writer.escape(0x22.chr, encoding)).to eq "\\\"" (0x23..0x26).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } - expect(writer.escape(0x27.chr, encoding)).to eq "\\'" + expect(writer.escape(0x27.chr, encoding)).to eq "'" (0x28..0x5B).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } expect(writer.escape(0x5C.chr, encoding)).to eq "\\\\" (0x5D..0x7E).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } @@ -733,17 +735,17 @@ # @see http://www.w3.org/TR/rdf-testcases/#ntrip_strings it "should correctly escape ASCII characters (#x0-#x7F)" do (0x00..0x07).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } - expect(writer.escape(0x08.chr, encoding)).to eq (encoding ? "\\b" : "\\u0008") - expect(writer.escape(0x09.chr, encoding)).to eq "\\t" + expect(writer.escape(0x08.chr, encoding)).to eq "\b" + expect(writer.escape(0x09.chr, encoding)).to eq "\t" expect(writer.escape(0x0A.chr, encoding)).to eq "\\n" - (0x0B..0x0B).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } - expect(writer.escape(0x0C.chr, encoding)).to eq (encoding ? "\\f" : "\\u000C") + expect(writer.escape(0x0B.chr, encoding)).to eq "\v" + expect(writer.escape(0x0C.chr, encoding)).to eq "\f" expect(writer.escape(0x0D.chr, encoding)).to eq "\\r" (0x0E..0x1F).each { |u| expect(writer.escape(u.chr, encoding)).to eq "\\u#{u.to_s(16).upcase.rjust(4, '0')}" } (0x20..0x21).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } expect(writer.escape(0x22.chr, encoding)).to eq "\\\"" (0x23..0x26).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } - expect(writer.escape(0x27.chr, encoding)).to eq "\\'" + expect(writer.escape(0x27.chr, encoding)).to eq "'" (0x28..0x5B).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr } expect(writer.escape(0x5C.chr, encoding)).to eq "\\\\" (0x5D..0x7E).each { |u| expect(writer.escape(u.chr, encoding)).to eq u.chr }