From e9a1542751cb7e3e5c4c8cc60083dd62d280d0b3 Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Thu, 3 Aug 2023 21:51:23 +0200 Subject: [PATCH 1/5] Conservative normalization --- lib/http/uri.rb | 17 +++++++- spec/lib/http/uri_spec.rb | 86 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/lib/http/uri.rb b/lib/http/uri.rb index 9c8a54c3..49fb1426 100644 --- a/lib/http/uri.rb +++ b/lib/http/uri.rb @@ -37,6 +37,9 @@ class URI # @private HTTPS_SCHEME = "https" + # @private + PERCENT_ENCODE = /[\x00-\x20\u007F-\u{1FFFF}]+/.freeze + # @private NORMALIZER = lambda do |uri| uri = HTTP::URI.parse uri @@ -44,8 +47,8 @@ class URI HTTP::URI.new( :scheme => uri.normalized_scheme, :authority => uri.normalized_authority, - :path => uri.normalized_path, - :query => uri.query, + :path => uri.path.empty? ? "/" : percent_encode(Addressable::URI.normalize_path(uri.path), PERCENT_ENCODE), + :query => percent_encode(uri.query, PERCENT_ENCODE), :fragment => uri.normalized_fragment ) end @@ -71,6 +74,16 @@ def self.form_encode(form_values, sort = false) Addressable::URI.form_encode(form_values, sort) end + # Percent-encode all characters matching a regular expression. + # + # @param [String] string raw string + # @param [Regexp] pattern regular expression matching characters to percent-encode + # + # @return [String] encoded value + def self.percent_encode(string, pattern) + string&.gsub(pattern) { |substr| substr.encode(Encoding::UTF_8).bytes.map { |c| format("%%%02X", c) }.join } + end + # Creates an HTTP::URI instance from the given options # # @param [Hash, Addressable::URI] options_or_uri diff --git a/spec/lib/http/uri_spec.rb b/spec/lib/http/uri_spec.rb index 4a330d61..10d3a84f 100644 --- a/spec/lib/http/uri_spec.rb +++ b/spec/lib/http/uri_spec.rb @@ -11,6 +11,92 @@ subject(:https_uri) { described_class.parse(example_https_uri_string) } subject(:ipv6_uri) { described_class.parse(example_ipv6_uri_string) } + describe "NORMALIZER" do + it "lower-cases scheme" do + expect(HTTP::URI::NORMALIZER.call("HttP://example.com").scheme).to eq "http" + end + + it "lower-cases hostname" do + expect(HTTP::URI::NORMALIZER.call("http://EXAMPLE.com").host).to eq "example.com" + end + + it "decodes percent-encoded hostname" do + expect(HTTP::URI::NORMALIZER.call("http://ex%61mple.com").host).to eq "example.com" + end + + it "removes trailing period in hostname" do + expect(HTTP::URI::NORMALIZER.call("http://example.com.").host).to eq "example.com" + end + + it "IDN-encodes non-ASCII hostname" do + expect(HTTP::URI::NORMALIZER.call("http://exämple.com").host).to eq "xn--exmple-cua.com" + end + + it "ensures path is not empty" do + expect(HTTP::URI::NORMALIZER.call("http://example.com").path).to eq "/" + end + + it "preserves double slashes in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com//a///b").path).to eq "//a///b" + end + + it "resolves single-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a/./b").path).to eq "/a/b" + end + + it "resolves double-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a/b/../c").path).to eq "/a/c" + end + + it "resolves leading double-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/../a/b").path).to eq "/a/b" + end + + it "percent-encodes control characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/\x00\x7F\n").path).to eq "/%00%7F%0A" + end + + it "percent-encodes space in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a b").path).to eq "/a%20b" + end + + it "percent-encodes non-ASCII characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/キョ").path).to eq "/%E3%82%AD%E3%83%A7" + end + + it "does not percent-encode non-special characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/~.-_!$&()*,;=:@{}").path).to eq "/~.-_!$&()*,;=:@{}" + end + + it "preserves escape sequences in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/%41").path).to eq "/%41" + end + + it "allows no query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com").query).to be_nil + end + + it "percent-encodes control characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?\x00\x7F\n").query).to eq "%00%7F%0A" + end + + it "percent-encodes space in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?a b").query).to eq "a%20b" + end + + it "percent-encodes non-ASCII characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com?キョ").query).to eq "%E3%82%AD%E3%83%A7" + end + + it "does not percent-encode non-special characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?~.-_!$&()*,;=:@{}?").query).to eq "~.-_!$&()*,;=:@{}?" + end + + it "preserves escape sequences in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?%41").query).to eq "%41" + end + end + it "knows URI schemes" do expect(http_uri.scheme).to eq "http" expect(https_uri.scheme).to eq "https" From 36bcb95e2e1f47d6ca1d1147727f6ba105eb4676 Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Mon, 28 Aug 2023 07:34:59 +0200 Subject: [PATCH 2/5] Remove unused pattern argument --- lib/http/uri.rb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/http/uri.rb b/lib/http/uri.rb index 49fb1426..a651e0c5 100644 --- a/lib/http/uri.rb +++ b/lib/http/uri.rb @@ -47,8 +47,8 @@ class URI HTTP::URI.new( :scheme => uri.normalized_scheme, :authority => uri.normalized_authority, - :path => uri.path.empty? ? "/" : percent_encode(Addressable::URI.normalize_path(uri.path), PERCENT_ENCODE), - :query => percent_encode(uri.query, PERCENT_ENCODE), + :path => uri.path.empty? ? "/" : percent_encode(Addressable::URI.normalize_path(uri.path)), + :query => percent_encode(uri.query), :fragment => uri.normalized_fragment ) end @@ -77,11 +77,14 @@ def self.form_encode(form_values, sort = false) # Percent-encode all characters matching a regular expression. # # @param [String] string raw string - # @param [Regexp] pattern regular expression matching characters to percent-encode # # @return [String] encoded value - def self.percent_encode(string, pattern) - string&.gsub(pattern) { |substr| substr.encode(Encoding::UTF_8).bytes.map { |c| format("%%%02X", c) }.join } + # + # @private + def self.percent_encode(string) + string&.gsub(PERCENT_ENCODE) do |substr| + substr.encode(Encoding::UTF_8).bytes.map { |c| format("%%%02X", c) }.join + end end # Creates an HTTP::URI instance from the given options From e31841f0d58364352ff34cd94dbbb24879bf871e Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Mon, 28 Aug 2023 07:36:06 +0200 Subject: [PATCH 3/5] Move normalizer specs to separate file --- spec/lib/http/uri/normalizer_spec.rb | 95 ++++++++++++++++++++++++++++ spec/lib/http/uri_spec.rb | 86 ------------------------- 2 files changed, 95 insertions(+), 86 deletions(-) create mode 100644 spec/lib/http/uri/normalizer_spec.rb diff --git a/spec/lib/http/uri/normalizer_spec.rb b/spec/lib/http/uri/normalizer_spec.rb new file mode 100644 index 00000000..c8720b2b --- /dev/null +++ b/spec/lib/http/uri/normalizer_spec.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true + +RSpec.describe HTTP::URI::NORMALIZER do + describe "scheme" do + it "lower-cases scheme" do + expect(HTTP::URI::NORMALIZER.call("HttP://example.com").scheme).to eq "http" + end + end + + describe "hostname" do + it "lower-cases hostname" do + expect(HTTP::URI::NORMALIZER.call("http://EXAMPLE.com").host).to eq "example.com" + end + + it "decodes percent-encoded hostname" do + expect(HTTP::URI::NORMALIZER.call("http://ex%61mple.com").host).to eq "example.com" + end + + it "removes trailing period in hostname" do + expect(HTTP::URI::NORMALIZER.call("http://example.com.").host).to eq "example.com" + end + + it "IDN-encodes non-ASCII hostname" do + expect(HTTP::URI::NORMALIZER.call("http://exämple.com").host).to eq "xn--exmple-cua.com" + end + end + + describe "path" do + it "ensures path is not empty" do + expect(HTTP::URI::NORMALIZER.call("http://example.com").path).to eq "/" + end + + it "preserves double slashes in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com//a///b").path).to eq "//a///b" + end + + it "resolves single-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a/./b").path).to eq "/a/b" + end + + it "resolves double-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a/b/../c").path).to eq "/a/c" + end + + it "resolves leading double-dot segments in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/../a/b").path).to eq "/a/b" + end + + it "percent-encodes control characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/\x00\x7F\n").path).to eq "/%00%7F%0A" + end + + it "percent-encodes space in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/a b").path).to eq "/a%20b" + end + + it "percent-encodes non-ASCII characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/キョ").path).to eq "/%E3%82%AD%E3%83%A7" + end + + it "does not percent-encode non-special characters in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/~.-_!$&()*,;=:@{}").path).to eq "/~.-_!$&()*,;=:@{}" + end + + it "preserves escape sequences in path" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/%41").path).to eq "/%41" + end + end + + describe "query" do + it "allows no query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com").query).to be_nil + end + + it "percent-encodes control characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?\x00\x7F\n").query).to eq "%00%7F%0A" + end + + it "percent-encodes space in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?a b").query).to eq "a%20b" + end + + it "percent-encodes non-ASCII characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com?キョ").query).to eq "%E3%82%AD%E3%83%A7" + end + + it "does not percent-encode non-special characters in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?~.-_!$&()*,;=:@{}?").query).to eq "~.-_!$&()*,;=:@{}?" + end + + it "preserves escape sequences in query" do + expect(HTTP::URI::NORMALIZER.call("http://example.com/?%41").query).to eq "%41" + end + end +end diff --git a/spec/lib/http/uri_spec.rb b/spec/lib/http/uri_spec.rb index 10d3a84f..4a330d61 100644 --- a/spec/lib/http/uri_spec.rb +++ b/spec/lib/http/uri_spec.rb @@ -11,92 +11,6 @@ subject(:https_uri) { described_class.parse(example_https_uri_string) } subject(:ipv6_uri) { described_class.parse(example_ipv6_uri_string) } - describe "NORMALIZER" do - it "lower-cases scheme" do - expect(HTTP::URI::NORMALIZER.call("HttP://example.com").scheme).to eq "http" - end - - it "lower-cases hostname" do - expect(HTTP::URI::NORMALIZER.call("http://EXAMPLE.com").host).to eq "example.com" - end - - it "decodes percent-encoded hostname" do - expect(HTTP::URI::NORMALIZER.call("http://ex%61mple.com").host).to eq "example.com" - end - - it "removes trailing period in hostname" do - expect(HTTP::URI::NORMALIZER.call("http://example.com.").host).to eq "example.com" - end - - it "IDN-encodes non-ASCII hostname" do - expect(HTTP::URI::NORMALIZER.call("http://exämple.com").host).to eq "xn--exmple-cua.com" - end - - it "ensures path is not empty" do - expect(HTTP::URI::NORMALIZER.call("http://example.com").path).to eq "/" - end - - it "preserves double slashes in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com//a///b").path).to eq "//a///b" - end - - it "resolves single-dot segments in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/a/./b").path).to eq "/a/b" - end - - it "resolves double-dot segments in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/a/b/../c").path).to eq "/a/c" - end - - it "resolves leading double-dot segments in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/../a/b").path).to eq "/a/b" - end - - it "percent-encodes control characters in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/\x00\x7F\n").path).to eq "/%00%7F%0A" - end - - it "percent-encodes space in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/a b").path).to eq "/a%20b" - end - - it "percent-encodes non-ASCII characters in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/キョ").path).to eq "/%E3%82%AD%E3%83%A7" - end - - it "does not percent-encode non-special characters in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/~.-_!$&()*,;=:@{}").path).to eq "/~.-_!$&()*,;=:@{}" - end - - it "preserves escape sequences in path" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/%41").path).to eq "/%41" - end - - it "allows no query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com").query).to be_nil - end - - it "percent-encodes control characters in query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/?\x00\x7F\n").query).to eq "%00%7F%0A" - end - - it "percent-encodes space in query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/?a b").query).to eq "a%20b" - end - - it "percent-encodes non-ASCII characters in query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com?キョ").query).to eq "%E3%82%AD%E3%83%A7" - end - - it "does not percent-encode non-special characters in query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/?~.-_!$&()*,;=:@{}?").query).to eq "~.-_!$&()*,;=:@{}?" - end - - it "preserves escape sequences in query" do - expect(HTTP::URI::NORMALIZER.call("http://example.com/?%41").query).to eq "%41" - end - end - it "knows URI schemes" do expect(http_uri.scheme).to eq "http" expect(https_uri.scheme).to eq "https" From c6c2739fd24286370311a8557aa9a16a8a4fd679 Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Mon, 28 Aug 2023 07:36:25 +0200 Subject: [PATCH 4/5] Mute some Rubocop rules in specs --- .rubocop.yml | 1 + .rubocop/metrics.yml | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 .rubocop/metrics.yml diff --git a/.rubocop.yml b/.rubocop.yml index b531d30f..e2956f84 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,6 +1,7 @@ inherit_from: - .rubocop_todo.yml - .rubocop/layout.yml + - .rubocop/metrics.yml - .rubocop/style.yml AllCops: diff --git a/.rubocop/metrics.yml b/.rubocop/metrics.yml new file mode 100644 index 00000000..944c40ae --- /dev/null +++ b/.rubocop/metrics.yml @@ -0,0 +1,4 @@ +Metrics/BlockLength: + Exclude: + - 'spec/**/*.rb' + - '*.gemspec' \ No newline at end of file From 508adcc7bc2e3b7c4754bfb13dee8bdf376de912 Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Wed, 6 Sep 2023 17:56:06 +0200 Subject: [PATCH 5/5] Flip the expression from blacklist to whitelist --- lib/http/uri.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/http/uri.rb b/lib/http/uri.rb index a651e0c5..9bc93adf 100644 --- a/lib/http/uri.rb +++ b/lib/http/uri.rb @@ -38,7 +38,7 @@ class URI HTTPS_SCHEME = "https" # @private - PERCENT_ENCODE = /[\x00-\x20\u007F-\u{1FFFF}]+/.freeze + PERCENT_ENCODE = /[^\x21-\x7E]+/.freeze # @private NORMALIZER = lambda do |uri|