diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index b54f7d88ac..e5fcc00569 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -7,6 +7,7 @@ tests/WPBlockMarkupProcessorTests.php tests/WPBlockMarkupUrlProcessorTests.php tests/URLParserWHATWGComplianceTests.php + tests/UrldecodeNTests.php diff --git a/packages/playground/data-liberation/src/functions.php b/packages/playground/data-liberation/src/functions.php index 43db5d00f5..acf574ddd9 100644 --- a/packages/playground/data-liberation/src/functions.php +++ b/packages/playground/data-liberation/src/functions.php @@ -47,17 +47,39 @@ function wp_rewrite_urls( $options ) { $parsed_matched_url = $p->get_parsed_url(); $parsed_matched_url->protocol = $new_site_url->protocol; $parsed_matched_url->hostname = $new_site_url->hostname; - $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); // Update the pathname if needed. if ( '/' !== $current_site_url->pathname ) { - // The matched URL starts with $current_site_name->pathname. + /** + * The matched URL starts with $current_site_name->pathname. + * + * We want to retain the portion of the pathname that comes + * after $current_site_name->pathname. This is not a simple + * substring operation because the matched URL may have + * urlencoded bytes at the beginning. We need to decode + * them before taking the substring. + * + * However, we can't just decode the entire pathname because + * the part after $current_site_name->pathname may itself + * contain urlencoded bytes. If we decode them here, it + * may change a path such as `/%2561/foo`, which decodes + * as `/61/foo`, to `/a/foo`. + * + * Therefore, we're going to decode a part of the string. We'll + * start at the beginning and keep going until we've found + * enough decoded bytes to skip over $current_site_name->pathname. + * Then we'll take the remaining, still encoded bytes as the new pathname. + */ + $decoded_matched_pathname = urldecode_n( + $parsed_matched_url->pathname, + strlen( $current_site_pathname_with_trailing_slash ) + ); $parsed_matched_url->pathname = $new_site_pathname_with_trailing_slash . - substr( - $decoded_matched_pathname, - strlen( urldecode( $current_site_pathname_with_trailing_slash ) ) - ); + substr( + $decoded_matched_pathname, + strlen( $current_site_pathname_with_trailing_slash ) + ); } /* @@ -129,3 +151,51 @@ function url_matches( URL $subject, string $current_site_url_no_trailing_slash ) str_starts_with( $matched_pathname_decoded, $current_pathname_no_trailing_slash . '/' ) ); } + +/** + * Decodes the first n **encoded bytes** a URL-encoded string. + * + * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns + * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. + * + * @param string $string The string to decode. + * @param int $target_length The maximum length of the resulting string. + * @return string The decoded string. + */ +function urldecode_n( $string, $target_length ) { + $result = ''; + $at = 0; + while(true) { + if($at + 3 > strlen($string)) { + break; + } + + $last_at = $at; + $at += strcspn( $string, '%', $at ); + // Consume bytes except for the percent sign. + $result .= substr( $string, $last_at, $at - $last_at ); + + // If we've already decoded the requested number of bytes, stop. + if(strlen($result) >= $target_length) { + break; + } + + ++$at; + $decodable_length = strspn( + $string, + '0123456789ABCDEFabcdef', + $at, + 2 + ); + if($decodable_length === 2) { + // Decode the hex sequence. + $result .= chr(hexdec($string[$at] . $string[$at + 1])); + $at += 2; + } else { + // Consume the percent sign and move on. + $result .= '%'; + } + } + $result .= substr($string, $at); + return $result; +} diff --git a/packages/playground/data-liberation/tests/UrldecodeNTests.php b/packages/playground/data-liberation/tests/UrldecodeNTests.php new file mode 100644 index 0000000000..5317ea54f6 --- /dev/null +++ b/packages/playground/data-liberation/tests/UrldecodeNTests.php @@ -0,0 +1,49 @@ +assertEquals( $expected_string, $result, 'Failed to decode the first n bytes of the string' ); + } + + static public function provider_test_urldecode_n() { + return [ + 'Encoded path segment with no encoded bytes later on' => [ + 'original_string' => '/%73/%63/image.png', + 'decode_length' => 4, + 'expected_string' => '/s/c/image.png', + ], + 'Encoded path segment with encoded bytes later on' => [ + 'original_string' => '/%73/%63/%73%63ience.png', + 'decode_length' => 4, + 'expected_string' => '/s/c/%73%63ience.png', + ], + 'Decode past the encoded path segment' => [ + 'original_string' => '/%73/%63/science.png', + 'decode_length' => 10, + 'expected_string' => '/s/c/science.png', + ], + 'Double percent sign – decode it' => [ + 'original_string' => '/%%73cience.png', + 'decode_length' => 3, + 'expected_string' => '/%science.png', + ], + 'Double percent sign – finish decoding after the first percent sign' => [ + 'original_string' => '/%%73cience.png', + 'decode_length' => 2, + 'expected_string' => '/%%73cience.png', + ], + ]; + } +} diff --git a/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php b/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php index 114a1972fc..6d934afa11 100644 --- a/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php +++ b/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php @@ -64,6 +64,30 @@ static public function provider_test_wp_rewrite_urls() { 'https://legacy-blog.com/~jappleseed/1997.10.1/', 'https://modern-webstore.org/blog/' ], + /** + * The urlencoded data needs to stay urlencoded. If it's decoded, the + * resulting path will be wrongly rewritten as + * + * "/blog/%65-reasons-to-migrate-data/" + * + * Which decodes to: + * + * "/blog/a-reasons-to-migrate-data/" + * + * But if the path is not decoded, it correctly becomes + * + * "/blog/%2565-reasons-to-migrate-data/" + * + * Which decodes to: + * + * "/blog/%65-reasons-to-migrate-data/" + */ + 'Path in an HTML attribute with URLencoded data' => [ + '61 reasons to migrate data', + '61 reasons to migrate data', + 'https://legacy-blog.com/~jappleseed/1997.10.1/', + 'https://modern-webstore.org/blog/' + ], 'Domain in an HTML attribute – encoded using HTML entities' => [ 'Contact us', 'Contact us',