diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml
index b54f7d88ac..e5fcc00569 100644
--- a/packages/playground/data-liberation/phpunit.xml
+++ b/packages/playground/data-liberation/phpunit.xml
@@ -7,6 +7,7 @@
tests/WPBlockMarkupProcessorTests.php
tests/WPBlockMarkupUrlProcessorTests.php
tests/URLParserWHATWGComplianceTests.php
+ tests/UrldecodeNTests.php
diff --git a/packages/playground/data-liberation/src/functions.php b/packages/playground/data-liberation/src/functions.php
index 43db5d00f5..acf574ddd9 100644
--- a/packages/playground/data-liberation/src/functions.php
+++ b/packages/playground/data-liberation/src/functions.php
@@ -47,17 +47,39 @@ function wp_rewrite_urls( $options ) {
$parsed_matched_url = $p->get_parsed_url();
$parsed_matched_url->protocol = $new_site_url->protocol;
$parsed_matched_url->hostname = $new_site_url->hostname;
- $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );
// Update the pathname if needed.
if ( '/' !== $current_site_url->pathname ) {
- // The matched URL starts with $current_site_name->pathname.
+ /**
+ * The matched URL starts with $current_site_name->pathname.
+ *
+ * We want to retain the portion of the pathname that comes
+ * after $current_site_name->pathname. This is not a simple
+ * substring operation because the matched URL may have
+ * urlencoded bytes at the beginning. We need to decode
+ * them before taking the substring.
+ *
+ * However, we can't just decode the entire pathname because
+ * the part after $current_site_name->pathname may itself
+ * contain urlencoded bytes. If we decode them here, it
+ * may change a path such as `/%2561/foo`, which decodes
+ * as `/61/foo`, to `/a/foo`.
+ *
+ * Therefore, we're going to decode a part of the string. We'll
+ * start at the beginning and keep going until we've found
+ * enough decoded bytes to skip over $current_site_name->pathname.
+ * Then we'll take the remaining, still encoded bytes as the new pathname.
+ */
+ $decoded_matched_pathname = urldecode_n(
+ $parsed_matched_url->pathname,
+ strlen( $current_site_pathname_with_trailing_slash )
+ );
$parsed_matched_url->pathname =
$new_site_pathname_with_trailing_slash .
- substr(
- $decoded_matched_pathname,
- strlen( urldecode( $current_site_pathname_with_trailing_slash ) )
- );
+ substr(
+ $decoded_matched_pathname,
+ strlen( $current_site_pathname_with_trailing_slash )
+ );
}
/*
@@ -129,3 +151,51 @@ function url_matches( URL $subject, string $current_site_url_no_trailing_slash )
str_starts_with( $matched_pathname_decoded, $current_pathname_no_trailing_slash . '/' )
);
}
+
+/**
+ * Decodes the first n **encoded bytes** a URL-encoded string.
+ *
+ * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
+ * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
+ *
+ * @param string $string The string to decode.
+ * @param int $target_length The maximum length of the resulting string.
+ * @return string The decoded string.
+ */
+function urldecode_n( $string, $target_length ) {
+ $result = '';
+ $at = 0;
+ while(true) {
+ if($at + 3 > strlen($string)) {
+ break;
+ }
+
+ $last_at = $at;
+ $at += strcspn( $string, '%', $at );
+ // Consume bytes except for the percent sign.
+ $result .= substr( $string, $last_at, $at - $last_at );
+
+ // If we've already decoded the requested number of bytes, stop.
+ if(strlen($result) >= $target_length) {
+ break;
+ }
+
+ ++$at;
+ $decodable_length = strspn(
+ $string,
+ '0123456789ABCDEFabcdef',
+ $at,
+ 2
+ );
+ if($decodable_length === 2) {
+ // Decode the hex sequence.
+ $result .= chr(hexdec($string[$at] . $string[$at + 1]));
+ $at += 2;
+ } else {
+ // Consume the percent sign and move on.
+ $result .= '%';
+ }
+ }
+ $result .= substr($string, $at);
+ return $result;
+}
diff --git a/packages/playground/data-liberation/tests/UrldecodeNTests.php b/packages/playground/data-liberation/tests/UrldecodeNTests.php
new file mode 100644
index 0000000000..5317ea54f6
--- /dev/null
+++ b/packages/playground/data-liberation/tests/UrldecodeNTests.php
@@ -0,0 +1,49 @@
+assertEquals( $expected_string, $result, 'Failed to decode the first n bytes of the string' );
+ }
+
+ static public function provider_test_urldecode_n() {
+ return [
+ 'Encoded path segment with no encoded bytes later on' => [
+ 'original_string' => '/%73/%63/image.png',
+ 'decode_length' => 4,
+ 'expected_string' => '/s/c/image.png',
+ ],
+ 'Encoded path segment with encoded bytes later on' => [
+ 'original_string' => '/%73/%63/%73%63ience.png',
+ 'decode_length' => 4,
+ 'expected_string' => '/s/c/%73%63ience.png',
+ ],
+ 'Decode past the encoded path segment' => [
+ 'original_string' => '/%73/%63/science.png',
+ 'decode_length' => 10,
+ 'expected_string' => '/s/c/science.png',
+ ],
+ 'Double percent sign – decode it' => [
+ 'original_string' => '/%%73cience.png',
+ 'decode_length' => 3,
+ 'expected_string' => '/%science.png',
+ ],
+ 'Double percent sign – finish decoding after the first percent sign' => [
+ 'original_string' => '/%%73cience.png',
+ 'decode_length' => 2,
+ 'expected_string' => '/%%73cience.png',
+ ],
+ ];
+ }
+}
diff --git a/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php b/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php
index 114a1972fc..6d934afa11 100644
--- a/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php
+++ b/packages/playground/data-liberation/tests/WPRewriteUrlsTests.php
@@ -64,6 +64,30 @@ static public function provider_test_wp_rewrite_urls() {
'https://legacy-blog.com/~jappleseed/1997.10.1/',
'https://modern-webstore.org/blog/'
],
+ /**
+ * The urlencoded data needs to stay urlencoded. If it's decoded, the
+ * resulting path will be wrongly rewritten as
+ *
+ * "/blog/%65-reasons-to-migrate-data/"
+ *
+ * Which decodes to:
+ *
+ * "/blog/a-reasons-to-migrate-data/"
+ *
+ * But if the path is not decoded, it correctly becomes
+ *
+ * "/blog/%2565-reasons-to-migrate-data/"
+ *
+ * Which decodes to:
+ *
+ * "/blog/%65-reasons-to-migrate-data/"
+ */
+ 'Path in an HTML attribute with URLencoded data' => [
+ '61 reasons to migrate data',
+ '61 reasons to migrate data',
+ 'https://legacy-blog.com/~jappleseed/1997.10.1/',
+ 'https://modern-webstore.org/blog/'
+ ],
'Domain in an HTML attribute – encoded using HTML entities' => [
'Contact us',
'Contact us',