Skip to content

Commit

Permalink
Preserve urlencoded data in the rewritten path
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Oct 28, 2024
1 parent d2aeea4 commit 60db1e1
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 6 deletions.
1 change: 1 addition & 0 deletions packages/playground/data-liberation/phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<file>tests/WPBlockMarkupProcessorTests.php</file>
<file>tests/WPBlockMarkupUrlProcessorTests.php</file>
<file>tests/URLParserWHATWGComplianceTests.php</file>
<file>tests/UrldecodeNTests.php</file>
</testsuite>
</testsuites>
</phpunit>
82 changes: 76 additions & 6 deletions packages/playground/data-liberation/src/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,39 @@ function wp_rewrite_urls( $options ) {
$parsed_matched_url = $p->get_parsed_url();
$parsed_matched_url->protocol = $new_site_url->protocol;
$parsed_matched_url->hostname = $new_site_url->hostname;
$decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );

// Update the pathname if needed.
if ( '/' !== $current_site_url->pathname ) {
// The matched URL starts with $current_site_name->pathname.
/**
* The matched URL starts with $current_site_name->pathname.
*
* We want to retain the portion of the pathname that comes
* after $current_site_name->pathname. This is not a simple
* substring operation because the matched URL may have
* urlencoded bytes at the beginning. We need to decode
* them before taking the substring.
*
* However, we can't just decode the entire pathname because
* the part after $current_site_name->pathname may itself
* contain urlencoded bytes. If we decode them here, it
* may change a path such as `/%2561/foo`, which decodes
* as `/61/foo`, to `/a/foo`.
*
* Therefore, we're going to decode a part of the string. We'll
* start at the beginning and keep going until we've found
* enough decoded bytes to skip over $current_site_name->pathname.
* Then we'll take the remaining, still encoded bytes as the new pathname.
*/
$decoded_matched_pathname = urldecode_n(
$parsed_matched_url->pathname,
strlen( $current_site_pathname_with_trailing_slash )
);
$parsed_matched_url->pathname =
$new_site_pathname_with_trailing_slash .
substr(
$decoded_matched_pathname,
strlen( urldecode( $current_site_pathname_with_trailing_slash ) )
);
substr(
$decoded_matched_pathname,
strlen( $current_site_pathname_with_trailing_slash )
);
}

/*
Expand Down Expand Up @@ -129,3 +151,51 @@ function url_matches( URL $subject, string $current_site_url_no_trailing_slash )
str_starts_with( $matched_pathname_decoded, $current_pathname_no_trailing_slash . '/' )
);
}

/**
* Decodes the first n **encoded bytes** a URL-encoded string.
*
* For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
* '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
*
* @param string $string The string to decode.
* @param int $target_length The maximum length of the resulting string.
* @return string The decoded string.
*/
function urldecode_n( $string, $target_length ) {
$result = '';
$at = 0;
while(true) {
if($at + 3 > strlen($string)) {
break;
}

$last_at = $at;
$at += strcspn( $string, '%', $at );
// Consume bytes except for the percent sign.
$result .= substr( $string, $last_at, $at - $last_at );

// If we've already decoded the requested number of bytes, stop.
if(strlen($result) >= $target_length) {
break;
}

++$at;
$decodable_length = strspn(
$string,
'0123456789ABCDEFabcdef',
$at,
2
);
if($decodable_length === 2) {
// Decode the hex sequence.
$result .= chr(hexdec($string[$at] . $string[$at + 1]));
$at += 2;
} else {
// Consume the percent sign and move on.
$result .= '%';
}
}
$result .= substr($string, $at);
return $result;
}
49 changes: 49 additions & 0 deletions packages/playground/data-liberation/tests/UrldecodeNTests.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?php

use PHPUnit\Framework\TestCase;

class UrldecodeNTests extends TestCase {

/**
*
* @dataProvider provider_test_urldecode_n
*/
public function test_urldecode_n(
$original_string,
$decode_length,
$expected_string,
) {
$result = urldecode_n( $original_string, $decode_length );
$this->assertEquals( $expected_string, $result, 'Failed to decode the first n bytes of the string' );
}

static public function provider_test_urldecode_n() {
return [
'Encoded path segment with no encoded bytes later on' => [
'original_string' => '/%73/%63/image.png',
'decode_length' => 4,
'expected_string' => '/s/c/image.png',
],
'Encoded path segment with encoded bytes later on' => [
'original_string' => '/%73/%63/%73%63ience.png',
'decode_length' => 4,
'expected_string' => '/s/c/%73%63ience.png',
],
'Decode past the encoded path segment' => [
'original_string' => '/%73/%63/science.png',
'decode_length' => 10,
'expected_string' => '/s/c/science.png',
],
'Double percent sign – decode it' => [
'original_string' => '/%%73cience.png',
'decode_length' => 3,
'expected_string' => '/%science.png',
],
'Double percent sign – finish decoding after the first percent sign' => [
'original_string' => '/%%73cience.png',
'decode_length' => 2,
'expected_string' => '/%%73cience.png',
],
];
}
}
24 changes: 24 additions & 0 deletions packages/playground/data-liberation/tests/WPRewriteUrlsTests.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,30 @@ static public function provider_test_wp_rewrite_urls() {
'https://legacy-blog.com/~jappleseed/1997.10.1/',
'https://modern-webstore.org/blog/'
],
/**
* The urlencoded data needs to stay urlencoded. If it's decoded, the
* resulting path will be wrongly rewritten as
*
* "/blog/%65-reasons-to-migrate-data/"
*
* Which decodes to:
*
* "/blog/a-reasons-to-migrate-data/"
*
* But if the path is not decoded, it correctly becomes
*
* "/blog/%2565-reasons-to-migrate-data/"
*
* Which decodes to:
*
* "/blog/%65-reasons-to-migrate-data/"
*/
'Path in an HTML attribute with URLencoded data' => [
'<a href="/~jappleseed/1997.10.1/%2561-reasons-to-migrate-data/">61 reasons to migrate data</a>',
'<a href="/blog/%2561-reasons-to-migrate-data/">61 reasons to migrate data</a>',
'https://legacy-blog.com/~jappleseed/1997.10.1/',
'https://modern-webstore.org/blog/'
],
'Domain in an HTML attribute – encoded using HTML entities' => [
'<a href="&#104;&#116;tps://&#108;&#101;g&#97;&#99;&#121;&#45;&#98;&#108;&#111;&#103;.&#99;&#111;&#109;/pages/contact-us">Contact us</a>',
'<a href="https://modern-webstore.org/pages/contact-us">Contact us</a>',
Expand Down

0 comments on commit 60db1e1

Please sign in to comment.