Skip to content

Commit

Permalink
Fix issue with special characters (#11)
Browse files Browse the repository at this point in the history
Validation of each component is always done for a new url instance, otherwise not percent encoded special characters in path, query or fragment lead to an InvalidUrlException.

Also prevent double encoding percent encoded characters in path, query or fragment.
  • Loading branch information
otsch authored Mar 25, 2019
1 parent 013bafc commit 2a6ac25
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 9 deletions.
105 changes: 101 additions & 4 deletions src/Validator.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,18 @@ public function url($url = '') : string
$url = $this->encodeIdnHostInUrl($url);
}

$validComponents = $this->validateComponents($url);

if (!empty($validComponents)) {
$url = $this->buildUrlFromComponents($validComponents);
$dontRevalidateScheme = true;
}

if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
// filter_var() doesn't check for a valid url scheme, so validate if it has one.
$splitAtColon = explode(':', $url);

if (count($splitAtColon) > 1 && $this->scheme($splitAtColon[0])) {
if (isset($dontRevalidateScheme) || (count($splitAtColon) > 1 && $this->scheme($splitAtColon[0]))) {
return $url;
}
}
Expand Down Expand Up @@ -252,7 +259,9 @@ public function port($port = 0)
*/
public function path(string $path) : string
{
$path = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/]/', function ($match) {
$path = $this->encodePercentCharacter($path);

$path = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/\%]/', function ($match) {
return $this->urlEncodeCharacter($match[0]);
}, $path);

Expand All @@ -273,7 +282,9 @@ public function query(string $query = '') : string
$query = substr($query, 1);
}

$query = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/]/', function ($match) {
$query = $this->encodePercentCharacter($query);

$query = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/\%]/', function ($match) {
return $this->urlEncodeCharacter($match[0]);
}, $query);

Expand All @@ -293,7 +304,9 @@ public function fragment(string $fragment = '')
$fragment = substr($fragment, 1);
}

$fragment = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/\?]/', function ($match) {
$fragment = $this->encodePercentCharacter($fragment);

$fragment = preg_replace_callback('/[^a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\/\?\%]/', function ($match) {
return $this->urlEncodeCharacter($match[0]);
}, $fragment);

Expand Down Expand Up @@ -360,6 +373,21 @@ private function encodeIdnHostInUrl(string $url = '') : string
return $url;
}

/**
* Encode percent character in path, query or fragment
*
* If the string (path, query, fragment) contains a percent character that is not part of an already percent
* encoded character it must be encoded (% => %25). So this method replaces all percent characters that are not
* followed by a hex code.
*
* @param string $string
* @return string
*/
private function encodePercentCharacter(string $string = ''): string
{
return preg_replace('/%(?![0-9A-Fa-f][0-9A-Fa-f])/', '%25', $string) ?: $string;
}

/**
* @param string $url
* @return string|false
Expand Down Expand Up @@ -432,4 +460,73 @@ private function isNotEmptyString($string) : bool

return false;
}

/**
* Helper method for url()
*
* Parse the input url to components and validate each component. Returns an array of valid components.
* Returns empty array when scheme or host component is missing or an invalid component is found.
*
* @param string $url
* @return array
*/
private function validateComponents(string $url): array
{
$components = parse_url($url);

if (!is_array($components) || !isset($components['scheme']) || !isset($components['host'])) {
return [];
}

foreach ($components as $componentName => $componentValue) {
$validComponentValue = null;

if (method_exists(Validator::class, $componentName)) {
$validComponentValue = $this->{$componentName}($componentValue);
} elseif (in_array($componentName, ['user', 'pass'])) {
$validComponentValue = $this->userOrPassword($componentValue);
}

if ($validComponentValue === null) {
return [];
}

$components[$componentName] = $validComponentValue;
}

return $components;
}

/**
* Builds a url from an array of url components.
*
* @param array $comp
* @return string
*/
private function buildUrlFromComponents(array $comp = []): string
{
$url = '';

if (isset($comp['scheme'])) {
$url .= $comp['scheme'] . ':';

if (isset($comp['port']) && $comp['port'] === Url::getStandardPortByScheme($comp['scheme'])) {
unset($comp['port']);
}
}

$url .= isset($comp['host']) ? '//' : '';

if (isset($comp['user'])) {
$url .= $comp['user'] . (isset($comp['pass']) ? ':' . $comp['pass'] : '') . '@';
}

$url .= $comp['host'] . (isset($comp['port']) ? ':' . $comp['port'] : '');

$url .= $comp['path'] ?? '';
$url .= isset($comp['query']) ? '?' . $comp['query'] : '';
$url .= isset($comp['fragment']) ? '#' . $comp['fragment'] : '';

return $url;
}
}
32 changes: 32 additions & 0 deletions tests/UrlTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,38 @@ public function testGetStandardPortsByScheme()
$this->assertNull(\Crwlr\Url\Url::getStandardPortByScheme('unknownscheme'));
}

/**
* Parsing urls containing special characters like umlauts in path, query or fragment percent encodes these
* characters.
*
* @throws \Crwlr\Url\Exceptions\InvalidUrlException
*/
public function testParsingUrlsContainingUmlauts()
{
$url = \Crwlr\Url\Url::parse('https://www.example.com/bürokaufmann');
$this->assertEquals('https://www.example.com/b%C3%BCrokaufmann', $url->toString());

$url = \Crwlr\Url\Url::parse('https://www.example.com/path?quäry=strüng');
$this->assertEquals('https://www.example.com/path?qu%C3%A4ry=str%C3%BCng', $url->toString());

$url = \Crwlr\Url\Url::parse('https://www.example.com/path#frägment');
$this->assertEquals('https://www.example.com/path#fr%C3%A4gment', $url->toString());
}

/**
* Percent characters from percent encoded characters must not be (double) encoded.
*
* @throws \Crwlr\Url\Exceptions\InvalidUrlException
*/
public function testEncodingPercentEncodedCharacters()
{
$url = \Crwlr\Url\Url::parse('https://www.example.com/b%C3%BCrokaufmann');
$this->assertEquals('https://www.example.com/b%C3%BCrokaufmann', $url->toString());

$url = \Crwlr\Url\Url::parse('https://www.example.com/just%-character');
$this->assertEquals('https://www.example.com/just%25-character', $url->toString());
}

/**
* @return \Crwlr\Url\Url
* @throws \Crwlr\Url\Exceptions\InvalidUrlException
Expand Down
13 changes: 8 additions & 5 deletions tests/ValidatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ public function testValidateUrl()
$validator->url(' https://wwww.example.com '),
'https://wwww.example.com'
);
$this->assertEquals(
$validator->url('ssh://username@host:/path/to/somewhere'),
'ssh://username@host:/path/to/somewhere'
);
$this->assertEquals(
$validator->url('ftp://username:[email protected]'),
'ftp://username:[email protected]'
Expand Down Expand Up @@ -215,12 +211,17 @@ public function testValidatePath()
$this->assertEquals($validator->path('/(foo)/*bar+'), '/(foo)/*bar+');
$this->assertEquals($validator->path('/foo,bar;baz:'), '/foo,bar;baz:');
$this->assertEquals($validator->path('/foo=bar@baz'), '/foo=bar@baz');
$this->assertEquals($validator->path('/foo%bar'), '/foo%25bar');
$this->assertEquals($validator->path('no/leading/slash'), 'no/leading/slash');
$this->assertEquals($validator->path('/"foo"'), '/%22foo%22');
$this->assertEquals($validator->path('/foo\\bar'), '/foo%5Cbar');
$this->assertEquals($validator->path('/bößer/pfad'), '/b%C3%B6%C3%9Fer/pfad');
$this->assertEquals($validator->path('/<html>'), '/%3Chtml%3E');

// Percent character not encoded (to %25) because %ba could be legitimate percent encoded character.
$this->assertEquals($validator->path('/foo%bar'), '/foo%bar');

// Percent character encoded because %ga isn't a valid percent encoded character.
$this->assertEquals($validator->path('/foo%gar'), '/foo%25gar');
}

public function testValidateQuery()
Expand All @@ -240,6 +241,7 @@ public function testValidateQuery()
$this->assertEquals($validator->query('föo=bar'), 'f%C3%B6o=bar');
$this->assertEquals($validator->query('boeßer=query'), 'boe%C3%9Fer=query');
$this->assertEquals($validator->query('foo`=bar'), 'foo%60=bar');
$this->assertEquals($validator->query('foo%25bar=baz'), 'foo%25bar=baz');
}

public function testValidateFragment()
Expand All @@ -260,5 +262,6 @@ public function testValidateFragment()
$this->assertEquals($validator->fragment('frägment'), 'fr%C3%A4gment');
$this->assertEquals($validator->fragment('boeßesfragment'), 'boe%C3%9Fesfragment');
$this->assertEquals($validator->fragment('fragment`'), 'fragment%60');
$this->assertEquals($validator->fragment('fragm%E2%82%ACnt'), 'fragm%E2%82%ACnt');
}
}

0 comments on commit 2a6ac25

Please sign in to comment.