Skip to content

Commit

Permalink
Fix issues with unicode characters - fixes #71 (#72)
Browse files Browse the repository at this point in the history
* Fix issues with unicode characters

* Prevent empty elements when splitting string into words
  • Loading branch information
iluuu1994 authored and jschroed91 committed Mar 15, 2018
1 parent 231250a commit 48c70a9
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 45 deletions.
32 changes: 21 additions & 11 deletions lib/Caxy/HtmlDiff/AbstractDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,9 @@ protected function getClosingTag($tag)
*/
protected function getStringBetween($str, $start, $end)
{
$expStr = explode($start, $str, 2);
$expStr = mb_split($start, $str, 2);
if (count($expStr) > 1) {
$expStr = explode($end, $expStr[ 1 ]);
$expStr = mb_split($end, $expStr[ 1 ]);
if (count($expStr) > 1) {
array_pop($expStr);

Expand Down Expand Up @@ -461,7 +461,7 @@ protected function setNewWords(array $newWords)
*/
protected function isPartOfWord($text)
{
return ctype_alnum(str_replace($this->config->getSpecialCaseChars(), '', $text));
return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
}

/**
Expand All @@ -485,15 +485,15 @@ protected function convertHtmlToListOfWords($characterString)

$current_word = '<';
$mode = 'tag';
} elseif (preg_match("/\s/", $character)) {
} elseif (preg_match("/\s/u", $character)) {
if ($current_word !== '') {
$words[] = $current_word;
}
$current_word = $keepNewLines ? $character : preg_replace('/\s+/S', ' ', $character);
$current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
$mode = 'whitespace';
} else {
if (
(ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
(($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
) {
$current_word .= $character;
Expand All @@ -509,7 +509,7 @@ protected function convertHtmlToListOfWords($characterString)
$words[] = $current_word;
$current_word = '';

if (!preg_match('[^\s]', $character)) {
if (!preg_match('[^\s]u', $character)) {
$mode = 'whitespace';
} else {
$mode = 'character';
Expand All @@ -525,9 +525,9 @@ protected function convertHtmlToListOfWords($characterString)
}
$current_word = '<';
$mode = 'tag';
} elseif (preg_match("/\s/", $character)) {
} elseif (preg_match("/\s/u", $character)) {
$current_word .= $character;
if (!$keepNewLines) $current_word = preg_replace('/\s+/S', ' ', $current_word);
if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
} else {
if ($current_word != '') {
$words[] = $current_word;
Expand Down Expand Up @@ -574,7 +574,7 @@ protected function isEndOfTag($val)
*/
protected function isWhiteSpace($value)
{
return !preg_match('[^\s]', $value);
return !preg_match('[^\s]u', $value);
}

/**
Expand All @@ -585,6 +585,16 @@ protected function isWhiteSpace($value)
protected function explode($value)
{
// as suggested by @onassar
return preg_split('//u', $value);
return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
}

/**
* @param string $str
*
* @return bool
*/
protected function ctypeAlphanumUnicode($str)
{
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str);
}
}
28 changes: 14 additions & 14 deletions lib/Caxy/HtmlDiff/HtmlDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ protected function createIsolatedDiffTagPlaceholders(&$words)
foreach ($words as $index => $word) {
$openIsolatedDiffTag = $this->isOpeningIsolatedDiffTag($word, $currentIsolatedDiffTag);
if ($openIsolatedDiffTag) {
if ($this->isSelfClosingTag($word) || stripos($word, '<img') !== false) {
if ($this->isSelfClosingTag($word) || mb_stripos($word, '<img') !== false) {
if ($openIsolatedDiffTags === 0) {
$isolatedDiffTagIndices[] = array(
'start' => $index,
Expand Down Expand Up @@ -205,7 +205,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul
$tagsToMatch = $currentIsolatedDiffTag !== null
? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag))
: $this->config->getIsolatedDiffTags();
$pattern = '#<%s(\s+[^>]*)?>#iU';
$pattern = '#<%s(\s+[^>]*)?>#iUu';
foreach ($tagsToMatch as $key => $value) {
if (preg_match(sprintf($pattern, $key), $item)) {
return $key;
Expand All @@ -217,7 +217,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul

protected function isSelfClosingTag($text)
{
return (bool) preg_match('/<[^>]+\/\s*>/', $text);
return (bool) preg_match('/<[^>]+\/\s*>/u', $text);
}

/**
Expand All @@ -231,7 +231,7 @@ protected function isClosingIsolatedDiffTag($item, $currentIsolatedDiffTag = nul
$tagsToMatch = $currentIsolatedDiffTag !== null
? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag))
: $this->config->getIsolatedDiffTags();
$pattern = '#</%s(\s+[^>]*)?>#iU';
$pattern = '#</%s(\s+[^>]*)?>#iUu';
foreach ($tagsToMatch as $key => $value) {
if (preg_match(sprintf($pattern, $key), $item)) {
return $key;
Expand Down Expand Up @@ -354,7 +354,7 @@ protected function diffElements($oldText, $newText, $stripWrappingTags = true)
$wrapEnd = '';

if ($stripWrappingTags) {
$pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/i';
$pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/iu';
$matches = array();

if (preg_match_all($pattern, $newText, $matches)) {
Expand Down Expand Up @@ -441,7 +441,7 @@ protected function processEqualOperation($operation)
protected function getAttributeFromTag($text, $attribute)
{
$matches = array();
if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/i', $attribute), $text, $matches)) {
if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/iu', $attribute), $text, $matches)) {
return htmlspecialchars_decode($matches[2]);
}

Expand Down Expand Up @@ -567,15 +567,15 @@ protected function insertTag($tag, $cssClass, &$words)
}
}
}
if (count($words) == 0 && strlen($specialCaseTagInjection) == 0) {
if (count($words) == 0 && mb_strlen($specialCaseTagInjection) == 0) {
break;
}
if ($specialCaseTagInjectionIsBefore) {
$this->content .= $specialCaseTagInjection.implode('', $this->extractConsecutiveWords($words, 'tag'));
} else {
$workTag = $this->extractConsecutiveWords($words, 'tag');
if (isset($workTag[ 0 ]) && $this->isOpeningTag($workTag[ 0 ]) && !$this->isClosingTag($workTag[ 0 ])) {
if (strpos($workTag[ 0 ], 'class=')) {
if (mb_strpos($workTag[ 0 ], 'class=')) {
$workTag[ 0 ] = str_replace('class="', 'class="diffmod ', $workTag[ 0 ]);
$workTag[ 0 ] = str_replace("class='", 'class="diffmod ', $workTag[ 0 ]);
} else {
Expand All @@ -584,7 +584,7 @@ protected function insertTag($tag, $cssClass, &$words)
}

$appendContent = implode('', $workTag).$specialCaseTagInjection;
if (isset($workTag[0]) && false !== stripos($workTag[0], '<img')) {
if (isset($workTag[0]) && false !== mb_stripos($workTag[0], '<img')) {
$appendContent = $this->wrapText($appendContent, $tag, $cssClass);
}
$this->content .= $appendContent;
Expand Down Expand Up @@ -673,7 +673,7 @@ protected function isTag($item)
*/
protected function isOpeningTag($item)
{
return preg_match('#<[^>]+>\\s*#iU', $item);
return preg_match('#<[^>]+>\\s*#iUu', $item);
}

/**
Expand All @@ -683,7 +683,7 @@ protected function isOpeningTag($item)
*/
protected function isClosingTag($item)
{
return preg_match('#</[^>]+>\\s*#iU', $item);
return preg_match('#</[^>]+>\\s*#iUu', $item);
}

/**
Expand Down Expand Up @@ -769,10 +769,10 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
*/
protected function stripTagAttributes($word)
{
$space = strpos($word, ' ', 1);
$space = mb_strpos($word, ' ', 1);

if ($space) {
return '<' . substr($word, 1, $space) . '>';
return '<' . mb_substr($word, 1, $space) . '>';
}

return trim($word, '<>');
Expand Down Expand Up @@ -850,7 +850,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
protected function isOnlyWhitespace($str)
{
// Slightly faster then using preg_match
return $str !== '' && (strlen(trim($str)) === 0);
return $str !== '' && (mb_strlen(trim($str)) === 0);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion lib/Caxy/HtmlDiff/HtmlDiffConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ public function setIsolatedDiffTags($isolatedDiffTags)
public function addIsolatedDiffTag($tag, $placeholder = null)
{
if (null === $placeholder) {
$placeholder = sprintf('[[REPLACE_%s]]', strtoupper($tag));
$placeholder = sprintf('[[REPLACE_%s]]', mb_strtoupper($tag));
}

if ($this->isIsolatedDiffTag($tag) && $this->isolatedDiffTags[$tag] !== $placeholder) {
Expand Down
12 changes: 6 additions & 6 deletions lib/Caxy/HtmlDiff/ListDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ protected function buildDiffList($words)
$list[] = $word;
}
} else {
$listType = substr($word, 1, 2);
$listType = mb_substr($word, 1, 2);
$listStartTag = $word;
}

Expand All @@ -254,7 +254,7 @@ protected function buildDiffList($words)
if ($openListItems === 0) {
// New top-level list item
$currentListItem = array();
$listItemType = substr($word, 1, 2);
$listItemType = mb_substr($word, 1, 2);
$listItemStart = $word;
} else {
$currentListItem[] = $word;
Expand Down Expand Up @@ -290,27 +290,27 @@ protected function isOpeningListTag($word, $type = null)
{
$filter = $type !== null ? array('<'.$type) : array('<ul', '<ol', '<dl');

return in_array(substr($word, 0, 3), $filter);
return in_array(mb_substr($word, 0, 3), $filter);
}

protected function isClosingListTag($word, $type = null)
{
$filter = $type !== null ? array('</'.$type) : array('</ul', '</ol', '</dl');

return in_array(substr($word, 0, 4), $filter);
return in_array(mb_substr($word, 0, 4), $filter);
}

protected function isOpeningListItemTag($word, $type = null)
{
$filter = $type !== null ? array('<'.$type) : array('<li', '<dd', '<dt');

return in_array(substr($word, 0, 3), $filter);
return in_array(mb_substr($word, 0, 3), $filter);
}

protected function isClosingListItemTag($word, $type = null)
{
$filter = $type !== null ? array('</'.$type) : array('</li', '</dd', '</dt');

return in_array(substr($word, 0, 4), $filter);
return in_array(mb_substr($word, 0, 4), $filter);
}
}
16 changes: 8 additions & 8 deletions lib/Caxy/HtmlDiff/Preprocessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ class Preprocessor
public static function diffCommonPrefix($old, $new)
{
// Quick check for common null cases.
if (strlen($old) == 0 || strlen($new) == 0 || substr($old, 0, 1) != substr($new, 0, 1)) {
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, 0, 1) != mb_substr($new, 0, 1)) {
return 0;
}

// Binary Search
$pointerMin = 0;
$pointerMax = min(strlen($old), strlen($new));
$pointerMax = min(mb_strlen($old), mb_strlen($new));
$pointerMid = $pointerMax;
$pointerStart = 0;
while ($pointerMin < $pointerMid) {
$cmp = substr_compare(
$old,
substr($new, $pointerStart, $pointerMid - $pointerStart),
mb_substr($new, $pointerStart, $pointerMid - $pointerStart),
$pointerStart,
$pointerMid - $pointerStart
);
Expand All @@ -37,19 +37,19 @@ public static function diffCommonPrefix($old, $new)
public static function diffCommonSuffix($old, $new)
{
// Quick check for common null cases.
if (strlen($old) == 0 || strlen($new) == 0 || substr($old, strlen($old) - 1, 1) != substr($new, strlen($new) - 1, 1)) {
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, mb_strlen($old) - 1, 1) != mb_substr($new, mb_strlen($new) - 1, 1)) {
return 0;
}

// Binary Search
$pointerMin = 0;
$pointerMax = min(strlen($old), strlen($new));
$pointerMax = min(mb_strlen($old), mb_strlen($new));
$pointerMid = $pointerMax;
$pointerEnd = 0;
$oldLen = strlen($old);
$newLen = strlen($new);
$oldLen = mb_strlen($old);
$newLen = mb_strlen($new);
while ($pointerMin < $pointerMid) {
if (substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
if (mb_substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == mb_substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
$pointerMin = $pointerMid;
$pointerEnd = $pointerMin;
} else {
Expand Down
8 changes: 4 additions & 4 deletions lib/Caxy/HtmlDiff/Strategy/ListItemMatchStrategy.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,20 @@ public function isMatch($a, $b)
// Check common prefix/ suffix length
$aCleaned = trim($aStripped);
$bCleaned = trim($bStripped);
if (strlen($aCleaned) === 0 || strlen($bCleaned) === 0) {
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
$aCleaned = $a;
$bCleaned = $b;
}
if (strlen($aCleaned) === 0 || strlen($bCleaned) === 0) {
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
return false;
}
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned);
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned);

// Use shorter string, and see how much of it is leftover
$len = min(strlen($aCleaned), strlen($bCleaned));
$len = min(mb_strlen($aCleaned), mb_strlen($bCleaned));
$remaining = $len - ($prefixIndex + $suffixIndex);
$strLengthPercent = $len / max(strlen($a), strlen($b));
$strLengthPercent = $len / max(mb_strlen($a), mb_strlen($b));

if ($remaining === 0 && $strLengthPercent > $this->lengthRatioThreshold) {
return true;
Expand Down
2 changes: 1 addition & 1 deletion lib/Caxy/HtmlDiff/Table/TableDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ protected function htmlFromNode($node)
protected function setInnerHtml($node, $html)
{
// DOMDocument::loadHTML does not allow empty strings.
if (strlen(trim($html)) === 0) {
if (mb_strlen(trim($html)) === 0) {
$html = '<span class="empty"></span>';
}

Expand Down

0 comments on commit 48c70a9

Please sign in to comment.