From 0dfd7b0890ee1a6174282614485e559fb507eecb Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 18 Feb 2019 03:00:48 +0300 Subject: [PATCH 1/2] Test PHP 7.3 with ICU 63.1 --- .travis.yml | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 99ca40701..fac94752c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ cache: directories: - vendor - $HOME/.composer/cache/files + - $HOME/.build matrix: include: @@ -18,7 +19,7 @@ matrix: - php: 7.1 - php: 7.2 - php: 7.3 - env: SYMFONY_PHPUNIT_VERSION=7.2 + env: SYMFONY_PHPUNIT_VERSION=7.2 ICU_VERSION=63.1 - php: nightly allow_failures: - php: nightly @@ -33,6 +34,29 @@ before_install: - if [[ $TRAVIS_PHP_VERSION = 5.* ]]; then echo yes | pecl install -f apcu-4.0.11; fi - if [[ $TRAVIS_PHP_VERSION = 7.* ]]; then echo yes | pecl install -f apcu_bc-1.0.4; fi - if [[ $TRAVIS_PHP_VERSION = 7.* ]]; then echo yes | pecl install -f apcu-5.1.11; fi + - | + if [[ $ICU_VERSION ]]; then + ICU_DIR=$HOME/.build/icu-$ICU_VERSION + ICU_PHP_VERSION=$(php -r "echo PHP_VERSION;") + ICU_PHP_DIR=$HOME/.build/php-$ICU_PHP_VERSION-icu-$ICU_VERSION + export ICU_PHP=$ICU_PHP_DIR/bin/php + if [ ! -f $ICU_PHP ]; then + wget -O icu-src.tgz http://download.icu-project.org/files/icu4c/$ICU_VERSION/icu4c-$(echo $ICU_VERSION | tr '.' '_')-src.tgz + mkdir icu-src && tar xzf icu-src.tgz -C icu-src --strip-components=1 + pushd icu-src/source + ./configure --prefix=$ICU_DIR + make && make install + popd + wget -O php-src.tgz http://us1.php.net/get/php-$ICU_PHP_VERSION.tar.gz/from/this/mirror + mkdir php-src && tar xzf php-src.tgz -C php-src --strip-components=1 + pushd php-src + ./configure --prefix=$ICU_PHP_DIR --enable-intl --with-icu-dir=$ICU_DIR + make && make install + popd + fi + $ICU_PHP -r "echo INTL_ICU_VERSION.PHP_EOL;" + $ICU_PHP -r "var_dump((new ReflectionClass('Normalizer'))->getConstants());" + fi - php -i install: @@ -41,3 +65,4 @@ install: script: - ./vendor/bin/simple-phpunit + - if [[ $ICU_PHP ]]; then $ICU_PHP ./vendor/bin/simple-phpunit --filter 'Symfony\\Polyfill\\Tests\\Intl'; fi From eb0bdcc1a0ff81a056a8e2e94e72058e0b22f24a Mon Sep 17 00:00:00 2001 From: Valentin Date: Tue, 19 Feb 2019 13:54:05 +0300 Subject: [PATCH 2/2] Fix some tests --- src/Intl/Normalizer/BaseNormalizer.php | 311 ++++++++++++++++++++++ src/Intl/Normalizer/Normalizer.php | 321 ++++------------------- tests/Intl/Normalizer/NormalizerTest.php | 7 +- 3 files changed, 358 insertions(+), 281 deletions(-) create mode 100644 src/Intl/Normalizer/BaseNormalizer.php diff --git a/src/Intl/Normalizer/BaseNormalizer.php b/src/Intl/Normalizer/BaseNormalizer.php new file mode 100644 index 000000000..3a49cc751 --- /dev/null +++ b/src/Intl/Normalizer/BaseNormalizer.php @@ -0,0 +1,311 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Polyfill\Intl\Normalizer; + +/** + * Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension. + * + * It has been validated with Unicode 6.3 Normalization Conformance Test. + * See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations. + * + * @author Nicolas Grekas + * + * @internal + */ +class BaseNormalizer +{ + const NONE = 1; + const FORM_D = 2; + const FORM_KD = 3; + const FORM_C = 4; + const FORM_KC = 5; + const NFD = 2; + const NFKD = 3; + const NFC = 4; + const NFKC = 5; + + private static $C; + private static $D; + private static $KD; + private static $cC; + private static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4); + private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; + + public static function isNormalized($s, $form = self::NFC) + { + if (!static::isFormNormalized($form)) { + return false; + } + + $s = (string) $s; + if (!isset($s[strspn($s, self::$ASCII)])) { + return true; + } + if (static::NFC === $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) { + return true; + } + + return false; // Pretend false as quick checks implementented in PHP won't be so quick + } + + public static function normalize($s, $form = self::NFC) + { + $s = (string) $s; + if (!preg_match('//u', $s)) { + return false; + } + + switch ($form) { + case static::NONE: return $s; + case static::NFC: $C = true; $K = false; break; + case static::NFD: $C = false; $K = false; break; + case static::NFKC: $C = true; $K = true; break; + case static::NFKD: $C = false; $K = true; break; + default: return false; + } + + if ('' === $s) { + return ''; + } + + if ($K && null === self::$KD) { + self::$KD = self::getData('compatibilityDecomposition'); + } + + if (null === self::$D) { + self::$D = self::getData('canonicalDecomposition'); + self::$cC = self::getData('combiningClass'); + } + + if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) { + mb_internal_encoding('8bit'); + } + + $r = self::decompose($s, $K); + + if ($C) { + if (null === self::$C) { + self::$C = self::getData('canonicalComposition'); + } + + $r = self::recompose($r); + } + if (null !== $mbEncoding) { + mb_internal_encoding($mbEncoding); + } + + return $r; + } + + protected static function isFormNormalized($form) + { + return $form > static::NONE && $form <= static::NFKC; + } + + private static function recompose($s) + { + $ASCII = self::$ASCII; + $compMap = self::$C; + $combClass = self::$cC; + $ulenMask = self::$ulenMask; + + $result = $tail = ''; + + $i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"]; + $len = \strlen($s); + + $lastUchr = substr($s, 0, $i); + $lastUcls = isset($combClass[$lastUchr]) ? 256 : 0; + + while ($i < $len) { + if ($s[$i] < "\x80") { + // ASCII chars + + if ($tail) { + $lastUchr .= $tail; + $tail = ''; + } + + if ($j = strspn($s, $ASCII, $i + 1)) { + $lastUchr .= substr($s, $i, $j); + $i += $j; + } + + $result .= $lastUchr; + $lastUchr = $s[$i]; + $lastUcls = 0; + ++$i; + continue; + } + + $ulen = $ulenMask[$s[$i] & "\xF0"]; + $uchr = substr($s, $i, $ulen); + + if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr + || $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr + || $lastUcls) { + // Table lookup and combining chars composition + + $ucls = isset($combClass[$uchr]) ? $combClass[$uchr] : 0; + + if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) { + $lastUchr = $compMap[$lastUchr.$uchr]; + } elseif ($lastUcls = $ucls) { + $tail .= $uchr; + } else { + if ($tail) { + $lastUchr .= $tail; + $tail = ''; + } + + $result .= $lastUchr; + $lastUchr = $uchr; + } + } else { + // Hangul chars + + $L = \ord($lastUchr[2]) - 0x80; + $V = \ord($uchr[2]) - 0xA1; + $T = 0; + + $uchr = substr($s, $i + $ulen, 3); + + if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") { + $T = \ord($uchr[2]) - 0xA7; + 0 > $T && $T += 0x40; + $ulen += 3; + } + + $L = 0xAC00 + ($L * 21 + $V) * 28 + $T; + $lastUchr = \chr(0xE0 | $L >> 12).\chr(0x80 | $L >> 6 & 0x3F).\chr(0x80 | $L & 0x3F); + } + + $i += $ulen; + } + + return $result.$lastUchr.$tail; + } + + private static function decompose($s, $c) + { + $result = ''; + + $ASCII = self::$ASCII; + $decompMap = self::$D; + $combClass = self::$cC; + $ulenMask = self::$ulenMask; + if ($c) { + $compatMap = self::$KD; + } + + $c = array(); + $i = 0; + $len = \strlen($s); + + while ($i < $len) { + if ($s[$i] < "\x80") { + // ASCII chars + + if ($c) { + ksort($c); + $result .= implode('', $c); + $c = array(); + } + + $j = 1 + strspn($s, $ASCII, $i + 1); + $result .= substr($s, $i, $j); + $i += $j; + continue; + } + + $ulen = $ulenMask[$s[$i] & "\xF0"]; + $uchr = substr($s, $i, $ulen); + $i += $ulen; + + if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) { + // Table lookup + + if ($uchr !== $j = isset($compatMap[$uchr]) ? $compatMap[$uchr] : (isset($decompMap[$uchr]) ? $decompMap[$uchr] : $uchr)) { + $uchr = $j; + + $j = \strlen($uchr); + $ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"]; + + if ($ulen != $j) { + // Put trailing chars in $s + + $j -= $ulen; + $i -= $j; + + if (0 > $i) { + $s = str_repeat(' ', -$i).$s; + $len -= $i; + $i = 0; + } + + while ($j--) { + $s[$i + $j] = $uchr[$ulen + $j]; + } + + $uchr = substr($uchr, 0, $ulen); + } + } + if (isset($combClass[$uchr])) { + // Combining chars, for sorting + + if (!isset($c[$combClass[$uchr]])) { + $c[$combClass[$uchr]] = ''; + } + $c[$combClass[$uchr]] .= $uchr; + continue; + } + } else { + // Hangul chars + + $uchr = unpack('C*', $uchr); + $j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80; + + $uchr = "\xE1\x84".\chr(0x80 + (int) ($j / 588)) + ."\xE1\x85".\chr(0xA1 + (int) (($j % 588) / 28)); + + if ($j %= 28) { + $uchr .= $j < 25 + ? ("\xE1\x86".\chr(0xA7 + $j)) + : ("\xE1\x87".\chr(0x67 + $j)); + } + } + if ($c) { + ksort($c); + $result .= implode('', $c); + $c = array(); + } + + $result .= $uchr; + } + + if ($c) { + ksort($c); + $result .= implode('', $c); + } + + return $result; + } + + private static function getData($file) + { + if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) { + return require $file; + } + + return false; + } +} diff --git a/src/Intl/Normalizer/Normalizer.php b/src/Intl/Normalizer/Normalizer.php index a4fea7e03..af65bea2c 100644 --- a/src/Intl/Normalizer/Normalizer.php +++ b/src/Intl/Normalizer/Normalizer.php @@ -14,292 +14,61 @@ /** * Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension. * - * It has been validated with Unicode 6.3 Normalization Conformance Test. - * See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations. + * Since PHP 7.3 Normalizer implementation depends on the ICU version. + * See https://github.com/php/php-src/blob/3fa88e0ce0ffd9f63672afe114158a07a0204e21/ext/intl/normalizer/normalizer.h#L22) for details. + * This class auto-adapts to the PHP and ICU versions. * * @author Nicolas Grekas + * @author Valentin Udaltsov * * @internal */ -class Normalizer -{ - const NONE = 1; - const FORM_D = 2; - const FORM_KD = 3; - const FORM_C = 4; - const FORM_KC = 5; - const NFD = 2; - const NFKD = 3; - const NFC = 4; - const NFKC = 5; - - private static $C; - private static $D; - private static $KD; - private static $cC; - private static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4); - private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; - - public static function isNormalized($s, $form = self::NFC) +if (version_compare(PHP_VERSION, '7.3', '>=') && defined('INTL_ICU_VERSION') && version_compare(INTL_ICU_VERSION, '56', '>=')) { + class Normalizer extends BaseNormalizer { - if ($form <= self::NONE || self::NFKC < $form) { - return false; - } - $s = (string) $s; - if (!isset($s[strspn($s, self::$ASCII)])) { - return true; - } - if (self::NFC === $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) { - return true; - } - - return false; // Pretend false as quick checks implementented in PHP won't be so quick - } - - public static function normalize($s, $form = self::NFC) - { - $s = (string) $s; - if (!preg_match('//u', $s)) { - return false; - } - - switch ($form) { - case self::NONE: return $s; - case self::NFC: $C = true; $K = false; break; - case self::NFD: $C = false; $K = false; break; - case self::NFKC: $C = true; $K = true; break; - case self::NFKD: $C = false; $K = true; break; - default: return false; - } - - if ('' === $s) { - return ''; - } - - if ($K && null === self::$KD) { - self::$KD = self::getData('compatibilityDecomposition'); - } - - if (null === self::$D) { - self::$D = self::getData('canonicalDecomposition'); - self::$cC = self::getData('combiningClass'); - } - - if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) { - mb_internal_encoding('8bit'); - } - - $r = self::decompose($s, $K); - - if ($C) { - if (null === self::$C) { - self::$C = self::getData('canonicalComposition'); + const NONE = 2; + const FORM_D = 4; + const FORM_KD = 8; + const FORM_C = 16; + const FORM_KC = 32; + const FORM_KC_CF = 48; + const NFD = 4; + const NFKD = 8; + const NFC = 16; + const NFKC = 32; + const NFKC_CF = 48; + + /** + * Override method to use new $form default value. + */ + public static function isNormalized($s, $form = self::NFC) + { + return parent::isNormalized($s, $form); + } + + /** + * Override method to use new $form default value. + */ + public static function normalize($s, $form = self::NFC) + { + return parent::normalize($s, $form); + } + + /** + * {@inheritdoc} + */ + protected static function isFormNormalized($form) + { + if ($form <= static::NONE || $form > static::NFKC_CF) { + return false; } - $r = self::recompose($r); - } - if (null !== $mbEncoding) { - mb_internal_encoding($mbEncoding); + // check $form is a power of two + return 0 === ($form & ($form - 1)); } - - return $r; } - - private static function recompose($s) +} else { + class Normalizer extends BaseNormalizer { - $ASCII = self::$ASCII; - $compMap = self::$C; - $combClass = self::$cC; - $ulenMask = self::$ulenMask; - - $result = $tail = ''; - - $i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"]; - $len = \strlen($s); - - $lastUchr = substr($s, 0, $i); - $lastUcls = isset($combClass[$lastUchr]) ? 256 : 0; - - while ($i < $len) { - if ($s[$i] < "\x80") { - // ASCII chars - - if ($tail) { - $lastUchr .= $tail; - $tail = ''; - } - - if ($j = strspn($s, $ASCII, $i + 1)) { - $lastUchr .= substr($s, $i, $j); - $i += $j; - } - - $result .= $lastUchr; - $lastUchr = $s[$i]; - $lastUcls = 0; - ++$i; - continue; - } - - $ulen = $ulenMask[$s[$i] & "\xF0"]; - $uchr = substr($s, $i, $ulen); - - if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr - || $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr - || $lastUcls) { - // Table lookup and combining chars composition - - $ucls = isset($combClass[$uchr]) ? $combClass[$uchr] : 0; - - if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) { - $lastUchr = $compMap[$lastUchr.$uchr]; - } elseif ($lastUcls = $ucls) { - $tail .= $uchr; - } else { - if ($tail) { - $lastUchr .= $tail; - $tail = ''; - } - - $result .= $lastUchr; - $lastUchr = $uchr; - } - } else { - // Hangul chars - - $L = \ord($lastUchr[2]) - 0x80; - $V = \ord($uchr[2]) - 0xA1; - $T = 0; - - $uchr = substr($s, $i + $ulen, 3); - - if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") { - $T = \ord($uchr[2]) - 0xA7; - 0 > $T && $T += 0x40; - $ulen += 3; - } - - $L = 0xAC00 + ($L * 21 + $V) * 28 + $T; - $lastUchr = \chr(0xE0 | $L >> 12).\chr(0x80 | $L >> 6 & 0x3F).\chr(0x80 | $L & 0x3F); - } - - $i += $ulen; - } - - return $result.$lastUchr.$tail; - } - - private static function decompose($s, $c) - { - $result = ''; - - $ASCII = self::$ASCII; - $decompMap = self::$D; - $combClass = self::$cC; - $ulenMask = self::$ulenMask; - if ($c) { - $compatMap = self::$KD; - } - - $c = array(); - $i = 0; - $len = \strlen($s); - - while ($i < $len) { - if ($s[$i] < "\x80") { - // ASCII chars - - if ($c) { - ksort($c); - $result .= implode('', $c); - $c = array(); - } - - $j = 1 + strspn($s, $ASCII, $i + 1); - $result .= substr($s, $i, $j); - $i += $j; - continue; - } - - $ulen = $ulenMask[$s[$i] & "\xF0"]; - $uchr = substr($s, $i, $ulen); - $i += $ulen; - - if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) { - // Table lookup - - if ($uchr !== $j = isset($compatMap[$uchr]) ? $compatMap[$uchr] : (isset($decompMap[$uchr]) ? $decompMap[$uchr] : $uchr)) { - $uchr = $j; - - $j = \strlen($uchr); - $ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"]; - - if ($ulen != $j) { - // Put trailing chars in $s - - $j -= $ulen; - $i -= $j; - - if (0 > $i) { - $s = str_repeat(' ', -$i).$s; - $len -= $i; - $i = 0; - } - - while ($j--) { - $s[$i + $j] = $uchr[$ulen + $j]; - } - - $uchr = substr($uchr, 0, $ulen); - } - } - if (isset($combClass[$uchr])) { - // Combining chars, for sorting - - if (!isset($c[$combClass[$uchr]])) { - $c[$combClass[$uchr]] = ''; - } - $c[$combClass[$uchr]] .= $uchr; - continue; - } - } else { - // Hangul chars - - $uchr = unpack('C*', $uchr); - $j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80; - - $uchr = "\xE1\x84".\chr(0x80 + (int) ($j / 588)) - ."\xE1\x85".\chr(0xA1 + (int) (($j % 588) / 28)); - - if ($j %= 28) { - $uchr .= $j < 25 - ? ("\xE1\x86".\chr(0xA7 + $j)) - : ("\xE1\x87".\chr(0x67 + $j)); - } - } - if ($c) { - ksort($c); - $result .= implode('', $c); - $c = array(); - } - - $result .= $uchr; - } - - if ($c) { - ksort($c); - $result .= implode('', $c); - } - - return $result; - } - - private static function getData($file) - { - if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) { - return require $file; - } - - return false; } } diff --git a/tests/Intl/Normalizer/NormalizerTest.php b/tests/Intl/Normalizer/NormalizerTest.php index 86bcf4b92..26d7de4e6 100644 --- a/tests/Intl/Normalizer/NormalizerTest.php +++ b/tests/Intl/Normalizer/NormalizerTest.php @@ -64,9 +64,6 @@ public function testIsNormalized() */ public function testNormalize() { - $c = in::normalize('déjà', pn::NFC).in::normalize('훈쇼™', pn::NFD); - $this->assertSame($c, normalizer_normalize($c, pn::NONE)); - $c = 'déjà 훈쇼™'; $d = in::normalize($c, pn::NFD); $kc = in::normalize($c, pn::NFKC); @@ -99,8 +96,8 @@ public function testNormalizeConformance() $t = explode(';', $t[0]); if (6 === \count($t)) { - foreach ($t as $k => $s) { - $t = explode(' ', $s); + foreach ($t as $k => $ss) { + $t = explode(' ', $ss); $t = array_map('hexdec', $t); $t = array_map(__CLASS__.'::chr', $t); $c[$k] = implode('', $t);