From c01607a515a7e98966d7b627150cccc438543e70 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 17 Apr 2007 17:31:50 +0000 Subject: [PATCH] IDNA update --- idn/ReadMe.txt | 38 ++-- idn/idna_convert.class.php | 424 ++++++++++++++++++------------------- 2 files changed, 222 insertions(+), 240 deletions(-) diff --git a/idn/ReadMe.txt b/idn/ReadMe.txt index 8f749d970..7ca8c7e6d 100644 --- a/idn/ReadMe.txt +++ b/idn/ReadMe.txt @@ -2,9 +2,10 @@ * * * IDNA Convert (idna_convert.class.php) * * * -* http://idnaconv.phlymail.de mailto:team@phlymail.de * +* http://idnaconv.phlymail.de mailto:phlymail@phlylabs.de * ******************************************************************************* -* (c) 2004-2005 phlyLabs, Berlin * +* (c) 2004-2007 phlyLabs, Berlin * +* This file is encoded in UTF-8 * ******************************************************************************* Introduction @@ -20,34 +21,36 @@ what you would expect them to do. You are allowed to use complete domain names, simple strings and complete email addresses as well. That means, that you might use any of the following notations: -- www.nörgler.com +- www.nörgler.com - xn--nrgler-wxa - xn--brse-5qa.xn--knrz-1ra.info Errors, incorrectly encoded or invalid strings will lead to either a FALSE response (when in strict mode) or to only partially converted strings. -You can query the occured error by calling the method get_last_error() when -using the PHP4 version or through exceptions when the PHP5 version is used. +You can query the occured error by calling the method get_last_error(). Unicode strings are expected to be either UTF-8 strings, UCS-4 strings or UCS-4 arrays. The default format is UTF-8. For setting different encodings, you can call the method setParams() - please see the inline documentation for details. ACE strings (the Punycode form) are always 7bit ASCII strings. +ATTENTION: We no longer supply the PHP5 version of the class. It is not +necessary for achieving a successfull conversion, since the supplied PHP code is +compatible with both PHP4 and PHP5. We expect to see no compatibility issues +with the upcoming PHP6, too. + Files ----- idna_convert.class.php - The actual class -idna_convert.class.php5.php - A PHP5 version, contributed by Marcus Nix idna_convert.create.npdata.php - Useful for (re)creating the NPData file npdata.ser - Serialized data for NamePrep example.php - An example web page for converting ReadMe.txt - This file LICENCE - The LGPL licence file -For using the class, you will have to either use idna_convert.class.php or -idna_convert.class.php5.php from your application. +The class is contained in idna_convert.class.php. MAKE SURE to copy the npdata.ser file into the same folder as the class file itself! @@ -55,23 +58,19 @@ itself! Examples -------- -1. Say we wish to encode the domain name nörgler.com: +1. Say we wish to encode the domain name nörgler.com: // Include the class include_once('idna_convert.class.php'); // Instantiate it * $IDN = new idna_convert(); // The input string, if input is not UTF-8 or UCS-4, it must be converted before -$input = utf8_encode('nörgler.com'); +$input = utf8_encode('nörgler.com'); // Encode it to its punycode presentation $output = $IDN->encode($input); // Output, what we got now echo $output; // This will read: xn--nrgler-wxa.com -* If you wish to use the PHP5 version of the class, be aware, that the constructor - is named Net_IDNA_php5() since this file is used in the PEAR version of this class. - Likeweise, you can also instantiate the PHP4 version with new Net_IDNA_php4(). - 2. We received an email from a punycoded domain and are willing to learn, how the domain name reads originally @@ -79,14 +78,14 @@ echo $output; // This will read: xn--nrgler-wxa.com // Include the class include_once('idna_convert.class.php'); // Instantiate it (depending on the version you are using) with -$IDN = new Net_IDNA_php4(); +$IDN = new idna_convert(); // The input string $input = 'andre@xn--brse-5qa.xn--knrz-1ra.info'; // Encode it to its punycode presentation $output = $IDN->decode($input); // Output, what we got now, if output should be in a format different to UTF-8 // or UCS-4, you will have to convert it before outputting it -echo utf8_decode($output); // This will read: andre@börse.knürz.info +echo utf8_decode($output); // This will read: andre@börse.knörz.info 3. The input is read from a UCS-4 coded file and encoded line by line. By @@ -96,7 +95,7 @@ echo utf8_decode($output); // This will read: andre@b // Include the class include_once('idna_convert.class.php'); // Instantiate it -$IDN = new Net_IDNA_php4(); +$IDN = new dinca_convert(); // Iterate through the input file line by line foreach (file('ucs4-domains.txt') as $line) { echo $IDN->encode(trim($line), 'ucs4_string'); @@ -119,5 +118,6 @@ Contact us In case of errors, bugs, questions, wishes, please don't hesitate to contact us under the email address above. -The team of -phlymail.de \ No newline at end of file +The team of phlyLabs +http://phlylabs.de +mailto:phlymail@phlylabs.de \ No newline at end of file diff --git a/idn/idna_convert.class.php b/idn/idna_convert.class.php index b2f5e9806..ed2bae26d 100644 --- a/idn/idna_convert.class.php +++ b/idn/idna_convert.class.php @@ -1,11 +1,4 @@ - * @version 0.4.3 + * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de + * @version 0.5.1 * */ - class idna_convert { - // {{{ npdata /** * Holds all relevant mapping tables, loaded from a seperate file on construct * See RFC3454 for details @@ -68,9 +60,7 @@ class idna_convert * @var array * @access private */ - var $_np_ = array(); - // }}} - + var $NP = array(); // Internal settings, do not mess with them var $_punycode_prefix = 'xn--'; @@ -86,7 +76,7 @@ class idna_convert var $_sbase = 0xAC00; var $_lbase = 0x1100; var $_vbase = 0x1161; - var $_tbase = 0x11a7; + var $_tbase = 0x11A7; var $_lcount = 19; var $_vcount = 21; var $_tcount = 28; @@ -94,8 +84,8 @@ class idna_convert var $_scount = 11172; // _lcount * _tcount * _vcount var $_error = false; - // See set_parameter() for details of how to change the following settings - // from within your script / application + // See {@link set_paramter()} for details of how to change the following + // settings from within your script / application var $_api_encoding = 'utf8'; // Default input charset is UTF-8 var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden var $_strict_mode = false; // Behave strict or not @@ -105,9 +95,9 @@ function idna_convert($options = false) { $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount; if (function_exists('file_get_contents')) { - $this->_np_ = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser')); + $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser')); } else { - $this->_np_ = unserialize(join('', file(dirname(__FILE__).'/npdata.ser'))); + $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser'))); } // If parameters are given, pass these to the respective method if (is_array($options)) { @@ -117,21 +107,21 @@ function idna_convert($options = false) } /** - * Sets a new option value. Available options and values: - * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, - * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] - * [overlong - Unicode does not allow unnecessarily long encodings of chars, - * to allow this, set this parameter to true, else to false; - * default is false.] - * [strict - true: strict mode, good for registration purposes - Causes errors - * on failures; false: loose mode, ideal for "wildlife" applications - * by silently ignoring errors and returning the original input instead - * - * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) - * @param string Value to use (if parameter 1 is a string) - * @return boolean true on success, false otherwise - * @access public - */ + * Sets a new option value. Available options and values: + * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, + * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] + * [overlong - Unicode does not allow unnecessarily long encodings of chars, + * to allow this, set this parameter to true, else to false; + * default is false.] + * [strict - true: strict mode, good for registration purposes - Causes errors + * on failures; false: loose mode, ideal for "wildlife" applications + * by silently ignoring errors and returning the original input instead + * + * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) + * @param string Value to use (if parameter 1 is a string) + * @return boolean true on success, false otherwise + * @access public + */ function set_parameter($option, $value = false) { if (!is_array($option)) { @@ -166,12 +156,12 @@ function set_parameter($option, $value = false) } /** - * Decode a given ACE domain name - * @param string Domain name (ACE string) - * [@param string Desired output encoding, see {@link set_parameter}] - * @return string Decoded Domain name (UTF-8 or UCS-4) - * @access public - */ + * Decode a given ACE domain name + * @param string Domain name (ACE string) + * [@param string Desired output encoding, see {@link set_parameter}] + * @return string Decoded Domain name (UTF-8 or UCS-4) + * @access public + */ function decode($input, $one_time_encoding = false) { // Optionally set @@ -189,7 +179,7 @@ function decode($input, $one_time_encoding = false) // Make sure to drop any newline characters around $input = trim($input); - // Negotiate input and try to determine, wether it is a plain string, + // Negotiate input and try to determine, whether it is a plain string, // an email address or something like a complete URL if (strpos($input, '@')) { // Maybe it is an email address // No no in strict mode @@ -197,13 +187,24 @@ function decode($input, $one_time_encoding = false) $this->_error('Only simple domain name parts can be handled in strict mode'); return false; } - list($email_pref, $input) = explode('@', $input, 2); + list ($email_pref, $input) = explode('@', $input, 2); $arr = explode('.', $input); foreach ($arr as $k => $v) { - $conv = $this->_decode($v); - if ($conv) $arr[$k] = $conv; + if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { + $conv = $this->_decode($v); + if ($conv) $arr[$k] = $conv; + } } - $return = $email_pref . '@' . join('.', $arr); + $input = join('.', $arr); + $arr = explode('.', $email_pref); + foreach ($arr as $k => $v) { + if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { + $conv = $this->_decode($v); + if ($conv) $arr[$k] = $conv; + } + } + $email_pref = join('.', $arr); + $return = $email_pref . '@' . $input; } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters) // No no in strict mode if ($this->_strict_mode) { @@ -223,19 +224,20 @@ function decode($input, $one_time_encoding = false) .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@') .$parsed['host'] .(empty($parsed['port']) ? '' : ':'.$parsed['port']) - .$parsed['path'] + .(empty($parsed['path']) ? '' : $parsed['path']) .(empty($parsed['query']) ? '' : '?'.$parsed['query']) .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']); } else { // parse_url seems to have failed, try without it $arr = explode('.', $input); foreach ($arr as $k => $v) { $conv = $this->_decode($v); - if ($conv) $arr[$k] = $conv; + $arr[$k] = ($conv) ? $conv : $v; } $return = join('.', $arr); } } else { // Otherwise we consider it being a pure domain name string $return = $this->_decode($input); + if (!$return) $return = $input; } // The output is UTF-8 by default, other output formats need conversion here // If one time encoding is given, use this, else the objects property @@ -256,17 +258,17 @@ function decode($input, $one_time_encoding = false) } /** - * Encode a given UTF-8 domain name - * @param string Domain name (UTF-8 or UCS-4) - * [@param string Desired input encoding, see {@link set_parameter}] - * @return string Encoded Domain name (ACE string) - * @access public - */ + * Encode a given UTF-8 domain name + * @param string Domain name (UTF-8 or UCS-4) + * [@param string Desired input encoding, see {@link set_parameter}] + * @return string Encoded Domain name (ACE string) + * @access public + */ function encode($decoded, $one_time_encoding = false) { // Forcing conversion of input to UCS4 array // If one time encoding is given, use this, else the objects property - switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) { + switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) { case 'utf8': $decoded = $this->_utf8_to_ucs4($decoded); break; @@ -275,8 +277,7 @@ function encode($decoded, $one_time_encoding = false) case 'ucs4_array': break; default: - // $this->_error('Unsupported input format: '.$this->_api_encoding); - $this->_error('Unsupported input format'); + $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding)); return false; } @@ -294,9 +295,7 @@ function encode($decoded, $one_time_encoding = false) case 0xFF0E: case 0xFF61: $decoded[$k] = 0x2E; - // It's right, no break here - // The codepoints above have to be converted to dots anyway - + // Right, no break here, the above are converted to dots anyway // Stumbling across an anchoring character case 0x2E: case 0x2F: @@ -344,20 +343,20 @@ function encode($decoded, $one_time_encoding = false) } /** - * Use this method to get the last error ocurred - * @param void - * @return string The last error, that occured - * @access public - */ + * Use this method to get the last error ocurred + * @param void + * @return string The last error, that occured + * @access public + */ function get_last_error() { return $this->_error; } /** - * The actual decoding algorithm - * @access private - */ + * The actual decoding algorithm + * @access private + */ function _decode($encoded) { // We do need to find the Punycode prefix @@ -414,9 +413,9 @@ function _decode($encoded) } /** - * The actual encoding algorithm - * @access private - */ + * The actual encoding algorithm + * @access private + */ function _encode($decoded) { // We cannot encode a domain name containing the Punycode prefix @@ -495,7 +494,7 @@ function _encode($decoded) $t = ($k <= $bias) ? $this->_tmin : (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias); if ($q < $t) break; - $encoded .= $this->_encode_digit(ceil($t + (($q - $t) % ($this->_base - $t)))); + $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval() $q = (int) (($q - $t) / ($this->_base - $t)); } $encoded .= $this->_encode_digit($q); @@ -512,32 +511,32 @@ function _encode($decoded) } /** - * Adapt the bias according to the current code point and position - * @access private - */ + * Adapt the bias according to the current code point and position + * @access private + */ function _adapt($delta, $npoints, $is_first) { - $delta = (int) ($is_first ? ($delta / $this->_damp) : ($delta / 2)); - $delta += (int) ($delta / $npoints); + $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2)); + $delta += intval($delta / $npoints); for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) { - $delta = (int) ($delta / ($this->_base - $this->_tmin)); + $delta = intval($delta / ($this->_base - $this->_tmin)); } - return (int) ($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); + return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); } /** - * Encoding a certain digit - * @access private - */ + * Encoding a certain digit + * @access private + */ function _encode_digit($d) { return chr($d + 22 + 75 * ($d < 26)); } /** - * Decode a certain digit - * @access private - */ + * Decode a certain digit + * @access private + */ function _decode_digit($cp) { $cp = ord($cp); @@ -545,20 +544,20 @@ function _decode_digit($cp) } /** - * Internal error handling method - * @access private - */ + * Internal error handling method + * @access private + */ function _error($error = '') { $this->_error = $error; } /** - * Do Nameprep according to RFC3491 and RFC3454 - * @param array Unicode Characters - * @return string Unicode Characters, Nameprep'd - * @access private - */ + * Do Nameprep according to RFC3491 and RFC3454 + * @param array Unicode Characters + * @return string Unicode Characters, Nameprep'd + * @access private + */ function _nameprep($input) { $output = array(); @@ -568,18 +567,16 @@ function _nameprep($input) // Walking through the input array, performing the required steps on each of // the input chars and putting the result into the output array // While mapping required chars we apply the cannonical ordering - - // $this->_show_hex($input); foreach ($input as $v) { // Map to nothing == skip that code point - if (in_array($v, $this->_np_['map_nothing'])) continue; + if (in_array($v, $this->NP['map_nothing'])) continue; // Try to find prohibited input - if (in_array($v, $this->_np_['prohibit']) || in_array($v, $this->_np_['general_prohibited'])) { + if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) { $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); return false; } - foreach ($this->_np_['prohibit_ranges'] as $range) { + foreach ($this->NP['prohibit_ranges'] as $range) { if ($range[0] <= $v && $v <= $range[1]) { $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); return false; @@ -589,15 +586,15 @@ function _nameprep($input) // Hangul syllable decomposition if (0xAC00 <= $v && $v <= 0xD7AF) { foreach ($this->_hangul_decompose($v) as $out) { - $output[] = $out; + $output[] = (int) $out; } // There's a decomposition mapping for that code point - } elseif (isset($this->_np_['replacemaps'][$v])) { - foreach ($this->_apply_cannonical_ordering($this->_np_['replacemaps'][$v]) as $out) { - $output[] = $out; + } elseif (isset($this->NP['replacemaps'][$v])) { + foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) { + $output[] = (int) $out; } } else { - $output[] = $v; + $output[] = (int) $v; } } // Before applying any Combining, try to rearrange any Hangul syllables @@ -631,77 +628,68 @@ function _nameprep($input) continue; } } - if (!$class) { // The current class is 0 - $last_starter = $i; - } + // The current class is 0 + if (!$class) $last_starter = $i; $last_class = $class; } return $output; } /** - * Decomposes a Hangul syllable - * (see http://www.unicode.org/unicode/reports/tr15/#Hangul - * @param integer 32bit UCS4 code point - * @return array Either Hangul Syllable decomposed or original 32bit value as one value array - * @access private - */ + * Decomposes a Hangul syllable + * (see http://www.unicode.org/unicode/reports/tr15/#Hangul + * @param integer 32bit UCS4 code point + * @return array Either Hangul Syllable decomposed or original 32bit value as one value array + * @access private + */ function _hangul_decompose($char) { - $sindex = $char - $this->_sbase; + $sindex = (int) $char - $this->_sbase; if ($sindex < 0 || $sindex >= $this->_scount) { return array($char); } $result = array(); - $T = $this->_tbase + ($sindex % $this->_tcount); - $result[] = (int) ($this->_lbase + $sindex / $this->_ncount); - $result[] = (int) $this->_vbase + (($sindex % $this->_ncount) / $this->_tcount); + $result[] = (int) $this->_lbase + $sindex / $this->_ncount; + $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount; + $T = intval($this->_tbase + $sindex % $this->_tcount); if ($T != $this->_tbase) $result[] = $T; return $result; } - /** - * Ccomposes a Hangul syllable - * (see http://www.unicode.org/unicode/reports/tr15/#Hangul - * @param array Decomposed UCS4 sequence - * @return array UCS4 sequence with syllables composed - * @access private - */ + * Ccomposes a Hangul syllable + * (see http://www.unicode.org/unicode/reports/tr15/#Hangul + * @param array Decomposed UCS4 sequence + * @return array UCS4 sequence with syllables composed + * @access private + */ function _hangul_compose($input) { $inp_len = count($input); if (!$inp_len) return array(); $result = array(); - $last = $input[0]; + $last = (int) $input[0]; $result[] = $last; // copy first char from input to output for ($i = 1; $i < $inp_len; ++$i) { - $char = $input[$i]; - - // Find out, wether two current characters from L and V + $char = (int) $input[$i]; + $sindex = $last - $this->_sbase; $lindex = $last - $this->_lbase; - if (0 <= $lindex && $lindex < $this->_lcount) { - $vindex = $char - $this->_vbase; - if (0 <= $vindex && $vindex < $this->_vcount) { - // create syllable of form LV - $last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount); - $out_off = count($result) - 1; - $result[$out_off] = $last; // reset last - continue; // discard char - } + $vindex = $char - $this->_vbase; + $tindex = $char - $this->_tbase; + // Find out, whether two current characters are LV and T + if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) + && 0 <= $tindex && $tindex <= $this->_tcount) { + // create syllable of form LVT + $last += $tindex; + $result[(count($result) - 1)] = $last; // reset last + continue; // discard char } - - // Find out, wether two current characters are LV and T - $sindex = $last - $this->_sbase; - if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0) { - $tindex = $char - $this->_tbase; - if (0 <= $tindex && $tindex <= $this->_tcount) { - // create syllable of form LVT - $last += $tindex; - $out_off = count($result) - 1; - $result[$out_off] = $last; // reset last - continue; // discard char - } + // Find out, whether two current characters form L and V + if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) { + // create syllable of form LV + $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount; + $result[(count($result) - 1)] = $last; // reset last + continue; // discard char } // if neither case was true, just add the character $last = $char; @@ -711,39 +699,39 @@ function _hangul_compose($input) } /** - * Returns the combining class of a certain wide char - * @param integer Wide char to check (32bit integer) - * @return integer Combining class if found, else 0 - * @access private - */ + * Returns the combining class of a certain wide char + * @param integer Wide char to check (32bit integer) + * @return integer Combining class if found, else 0 + * @access private + */ function _get_combining_class($char) { - return isset($this->_np_['norm_combcls'][$char]) ? $this->_np_['norm_combcls'][$char] : 0; + return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0; } /** - * Apllies the cannonical ordering of a decomposed UCS4 sequence - * @param array Decomposed UCS4 sequence - * @return array Ordered USC4 sequence - * @access private - */ + * Apllies the cannonical ordering of a decomposed UCS4 sequence + * @param array Decomposed UCS4 sequence + * @return array Ordered USC4 sequence + * @access private + */ function _apply_cannonical_ordering($input) { $swap = true; $size = count($input); while ($swap) { $swap = false; - $last = $this->_get_combining_class($input[0]); - for ($i = 0; $i < $size - 1; ++$i) { - $next = $this->_get_combining_class($input[$i+1]); + $last = $this->_get_combining_class(intval($input[0])); + for ($i = 0; $i < $size-1; ++$i) { + $next = $this->_get_combining_class(intval($input[$i+1])); if ($next != 0 && $last > $next) { // Move item leftward until it fits for ($j = $i + 1; $j > 0; --$j) { - if ($this->_get_combining_class($input[$j - 1]) <= $next) break; - $t = $input[$j]; - $input[$j] = $input[$j - 1]; - $input[$j - 1] = $t; - $swap = 1; + if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break; + $t = intval($input[$j]); + $input[$j] = intval($input[$j-1]); + $input[$j-1] = $t; + $swap = true; } // Reentering the loop looking at the old character again $next = $last; @@ -755,15 +743,15 @@ function _apply_cannonical_ordering($input) } /** - * Do composition of a sequence of starter and non-starter - * @param array UCS4 Decomposed sequence - * @return array Ordered USC4 sequence - * @access private - */ + * Do composition of a sequence of starter and non-starter + * @param array UCS4 Decomposed sequence + * @return array Ordered USC4 sequence + * @access private + */ function _combine($input) { $inp_len = count($input); - foreach ($this->_np_['replacemaps'] as $np_src => $np_target) { + foreach ($this->NP['replacemaps'] as $np_src => $np_target) { if ($np_target[0] != $input[0]) continue; if (count($np_target) != $inp_len) continue; $hit = false; @@ -781,22 +769,22 @@ function _combine($input) } /** - * This converts an UTF-8 encoded string to its UCS-4 representation - * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing - * each of the "chars". This is due to PHP not being able to handle strings with - * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too. - * The following UTF-8 encodings are supported: - * bytes bits representation - * 1 7 0xxxxxxx - * 2 11 110xxxxx 10xxxxxx - * 3 16 1110xxxx 10xxxxxx 10xxxxxx - * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * Each x represents a bit that can be used to store character data. - * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000 - * @access private - */ + * This converts an UTF-8 encoded string to its UCS-4 representation + * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing + * each of the "chars". This is due to PHP not being able to handle strings with + * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too. + * The following UTF-8 encodings are supported: + * bytes bits representation + * 1 7 0xxxxxxx + * 2 11 110xxxxx 10xxxxxx + * 3 16 1110xxxx 10xxxxxx 10xxxxxx + * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * Each x represents a bit that can be used to store character data. + * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000 + * @access private + */ function _utf8_to_ucs4($input) { $output = array(); @@ -870,10 +858,10 @@ function _utf8_to_ucs4($input) } /** - * Convert UCS-4 string into UTF-8 string - * See _utf8_to_ucs4() for details - * @access private - */ + * Convert UCS-4 string into UTF-8 string + * See _utf8_to_ucs4() for details + * @access private + */ function _ucs4_to_utf8($input) { $output = ''; @@ -907,43 +895,37 @@ function _ucs4_to_utf8($input) } /** - * Convert UCS-4 array into UCS-4 string - * - * @access private - */ + * Convert UCS-4 array into UCS-4 string + * + * @access private + */ function _ucs4_to_ucs4_string($input) { $output = ''; // Take array values and split output to 4 bytes per value // The bit mask is 255, which reads &11111111 foreach ($input as $v) { - $output .= chr(($v >> 24) & 255) - . chr(($v >> 16) & 255) - . chr(($v >> 8) & 255) - . chr($v & 255); + $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); } return $output; } /** - * Convert UCS-4 strin into UCS-4 garray - * - * @access private - */ + * Convert UCS-4 strin into UCS-4 garray + * + * @access private + */ function _ucs4_string_to_ucs4($input) { $output = array(); - $inp_len = strlen($input); // Input length must be dividable by 4 if ($inp_len % 4) { $this->_error('Input UCS4 string is broken'); return false; } - // Empty input - return empty output if (!$inp_len) return $output; - for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { // Increment output position every 4 input bytes if (!($i % 4)) { @@ -963,21 +945,21 @@ function _ucs4_string_to_ucs4($input) class Net_IDNA_php4 extends idna_convert { /** - * Sets a new option value. Available options and values: - * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, - * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] - * [overlong - Unicode does not allow unnecessarily long encodings of chars, - * to allow this, set this parameter to true, else to false; - * default is false.] - * [strict - true: strict mode, good for registration purposes - Causes errors - * on failures; false: loose mode, ideal for "wildlife" applications - * by silently ignoring errors and returning the original input instead - * - * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) - * @param string Value to use (if parameter 1 is a string) - * @return boolean true on success, false otherwise - * @access public - */ + * Sets a new option value. Available options and values: + * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, + * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] + * [overlong - Unicode does not allow unnecessarily long encodings of chars, + * to allow this, set this parameter to true, else to false; + * default is false.] + * [strict - true: strict mode, good for registration purposes - Causes errors + * on failures; false: loose mode, ideal for "wildlife" applications + * by silently ignoring errors and returning the original input instead + * + * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) + * @param string Value to use (if parameter 1 is a string) + * @return boolean true on success, false otherwise + * @access public + */ function setParams($option, $param = false) { return $this->IC->set_parameters($option, $param);