array()); $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; if (is_file($file)) { include($file); } $base = $UTF8_TO_ASCII[$bank]; // For unknown characters, these files have '[?]' in them. Replace with // NULL for compatibility with our data. $base = array_map('_replace_question_with_null', $base); $out[$bank] = $base; } return $out; } /** * Reads in the CPAN Text::Unidecode data set. * * The data is expected to be in files xNN.pm in directory 'Unidecode' under * this file's directory. It can be downloaded from * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm. * * @return * Nested array of transliteration data. Outer keys are the first two * bytes of Unicode characters (or 0 for base ASCII characters). The next * level is the other two bytes, and the values are the transliterations. */ function read_cpan_data() { $dir = __DIR__ . '/Unidecode'; $out = array(); // Read data files. for ($bank = 0; $bank < 256; $bank++) { $base = array(); $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm'; if (is_file($file)) { $base = _cpan_read_file($file); } $out[$bank] = $base; } return $out; } /** * Reads in the data in a single file from the Text::Unidecode CPAN project. */ function _cpan_read_file($file) { $contents = file($file); $save = ''; foreach ($contents as $line) { // Discard lines starting with # or $. The first line seems to have a // comment starting with #, the second has a Perl line like // $Text::Unidecode::Char[0x04] = [, -- and we do not want either. if (preg_match('|^\s*[#\$]|', $line)) { continue; } // Discard lines ending with semi-colons, which we also don't want // (there seem to be two of these lines at the end of the files). if (preg_match('|;\s*$|', $line)) { continue; } // Replace '[?]' with nothing (that means "don't know how to // transliterate"). $line = str_replace("'[?]'", 'NULL', $line); // Replace qq{} with either "" or '' or nothing, depending on what is // inside it. $line = str_replace('qq{\{}', "'{'", $line); $line = str_replace('qq{\}}', "'}'", $line); $line = str_replace('qq{\} }', "'} '", $line); $line = str_replace("qq{\\\\}", '"\\\\"', $line); $line = str_replace("qq{\\", "qq{'", $line); $line = str_replace("qq{\"'}", "\"\\\"'\"", $line); $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line); $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line); $save .= $line; } // Now we should have a string that looks like: // 'a', 'b', ... // Evaluate as an array. $save = 'return array(' . $save . ');'; $data = @eval($save); if (isset($data) && is_array($data)) { $data = array_map('_replace_hex_with_character', $data); } else { // There was a problem, so throw an error and exit. print "Problem in evaluating $file\n"; print $save; eval($save); exit(); } return $data; } /** * Reads in the Node.js transliteration data. * * The data is expected to be in files xNN.yml in directory unidecoder_data * under the directory where this file resides. It can be downloaded from * https://github.com/bitwalker/stringex/downloads. You also need the PECL * 'yaml' extension installed for this function to work. * * @return * Nested array of transliteration data. Outer keys are the first two * bytes of Unicode characters (or 0 for base ASCII characters). The next * level is the other two bytes, and the values are the transliterations. */ function read_nodejs_data() { $dir = __DIR__ . '/unidecoder_data'; $out = array(); // Read data files. for ($bank = 0; $bank < 256; $bank++) { $base = array(); $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml'; if (is_file($file)) { $base = yaml_parse_file($file); // For unknown characters, these files have '[?]' in them. Replace with // NULL for compatibility with our data. $base = array_map('_replace_question_with_null', $base); } $out[$bank] = $base; } return $out; } /** * Loads the PECL intl Transliterator class's transliteration data. * * You need to have the PECL 'intl' package installed for this to work. * * Note that this does a VERY LIMITED transliteration. See comments in the code. * I don't recommend using this function. * * @return * Nested array of transliteration data. Outer keys are the first two * bytes of Unicode characters (or 0 for base ASCII characters). The next * level is the other two bytes, and the values are the transliterations. */ function read_intl_data() { // In order to transliterate, you first have to create a transliterator // object. This needs a list of transliteration operations. You can get a // list of available operations with: // print_r(Transliterator::listIDs()); exit(); // And a few of these are documented on // http://userguide.icu-project.org/transforms/general and // http://www.unicode.org/reports/tr15/ (for normalizations). $ops = ''; // Separate out accents. $ops .= 'NFD; '; // Then you need to do a bunch of language-specific or script-specific // transliterations. But I am not sure which ones... Here is a minimal set, // but it doesn't cover anywhere near all of Unicode. And there may be more // up-to-date transforms available from unicode.org at: // http://cldr.unicode.org/index/downloads -- I think the ICU bases its // data on these downloads, but I have no idea which version might be // compiled into the 'intl' PECL install that I downloaded. $ops .= 'Cyrillic-Latin; '; $ops .= 'Greek-Latin; '; $ops .= 'Hangul-Latin; '; $ops .= 'Hiragana-Latin; '; $ops .= 'Katakana-Latin; '; // @todo Add more scripts/languages to transform. // Remove any remaining accents and recompose. $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;'; $trans = Transliterator::create($ops); $out = array(); // Transliterate all possible characters. for ($bank = 0; $bank < 256; $bank++) { $data = array(); for ($chr = 0; $chr < 256; $chr++) { $data[$chr] = $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')); } $out[$bank] = $data; } return $out; } /** * Reads in the JUnidecode data set. * * The data is expected to be in files XNN.java in directory 'junidecode' under * this file's directory. It can be downloaded from * http://www.ippatsuman.com/projects/junidecode/index.html * * @return * Nested array of transliteration data. Outer keys are the first two * bytes of Unicode characters (or 0 for base ASCII characters). The next * level is the other two bytes, and the values are the transliterations. */ function read_junidecode_data() { $dir = __DIR__ . '/junidecode'; $out = array(); // Read data files. for ($bank = 0; $bank < 256; $bank++) { $base = array(); $file = $dir . '/X' . sprintf('%02x', $bank) . '.java'; if (is_file($file)) { $base = _junidecode_read_file($file); } $out[$bank] = $base; } return $out; } /** * Reads in the data in a single file from the JUnidecode project. */ function _junidecode_read_file($file) { $contents = file($file); $save = ''; foreach ($contents as $line) { // Discard lines starting with * or / or package or class or public or }, // to get rid of comments and Java code. if (preg_match('|^\s*[\*/\}]|', $line)) { continue; } if (preg_match('/^\s*package|public|class/', $line)) { continue; } // Some of the lines look like this: // new String("" + (char) 0x00), // 0x00 // Transform to be '0x00,' $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line); // Strings are in double quotes, yet many have \' in them. $line = str_replace("\'", "'", $line); // Everything else should probably be OK -- the lines are like: // "Ie", // 0x00 $save .= $line; } // Evaluate as an array. $save = 'return array(' . $save . ');'; $data = @eval($save); if (isset($data) && is_array($data)) { $data = array_map('_replace_hex_with_character', $data); $data = array_map('_replace_question_with_null', $data); } else { // There was a problem, so throw an error and exit. print "Problem in evaluating $file\n"; print $save; eval($save); exit(); } return $data; } /** * Callback for array_map(): Returns $data, with '[?]' replaced with NULL. */ function _replace_question_with_null($data) { return ($data == '[?]') ? NULL : $data; } /** * Callback for array_map(): Replaces '\xNN' with the actual character. */ function _replace_hex_with_character($item) { if (strpos($item, '\x') === 0) { $item = eval($item); } return $item; }