diff --git a/core/includes/common.inc b/core/includes/common.inc index dece539..13938d3 100644 --- a/core/includes/common.inc +++ b/core/includes/common.inc @@ -7435,3 +7435,11 @@ function queue($name, $reliable = FALSE) { /** * @} End of "defgroup queue". */ + +function transliterate($string, $langcode = NULL, $unknown_character = '?') { + if (empty($langcode)) { + $langcode = language(LANGUAGE_TYPE_INTERFACE)->langcode; + } + + return drupal_container()->get('transliteration')->get($langcode, $unknown_character)->transliterate($string); +} diff --git a/core/lib/Drupal/Component/Transliteration/IcuTransliteration.php b/core/lib/Drupal/Component/Transliteration/IcuTransliteration.php new file mode 100644 index 0000000..ed9726f --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/IcuTransliteration.php @@ -0,0 +1,74 @@ += 5.4). + * + * @see http://php.net/manual/en/class.transliterator.php + */ +class IcuTransliteration extends Transliteration implements TransliterationInterface { + + /** + * Holds a static map of language overrides as an array of strings suitable + * for the Transliterator::createFromRules() method. + * + * @var array + */ + protected static $overrideRules = array(); + + /** + * Implements TransliterationInterface::transliterate(). + */ + public function transliterate($string) { + parent::transliterate($string); + + if (empty(self::$overrideRules[$this->langcode])) { + foreach (self::$languageOverrides as $langcode => $overrides) { + $rule = ''; + foreach ($overrides as $ord => $override) { + $rule .= '\u' . sprintf("%04s", dechex($ord)) . ' > ' . $override . '; '; + } + self::$overrideRules[$langcode] = $rule; + } + } + + // Apply the language overrides first. + if (isset(self::$overrideRules[$this->langcode])) { + $transliterator = Transliterator::createFromRules(self::$overrideRules[$this->langcode]); + $string = $transliterator->transliterate($string); + } + + // ICU does not have a 'Any-ASCII' transliterator, so we have to go through + // Latin first. + $transliterator = Transliterator::create("NFD; [:Nonspacing Mark:] Remove; NFC; Any-Latin; Latin-ASCII;"); + $ascii_string = $transliterator->transliterate($string); + + // @todo Also, the ICU transliterator behaves quite badly when used with the + // rule above and it doesn't transliterate some languages (e.g. Amharic). + // We need to either somehow map the incoming string to a supported script, + // or do something like this: +// $transliterator = Transliterator::create("NFD; [:Nonspacing Mark:] Remove; NFC; +// Amharic-Latin/BGN; Arabic-Latin/BGN; Armenian-Latin/BGN; +// Azerbaijani-Latin/BGN; Belarusian-Latin/BGN; Bengali-Latin; +// Bulgarian-Latin/BGN; Cyrillic-Latin; Devanagari-Latin; Georgian-Latin/BGN; +// Greek-Latin/BGN; Gujarati-Latin; Gurmukhi-Latin; Han-Latin; Hangul-Latin; +// Hebrew-Latin/BGN; Hiragana-Latin; Jamo-Latin; JapaneseKana-Latin/BGN; +// Kannada-Latin; Katakana-Latin; Kazakh-Latin/BGN; Kirghiz-Latin/BGN; +// Korean-Latin/BGN; Macedonian-Latin/BGN; Malayalam-Latin; Maldivian-Latin/BGN; +// Mongolian-Latin/BGN; NumericPinyin-Latin; Oriya-Latin; Pashto-Latin/BGN; +// Persian-Latin/BGN; Russian-Latin/BGN; Serbian-Latin/BGN; Syriac-Latin; +// Tamil-Latin; Telugu-Latin; Thaana-Latin; Thai-Latin; Turkmen-Latin/BGN; +// Ukrainian-Latin/BGN; Uzbek-Latin/BGN; Any-Latin; Latin-ASCII;"); + + return $ascii_string; + } +} diff --git a/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php new file mode 100644 index 0000000..fdd02a7 --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @@ -0,0 +1,181 @@ += "\x80" && $c < "\xc0") { + // Legal tail bytes are nice. + $sequence .= $c; + } + else { + if ($len == 0) { + // Premature end of string! Drop a replacement character into + // output to represent the invalid UTF-8 sequence. + $result .= $this->unknownCharacter; + break 2; + } + else { + // Illegal tail byte; abandon the sequence. + $result .= $this->unknownCharacter; + // Back up and reprocess this byte; it may itself be a legal + // ASCII or UTF-8 sequence head. + --$i; + ++$len; + continue 2; + } + } + } while (--$remaining); + + $n = ord($head); + if ($n <= 0xdf) { + $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); + } + elseif ($n <= 0xef) { + $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); + } + elseif ($n <= 0xf7) { + $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); + } + elseif ($n <= 0xfb) { + $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); + } + elseif ($n <= 0xfd) { + $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); + } + $result .= $this->replace($ord); + $head = ''; + } + elseif ($c < "\x80") { + // ASCII byte. + $result .= $c; + $head = ''; + } + elseif ($c < "\xc0") { + // Illegal tail bytes. + if ($head == '') { + $result .= $this->unknownCharacter; + } + } + else { + // Miscellaneous freaks. + $result .= $this->unknownCharacter; + $head = ''; + } + } + } + + return $result; + } + + /** + * Replaces a Unicode character using the transliteration database. + * + * @param int $ord + * An ordinal Unicode character code. + * + * @return string + * ASCII replacement character. + */ + protected function replace($ord) { + // Bail out early if this character is in the overrides table. + if (isset(self::$languageOverrides[$this->langcode][$ord])) { + return self::$languageOverrides[$this->langcode][$ord]; + } + + static $map = array(); + + $bank = $ord >> 8; + + if (!isset($map[$bank][$this->langcode])) { + $file = $this->dataDirectory . '/' . sprintf('x%02x', $bank) . '.php'; + if (file_exists($file)) { + include $file; + $map[$bank][$this->langcode] = $base; + } + else { + $map[$bank][$this->langcode] = array(); + } + } + + $ord = $ord & 255; + + return isset($map[$bank][$this->langcode][$ord]) ? $map[$bank][$this->langcode][$ord] : $this->unknownCharacter; + } +} diff --git a/core/lib/Drupal/Component/Transliteration/Transliteration.php b/core/lib/Drupal/Component/Transliteration/Transliteration.php new file mode 100644 index 0000000..5c8d2bd --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/Transliteration.php @@ -0,0 +1,48 @@ +langcode = $langcode; + $this->unknownCharacter = $unknown_character; + + $this->dataDirectory = dirname(__FILE__) . '/data'; + + // Initialize the language overrides in the base class so all available + // implementations can take them into consideration. + if (empty(self::$languageOverrides)) { + include $this->dataDirectory . '/language_overrides.php'; + self::$languageOverrides = $overrides; + } + } + + /** + * Implements Drupal\Component\Transliteration\TransliterationInterface::transliterate(). + */ + public function transliterate($string) { + // ASCII is always valid NFC! If we're only ever given plain ASCII, we can + // avoid the overhead of initializing the decomposition tables or + // Transliterator objects by skipping out early. + if (!preg_match('/[\x80-\xff]/', $string)) { + return $string; + } + } +} diff --git a/core/lib/Drupal/Component/Transliteration/TransliterationFactory.php b/core/lib/Drupal/Component/Transliteration/TransliterationFactory.php new file mode 100644 index 0000000..c191a9f --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/TransliterationFactory.php @@ -0,0 +1,40 @@ += 2.0.0, use the native ICU implementation. + if (extension_loaded('intl') + && (version_compare(PHP_VERSION, '5.4.0', '>=') || version_compare(phpversion('intl'), '2.0.0', '>='))) { + $class = 'Drupal\Component\Transliteration\IcuTransliteration'; + } + } + + return new $class($langcode, $unknown_character); + } +} diff --git a/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php new file mode 100644 index 0000000..cc64749 --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php @@ -0,0 +1,25 @@ +setFactoryClass('Drupal\Core\Database\Database') ->setFactoryMethod('getConnection') ->addArgument('slave'); + $container->register('transliteration', 'Drupal\Component\Transliteration\TransliterationFactory'); // @todo Replace below lines with the commented out block below it when it's // performant to do so: http://drupal.org/node/1706064.