diff --git a/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php new file mode 100644 index 0000000..7503d52 --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php @@ -0,0 +1,331 @@ +, + * http://www.mediawiki.org/ + */ + +namespace Drupal\Component\Transliteration; + +/** + * Implements transliteration without using the PECL extensions. + * + * Transliterations are done character-by-character, by looking up non-US-ASCII + * characters in a transliteration database. The database comes from two types + * of files, both of which are searched for in the + * PHPTransliteration::$dataDirectory directory. First, language-specific + * overrides are searched (see PHPTranslation::readLanguageOverrides() for + * details of these files). If there is no language-specific override for a + * character, the generic transliteration character tables are searched (see + * PHPTranslation::readGenericData() for details of these files). If looking up + * the character in the generic table results in a NULL value, or an illegal + * character is encountered, then a substitute character is returned. + * + * This class is the registered transliteration class returned from + * drupal_container()->get('transliteration') by default. + * + * @ingroup transliteration + */ +class PHPTransliteration implements TransliterationInterface { + + /** + * Directory where data for transliteration resides. + * + * The constructor sets this (by default) to subdirectory 'data' underneath + * the directory where the class's PHP file resides. + * + * @var string + */ + protected $dataDirectory; + + /** + * Associative array of language-specific character transliteration tables. + * + * The outermost array keys are language codes. For each language code key, + * the value is an array whose keys are Unicode character codes, and whose + * values are the transliterations of those characters to US-ASCII. This is + * set up as needed in PHPTransliteration::replace() by calling + * PHPTransliteration::readLanguageOverrides(). + * + * @var array + */ + protected $languageOverrides = array(); + + /** + * Non-language-specific transliteration tables. + * + * Array whose keys are the upper two bytes of the Unicode character, and + * whose values are an array of transliterations for each lower-two bytes + * character code. This is set up as needed in PHPTransliteration::replace() + * by calling PHPTransliteration::readGenericData(). + * + * @var array + */ + protected $genericMap = array(); + + /** + * Table of tail bytes for Unicode characters. + * + * Unicode characters consist of a UTF-8 head byte followed by some number of + * tail bytes. This table, which is set up in the constructor, maps character + * cut-off values to this tail bytes number. + * + * @var array + */ + static protected $tailBytes = NULL; + + /** + * Returns this PHPTransliteration object (for the Drupal Container). + */ + public function get() { + return $this; + } + + /** + * Constructs a transliteration object. + * + * @param string $data_directory + * (optional) The directory where data files reside. If omitted, defaults + * to subdirectory 'data' underneath the directory where the class's PHP + * file resides. + */ + public function __construct($data_directory = NULL) { + // Set up data directory and tail bytes table. + $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data'; + self::setupTailBytesTable(); + } + + /** + * Implements TransliterationInterface::transliterate(). + */ + public function transliterate($string, $langcode = 'en', $unknown_character = '?') { + // Chop the text into pure-US-ASCII and non-US-ASCII areas; large US-ASCII + // parts can be handled much more quickly. Don't chop up Unicode areas for + // punctuation, though, as that wastes energy. + preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches); + + $result = ''; + foreach ($matches[0] as $str) { + if ($str[0] < "\x80") { + // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so + // skip over it. + $result .= $str; + continue; + } + + // Non-ASCII chunk: examine byte by byte to ensure that it consists + // of valid UTF-8 sequences, and normalize each character. This loop is + // optimized so it is a bit ugly. + $head = ''; + $chunk = strlen($str); + // Counting down is faster. + $len = $chunk + 1; + + for ($i = -1; --$len; ) { + // See how many bytes the next character is, after the head two bytes. + $c = $str[++$i]; + if ($remaining = self::$tailBytes[$c]) { + // This is a multi-byte character. + $sequence = $head = $c; + do { + // Look for the defined number of tail bytes. + if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") { + // Add the (legal) bytes to the character. + $sequence .= $c; + } + else { + if ($len == 0) { + // Premature end of string! Drop a replacement character into + // output to represent the invalid UTF-8 sequence. + $result .= $unknown_character; + break 2; + } + else { + // Illegal tail byte; abandon the sequence. + $result .= $unknown_character; + // Back up and reprocess this byte; it may itself be a legal + // ASCII or UTF-8 sequence head. + --$i; + ++$len; + continue 2; + } + } + } while (--$remaining); + + // Do some normalization. + $n = ord($head); + if ($n <= 0xdf) { + $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); + } + elseif ($n <= 0xef) { + $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); + } + elseif ($n <= 0xf7) { + $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); + } + elseif ($n <= 0xfb) { + $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); + } + elseif ($n <= 0xfd) { + $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); + } + $result .= $this->replace($ord, $langcode, $unknown_character); + $head = ''; + } + elseif ($c < "\x80") { + // ASCII byte. + $result .= $c; + $head = ''; + } + elseif ($c < "\xc0") { + // Illegal tail bytes. + if ($head == '') { + $result .= $unknown_character; + } + } + else { + // Miscellaneous freaks. + $result .= $unknown_character; + $head = ''; + } + } + } + + return $result; + } + + /** + * Replaces a single Unicode character using the transliteration database. + * + * @param int $ord + * An ordinal Unicode character code. + * @param string $langcode + * The language code of the language the character is in. + * @param string $unknown_character + * The character to return if there is no transliterated equivalent. + * + * @return string + * US-ASCII replacement character. If $ord has a mapping, it is returned; + * otherwise, $unknown_character is returned. + */ + protected function replace($ord, $langcode, $unknown_character) { + // See if there is a language-specific override for this character. + if (!isset($this->languageOverrides[$langcode])) { + $this->readLanguageOverrides($langcode); + } + if (isset($this->languageOverrides[$langcode][$ord])) { + return $this->languageOverrides[$langcode][$ord]; + } + + // See if there is a generic mapping for this character, and return it or + // $unknown_character if not found. + $bank = $ord >> 8; + if (!isset($this->genericMap[$bank])) { + $this->readGenericData($bank); + } + $ord = $ord & 255; + return isset($this->genericMap[$bank][$ord]) ? $this->genericMap[$bank][$ord] : $unknown_character; + } + + /** + * Reads in language overrides for a language code. + * + * The data is read from files named "$langcode.php" in + * PHPTransliteration::$dataDirectory. These files should set up an array + * variable $overrides with an element whose key is $langcode and whose value + * is an array whose keys are character codes, and whose values are their + * transliterations in this language. The resulting $overrides array is + * altered by invoking hook_transliteration_overrides_alter() to let modules + * add additional overrides. + * + * @param $langcode + * Code for the language to read. + */ + protected function readLanguageOverrides($langcode) { + // Figure out the file name to use by sanitizing the language code, + // just in case. + $file = $this->dataDirectory . '/' . preg_replace('[^a-zA-Z\-]', '', $langcode) . '.php'; + + // Read in this file, which should set up a variable called $overrides, + // which will be local to this function. + if (is_file($file)) { + include($file); + } + if (!isset($overrides) || !is_array($overrides)) { + $overrides = array($langcode => array()); + } + + // Let modules alter the list, and save it. + drupal_alter('transliteration_overrides', $overrides, $langcode); + $this->languageOverrides[$langcode] = $overrides[$langcode]; + } + + /** + * Reads in generic transliteration data for a bank of characters. + * + * The data is read in from a file named "x$bank.php" (with $bank in + * hexidecimal notation) in PHPTransliteration::$dataDirectory. These files + * should set up a variable $bank containing an array whose numerical indices + * are the remaining two bytes of the character code, and whose values are the + * transliterations of these characters into US-ASCII. + * + * @param $bank + * First two bytes of the Unicode character, or 0 for the ASCII range. + */ + protected function readGenericData($bank) { + // Figure out the file name. + $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php'; + + // Read in this file, which should set up a variable called $base, which + // will be local to this function. + if (is_file($file)) { + include($file); + } + if (!isset($base) || !is_array($base)) { + $base = array(); + } + + // Save this data. + $this->genericMap[$bank] = $base; + } + + /** + * Sets up the table of tail bytes for Unicode tables. + */ + static protected function setupTailBytesTable() { + if (isset(self::$tailBytes)) { + return; + } + + self::$tailBytes = array(); + + for ($n = 0; $n < 256; $n++) { + if ($n < 0xc0) { + $remaining = 0; + } + elseif ($n < 0xe0) { + $remaining = 1; + } + elseif ($n < 0xf0) { + $remaining = 2; + } + elseif ($n < 0xf8) { + $remaining = 3; + } + elseif ($n < 0xfc) { + $remaining = 4; + } + elseif ($n < 0xfe) { + $remaining = 5; + } + else { + $remaining = 0; + } + self::$tailBytes[chr($n)] = $remaining; + } + } +} diff --git a/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php new file mode 100644 index 0000000..b88a55d --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php @@ -0,0 +1,34 @@ +addCompilerPass(new RegisterNestedMatchersPass()); // Add a compiler pass for registering event subscribers. $container->addCompilerPass(new RegisterKernelListenersPass(), PassConfig::TYPE_AFTER_REMOVING); + + $container->register('transliteration', 'Drupal\Component\Transliteration\PHPTransliteration'); } } diff --git a/core/modules/system/language.api.php b/core/modules/system/language.api.php index 16d0794..767d932 100644 --- a/core/modules/system/language.api.php +++ b/core/modules/system/language.api.php @@ -184,3 +184,70 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) { /** * @} End of "addtogroup hooks". */ + +/** + * @defgroup transliteration Transliteration + * @{ + * Transliterate from Unicode to US-ASCII + * + * Transliteration is the process of translating individual non-US-ASCII + * characters into ASCII characters, which specifically does not transform + * non-printable and punctuation characters in any way. This process will always + * be both inexact and language-dependent. For instance, the character Ö (O with + * an umlaut) is commonly transliterated as O, but in German text, the + * convention would be to transliterate it as Oe or OE, depending on the context + * (beginning of a capitalized word, or in an all-capital letter context). + * + * The Drupal default transliteration process transliterates text character by + * character using a database of generic character transliterations and + * language-specific overrides. Character context (such as all-capitals + * vs. initial capital letter only) is not taken into account, and in + * transliterations of capital letters that result in two or more letters, by + * convention only the first is capitalized in the Drupal transliteration + * result. So, the process has limitations; however, since the reason for + * transliteration is typically to create machine names or file names, this + * should not really be a problem. After transliteration, other transformation + * or validation may be necessary, such as converting spaces to another + * character, removing non-printable characters, lower-casing, etc. + * + * Here is a code snippet to transliterate some text: + * @code + * // Use the current default interface language. + * $langcode = language(LANGUAGE_TYPE_INTERFACE)->langcode; + * // Instantiate the transliteration class. + * $trans = drupal_container()->get('transliteration'); + * // Use this to transliterate some text. + * $transformed = $trans->transliterate($string, $langcode); + * @endcode + * + * Drupal Core provides the generic transliteration character tables and + * overrides for a few common languages; modules can implement + * hook_transliteration_overrides_alter() to provide further language-specific + * overrides. Modules can also completely override the transliteration classes + * in \Drupal\Core\CoreBundle. + */ + +/** + * Provide language overrides for transliteration. + * + * @param array $overrides + * Associative array of language overrides. The outermost key is the language + * code, and the corresponding value is an array whose keys are integer + * Unicode character codes, and whose values are the transliterations of those + * characters in the given language, to override default transliterations. + * @param string $langcode + * The code for the language that is being transliterated. + * + * @ingroup hooks + */ +function hook_transliteration_overrides_alter(&$overrides, $langcode) { + // Provide special overrides for German for a custom site. + if ($langcode == 'de') { + // The core-provided transliteration of Ä is Ae, but we want just A. + $overrides['de'][0xC4] = 'A'; + } +} + +/** + * @} End of "defgroup transliteration". + */ diff --git a/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php new file mode 100644 index 0000000..2086df8 --- /dev/null +++ b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php @@ -0,0 +1,76 @@ + 'Transliteration functionality', + 'description' => 'Tests the transliteration component', + 'group' => 'Transliteration', + ); + } + + /** + * Tests the PHPTransliteration class. + */ + public function testPHPTransliteration() { + $random = $this->randomName(10); + $cases = array( + // Each test case is (language code, input, output). + // Test ASCII in English. + array('en', $random, $random), + // Test ASCII in some other language with no overrides. + array('fr', $random, $random), + // Test a language with no overrides. + array('en', 'Ä Ö Ü Å Ø äöüåøhello', 'A O U A O aouaohello'), + // Test language overrides provided by core. + array('de', 'Ä Ö Ü Å Ø äöüåøhello', 'Ae Oe Ue A O aeoeueaohello'), + array('de', $random, $random), + array('dk', 'Ä Ö Ü Å Ø äöüåøhello', 'A O U Aa Oe aouaaoehello'), + array('dk', $random, $random), + // Test the language override hook in the test module, which changes + // the transliteration of Ä to Z. + array('zz', 'Ä Ö Ü Å Ø äöüåøhello', 'Z O U A O aouaohello'), + array('zz', $random, $random), + // Test strings in some other languages. + // Turkish, provided by drupal.org user Kartagis. + array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'), + ); + + // Test each case both with a new instance of the transliteration class, + // and with one that builds as it goes. + $common_transliterator = new PHPTransliteration(); + + foreach($cases as $case) { + list($langcode, $before, $after) = $case; + $transliterator = new PHPTransliteration(); + $actual = $transliterator->transliterate($before, $langcode); + $this->assertEqual($after, $actual, format_string('@before is correctly transliterated to @after in new class (@actual) in language @langcode', array('@before' => $before, '@langcode' => $langcode, '@after' => $after, '@actual' => $actual))); + + $actual = $common_transliterator->transliterate($before, $langcode); + $this->assertEqual($after, $actual, format_string('@before is correctly transliterated to @after in previously-used class (@actual) in language @langcode', array('@before' => $before, '@langcode' => $langcode, '@after' => $after, '@actual' => $actual))); + } + } +} diff --git a/core/modules/system/tests/modules/transliterate_test/transliterate_test.info b/core/modules/system/tests/modules/transliterate_test/transliterate_test.info new file mode 100644 index 0000000..8325b2f --- /dev/null +++ b/core/modules/system/tests/modules/transliterate_test/transliterate_test.info @@ -0,0 +1,6 @@ +name = "Transliteration test module" +description = "Tests the transliteration hook" +package = Testing +version = VERSION +core = 8.x +hidden = TRUE diff --git a/core/modules/system/tests/modules/transliterate_test/transliterate_test.module b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module new file mode 100644 index 0000000..59e1149 --- /dev/null +++ b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module @@ -0,0 +1,23 @@ +