diff --git a/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php new file mode 100644 index 0000000..7162325 --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php @@ -0,0 +1,241 @@ +, + * http://www.mediawiki.org/ + */ + +namespace Drupal\Component\Transliteration; + +/** + * Implements transliteration without using the PECL extensions. + * + * Transliterations are done character-by-character, by looking up non-US-ASCII + * characters in a transliteration database. The database comes from two types + * of files, both of which are searched for in the + * PHPTransliteration::$dataDirectory directory. First, language-specific + * overrides are searched (see PHPTranslation::readLanguageOverrides() for + * details of these files). If there is no language-specific override for a + * character, the generic transliteration character tables are searched (see + * PHPTranslation::readGenericData() for details of these files). If looking up + * the character in the generic table results in a NULL value, or an illegal + * character is encountered, then a substitute character is returned. + * + * This class is the registered transliteration class returned from + * drupal_container()->get('transliteration') by default. + * + * @ingroup transliteration + */ +class PHPTransliteration implements TransliterationInterface { + + /** + * Directory where data for transliteration resides. + * + * The constructor sets this (by default) to subdirectory 'data' underneath + * the directory where the class's PHP file resides. + * + * @var string + */ + protected $dataDirectory; + + /** + * Associative array of language-specific character transliteration tables. + * + * The outermost array keys are language codes. For each language code key, + * the value is an array whose keys are Unicode character codes, and whose + * values are the transliterations of those characters to US-ASCII. This is + * set up as needed in PHPTransliteration::replace() by calling + * PHPTransliteration::readLanguageOverrides(). + * + * @var array + */ + protected $languageOverrides = array(); + + /** + * Non-language-specific transliteration tables. + * + * Array whose keys are the upper two bytes of the Unicode character, and + * whose values are an array of transliterations for each lower-two bytes + * character code. This is set up as needed in PHPTransliteration::replace() + * by calling PHPTransliteration::readGenericData(). + * + * @var array + */ + protected $genericMap = array(); + + /** + * Returns this PHPTransliteration object (for the Drupal Container). + */ + public function get() { + return $this; + } + + /** + * Constructs a transliteration object. + * + * @param string $data_directory + * (optional) The directory where data files reside. If omitted, defaults + * to subdirectory 'data' underneath the directory where the class's PHP + * file resides. + */ + public function __construct($data_directory = NULL) { + // Set up data directory and tail bytes table. + $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data'; + } + + /** + * Implements TransliterationInterface::transliterate(). + */ + public function transliterate($string, $langcode = 'en', $unknown_character = '?') { + $result = ''; + // Split into Unicode characters and transliterate each one. + foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { + $code = self::ordUTF8($character); + if ($code == -1) { + $result .= $unknown_character; + } + else { + $result .= $this->replace($code, $langcode, $unknown_character); + } + } + + return $result; + } + + /** + * Finds the character code for a UTF-8 character: like ord() but for UTF-8. + * + * @param string $character + * A single UTF-8 character. + * + * @return int + * The character code, or -1 if an illegal character is found. + */ + protected static function ordUTF8($character) { + $first_byte = ord($character[0]); + + if (($first_byte & 0x80) == 0) { + // Single-byte form: 0xxxxxxxx. + return $first_byte; + } + if (($first_byte & 0xe0) == 0xc0) { + // Two-byte form: 110xxxxx 10xxxxxx. + return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f); + } + if (($first_byte & 0xf0) == 0xe0) { + // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx. + return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f); + } + if (($first_byte & 0xf8) == 0xf0) { + // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. + return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f); + } + + // Other forms are not legal. + return -1; + } + + /** + * Replaces a single Unicode character using the transliteration database. + * + * @param int $code + * The character code of a Unicode character. + * @param string $langcode + * The language code of the language the character is in. + * @param string $unknown_character + * The character to substitute for characters without transliterated + * equivalents. + * + * @return string + * US-ASCII replacement character. If it has a mapping, it is returned; + * otherwise, $unknown_character is returned. + */ + protected function replace($code, $langcode, $unknown_character) { + if ($code < 0x80) { + // Already lower ASCII. + return chr($code); + } + + // See if there is a language-specific override for this character. + if (!isset($this->languageOverrides[$langcode])) { + $this->readLanguageOverrides($langcode); + } + if (isset($this->languageOverrides[$langcode][$code])) { + return $this->languageOverrides[$langcode][$code]; + } + + // See if there is a generic mapping for this character. + $bank = $code >> 8; + if (!isset($this->genericMap[$bank])) { + $this->readGenericData($bank); + } + $code = $code & 0xff; + return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character; + } + + /** + * Reads in language overrides for a language code. + * + * The data is read from files named "$langcode.php" in + * PHPTransliteration::$dataDirectory. These files should set up an array + * variable $overrides with an element whose key is $langcode and whose value + * is an array whose keys are character codes, and whose values are their + * transliterations in this language. The resulting $overrides array is + * altered by invoking hook_transliteration_overrides_alter() to let modules + * add additional overrides. + * + * @param $langcode + * Code for the language to read. + */ + protected function readLanguageOverrides($langcode) { + // Figure out the file name to use by sanitizing the language code, + // just in case. + $file = $this->dataDirectory . '/' . preg_replace('[^a-zA-Z\-]', '', $langcode) . '.php'; + + // Read in this file, which should set up a variable called $overrides, + // which will be local to this function. + if (is_file($file)) { + include($file); + } + if (!isset($overrides) || !is_array($overrides)) { + $overrides = array($langcode => array()); + } + + // Let modules alter the list, and save it. + drupal_alter('transliteration_overrides', $overrides, $langcode); + $this->languageOverrides[$langcode] = $overrides[$langcode]; + } + + /** + * Reads in generic transliteration data for a bank of characters. + * + * The data is read in from a file named "x$bank.php" (with $bank in + * hexidecimal notation) in PHPTransliteration::$dataDirectory. These files + * should set up a variable $bank containing an array whose numerical indices + * are the remaining two bytes of the character code, and whose values are the + * transliterations of these characters into US-ASCII. + * + * @param $bank + * First two bytes of the Unicode character, or 0 for the ASCII range. + */ + protected function readGenericData($bank) { + // Figure out the file name. + $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php'; + + // Read in this file, which should set up a variable called $base, which + // will be local to this function. + if (is_file($file)) { + include($file); + } + if (!isset($base) || !is_array($base)) { + $base = array(); + } + + // Save this data. + $this->genericMap[$bank] = $base; + } +} diff --git a/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php new file mode 100644 index 0000000..b88a55d --- /dev/null +++ b/core/lib/Drupal/Component/Transliteration/TransliterationInterface.php @@ -0,0 +1,34 @@ +setFactoryClass('Drupal\Core\ExceptionController') ->setFactoryMethod('getExceptionListener'); + $container->register('transliteration', 'Drupal\Component\Transliteration\PHPTransliteration'); + // Add Serializer with arguments to be replaced in the compiler pass. $container->register('serializer', 'Symfony\Component\Serializer\Serializer') ->addArgument(array()) diff --git a/core/modules/system/language.api.php b/core/modules/system/language.api.php index 16d0794..767d932 100644 --- a/core/modules/system/language.api.php +++ b/core/modules/system/language.api.php @@ -184,3 +184,70 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) { /** * @} End of "addtogroup hooks". */ + +/** + * @defgroup transliteration Transliteration + * @{ + * Transliterate from Unicode to US-ASCII + * + * Transliteration is the process of translating individual non-US-ASCII + * characters into ASCII characters, which specifically does not transform + * non-printable and punctuation characters in any way. This process will always + * be both inexact and language-dependent. For instance, the character Ö (O with + * an umlaut) is commonly transliterated as O, but in German text, the + * convention would be to transliterate it as Oe or OE, depending on the context + * (beginning of a capitalized word, or in an all-capital letter context). + * + * The Drupal default transliteration process transliterates text character by + * character using a database of generic character transliterations and + * language-specific overrides. Character context (such as all-capitals + * vs. initial capital letter only) is not taken into account, and in + * transliterations of capital letters that result in two or more letters, by + * convention only the first is capitalized in the Drupal transliteration + * result. So, the process has limitations; however, since the reason for + * transliteration is typically to create machine names or file names, this + * should not really be a problem. After transliteration, other transformation + * or validation may be necessary, such as converting spaces to another + * character, removing non-printable characters, lower-casing, etc. + * + * Here is a code snippet to transliterate some text: + * @code + * // Use the current default interface language. + * $langcode = language(LANGUAGE_TYPE_INTERFACE)->langcode; + * // Instantiate the transliteration class. + * $trans = drupal_container()->get('transliteration'); + * // Use this to transliterate some text. + * $transformed = $trans->transliterate($string, $langcode); + * @endcode + * + * Drupal Core provides the generic transliteration character tables and + * overrides for a few common languages; modules can implement + * hook_transliteration_overrides_alter() to provide further language-specific + * overrides. Modules can also completely override the transliteration classes + * in \Drupal\Core\CoreBundle. + */ + +/** + * Provide language overrides for transliteration. + * + * @param array $overrides + * Associative array of language overrides. The outermost key is the language + * code, and the corresponding value is an array whose keys are integer + * Unicode character codes, and whose values are the transliterations of those + * characters in the given language, to override default transliterations. + * @param string $langcode + * The code for the language that is being transliterated. + * + * @ingroup hooks + */ +function hook_transliteration_overrides_alter(&$overrides, $langcode) { + // Provide special overrides for German for a custom site. + if ($langcode == 'de') { + // The core-provided transliteration of Ä is Ae, but we want just A. + $overrides['de'][0xC4] = 'A'; + } +} + +/** + * @} End of "defgroup transliteration". + */ diff --git a/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php new file mode 100644 index 0000000..dcc3709 --- /dev/null +++ b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php @@ -0,0 +1,91 @@ + 'Transliteration functionality', + 'description' => 'Tests the transliteration component', + 'group' => 'Transliteration', + ); + } + + /** + * Tests the PHPTransliteration class. + */ + public function testPHPTransliteration() { + $random = $this->randomName(10); + // Make some strings with two, three, and four-byte characters for testing. + // Note that the 3-byte character is overridden by the 'kg' language. + $two_byte = 'Ä Ö Ü Å Ø äöüåøhello'; + // This is a Cyrrillic character that looks something like a u. See + // http://www.unicode.org/charts/PDF/U0400.pdf + $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8'); + // This is a Canadian Aboriginal character like a triangle. See + // http://www.unicode.org/charts/PDF/U1400.pdf + $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); + $cases = array( + // Each test case is (language code, input, output). + // Test ASCII in English. + array('en', $random, $random), + // Test ASCII in some other language with no overrides. + array('fr', $random, $random), + // Test 3 and 4-byte characters in a language without overrides. + // Note: if the data tables change, these will need to change too! They + // are set up to test that data table loading works, so values come + // directly from the data files. + array('fr', $three_byte, 'c'), + array('fr', $four_byte, 'wii'), + // Test a language with no overrides. + array('en', $two_byte, 'A O U A O aouaohello'), + // Test language overrides provided by core. + array('de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'), + array('de', $random, $random), + array('dk', $two_byte, 'A O U Aa Oe aouaaoehello'), + array('dk', $random, $random), + array('kg', $three_byte, 'ts'), + // Test the language override hook in the test module, which changes + // the transliteration of Ä to Z. + array('zz', $two_byte, 'Z O U A O aouaohello'), + array('zz', $random, $random), + // Test strings in some other languages. + // Turkish, provided by drupal.org user Kartagis. + array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'), + ); + + // Test each case both with a new instance of the transliteration class, + // and with one that builds as it goes. + $common_transliterator = drupal_container()->get('transliteration'); + + foreach($cases as $case) { + list($langcode, $before, $after) = $case; + $transliterator = new PHPTransliteration(); + $actual = $transliterator->transliterate($before, $langcode); + $this->assertEqual($after, $actual, format_string('@before is correctly transliterated to @after in new class (@actual) in language @langcode', array('@before' => $before, '@langcode' => $langcode, '@after' => $after, '@actual' => $actual))); + + $actual = $common_transliterator->transliterate($before, $langcode); + $this->assertEqual($after, $actual, format_string('@before is correctly transliterated to @after in previously-used class (@actual) in language @langcode', array('@before' => $before, '@langcode' => $langcode, '@after' => $after, '@actual' => $actual))); + } + } +} diff --git a/core/modules/system/tests/modules/transliterate_test/transliterate_test.info b/core/modules/system/tests/modules/transliterate_test/transliterate_test.info new file mode 100644 index 0000000..8325b2f --- /dev/null +++ b/core/modules/system/tests/modules/transliterate_test/transliterate_test.info @@ -0,0 +1,6 @@ +name = "Transliteration test module" +description = "Tests the transliteration hook" +package = Testing +version = VERSION +core = 8.x +hidden = TRUE diff --git a/core/modules/system/tests/modules/transliterate_test/transliterate_test.module b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module new file mode 100644 index 0000000..59e1149 --- /dev/null +++ b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module @@ -0,0 +1,23 @@ +