From 2cc49ab9d44c2077adef59b5dd79c585476f8ca2 Mon Sep 17 00:00:00 2001 From: Jerenus Date: Sun, 24 Feb 2013 23:47:29 +0800 Subject: [PATCH] Issue #1428272 by Jerenus: Added support of encoding conversions to the CSV Parser. --- libraries/ParserCSV.inc | 55 ++++++++++++++++++++++++++++++++++++++++++++-- plugins/FeedsCSVParser.inc | 42 ++++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) diff --git a/libraries/ParserCSV.inc b/libraries/ParserCSV.inc index 4ddc77a..b2b1095 100644 --- a/libraries/ParserCSV.inc +++ b/libraries/ParserCSV.inc @@ -76,6 +76,8 @@ class ParserCSV { public function __construct() { $this->delimiter = ','; + $this->from_encoding = $this->to_encoding = $this->encoding = 'UTF-8'; + $this->check_encoding = FALSE; $this->skipFirstLine = FALSE; $this->columnNames = FALSE; $this->timeout = FALSE; @@ -95,6 +97,22 @@ class ParserCSV { } /** + * Set the source file encoding. + * By default, UTF-8. + */ + public function setEncoding($encoding) { + $this->from_encoding = $encoding; + } + + /** + * Set the option to check source file encoding. + * By default, FALSE. + */ + public function setEncodingCheck($check_encoding) { + $this->check_encoding = $check_encoding; + } + + /** * Set this to TRUE if the parser should skip the first line of the CSV text, * which might be desired if the first line contains the column names. * By default, this is set to FALSE and the first line is not skipped. @@ -197,7 +215,7 @@ class ParserCSV { for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) { // Make really sure we've got lines without trailing newlines. - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); // Skip empty lines. if (empty($line)) { @@ -237,7 +255,7 @@ class ParserCSV { } // Ok, so, on with fetching the next line, as mentioned above. $currentField .= "\n"; - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); $currentIndex = 0; continue; } @@ -325,4 +343,37 @@ class ParserCSV { } return $rows; } + + /** + * Checks and converts encoding of input data + * + * @param $data + * A chunk of data + * @return + * Data in correct encoding or throws exceptions if + * ecnoding doesn't match or mbstring is not found. + */ + private function fixEncoding($data) { + + if (extension_loaded('mbstring')) { + // Check encoding if needed + if ($this->check_encoding) { + if (!mb_check_encoding($data, $this->from_encoding)) { + throw new Exception(t('Source file is not in @encoding encoding.', array('@encoding' => $this->from_encoding))); + } + } + + $encode_array = array('ASCII', 'UTF-8', 'GBK', 'GB2312', 'BIG5'); + $this->encoding = mb_detect_encoding($data, $encode_array); + + // Convert encoding if needed + if ($this->encoding != $this->to_encoding) { + $data = mb_convert_encoding($data, $this->to_encoding, $this->encoding); + } + } + else { + throw new Exception(t('For encoding conversion mbstring PHP extension must be available.')); + } + + return $data; + } } diff --git a/plugins/FeedsCSVParser.inc b/plugins/FeedsCSVParser.inc index 7044440..977c090 100644 --- a/plugins/FeedsCSVParser.inc +++ b/plugins/FeedsCSVParser.inc @@ -22,6 +22,8 @@ class FeedsCSVParser extends FeedsParser { $parser = new ParserCSV(); $delimiter = $source_config['delimiter'] == 'TAB' ? "\t" : $source_config['delimiter']; $parser->setDelimiter($delimiter); + $parser->setEncoding($source_config['encoding']['encoding']); + $parser->setEncodingCheck($source_config['encoding']['check_encoding']); $iterator = new ParserCSVIterator($fetcher_result->getFilePath()); if (empty($source_config['no_headers'])) { @@ -106,6 +108,8 @@ class FeedsCSVParser extends FeedsParser { public function sourceDefaults() { return array( 'delimiter' => $this->config['delimiter'], + 'encoding' => $this->config['encoding'], + 'check_encoding' => $this->config['check_encoding'], 'no_headers' => $this->config['no_headers'], ); } @@ -164,7 +168,7 @@ class FeedsCSVParser extends FeedsParser { '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => isset($source_config['no_headers']) ? $source_config['no_headers'] : 0, ); - return $form; + return $form + $this->configEncodingForm(TRUE); } /** @@ -173,6 +177,8 @@ class FeedsCSVParser extends FeedsParser { public function configDefaults() { return array( 'delimiter' => ',', + 'encoding' => 'UTF-8', + 'check_encoding' => FALSE, 'no_headers' => 0, ); } @@ -201,6 +207,40 @@ class FeedsCSVParser extends FeedsParser { '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => $this->config['no_headers'], ); + return $form + $this->configEncodingForm(); + } + + public function configEncodingForm($sourceForm = FALSE) { + $form = array(); + $defaults = $this->configDefaults(); + if (extension_loaded('mbstring')) { + $form['encoding'] = array( + '#type' => 'fieldset', + '#title' => 'Encoding conversion', + '#collapsible' => TRUE, + '#collapsed' => $sourceForm || ($this->config['encoding'] == $defaults['encoding'] && $this->config['check_encoding'] == $defaults['check_encoding']), + ); + $options = mb_list_encodings(); + $options = array_combine($options, $options); + $form['encoding']['encoding'] = array( + '#type' => 'select', + '#title' => t('Source file encoding'), + '#description' => t('Performs encoding conversion of a source files to UTF-8. Defaults to UTF-8 — no encoding conversion will happen.'), + '#options' => $options, + '#default_value' => $this->config['encoding'], + ); + $form['encoding']['check_encoding'] = array( + '#type' => 'checkbox', + '#title' => t('Check encoding'), + '#description' => t('Checks encoding of a source file and breaks import process if encoding differs.'), + '#default_value' => $this->config['check_encoding'], + ); + } + else { + $form['encoding']['encoding'] = array( + '#markup' => '

' . t('Encoding conversion is disabled due to the lack of mbstring PHP extension.') . '

', + ); + } return $form; } -- 1.8.0.1