From 1319f52f27d905b187db2d74749a3b89298cff6f Mon Sep 17 00:00:00 2001 From: Bob Vincent Date: Wed, 16 Mar 2011 05:34:09 -0400 Subject: [PATCH] Issue #221257 by AlexisWilke,NancyDru,greg.harvey,pillars.net: text_summary() should be made HTML-aware. --- modules/field/modules/text/text.module | 185 ++++++++++++++++++++++---------- modules/field/modules/text/text.test | 154 +++++++++++++------------- 2 files changed, 205 insertions(+), 134 deletions(-) diff --git a/modules/field/modules/text/text.module b/modules/field/modules/text/text.module index 89c605c..828cddd 100644 --- a/modules/field/modules/text/text.module +++ b/modules/field/modules/text/text.module @@ -327,6 +327,10 @@ function _text_sanitize($instance, $langcode, $item, $column) { * place such as the end of a paragraph, a line break, or the end of a * sentence (in that order of preference). * + * @note + * This function uses strlen(), strpos(), etc. rather than their multibyte + * equivalents where doing so increases speed without affecting output. + * * @param $text * The content for which a summary will be generated. * @param $format @@ -344,7 +348,7 @@ function _text_sanitize($instance, $langcode, $item, $column) { * @return * The generated summary. */ -function text_summary($text, $format = NULL, $size = NULL) { +function text_summary($text, $format = NULL, $size = NULL, &$summary_length = NULL) { if (!isset($size)) { // What used to be called 'teaser' is now called 'summary', but @@ -368,6 +372,7 @@ function text_summary($text, $format = NULL, $size = NULL) { // We check for the presence of the PHP evaluator filter in the current // format. If the body contains PHP code, we do not split it up to prevent // parse errors. + $filters = array(); if (isset($format)) { $filters = filter_list_format($format); if (isset($filters['php_code']) && $filters['php_code']->status && strpos($text, '' => 0); - - // If no complete paragraph then treat line breaks as paragraphs. - $line_breaks = array('
' => 6, '
' => 4); - // Newline only indicates a line break if line break converter - // filter is present. - if (isset($filters['filter_autop'])) { - $line_breaks["\n"] = 1; - } - $break_points[] = $line_breaks; - - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); - - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the summary. - foreach ($points as $point => $offset) { - // The summary is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); + $filter_newline = isset($filters['filter/1']); + $text_length = strlen($text); + + $position = 0; + $length = 0; + $stack = array(); + while ($position < $text_length && $length < $size) { + $last_tag = FALSE; + $offset = strpos($text, '<', $position); + if ($offset === FALSE) { + // There are no more tags, so find the UTF-8 string length. + $additional = drupal_strlen(substr($text, $position, $text_length - $position)); + $num_chars = $text_length; + } + else { + // Count UTF-8 characters between the previous position and the next tag. + $additional = drupal_strlen(substr($text, $position, $offset - $position)); + ++$offset; // Skip the '<' character. + $num_chars = strpos($text, '>', $offset); + if ($text[$offset] == '/') { + // Found a closing tag, so pop the opening tag too. + array_pop($stack); + } + elseif ($text[$num_chars - 1] != '/') { // Skip empty tags. + // Found an opening tag; save it on the stack. + $end_name = strpos($text, ' ', $offset); + if ($end_name === FALSE || $end_name > $num_chars) { + $end_name = $num_chars; + } + $tag_name = substr($text, $offset, $end_name - $offset); + switch ($tag_name) { // Ignore empty tags that were not properly closed. + case 'br': + case 'hr': + case 'img': + case 'input': + break; + default: + $stack[] = $tag_name; + $last_tag = TRUE; + break; + } + } + // For now, we assume properly opening/closing tag boundaries. + if ($num_chars === FALSE) { + // Either the last tag was not closed or it wasn't a tag. + $num_chars = $text_length; + } + else { + ++$num_chars; // Skip the '>' character. } } - - // If a break point was found in this group, slice and stop searching. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos); - break; + // Are there any characters to add the to result? + if ($additional) { + if ($length + $additional >= $size) { + // The last tag did not make it in. + if ($last_tag) { + array_pop($stack); + } + // There are too many characters, so search for a break-point. + $offset = $position + $size - $length; + if (isset($text[$offset]) && $text[$offset] != ' ') { + while ($offset > $position) { + switch ($text[$offset - 1]) { + case "\xD8": + // Is this the Arabic equivalent of the ascii '?' character? + if (!isset($text[$offset]) || $text[$offset] != "\x9F") { + // No; this is not the right sequence. + break; + } + if ($offset + 1 == $text_length || $text[$offset + 1] == ' ') { + // Found a break-point. + break 2; + } + if ($text[$offset + 1] == '"') { + $offset += 2; + break 2; + } + break; + case '.': + case '!': + case '?': + if ($offset == $text_length || $text[$offset] == ' ') { + // Found a break-point. + break 2; + } + if ($text[$offset] == '"') { + ++$offset; + break 2; + } + break; + case "\n": + if (!$filter_newline) { + --$offset; + break 2; + } + case ' ': + // Found the (breaking) space; remove and break there. + --$offset; + break 2; + // @todo Add support for other UTF-8 spaces? + case "\xE3": + // Found the CJK ideographic full stop? + if (isset($text[$offset + 1]) && $text[$offset] == "\x80" && $text[$offset + 1] == "\x82") { + // keep this character in full + $offset += 2; + break 2; + } + break; + } + --$offset; + } + } + $position = $offset; + break; + } + $length += $additional; } + $position = $num_chars; } - - // If the htmlcorrector filter is present, apply it to the generated summary. - if (isset($filters['filter_htmlcorrector'])) { - $summary = _filter_htmlcorrector($summary); + $summary = substr($text, 0, $position); + if (!empty($stack)) { + // If closing tags are missing, we add an ellipsis and closing tags. + $summary .= t('...'); + do { + $summary .= ''; + } while (!empty($stack)); } - return $summary; } diff --git a/modules/field/modules/text/text.test b/modules/field/modules/text/text.test index b42fed7..98c0bc1 100644 --- a/modules/field/modules/text/text.test +++ b/modules/field/modules/text/text.test @@ -287,87 +287,87 @@ class TextSummaryTestCase extends DrupalWebTestCase { // The summaries we expect text_summary() to return when $size is the index // of each array item. // Using no text format: - $expected = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "<", - "", - "

\n", - "

\nH", - "

\nHi", - "

\nHi\n", - "

\nHi\n<", - "

\nHi\n\nHi\n\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", + $expected = array ( + 0 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 1 => "

", + 2 => "

", + 3 => "

", + 4 => "

\nHi", + 5 => "

\nHi\n

", + 6 => "

\nHi\n

\n

...

", + 7 => "

\nHi\n

\n

...

", + 8 => "

\nHi\n

\n

...

", + 9 => "

\nHi\n

\n

...

", + 10 => "

\nHi\n

\n

...

", + 11 => "

\nHi\n

\n

...

", + 12 => "

\nHi\n

\n

\nfolks...

", + 13 => "

\nHi\n

\n

\nfolks\n
", + 14 => "

\nHi\n

\n

\nfolks\n
", + 15 => "

\nHi\n

\n

\nfolks\n
\n!", + 16 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 17 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 18 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 19 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 20 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 21 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 22 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 23 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 24 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 25 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 26 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 27 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 28 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 29 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 30 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 31 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 32 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 33 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 34 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 35 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 36 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 37 => "

\nHi\n

\n

\nfolks\n
\n!\n

", ); // And using a text format WITH the line-break and htmlcorrector filters. $expected_lb = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "", - "

", - "

", - "

", - "

", - "

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", + 0 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 1 => "

", + 2 => "

", + 3 => "

", + 4 => "

\nHi", + 5 => "

\nHi\n

", + 6 => "

\nHi\n

\n

...

", + 7 => "

\nHi\n

\n

...

", + 8 => "

\nHi\n

\n

...

", + 9 => "

\nHi\n

\n

...

", + 10 => "

\nHi\n

\n

...

", + 11 => "

\nHi\n

\n

...

", + 12 => "

\nHi\n

\n

\nfolks...

", + 13 => "

\nHi\n

\n

\nfolks\n
", + 14 => "

\nHi\n

\n

\nfolks\n
", + 15 => "

\nHi\n

\n

\nfolks\n
\n!", + 16 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 17 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 18 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 19 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 20 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 21 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 22 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 23 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 24 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 25 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 26 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 27 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 28 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 29 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 30 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 31 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 32 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 33 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 34 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 35 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 36 => "

\nHi\n

\n

\nfolks\n
\n!\n

", + 37 => "

\nHi\n

\n

\nfolks\n
\n!\n

", ); // Test text_summary() for different sizes. -- 1.7.1