From ddff9636696f44b2031fbc8c2315a5fb5189300e Mon Sep 17 00:00:00 2001
From: Bob Vincent
tag, but if the original + // formatted version was plaintext, then the summary should be plaintext + // also. + if ($text === strip_tags($text)) { + $output = strip_tags($output); } - $break_points[] = $line_breaks; - - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); + return trim($output); +} - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the summary. - foreach ($points as $point => $offset) { - // The summary is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); +/** + * Helper function for text_summary. + * + * Recursively copies elements from $body to $summary, subtracting the length + * of the textContent portions from $size until $size reaches zero. + * + * @param $body + * The source DOMNode. + * @param $size + * The maximum number of textContent characters to copy. + * @param $summary + * The destination DOMNode. + * @param $doc + * The destination DOMDocument. Should be the same as the + * $summary->ownerDocument property. + * @param $parents + * An array of tag names of ancestor nodes. + * + * @return + * The number of additional characters left to copy. + */ +function _text_summarize($body, $size, $summary, $doc, $parents = array()) { + static $sentence_splitter; + static $word_splitter; + if (!isset($sentence_splitter)) { + // According to http://unicode.org/review/pr-23.html, these are the Unicode + // Sentence_Terminal characters. + $stops = + "\x21" . // 'Exclamation mark'. + "\x2E" . // 'Full stop'. + "\x3F" . // 'Question mark'. + "\xD6\x89" . // 'Armenian full stop'. + "\xD8\x9F" . // 'Arabic question mark'. + "\xDB\x94" . // 'Arabic full stop'. + "\xDC\x80" . // 'Syriac end of paragraph'. + "\xDC\x81" . // 'Syriac supralinear full stop'. + "\xDC\x82" . // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4" . // 'Devanagari danda'. + "\xE1\x81\x8A" . // 'Myanmar sign little section'. + "\xE1\x81\x8B" . // 'Myanmar sign section'. + "\xE1\x8D\xA2" . // 'Ethiopic full stop'. + "\xE1\x8D\xA7" . // 'Ethiopic question mark'. + "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE" . // 'Canadian syllabics full stop'. + "\xE1\xA0\x83" . // 'Mongolian full stop'. + "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'. + "\xE2\x80\xBC" . // 'Double exclamation mark'. + "\xE2\x80\xBD" . // 'Interrobang'. + "\xE2\x81\x87" . // 'Double question mark'. + "\xE2\x81\x88" . // 'Question exclamation mark'. + "\xE2\x81\x89" . // 'Exclamation question mark'. + "\xE3\x80\x82" . // 'Ideographic full stop'. + "\xEF\xB9\x92" . // 'Small full stop'. + "\xEF\xB9\x97" . // 'Small exclamation mark'. + "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E" . // 'Fullwidth full stop'. + "\xEF\xBC\x9E" . // 'Fullwidth question mark'. + "\xEF\xBD\xA1"; // 'Halfwidth ideographic full stop'. + // We split after Sentence_Terminal characters only if preceded by a Letter + // character and followed by a Separator character. + $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u'; + // If no suitable sentence break is found, we split before any Unicode + // separator character. + $word_splitter = '/(?=\p{Z})/u'; + } + if ($body->nodeType === XML_TEXT_NODE) { + $text_length = drupal_strlen($body->textContent); + if ($text_length <= $size) { + $size -= $text_length; + $summary->appendChild($doc->createTextNode($body->textContent)); + return $size; + } + // We avoid breaking text nodes within code blocks. + if (in_array('code', $parents)) { + // Return zero to avoid adding subsequent text nodes. + return 0; + } + $sentences = preg_split($sentence_splitter, $body->textContent); + $text = ''; + foreach ($sentences as $sentence) { + $sentence_length = drupal_strlen($sentence); + // Only add the sentence if it fits within the length limit. + if ($sentence_length > $size) { + break; } + $text .= $sentence; + $size -= $sentence_length; } - - // If a break point was found in this group, slice and stop searching. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos); - break; + // If no suitable sentence break was found, try to break between words. + if ($text === '') { + $words = preg_split($word_splitter, $body->textContent); + foreach ($words as $word) { + $word_length = drupal_strlen($word); + // Only add the word if it fits within the length limit. + if ($word_length > $size) { + break; + } + $text .= $word; + $size -= $word_length; + } } + $summary->appendChild($doc->createTextNode($text)); + // Return zero to avoid adding subsequent text nodes. + return 0; } - - // If the htmlcorrector filter is present, apply it to the generated summary. - if (isset($filters['filter_htmlcorrector'])) { - $summary = _filter_htmlcorrector($summary); + if ($body->hasChildNodes()) { + $node = $summary->appendChild($doc->createElement($body->tagName)); + $parents[] = $body->tagName; + foreach ($body->childNodes as $child) { + if ($size > 0) { + $size = _text_summarize($child, $size, $node, $doc, $parents); + } + else { + break; + } + } } - - return $summary; + return $size; } /** diff --git a/modules/field/modules/text/text.test b/modules/field/modules/text/text.test index 59369370efcc355e87fe5855b2d5d79f1afe98a3..f2a6848a1de7d6164c33c5e482358b74989a94b5 100644 --- a/modules/field/modules/text/text.test +++ b/modules/field/modules/text/text.test @@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase { */ function testFirstSentenceQuestion() { $text = 'A question? A sentence. Another sentence.'; - $expected = 'A question? A sentence.'; + // The default format includes the auto-paragraph filter. + $expected = '
A question? A sentence.
'; $this->callTextSummary($text, $expected, NULL, 30); } @@ -270,11 +271,52 @@ class TextSummaryTestCase extends DrupalWebTestCase { 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110 - $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . + $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . - 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'; - // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. - $this->callTextSummary($text, $expected, NULL, 340); + 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
'; + // Test that sentence splitting works when we replace the full stops with + // any other Sentence_Terminal character. + $sentence_terminals = array( + '!', // Exclamation mark. + '.', // Full stop. + '?', // Question mark. + '?', // Armenian full stop. + '?', // Arabic question mark. + '?', // Arabic full stop. + '?', // Syriac end of paragraph. + '?', // Syriac supralinear full stop. + '?', // Syriac sublinear full stop. + '?', // Devanagari danda. + '?', // Myanmar sign little section. + '?', // Myanmar sign section. + '?', // Ethiopic full stop. + '?', // Ethiopic question mark. + '?', // Ethiopic paragraph separator. + '?', // Canadian syllabics full stop. + '?', // Mongolian full stop. + '?', // Mongolian manchu full stop. + '?', // Double exclamation mark. + '?', // Interrobang. + '?', // Double question mark. + '?', // Question exclamation mark. + '?', // Exclamation question mark. + '?', // Ideographic full stop. + '?', // Small full stop. + '?', // Small exclamation mark. + '?', // Fullwidth exclamation mark. + '?', // Fullwidth full stop. + '?', // Fullwidth question mark. + '?', // Halfwidth ideographic full stop. + ); + foreach ($sentence_terminals as $stop) { + // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. + $this->callTextSummary( + str_replace('.', $stop, $text), + str_replace('.', $stop, $expected), + NULL, + 340 + ); + } } /** @@ -286,104 +328,79 @@ class TextSummaryTestCase extends DrupalWebTestCase { // The summaries we expect text_summary() to return when $size is the index // of each array item. - // Using no text format: - $expected = array( - "\nHi\n
\n\nfolks\n
\n!\n
", - "
\n", - "
\nH", - "
\nHi", - "
\nHi\n", - "
\nHi\n<", - "
\nHi\n", - "
\nHi\n
\nHi\n", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi
", - "\nHi
", - "\nHi
", - "\nHi
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
Hi
\nfolks
\n!
", + 1 => "", + 2 => "Hi
", + 3 => "Hi
", + 4 => "Hi
\n", + 5 => "Hi
\n", + 6 => "Hi
\n", + 7 => "Hi
\n", + 8 => "Hi
\nfolks
", + 9 => "Hi
\nfolks
", + 10 => "Hi
\nfolks
\n!
", + 11 => "Hi
\nfolks
\n!
", + 12 => "Hi
\nfolks
\n!
", + 13 => "Hi
\nfolks
\n!
", + 14 => "Hi
\nfolks
\n!
", + 15 => "Hi
\nfolks
\n!
", + 16 => "Hi
\nfolks
\n!
", + 17 => "Hi
\nfolks
\n!
", + 18 => "Hi
\nfolks
\n!
", + 19 => "Hi
\nfolks
\n!
", + 20 => "Hi
\nfolks
\n!
", + 21 => "Hi
\nfolks
\n!
", + 22 => "Hi
\nfolks
\n!
", + 23 => "Hi
\nfolks
\n!
", + 24 => "Hi
\nfolks
\n!
", + 25 => "Hi
\nfolks
\n!
", + 26 => "Hi
\nfolks
\n!
", + 27 => "Hi
\nfolks
\n!
", + 28 => "Hi
\nfolks
\n!
", + 29 => "Hi
\nfolks
\n!
", + 30 => "Hi
\nfolks
\n!
", + 31 => "Hi
\nfolks
\n!
", + 32 => "Hi
\nfolks
\n!
", + 33 => "Hi
\nfolks
\n!
", + 34 => "Hi
\nfolks
\n!
", + 35 => "Hi
\nfolks
\n!
", + 36 => "Hi
\nfolks
\n!
", + 37 => "Hi
\nfolks
\n!
", ); // Test text_summary() for different sizes. for ($i = 0; $i <= 37; $i++) { - $this->callTextSummary($text, $expected[$i], NULL, $i); - $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i); - $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i); + $this->callTextSummary($text, $expected[$i], 'filtered_html', $i); } } /** + * Test that we avoid breaking text in the middle of a CODE block. + */ + function testCode() { + $text = 'This is an example code block:' + . '$example = "Sentence one. Sentence two. Sentence three.
';
+ $expected = 'This is an example code block: