From bef874b708cd29b78c25abc15d88062ece5803a8 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Mon, 30 May 2011 05:50:33 -0400
Subject: [PATCH] Issue #299138 by catch, Kevin Hankens, drewish, arjenk, jrglasgow, stella, sun, kscheirer, lilou, pillarsdotnet, stephandale: Fix the broken formatting in drupal_html_to_text() and also add tests.

---
 includes/mail.inc                  |  532 +++++++++++++++++++++++-------------
 modules/simpletest/tests/mail.test |  285 +++++++++++++++++++-
 2 files changed, 620 insertions(+), 197 deletions(-)

diff --git a/includes/mail.inc b/includes/mail.inc
index be2df923427ec363f671132771e9c97ee490c090..125bc28099b34d13e0fe27149ee562a300e9e4d2 100644
--- a/includes/mail.inc
+++ b/includes/mail.inc
@@ -12,6 +12,8 @@
  */
 define('MAIL_LINE_ENDINGS', isset($_SERVER['WINDIR']) || strpos($_SERVER['SERVER_SOFTWARE'], 'Win32') !== FALSE ? "\r\n" : "\n");
 
+define('NBSP', html_entity_decode('&nbsp;'));
+
 /**
  * Compose and optionally send an e-mail message.
  *
@@ -267,7 +269,7 @@ interface MailSystemInterface {
    * @return
    *   The formatted $message.
    */
-   public function format(array $message);
+  public function format(array $message);
 
   /**
    * Send a message composed by drupal_mail().
@@ -294,7 +296,7 @@ interface MailSystemInterface {
    * @return
    *   TRUE if the mail was successfully accepted for delivery, otherwise FALSE.
    */
-   public function mail(array $message);
+  public function mail(array $message);
 }
 
 /**
@@ -303,40 +305,47 @@ interface MailSystemInterface {
  * We use delsp=yes wrapping, but only break non-spaced languages when
  * absolutely necessary to avoid compatibility issues.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use variable_get('mail_line_endings), MAIL_LINE_ENDINGS)
+ * rather than "\r\n".
  *
  * @param $text
  *   The plain text to process.
  * @param $indent (optional)
  *   A string to indent the text with. Only '>' characters are repeated on
  *   subsequent wrapped lines. Others are replaced by spaces.
+ *
+ * @see drupal_mail()
  */
 function drupal_wrap_mail($text, $indent = '') {
-  // Convert CRLF into LF.
-  $text = str_replace("\r", '', $text);
+  // Convert LF or CRLF into platform-specific line-endings.
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+  $text = preg_replace('/\r?\n/', $eol, $text);
   // See if soft-wrapping is allowed.
   $clean_indent = _drupal_html_to_text_clean($indent);
   $soft = strpos($clean_indent, ' ') === FALSE;
   // Check if the string has line breaks.
-  if (strpos($text, "\n") !== FALSE) {
+  if (strpos($text, $eol) !== FALSE) {
     // Remove trailing spaces to make existing breaks hard.
-    $text = preg_replace('/ +\n/m', "\n", $text);
+    $text = preg_replace('/ +\r?\n/m', $eol, $text);
     // Wrap each line at the needed width.
-    $lines = explode("\n", $text);
-    array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => strlen($indent)));
-    $text = implode("\n", $lines);
+    $lines = explode($eol, $text);
+    array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => drupal_strlen($indent)));
+    $text = implode($eol, $lines);
   }
   else {
     // Wrap this line.
-    _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => strlen($indent)));
+    _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => drupal_strlen($indent)));
   }
   // Empty lines with nothing but spaces.
-  $text = preg_replace('/^ +\n/m', "\n", $text);
+  $text = preg_replace('/^ +\r?\n/m', $eol, $text);
   // Space-stuff special lines.
-  $text = preg_replace('/^(>| |From)/m', ' $1', $text);
+  $text = preg_replace('/^(>|From)/m', ' $1', $text);
+  // Strip and save trailing $eol chars to work around a bug in older PCRE
+  // libraries.
+  $stripped = rtrim($text, $eol);
+  $suffix = drupal_substr($text, drupal_strlen($stripped));
   // Apply indentation. We only include non-'>' indentation on the first line.
-  $text = $indent . substr(preg_replace('/^/m', $clean_indent, $text), strlen($indent));
-
+  $text = $indent . drupal_substr(preg_replace('/^/m', $clean_indent, $stripped), drupal_strlen($indent)) . $suffix;
   return $text;
 }
 
@@ -347,177 +356,328 @@ function drupal_wrap_mail($text, $indent = '') {
  * The output will be suitable for use as 'format=flowed; delsp=yes' text
  * (RFC 3676) and can be passed directly to drupal_mail() for sending.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS)
+ * rather than "\r\n".
  *
  * This function provides suitable alternatives for the following tags:
- * <a> <em> <i> <strong> <b> <br> <p> <blockquote> <ul> <ol> <li> <dl> <dt>
- * <dd> <h1> <h2> <h3> <h4> <h5> <h6> <hr>
+ *
+ * <a> <address> <b> <blockquote> <br /> <dd> <dl> <dt> <em>
+ * <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <ol> <p> <pre>
+ * <strong> <u> <ul>
+ *
+ * The following tags are also handled:
+ *
+ * <div> <tr>: Rendered the same as a <p> tag.
+ *
+ * <td>: A space is inserted between adjacent table cells.
  *
  * @param $string
  *   The string to be transformed.
- * @param $allowed_tags (optional)
- *   If supplied, a list of tags that will be transformed. If omitted, all
- *   all supported tags are transformed.
+ * @param $allowed_tags
+ *   (optional) If supplied, a list of tags that will be transformed. If
+ *   omitted, all supported tags are transformed.
  *
  * @return
  *   The transformed string.
+ *
+ * @see drupal_mail()
  */
 function drupal_html_to_text($string, $allowed_tags = NULL) {
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
   // Cache list of supported tags.
   static $supported_tags;
-  if (empty($supported_tags)) {
-    $supported_tags = array('a', 'em', 'i', 'strong', 'b', 'br', 'p', 'blockquote', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr');
+  if (!isset($supported_tags)) {
+    $supported_tags = array(
+      'a', 'address', 'b', 'blockquote', 'br', 'dd', 'div', 'dl', 'dt', 'em',
+      'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'li', 'ol', 'p', 'pre',
+      'strong', 'td', 'tr', 'u', 'ul',
+    );
   }
 
   // Make sure only supported tags are kept.
   $allowed_tags = isset($allowed_tags) ? array_intersect($supported_tags, $allowed_tags) : $supported_tags;
 
-  // Make sure tags, entities and attributes are well-formed and properly nested.
-  $string = _filter_htmlcorrector(filter_xss($string, $allowed_tags));
-
-  // Apply inline styles.
-  $string = preg_replace('!</?(em|i)((?> +)[^>]*)?>!i', '/', $string);
-  $string = preg_replace('!</?(strong|b)((?> +)[^>]*)?>!i', '*', $string);
-
-  // Replace inline <a> tags with the text of link and a footnote.
-  // 'See <a href="http://drupal.org">the Drupal site</a>' becomes
-  // 'See the Drupal site [1]' with the URL included as a footnote.
-  _drupal_html_to_mail_urls(NULL, TRUE);
-  $pattern = '@(<a[^>]+?href="([^"]*)"[^>]*?>(.+?)</a>)@i';
-  $string = preg_replace_callback($pattern, '_drupal_html_to_mail_urls', $string);
-  $urls = _drupal_html_to_mail_urls();
-  $footnotes = '';
-  if (count($urls)) {
-    $footnotes .= "\n";
-    for ($i = 0, $max = count($urls); $i < $max; $i++) {
-      $footnotes .= '[' . ($i + 1) . '] ' . $urls[$i] . "\n";
+  // Parse $string into a DOM tree.
+  $dom = filter_dom_load($string);
+  $notes = array();
+  $text = _drupal_html_to_text($dom->documentElement, $allowed_tags, $notes);
+  // Convert non-breaking spaces to regular spaces, and remove trailing linefeeds.
+  $text = trim(str_replace(NBSP, ' ', $text), $eol);
+  // Add footnotes;
+  if ($notes) {
+    // Add a blank line before the footnote list.
+    $text .= $eol;
+    foreach ($notes as $url => $note) {
+      $text .= $eol . '[' . $note . '] ' . $url;
     }
   }
+  return $text;
+}
 
-  // Split tags from text.
-  $split = preg_split('/<([^>]+?)>/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
-  // Note: PHP ensures the array consists of alternating delimiters and literals
-  // and begins and ends with a literal (inserting $null as required).
-
-  $tag = FALSE; // Odd/even counter (tag or no tag)
-  $casing = NULL; // Case conversion function
-  $output = '';
-  $indent = array(); // All current indentation string chunks
-  $lists = array(); // Array of counters for opened lists
-  foreach ($split as $value) {
-    $chunk = NULL; // Holds a string ready to be formatted and output.
-
-    // Process HTML tags (but don't output any literally).
-    if ($tag) {
-      list($tagname) = explode(' ', strtolower($value), 2);
-      switch ($tagname) {
-        // List counters
-        case 'ul':
-          array_unshift($lists, '*');
-          break;
-        case 'ol':
-          array_unshift($lists, 1);
-          break;
-        case '/ul':
-        case '/ol':
-          array_shift($lists);
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Quotation/list markers, non-fancy headers
-        case 'blockquote':
-          // Format=flowed indentation cannot be mixed with lists.
-          $indent[] = count($lists) ? ' "' : '>';
-          break;
-        case 'li':
-          $indent[] = is_numeric($lists[0]) ? ' ' . $lists[0]++ . ') ' : ' * ';
-          break;
-        case 'dd':
-          $indent[] = '    ';
-          break;
-        case 'h3':
-          $indent[] = '.... ';
-          break;
-        case 'h4':
-          $indent[] = '.. ';
-          break;
-        case '/blockquote':
-          if (count($lists)) {
-            // Append closing quote for inline quotes (immediately).
-            $output = rtrim($output, "> \n") . "\"\n";
-            $chunk = ''; // Ensure blank new-line.
+/**
+ * Helper function for drupal_html_to_text
+ *
+ * Recursively converts $node to text, wrapping and indenting as necessary.
+ *
+ * @param $node
+ *   The source DOMNode.
+ * @param $allowed_tags
+ *   A list of tags that will be transformed.
+ * @param $notes
+ *   The list of footnotes, an associative array of (url => reference number) items.
+ * @param $parents
+ *   The list of ancestor tags, from nearest to most distant.
+ * @param $count
+ *   The number to use for the next list item within an ordered list.
+ */
+function _drupal_html_to_text(DOMNode $node, array $allowed_tags, array &$notes, $parents = array(), &$count = NULL) {
+  if (is_null($count)) {
+    $count = 1;
+  }
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+  if ($node->nodeType === XML_TEXT_NODE) {
+    // For text nodes, we just copy the text content.
+    $text = $node->textContent;
+    // Collapse whitespace except within pre tags.
+    if (!in_array('pre', $parents)) {
+      $text = preg_replace('/[[:space:]]+/', ' ', $text);
+    }
+    return $text;
+  }
+  // Non-text node.
+  $tag = '';
+  $text = '';
+  $child_text = '';
+  $child_count = 1;
+  $prefix = '';
+  $indent = '';
+  $suffix = '';
+  if (isset($node->tagName) && in_array($node->tagName, $allowed_tags)) {
+    $tag = $node->tagName;
+    switch ($tag) {
+      // Turn links with valid hrefs into footnotes.
+      case 'a':
+        if ( !empty($node->attributes)
+          && ($href = $node->attributes->getNamedItem('href'))
+          && ($url = url(ltrim($href->nodeValue, '/'), array('absolute' => TRUE)))
+          && valid_url($url) ) {
+          // Only add links that have not already been added.
+          if (isset($notes[$url])) {
+            $note = $notes[$url];
           }
-          // Fall-through
-        case '/li':
-        case '/dd':
-          array_pop($indent);
-          break;
-        case '/h3':
-        case '/h4':
-          array_pop($indent);
-        case '/h5':
-        case '/h6':
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Fancy headers
-        case 'h1':
-          $indent[] = '======== ';
-          $casing = 'drupal_strtoupper';
-          break;
-        case 'h2':
-          $indent[] = '-------- ';
-          $casing = 'drupal_strtoupper';
-          break;
-        case '/h1':
-        case '/h2':
-          $casing = NULL;
-          // Pad the line with dashes.
-          $output = _drupal_html_to_text_pad($output, ($tagname == '/h1') ? '=' : '-', ' ');
-          array_pop($indent);
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Horizontal rulers
-        case 'hr':
-          // Insert immediately.
-          $output .= drupal_wrap_mail('', implode('', $indent)) . "\n";
-          $output = _drupal_html_to_text_pad($output, '-');
-          break;
-
-        // Paragraphs and definition lists
-        case '/p':
-        case '/dl':
-          $chunk = ''; // Ensure blank new-line.
-          break;
-      }
+          else {
+            $note = count($notes) + 1;
+            $notes[$url] = $note;
+          }
+          $suffix = ' [' . $note . ']';
+        }
+        break;
+
+      // Generic block-level tags.
+      case 'address':
+      case 'div':
+      case 'p':
+      case 'pre':
+        $text = $eol;
+        $suffix = $eol;
+        break;
+
+      // Forced line break.
+      case 'br':
+        $text = $eol;
+        break;
+
+      // Boldface by wrapping with "*" characters.
+      case 'b':
+      case 'strong':
+        $prefix = '*';
+        $suffix = '*';
+        break;
+
+      // Italicize by wrapping with "/" characters.
+      case 'em':
+      case 'i':
+        $prefix = '/';
+        $suffix = '/';
+        break;
+
+      // Underline by wrapping with "_" characters.
+      case 'u':
+        $prefix = '_';
+        $suffix = '_';
+        break;
+
+      // Blockquotes are indented by "> " at each level.
+      case 'blockquote':
+        $text = $eol;
+        $indent = '>' . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Dictionary definitions are indented by four spaces.
+      case 'dd':
+        $indent = NBSP . NBSP . NBSP . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Dictionary list.
+      case 'dl':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+
+      // Dictionary term.
+      case 'dt':
+        $suffix = $eol;
+        break;
+
+      // Header level 1 is prefixed by eight "=" characters.
+      case 'h1':
+        $text = "$eol$eol";
+        $indent = '======== ';
+        $suffix = $eol;
+        break;
+
+      // Header level 2 is prefixed by six "-" characters.
+      case 'h2':
+        $text = "$eol$eol";
+        $indent = '------ ';
+        $suffix = $eol;
+        break;
+
+      // Header level 3 is prefixed by four "." characters and a space.
+      case 'h3':
+        $text = "$eol$eol";
+        $indent = '....' . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Header level 4 is prefixed by three "." characters and a space.
+      case 'h4':
+        $text = "$eol$eol";
+        $indent = '...' . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Header level 5 is prefixed by two "." character and a space.
+      case 'h5':
+        $text = "$eol$eol";
+        $indent = '..' . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Header level 6 is prefixed by one "." character and a space.
+      case 'h6':
+        $text = "$eol$eol";
+        $indent = '.' . NBSP;
+        $suffix = $eol;
+        break;
+
+      // Horizontal rulers become a line of 78 "-" characters.
+      case 'hr':
+        $text = $eol . str_repeat('-', 78) . $eol;
+        break;
+
+      // List items are treated differently depending on the parent tag.
+      case 'li':
+        // Ordered list item.
+        if (reset($parents) === 'ol') {
+          // Check the value attribute.
+          if ( !empty($node->attributes)
+            && ($value = $node->attributes->getNamedItem('value'))) {
+            $count = $value->nodeValue;
+          }
+          $indent = ($count < 10 ? NBSP : '') . NBSP . "$count)" . NBSP;
+          $count++;
+        }
+        // Unordered list item.
+        else {
+          $indent = NBSP . '*' . NBSP;
+        }
+        $suffix = $eol;
+        break;
+
+      // Ordered lists.
+      case 'ol':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+        // Check the start attribute.
+        if ( !empty($node->attributes)
+          && ($value = $node->attributes->getNamedItem('start')) ) {
+          $child_count = $value->nodeValue;
+        }
+        break;
+
+      // Start and end tables on a new line.
+      case 'table':
+        $text = $eol;
+        $suffix = $eol;
+        break;
+
+      // Wrap table cells in space characters.
+      case 'td':
+        if (!empty($node->nextSibling)) {
+          $suffix = NBSP;
+        }
+        break;
+
+      // End each table row with a newline.
+      case 'tr':
+        $suffix = $eol;
+        break;
+
+      // Unordered lists.
+      case 'ul':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+        break;
+
+      default:
+        break;
     }
-    // Process blocks of text.
-    else {
-      // Convert inline HTML text to plain text; not removing line-breaks or
-      // white-space, since that breaks newlines when sanitizing plain-text.
-      $value = trim(decode_entities($value));
-      if (drupal_strlen($value)) {
-        $chunk = $value;
-      }
+    // Only add allowed tags to the $parents array.
+    array_unshift($parents, $tag);
+  }
+  // Copy each child node to output.
+  if ($node->hasChildNodes()) {
+    foreach ($node->childNodes as $child) {
+      $child_text .= _drupal_html_to_text($child, $allowed_tags, $notes, $parents, $child_count);
     }
-
-    // See if there is something waiting to be output.
-    if (isset($chunk)) {
-      // Apply any necessary case conversion.
-      if (isset($casing)) {
-        $chunk = $casing($chunk);
+  }
+  // We only add prefix and suffix if the child nodes were non-empty.
+  if (drupal_strlen($child_text)) {
+    // Don't add a newline to an existing newline.
+    if ($suffix === $eol && drupal_substr($child_text, - drupal_strlen($eol)) === $eol) {
+      $suffix = '';
+    }
+    $child_text = $prefix . $child_text . $suffix;
+    // Remove spaces around newlines.
+    $child_text = preg_replace('/ *\n */', "\n", $child_text);
+    $child_text = drupal_wrap_mail($child_text, $indent);
+    // We capitalize the contents of h1 and h2 tags.
+    if ($tag === 'h1' || $tag === 'h2') {
+      $child_text = drupal_strtoupper($child_text);
+      // For h1 and h2 tags at the top level, pad each non-empty line with the
+      // character used for indentation.
+      if (count($parents) == 1) {
+        $pad = drupal_substr($indent, 0, 1);
+        $lines = explode($eol, $child_text);
+        foreach ($lines as $i => $line) {
+          if (strlen($line)) {
+            $lines[$i] = _drupal_html_to_text_pad($line . NBSP, $pad);
+          }
+        }
+        $child_text = implode($eol, $lines);
       }
-      // Format it and apply the current indentation.
-      $output .= drupal_wrap_mail($chunk, implode('', $indent));
-      // Remove non-quotation markers from indentation.
-      $indent = array_map('_drupal_html_to_text_clean', $indent);
     }
-
-    $tag = !$tag;
+    $text .= $child_text;
   }
-
-  return $output . $footnotes;
+  return $text;
 }
 
 /**
@@ -526,37 +686,11 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
  * Wraps words on a single line.
  */
 function _drupal_wrap_mail_line(&$line, $key, $values) {
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
   // Use soft-breaks only for purely quoted or unindented text.
-  $line = wordwrap($line, 77 - $values['length'], $values['soft'] ? "  \n" : "\n");
+  $line = wordwrap($line, 77 - $values['length'], ($values['soft'] ? ' ' : '') . $eol);
   // Break really long words at the maximum width allowed.
-  $line = wordwrap($line, 996 - $values['length'], $values['soft'] ? " \n" : "\n");
-}
-
-/**
- * Helper function for drupal_html_to_text().
- *
- * Keeps track of URLs and replaces them with placeholder tokens.
- */
-function _drupal_html_to_mail_urls($match = NULL, $reset = FALSE) {
-  global $base_url, $base_path;
-  static $urls = array(), $regexp;
-
-  if ($reset) {
-    // Reset internal URL list.
-    $urls = array();
-  }
-  else {
-    if (empty($regexp)) {
-      $regexp = '@^' . preg_quote($base_path, '@') . '@';
-    }
-    if ($match) {
-      list(, , $url, $label) = $match;
-      // Ensure all URLs are absolute.
-      $urls[] = strpos($url, '://') ? $url : preg_replace($regexp, $base_url . '/', $url);
-      return $label . ' [' . count($urls) . ']';
-    }
-  }
-  return $urls;
+  $line = wordwrap($line, 996 - $values['length'], ($values['soft'] ? ' ' : '') . $eol);
 }
 
 /**
@@ -565,7 +699,7 @@ function _drupal_html_to_mail_urls($match = NULL, $reset = FALSE) {
  * Replace all non-quotation markers from a given piece of indentation with spaces.
  */
 function _drupal_html_to_text_clean($indent) {
-  return preg_replace('/[^>]/', ' ', $indent);
+  return preg_replace('/[^>]/', NBSP, $indent);
 }
 
 /**
@@ -574,13 +708,19 @@ function _drupal_html_to_text_clean($indent) {
  * Pad the last line with the given character.
  */
 function _drupal_html_to_text_pad($text, $pad, $prefix = '') {
-  // Remove last line break.
-  $text = substr($text, 0, -1);
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
   // Calculate needed padding space and add it.
-  if (($p = strrpos($text, "\n")) === FALSE) {
+  if (($p = strrpos($text, $eol)) === FALSE) {
     $p = -1;
   }
-  $n = max(0, 79 - (strlen($text) - $p) - strlen($prefix));
-  // Add prefix and padding, and restore linebreak.
-  return $text . $prefix . str_repeat($pad, $n) . "\n";
+  else {
+    // Convert position from byte count to character count. Must use substr()
+    // instead of drupal_substr() to match the previous strrpos() for which
+    // Drupal has no unicode-safe alternative.
+    $p = drupal_strlen(substr($text, 0, $p));
+  }
+  // Subtract the result of strrpos().
+  $n = max(0, 78 - (drupal_strlen($text) - $p) - drupal_strlen($prefix));
+  // Add prefix and padding.
+  return $text . $prefix . str_repeat($pad, $n);
 }
diff --git a/modules/simpletest/tests/mail.test b/modules/simpletest/tests/mail.test
index 8a7b152d9d32eee7ae47c9ef8b5fb9c77f4e0cf1..5b4707b9ede841e62c7c16bd763c7aee697b5d05 100644
--- a/modules/simpletest/tests/mail.test
+++ b/modules/simpletest/tests/mail.test
@@ -1,6 +1,7 @@
 <?php
 
 /**
+ * @file
  * Test the Drupal mailing system.
  */
 class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
@@ -43,7 +44,7 @@ class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
   /**
    * Concatenate and wrap the e-mail body for plain-text mails.
    *
-   * @see DefaultMailSystem
+   * @see DefaultMailSystem()
    */
   public function format(array $message) {
     // Join the body array into one string.
@@ -63,3 +64,285 @@ class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
   }
 }
 
+/**
+ * Unit tests for drupal_html_to_text().
+ */
+class DrupalHtmlToTextTestCase extends DrupalUnitTestCase {
+  public static function getInfo() {
+    return array(
+      'name'  => 'HTML to text conversion',
+      'description' => 'Tests drupal_html_to_text().',
+      'group' => 'Mail',
+    );
+  }
+
+  function _string_to_html($text) {
+    return '"' .
+      str_replace(
+        array("\n", ' '),
+        array('\n', '&nbsp;'),
+        check_plain($text)
+      ) . '"';
+  }
+
+  function _testHtmlToText($html, $text, $args = NULL, $reason = '') {
+    $result = drupal_html_to_text($html, $args);
+    $this->assertEqual(
+      $result,
+      $text,
+      (drupal_strlen($reason) ? $reason . '<br />' : '')
+      . 'html = ' . $this->_string_to_html($html) . '<br />'
+      . 'result = ' . $this->_string_to_html($result) . '<br />'
+      . 'expected = ' . $this->_string_to_html($text)
+    );
+  }
+
+  /**
+   * Test all supported tags of drupal_html_to_text().
+   */
+  function testTags() {
+    $tests = array(
+      '<a href = "http://drupal.org">Drupal.org</a>' => "Drupal.org [1]\n\n[1] http://drupal.org",
+      '<a href = "/">Homepage</a>' => "Homepage [1]\n\n[1] " . url('', array('absolute' => TRUE)),
+      '<address>Drupal</address>' => "Drupal",
+      '<address>Drupal</address><address>Drupal</address>' => "Drupal\n\nDrupal",
+      '<b>Drupal</b>' => "*Drupal*",
+      '<blockquote>Drupal</blockquote>' => " > Drupal",
+      '<blockquote>Drupal</blockquote><blockquote>Drupal</blockquote>' => " > Drupal\n\n > Drupal",
+      '<br />Drupal<br />Drupal<br /><br />Drupal' => "Drupal\nDrupal\n\nDrupal",
+      '<br/>Drupal<br/>Drupal<br/><br/>Drupal' => "Drupal\nDrupal\n\nDrupal",
+      '<br/>Drupal<br/>Drupal<br/><br/>Drupal<p>Drupal</p>' => "Drupal\nDrupal\n\nDrupal\nDrupal",
+      '<div>Drupal</div>' => "Drupal",
+      '<div>Drupal</div><div>Drupal</div>' => "Drupal\n\nDrupal",
+      '<em>Drupal</em>' => "/Drupal/",
+      '<h1>Drupal</h1>' => "======== DRUPAL " . str_repeat('=', 61),
+      '<h1>Drupal</h1><p>Drupal</p>' => "======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal",
+      '<h2>Drupal</h2>' => "------ DRUPAL " . str_repeat('-', 63),
+      '<h2>Drupal</h2><p>Drupal</p>' => "------ DRUPAL " . str_repeat('-', 63) . "\n\nDrupal",
+      '<h3>Drupal</h3>' => ".... Drupal",
+      '<h3>Drupal</h3><p>Drupal</p>' => ".... Drupal\n\nDrupal",
+      '<h4>Drupal</h4>' => "... Drupal",
+      '<h4>Drupal</h4><p>Drupal</p>' => "... Drupal\n\nDrupal",
+      '<h5>Drupal</h5>' => ".. Drupal",
+      '<h5>Drupal</h5><p>Drupal</p>' => ".. Drupal\n\nDrupal",
+      '<h6>Drupal</h6>' => ". Drupal",
+      '<h6>Drupal</h6><p>Drupal</p>' => ". Drupal\n\nDrupal",
+      '<hr />Drupal<hr />' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78),
+      '<hr/>Drupal<hr/>' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78),
+      '<hr/>Drupal<hr/><p>Drupal</p>' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78) . "\n\nDrupal",
+      '<i>Drupal</i>' => "/Drupal/",
+      '<p>Drupal</p>' => "Drupal",
+      '<p>Drupal</p><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<pre>Drupal</pre>' => "Drupal",
+      '<pre>Drupal</pre>Drupal' => "Drupal\nDrupal",
+      '<pre>Drupal</pre><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<strong>Drupal</strong>' => "*Drupal*",
+      '<table><tr><td>Drupal</td><td>Drupal</td></tr><tr><td>Drupal</td><td>Drupal</td></tr></table>' => "Drupal Drupal\nDrupal Drupal",
+      '<table><tr><td>Drupal</td></tr></table><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<u>Drupal</u>' => "_Drupal_",
+      '<ul><li>Drupal</li></ul>' => " * Drupal",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</li></ul>' => " * Drupal /Drupal/ Drupal",
+      '<ul><li>Drupal</li><li><ol><li>Drupal</li><li>Drupal</li></ol></li></ul>' => " * Drupal\n *   1) Drupal\n     2) Drupal",
+      '<ul><li>Drupal</li><li><ol><li>Drupal</li></ol></li><li>Drupal</li></ul>' => " * Drupal\n *   1) Drupal\n * Drupal",
+      '<ul><li>Drupal</li><li>Drupal</li></ul>' => " * Drupal\n * Drupal",
+      '<ul><li>Drupal</li></ul><p>Drupal</p>' => " * Drupal\n\nDrupal",
+      '<ol><li>Drupal</li></ol>' => "  1) Drupal",
+      '<ol><li>Drupal</li><li><ul><li>Drupal</li><li>Drupal</li></ul></li></ol>' => "  1) Drupal\n  2)  * Drupal\n      * Drupal",
+      '<ol><li>Drupal</li><li>Drupal</li></ol>' => "  1) Drupal\n  2) Drupal",
+      '<ol>Drupal</ol>' => "Drupal",
+      '<ol><li>Drupal</li></ol><p>Drupal</p>' => "  1) Drupal\n\nDrupal",
+      '<dl><dt>Drupal</dt></dl>' => "Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd></dl>' => "Drupal\n    Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd><dt>Drupal</dt><dd>Drupal</dd></dl>' => "Drupal\n    Drupal\nDrupal\n    Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd></dl><p>Drupal</p>' => "Drupal\n    Drupal\n\nDrupal",
+      '<dl><dt>Drupal<dd>Drupal</dl>' => "Drupal\n    Drupal",
+      '<dl><dt>Drupal</dt></dl><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<ul><li>Drupal</li><li><dl><dt>Drupal</dt><dd>Drupal</dd><dt>Drupal</dt><dd>Drupal</dd></dl></li><li>Drupal</li></ul>' => " * Drupal\n * Drupal\n       Drupal\n   Drupal\n       Drupal\n * Drupal",
+      // Tests malformed HTML tags.
+      '<br>Drupal<br>Drupal' => "Drupal\nDrupal",
+      '<hr>Drupal<hr>Drupal' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78) . "\nDrupal",
+      '<ol><li>Drupal<li>Drupal</ol>' => "  1) Drupal\n  2) Drupal",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</ul></ul>' => " * Drupal /Drupal/ Drupal",
+      '<ul><li>Drupal<li>Drupal</ol>' => " * Drupal\n * Drupal",
+      '<ul><li>Drupal<li>Drupal</ul>' => " * Drupal\n * Drupal",
+      '<ul>Drupal</ul>' => "Drupal",
+      'Drupal</ul></ol></dl><li>Drupal' => "Drupal * Drupal",
+      '<dl>Drupal</dl>' => "Drupal",
+      '<dl>Drupal</dl><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<dt>Drupal</dt>' => "Drupal",
+      // Tests some unsupported HTML tags.
+      '<html>Drupal</html>' => "Drupal",
+      '<script type="text/javascript">Drupal</script>' => "",
+    );
+
+    foreach ($tests as $html => $text) {
+      $this->_testHtmlToText($html, $text);
+    }
+  }
+
+  /**
+   * Test $allowed_tags argument of drupal_html_to_text().
+   */
+  function testDrupalHtmlToTextArgs() {
+    // The second parameter of drupal_html_to_text() overrules the allowed tags.
+    $this->_testHtmlToText(
+      'Drupal <b>Drupal</b> Drupal',
+      'Drupal *Drupal* Drupal',
+      array('b'),
+      'Allowed &lt;b&gt; tag found.'
+    );
+    $this->_testHtmlToText(
+      'Drupal <h1>Drupal</h1> Drupal',
+      'Drupal Drupal Drupal',
+      array('b'),
+      'Disallowed &lt;h1&gt; tag not found.'
+    );
+
+    $this->_testHtmlToText(
+      'Drupal <p><em><b>Drupal</b></em><p> Drupal',
+      'Drupal Drupal Drupal',
+      array('a', 'br', 'h1'),
+      'Disallowed &lt;p&gt;, &lt;em&gt;, and &lt;b&gt; tags not found.'
+    );
+
+    $this->_testHtmlToText(
+      '<html><body>Drupal</body></html>',
+      'Drupal',
+      array('html', 'body'),
+      'Unsupported &lt;html&gt; and &lt;body&gt; tags not found.'
+    );
+  }
+
+  /**
+   * Test that whitespace is collapsed, except within <pre> tags.
+   */
+  function testDrupalHtmltoTextCollapsesWhitespace() {
+    $input = "<pre>Drupal  Drupal\n\nDrupal<pre>Drupal  Drupal\n\nDrupal</pre>Drupal  Drupal\n\nDrupal</pre>";
+    $collapsed = "Drupal Drupal DrupalDrupal Drupal DrupalDrupal Drupal Drupal";
+    $preserved = "Drupal  Drupal\n\nDrupal\nDrupal  Drupal\n\nDrupal\nDrupal  Drupal\n\nDrupal";
+    $this->_testHtmlToText(
+      $input,
+      $collapsed,
+      array('p'),
+      'Whitespace inside disallowed &lt;pre&gt; tags is collapsed:<br />'
+    );
+    $this->_testHtmlToText(
+      $input,
+      $preserved,
+      NULL,
+      'Whitespace inside allowed &lt;pre&gt; tags is preserved:<br />'
+    );
+  }
+
+  /**
+   * Test that text separated by block-level tags in HTML get separated by
+   * (at least) a newline in the plaintext version.
+   */
+  function testDrupalHtmlToTextBlockTagToNewline() {
+    $input = '[text]'
+      . '<address>[address]</address>'
+      . '<blockquote>[blockquote]</blockquote>'
+      . '<br />[br]'
+      . '<div>[div]</div>'
+      . '<dl><dt>[dl-dt]</dt>'
+      . '<dt>[dt]</dt>'
+      . '<dd>[dd]</dd>'
+      . '<dd>[dd-dl]</dd></dl>'
+      . '<h1>[h1]</h1>'
+      . '<h2>[h2]</h2>'
+      . '<h3>[h3]</h3>'
+      . '<h4>[h4]</h4>'
+      . '<h5>[h5]</h5>'
+      . '<h6>[h6]</h6>'
+      . '<hr />[hr]'
+      . '<ol><li>[ol-li]</li>'
+      . '<li>[li]</li>'
+      . '<li>[li-ol]</li></ol>'
+      . '<p>[p]</p>'
+      . '<pre>[pre]</pre>'
+      . '<table><thead><tr><td>[table-thead--tr-td]</td></tr></thead>'
+      . '<tbody><tr><td>[tbody-tr-td]</td></tr>'
+      . '<tr><td>[tr-td]</td></tr></tbody></table>'
+      . '<ul><li>[ul-li]</li>'
+      . '<li>[li-ul]</li></ul>'
+      . '[text]';
+    $output = drupal_html_to_text($input);
+    $this->assertFalse(
+      preg_match('/\][^\n]*\[/s', $output),
+      'Block-level HTML tags should force newlines: '
+      . nl2br(check_plain($output))
+    );
+    $output_upper = drupal_strtoupper($output);
+    $upper_input = drupal_strtoupper($input);
+    $upper_output = drupal_html_to_text($upper_input);
+    $this->assertEqual(
+      $upper_output,
+      $output_upper,
+      'Tag recognition should be case-insensitive:<br />'
+      . $upper_output
+      . '<br />should  be equal to <br />'
+      . $output_upper
+    );
+  }
+
+  /**
+   * Test that headers are properly separated from surrounding text.
+   */
+  function testHeaderSeparation() {
+    // Text before and after.
+    $html = 'Drupal<h1>Drupal</h1>Drupal';
+    $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 61) . "\nDrupal";
+    $this->_testHtmlToText($html, $text);
+    // Paragraph before; text after.
+    $html = '<p>Drupal</p><h1>Drupal</h1>Drupal';
+    $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 61) . "\nDrupal";
+    $this->_testHtmlToText($html, $text);
+    // Text before; paragraph after.
+    $html = 'Drupal<h1>Drupal</h1><p>Drupal</p>';
+    $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal";
+    $this->_testHtmlToText($html, $text);
+    //  Paragraph before and after.
+    $html = '<p>Drupal</p><h1>Drupal</h1><p>Drupal</p>';
+    $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal";
+    $this->_testHtmlToText($html, $text);
+  }
+
+  /**
+   * Test that footnote references are properly generated.
+   */
+  function testFootnoteReferences() {
+    $source = '<a href="http://www.example.com/node/1">Host and path</a>'
+      . '<br /><a href="http://www.example.com">Host, no path</a>'
+      . '<br /><a href="/node/1">Path, no host</a>'
+      . '<br /><a href="node/1">Relative path</a>';
+    $tt = "Host and path [1]"
+      . "\nHost, no path [2]"
+      . "\nPath, no host [3]"
+      . "\nRelative path [3]"
+      . "\n"
+      . "\n[1] http://www.example.com/node/1"
+      . "\n[2] http://www.example.com"
+      . "\n[3] " . url('node/1', array('absolute' => TRUE));
+    $this->_testHtmlToText($source, $tt);
+  }
+
+  /**
+   * Test that combinations of paragraph breaks, line breaks, linefeeds,
+   * and spaces are properly handled.
+   */
+  function testDrupalHtmlToTextParagraphs() {
+    $tests = array(
+      array(
+        'html' => "<p>line 1<br />\nline 2<br />line 3\n<br />line 4</p><p>paragraph</p>",
+        'text' => "line 1\nline 2\nline 3\nline 4\n\nparagraph",
+      ),
+      array(
+        'html' => "<p>line 1<br /> line 2</p> <p>line 4<br /> line 5</p> <p>0</p>",
+        'text' => "line 1\nline 2\n\nline 4\nline 5\n\n0",
+      )
+    );
+    foreach ($tests as $test) {
+      $this->_testHtmlToText($test['html'], $test['text']);
+    }
+  }
+}
-- 
1.7.4.1

