Allow script attribute and CSS value properties to be conditionally allowed into HTML filter [#1311064]

The default HTML filter is a great way to provide protection against XSS attacks, and allow the administrator to quickly designate which HTML are OK to be rendered.

Unfortunately, it wipes out the CSS style attribute and its values. A lot of WYSIWYG's use style as an attribute, which is unconditionally blocked in _filter_xss_attributes(). CSS properties are then nuked by filter_xss_bad_protocol() -- which reads the colon-separated name-value pairs as bad URL protocol.

A lot of WYSIWYG's also put extra divs into the markup, and if weren't for the fact that the HTML filter blows away the style attribute, the default HTML filter would be a great way to rid the output of these extra divs, and disallow other types of markup that the WYSIWYG might be generating.

I threw together a module that adds another HTML filter option, which is basically the default option, without the removal of the style attribute, and it's got a list of CSS properties that are OK'd as "allowed protocol". I'm putting it out there and can't really maintain or update it, but if anybody wanted to maybe look at making a module or putting an option to make the default Drupal filter behave like this does, please do.

The WA prefix stands for Western Ascent, Inc

wafilter.module:



/**
 * Implementation of hook_filter().
 *
 * Sets up a WAFilter
 */
function wafilter_filter($op, $delta = 0, $format = -1, $text = '') {
  switch ($op) {
    case 'list':
      return array(0 => t('WAFilter'));

    case 'description':
      switch ($delta) {
        case 0:
          return t('Similar to default HTML Filter, but doesn\'t remove the inline style attribute.');       
        default:
          return;
      }

    case 'process':
      switch ($delta) {
        case 0:
          return _filter_wafilter($text, $format);
        default:
          return $text;
      }

    case 'settings':
      switch ($delta) {
        case 0:
          return _filter_wafilter_settings($format);
        default:
          return;
      }

    default:
      return $text;
  }
}


/**
 * Settings for the WAFilter.
 */
function _filter_wafilter_settings($format) {
  $form['filter_html'] = array(
    '#type' => 'fieldset',
    '#title' => t('WAFilter'),
    '#collapsible' => TRUE,
  );
  $form['filter_html']["wafilter_html_$format"] = array(
    '#type' => 'radios',
    '#title' => t('Filter HTML tags'),
    '#default_value' => variable_get("wafilter_html_$format", FILTER_HTML_STRIP),
    '#options' => array(FILTER_HTML_STRIP => t('Strip disallowed tags'), FILTER_HTML_ESCAPE => t('Escape all tags')),
    '#description' => t('How to deal with HTML tags in user-contributed content. If set to "Strip disallowed tags", dangerous tags are removed (see below). If set to "Escape tags", all HTML is escaped and presented as it was typed.'),
  );
  $form['filter_html']["waallowed_html_$format"] = array(
    '#type' => 'textfield',
    '#title' => t('Allowed HTML tags'),
    '#default_value' => variable_get("waallowed_html_$format", '<a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>'),
    '#size' => 64,
    '#maxlength' => 1024,
    '#description' => t('If "Strip disallowed tags" is selected, optionally specify tags which should not be stripped. JavaScript event attributes are always stripped.'),
  );
  $form['filter_html']["wafilter_html_help_$format"] = array(
    '#type' => 'checkbox',
    '#title' => t('Display HTML help'),
    '#default_value' => variable_get("wafilter_html_help_$format", 1),
    '#description' => t('If enabled, Drupal will display some basic HTML help in the long filter tips.'),
  );
  $form['filter_html']["wafilter_html_nofollow_$format"] = array(
    '#type' => 'checkbox',
    '#title' => t('Spam link deterrent'),
    '#default_value' => variable_get("wafilter_html_nofollow_$format", FALSE),
    '#description' => t('If enabled, Drupal will add rel="nofollow" to all links, as a measure to reduce the effectiveness of spam links. Note: this will also prevent valid links from being followed by search engines, therefore it is likely most effective when enabled for anonymous users.'),
  );
  return $form;
}

/**
 * HTML filter. Provides filtering of input into accepted HTML.
 */
function _filter_wafilter($text, $format) {
  if (variable_get("wafilter_html_$format", FILTER_HTML_STRIP) == FILTER_HTML_STRIP) {
    $allowed_tags = preg_split('/\s+|<|>/', variable_get("waallowed_html_$format", '<a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>'), -1, PREG_SPLIT_NO_EMPTY);
    $text = wafilter_xss($text, $allowed_tags);
  }

  if (variable_get("wafilter_html_$format", FILTER_HTML_STRIP) == FILTER_HTML_ESCAPE) {
    // Escape HTML
    $text = check_plain($text);
  }

  if (variable_get("wafilter_html_nofollow_$format", FALSE)) {
    $text = preg_replace('/<a([^>]+)>/i', '<a\\1 rel="nofollow">', $text);
  }

  return trim($text);
}

/**
 * Filters an HTML string to prevent cross-site-scripting (XSS) vulnerabilities.
 *
 * Based on Drupal default filter_xss (copy/paste with override on _wafilter_xss_split
 *
 * This code does four things:
 * - Removes characters and constructs that can trick browsers.
 * - Makes sure all HTML entities are well-formed.
 * - Makes sure all HTML tags and attributes are well-formed.
 * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
 *   javascript:).
 *
 * @param $string
 *   The string with raw HTML in it. It will be stripped of everything that can
 *   cause an XSS attack.
 * @param $allowed_tags
 *   An array of allowed tags.
 *
 * @return
 *   An XSS safe version of $string, or an empty string if $string is not
 *   valid UTF-8.
 *
 * @see drupal_validate_utf8()
 * @ingroup sanitization
 */
function wafilter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) {
  // Only operate on valid UTF-8 strings. This is necessary to prevent cross
  // site scripting issues on Internet Explorer 6.
  if (!drupal_validate_utf8($string)) {
    return '';
  }
  // Store the input format
  _wafilter_xss_split($allowed_tags, TRUE);
  // Remove NUL characters (ignored by some browsers)
  $string = str_replace(chr(0), '', $string);
  // Remove Netscape 4 JS entities
  $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);

  // Defuse all HTML entities
  $string = str_replace('&', '&amp;', $string);
  // Change back only well-formed entities in our whitelist
  // Decimal numeric entities
  $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
  // Hexadecimal numeric entities
  $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
  // Named entities
  $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);

  return preg_replace_callback('%
    (
    <(?=[^a-zA-Z!/])  # a lone <
    |                 # or
    <!--.*?-->        # a comment
    |                 # or
    <[^>]*(>|$)       # a string that starts with a <, up until the > or the end of the string
    |                 # or
    >                 # just a >
    )%x', '_wafilter_xss_split', $string);
}

/**
 * Processes an HTML tag.
 *
 * total ripoff of _filter_xss_split, but points to _wafilter_xss_attributes
 *
 * @param @m
 *   An array with various meaning depending on the value of $store.
 *   If $store is TRUE then the array contains the allowed tags.
 *   If $store is FALSE then the array has one element, the HTML tag to process.
 * @param $store
 *   Whether to store $m.
 * @return
 *   If the element isn't allowed, an empty string. Otherwise, the cleaned up
 *   version of the HTML element.
 */
function _wafilter_xss_split($m, $store = FALSE) {
  static $allowed_html;

  if ($store) {
    $allowed_html = array_flip($m);
    return;
  }

  $string = $m[1];

  if (substr($string, 0, 1) != '<') {
    // We matched a lone ">" character
    return '&gt;';
  }
  else if (strlen($string) == 1) {
    // We matched a lone "<" character
    return '&lt;';
  }

  if (!preg_match('%^(?:<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|(<!--.*?-->))$%', $string, $matches)) {
    // Seriously malformed
    return '';
  }

  $slash = trim($matches[1]);
  $elem = &$matches[2];
  $attrlist = &$matches[3];
  $comment = &$matches[4];

  if ($comment) {
    $elem = '!--';
  }

  if (!isset($allowed_html[strtolower($elem)])) {
    // Disallowed HTML element
    return '';
  }

  if ($comment) {
    return $comment;
  }

  if ($slash != '') {
    return "</$elem>";
  }

  // Is there a closing XHTML slash at the end of the attributes?
  // In PHP 5.1.0+ we could count the changes, currently we need a separate match
  $xhtml_slash = preg_match('%\s?/\s*$%', $attrlist) ? ' /' : '';
  $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist);

  // Clean up attributes
  $attr2 = implode(' ', _wafilter_xss_attributes($attrlist));
  $attr2 = preg_replace('/[<>]/', '', $attr2);
  $attr2 = strlen($attr2) ? ' '. $attr2 : '';

  return "<$elem$attr2$xhtml_slash>";
}

/**
 * Processes a string of HTML attributes.
 *
 * total rip of _filter_xss_attributes, but leaves 'style' attribute alone
 *
 * @return
 *   Cleaned up version of the HTML attributes.
 */
function _wafilter_xss_attributes($attr) {
  $attrarr = array();
  $mode = 0;
  $attrname = '';

  while (strlen($attr) != 0) {
    // Was the last operation successful?
    $working = 0;

    switch ($mode) {
      case 0:
        // Attribute name, href for instance
        if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
          $attrname = strtolower($match[1]);
		  //DRUPAL DEFAULT
          //$skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on');
		  //WA FILTER
          $skip = (substr($attrname, 0, 2) == 'on');
		  //TODO: allowed list of attributes?
		  
          $working = $mode = 1;
          $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
        }

        break;

      case 1:
        // Equals sign or valueless ("selected")
        if (preg_match('/^\s*=\s*/', $attr)) {
          $working = 1; $mode = 2;
          $attr = preg_replace('/^\s*=\s*/', '', $attr);
          break;
        }

        if (preg_match('/^\s+/', $attr)) {
          $working = 1; $mode = 0;
          if (!$skip) {
            $attrarr[] = $attrname;
          }
          $attr = preg_replace('/^\s+/', '', $attr);
        }

        break;

      case 2:
        // Attribute value, a URL after href= for instance
        if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
          $thisval = wafilter_xss_bad_protocol($match[1]);

          if (!$skip) {
            $attrarr[] = "$attrname=\"$thisval\"";
          }
          $working = 1;
          $mode = 0;
          $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
          break;
        }

        if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
          $thisval = wafilter_xss_bad_protocol($match[1]);

          if (!$skip) {
            $attrarr[] = "$attrname='$thisval'";;
          }
          $working = 1; $mode = 0;
          $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
          break;
        }

        if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
          $thisval = wafilter_xss_bad_protocol($match[1]);

          if (!$skip) {
            $attrarr[] = "$attrname=\"$thisval\"";
          }
          $working = 1; $mode = 0;
          $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
        }

        break;
    }

    if ($working == 0) {
      // not well formed, remove and try again
      $attr = preg_replace('/
        ^
        (
        "[^"]*("|$)     # - a string that starts with a double quote, up until the next double quote or the end of the string
        |               # or
        \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
        |               # or
        \S              # - a non-whitespace character
        )*              # any number of the above three
        \s*             # any number of whitespaces
        /x', '', $attr);
      $mode = 0;
    }
  }

  // the attribute list ends with a valueless attribute like "selected"
  if ($mode == 1) {
    $attrarr[] = $attrname;
  }
  return $attrarr;
}

/**
 *
 * Copy/Paste Drupal's filter_xss_bad_protocol and pass CSS properties
 * as "allowed protocols"
 *
 * @param $string
 *   The string with the attribute value.
 * @param $decode
 *   Whether to decode entities in the $string. Set to FALSE if the $string
 *   is in plain text, TRUE otherwise. Defaults to TRUE.
 * @return
 *   Cleaned up and HTML-escaped version of $string.
 */
function wafilter_xss_bad_protocol($string, $decode = TRUE) {
  static $allowed_protocols;
  if (!isset($allowed_protocols)) {
    $allowed_protocols = array_flip(
		variable_get('filter_allowed_protocols', array(
			'http', 'https', 'ftp', 'news', 'nntp', 'tel', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'webcal', 'rtsp')
		)
	);
  }

  // Get the plain text representation of the attribute value (i.e. its meaning).
  if ($decode) {
    $string = decode_entities($string);
  }
  
  //pass CSS exceptions as "protocols"
  //TODO: admin-configured allowed list of css styles?
  $css_as_protocols = array_flip(array('float','width','height',
		'margin','margin-top','margin-right','margin-bottom','margin-left',
		'padding','padding-top','padding-right','padding-bottom','padding-left',
		'border','border-top','border-right','border-bottom','border-left'
	));
  $allowed_protocols = array_merge($allowed_protocols,$css_as_protocols);
  

  // Iteratively remove any invalid protocol found.

  do {
    $before = $string;
    $colonpos = strpos($string, ':');
    if ($colonpos > 0) {
      // We found a colon, possibly a protocol. Verify.
      $protocol = substr($string, 0, $colonpos);
      // If a colon is preceded by a slash, question mark or hash, it cannot
      // possibly be part of the URL scheme. This must be a relative URL,
      // which inherits the (safe) protocol of the base document.
      if (preg_match('![/?#]!', $protocol)) {
        break;
      }
      // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive
      // Check if this is a disallowed protocol.
      if (!isset($allowed_protocols[strtolower($protocol)])) {
        $string = substr($string, $colonpos + 1);
      }
    }
  } while ($before != $string);
  return check_plain($string);
}