diff --git a/core/includes/bootstrap.inc b/core/includes/bootstrap.inc index 6541350..27aa4da 100644 --- a/core/includes/bootstrap.inc +++ b/core/includes/bootstrap.inc @@ -6,6 +6,7 @@ use Drupal\Component\Utility\String; use Drupal\Component\Utility\Timer; use Drupal\Component\Utility\Unicode; +use Drupal\Component\Utility\UrlValidator; use Drupal\Core\DrupalKernel; use Drupal\Core\Database\Database; use Drupal\Core\DependencyInjection\ContainerBuilder; @@ -1455,15 +1456,11 @@ function check_plain($text) { * * @return * TRUE if the text is valid UTF-8, FALSE if not. + * + * @see \Drupal\Component\Utility\Unicode::validateUtf8() */ function drupal_validate_utf8($text) { - if (strlen($text) == 0) { - return TRUE; - } - // With the PCRE_UTF8 modifier 'u', preg_match() fails silently on strings - // containing invalid UTF-8 byte sequences. It does not reject character - // codes above U+10FFFF (represented by 4 or more octets), though. - return (preg_match('/^./us', $text) == 1); + return Unicode::validateUtf8($text); } /** diff --git a/core/includes/common.inc b/core/includes/common.inc index 4f72021..8685360 100644 --- a/core/includes/common.inc +++ b/core/includes/common.inc @@ -1,6 +1,9 @@ get('protocols') ?: array('http', 'https')); - } - - // Iteratively remove any invalid protocol found. - do { - $before = $uri; - $colonpos = strpos($uri, ':'); - if ($colonpos > 0) { - // We found a colon, possibly a protocol. Verify. - $protocol = substr($uri, 0, $colonpos); - // If a colon is preceded by a slash, question mark or hash, it cannot - // possibly be part of the URL scheme. This must be a relative URL, which - // inherits the (safe) protocol of the base document. - if (preg_match('![/?#]!', $protocol)) { - break; - } - // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3 - // (URI Comparison) scheme comparison must be case-insensitive. - if (!isset($allowed_protocols[strtolower($protocol)])) { - $uri = substr($uri, $colonpos + 1); - } - } - } while ($before != $uri); - - return $uri; + return UrlValidator::stripDangerousProtocols($uri); } /** @@ -961,10 +916,11 @@ function drupal_strip_dangerous_protocols($uri) { * Drupal\Core\Template\Attribute, call drupal_strip_dangerous_protocols() * instead. * - * @see drupal_strip_dangerous_protocols() + * @see \Drupal\Component\Utility\Url::stripDangerousProtocols() + * @see \Drupal\Component\Utility\String::checkPlain() */ function check_url($uri) { - return check_plain(drupal_strip_dangerous_protocols($uri)); + return String::checkPlain(UrlValidator::stripDangerousProtocols($uri)); } /** @@ -976,9 +932,17 @@ function check_url($uri) { * * Allows all tags that can be used inside an HTML body, save * for scripts and styles. + * + * @param string $string + * The string to apply the filter to. + * + * @return string + * The filtered string. + * + * @see \Drupal\Component\Utility\Xss::filterAdmin() */ function filter_xss_admin($string) { - return filter_xss($string, array('a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'command', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'menu', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr')); + return Xss::filterAdmin($string); } /** @@ -1004,246 +968,27 @@ function filter_xss_admin($string) { * An XSS safe version of $string, or an empty string if $string is not * valid UTF-8. * - * @see drupal_validate_utf8() + * @see \Drupal\Component\Utility\Xss::filter() + * * @ingroup sanitization */ function filter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) { - // Only operate on valid UTF-8 strings. This is necessary to prevent cross - // site scripting issues on Internet Explorer 6. - if (!drupal_validate_utf8($string)) { - return ''; - } - // Store the text format. - _filter_xss_split($allowed_tags, TRUE); - // Remove NULL characters (ignored by some browsers). - $string = str_replace(chr(0), '', $string); - // Remove Netscape 4 JS entities. - $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string); - - // Defuse all HTML entities. - $string = str_replace('&', '&', $string); - // Change back only well-formed entities in our whitelist: - // Decimal numeric entities. - $string = preg_replace('/&#([0-9]+;)/', '&#\1', $string); - // Hexadecimal numeric entities. - $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string); - // Named entities. - $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string); - - return preg_replace_callback('% - ( - <(?=[^a-zA-Z!/]) # a lone < - | # or - # a comment - | # or - <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string - | # or - > # just a > - )%x', '_filter_xss_split', $string); -} - -/** - * Processes an HTML tag. - * - * @param $m - * An array with various meaning depending on the value of $store. - * If $store is TRUE then the array contains the allowed tags. - * If $store is FALSE then the array has one element, the HTML tag to process. - * @param $store - * Whether to store $m. - * - * @return - * If the element isn't allowed, an empty string. Otherwise, the cleaned up - * version of the HTML element. - */ -function _filter_xss_split($m, $store = FALSE) { - static $allowed_html; - - if ($store) { - $allowed_html = array_flip($m); - return; - } - - $string = $m[1]; - - if (substr($string, 0, 1) != '<') { - // We matched a lone ">" character. - return '>'; - } - elseif (strlen($string) == 1) { - // We matched a lone "<" character. - return '<'; - } - - if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|()$%', $string, $matches)) { - // Seriously malformed. - return ''; - } - - $slash = trim($matches[1]); - $elem = &$matches[2]; - $attrlist = &$matches[3]; - $comment = &$matches[4]; - - if ($comment) { - $elem = '!--'; - } - - if (!isset($allowed_html[strtolower($elem)])) { - // Disallowed HTML element. - return ''; - } - - if ($comment) { - return $comment; - } - - if ($slash != '') { - return ""; - } - - // Is there a closing XHTML slash at the end of the attributes? - $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count); - $xhtml_slash = $count ? ' /' : ''; - - // Clean up attributes. - $attr2 = implode(' ', _filter_xss_attributes($attrlist)); - $attr2 = preg_replace('/[<>]/', '', $attr2); - $attr2 = strlen($attr2) ? ' ' . $attr2 : ''; - - return "<$elem$attr2$xhtml_slash>"; -} - -/** - * Processes a string of HTML attributes. - * - * @return - * Cleaned up version of the HTML attributes. - */ -function _filter_xss_attributes($attr) { - $attrarr = array(); - $mode = 0; - $attrname = ''; - - while (strlen($attr) != 0) { - // Was the last operation successful? - $working = 0; - - switch ($mode) { - case 0: - // Attribute name, href for instance. - if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { - $attrname = strtolower($match[1]); - $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on'); - $working = $mode = 1; - $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); - } - break; - - case 1: - // Equals sign or valueless ("selected"). - if (preg_match('/^\s*=\s*/', $attr)) { - $working = 1; $mode = 2; - $attr = preg_replace('/^\s*=\s*/', '', $attr); - break; - } - - if (preg_match('/^\s+/', $attr)) { - $working = 1; $mode = 0; - if (!$skip) { - $attrarr[] = $attrname; - } - $attr = preg_replace('/^\s+/', '', $attr); - } - break; - - case 2: - // Attribute value, a URL after href= for instance. - if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname=\"$thisval\""; - } - $working = 1; - $mode = 0; - $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); - break; - } - - if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname='$thisval'"; - } - $working = 1; $mode = 0; - $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); - break; - } - - if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname=\"$thisval\""; - } - $working = 1; $mode = 0; - $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); - } - break; - } - - if ($working == 0) { - // Not well formed; remove and try again. - $attr = preg_replace('/ - ^ - ( - "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string - | # or - \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string - | # or - \S # - a non-whitespace character - )* # any number of the above three - \s* # any number of whitespaces - /x', '', $attr); - $mode = 0; - } - } - - // The attribute list ends with a valueless attribute like "selected". - if ($mode == 1 && !$skip) { - $attrarr[] = $attrname; - } - return $attrarr; + return Xss::filter($string, $allowed_tags); } /** * Processes an HTML attribute value and strips dangerous protocols from URLs. * - * @param $string + * @param string $string * The string with the attribute value. - * @param $decode - * (deprecated) Whether to decode entities in the $string. Set to FALSE if the - * $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter - * is deprecated and will be removed in Drupal 8. To process a plain-text URI, - * call drupal_strip_dangerous_protocols() or check_url() instead. * - * @return + * @return string * Cleaned up and HTML-escaped version of $string. + * + * @see \Drupal\Component\Utility\Url::filterBadProtocol() */ -function filter_xss_bad_protocol($string, $decode = TRUE) { - // Get the plain text representation of the attribute value (i.e. its meaning). - // @todo Remove the $decode parameter in Drupal 8, and always assume an HTML - // string that needs decoding. - if ($decode) { - if (!function_exists('decode_entities')) { - require_once __DIR__ . '/unicode.inc'; - } - - $string = decode_entities($string); - } - return check_plain(drupal_strip_dangerous_protocols($string)); +function filter_xss_bad_protocol($string) { + return UrlValidator::filterBadProtocol($string); } /** @@ -4594,6 +4339,16 @@ function _drupal_bootstrap_code() { ini_set('log_errors', 1); ini_set('error_log', 'public://error.log'); } + + // Set the allowed protocols once we have the config available. + $allowed_protocols = \Drupal::config('system.filter')->get('protocols'); + if (!isset($allowed_protocols)) { + // filter_xss_admin() is called by the installer and update.php, in which + // case the configuration may not exist (yet). Provide a minimal default set + // of allowed protocols for these cases. + $allowed_protocols = array('http', 'https'); + } + UrlValidator::setAllowedProtocols($allowed_protocols); } /** diff --git a/core/lib/Drupal/Component/Utility/String.php b/core/lib/Drupal/Component/Utility/String.php index a6df8fe..3cd6472 100644 --- a/core/lib/Drupal/Component/Utility/String.php +++ b/core/lib/Drupal/Component/Utility/String.php @@ -25,6 +25,7 @@ class String { * valid UTF-8. * * @see drupal_validate_utf8() + * * @ingroup sanitization */ public static function checkPlain($text) { diff --git a/core/lib/Drupal/Component/Utility/Unicode.php b/core/lib/Drupal/Component/Utility/Unicode.php index 6ee23d1..6bbddbb 100644 --- a/core/lib/Drupal/Component/Utility/Unicode.php +++ b/core/lib/Drupal/Component/Utility/Unicode.php @@ -575,4 +575,38 @@ public static function caseFlip($matches) { return $matches[0][0] . chr(ord($matches[0][1]) ^ 32); } + /** + * Checks whether a string is valid UTF-8. + * + * All functions designed to filter input should use drupal_validate_utf8 + * to ensure they operate on valid UTF-8 strings to prevent bypass of the + * filter. + * + * When text containing an invalid UTF-8 lead byte (0xC0 - 0xFF) is presented + * as UTF-8 to Internet Explorer 6, the program may misinterpret subsequent + * bytes. When these subsequent bytes are HTML control characters such as + * quotes or angle brackets, parts of the text that were deemed safe by filters + * end up in locations that are potentially unsafe; An onerror attribute that + * is outside of a tag, and thus deemed safe by a filter, can be interpreted + * by the browser as if it were inside the tag. + * + * The function does not return FALSE for strings containing character codes + * above U+10FFFF, even though these are prohibited by RFC 3629. + * + * @param string $text + * The text to check. + * + * @return bool + * TRUE if the text is valid UTF-8, FALSE if not. + */ + public static function validateUtf8($text) { + if (strlen($text) == 0) { + return TRUE; + } + // With the PCRE_UTF8 modifier 'u', preg_match() fails silently on strings + // containing invalid UTF-8 byte sequences. It does not reject character + // codes above U+10FFFF (represented by 4 or more octets), though. + return (preg_match('/^./us', $text) == 1); + } + } diff --git a/core/lib/Drupal/Component/Utility/UrlValidator.php b/core/lib/Drupal/Component/Utility/UrlValidator.php new file mode 100644 index 0000000..4772373 --- /dev/null +++ b/core/lib/Drupal/Component/Utility/UrlValidator.php @@ -0,0 +1,136 @@ + 0) { + // We found a colon, possibly a protocol. Verify. + $protocol = substr($uri, 0, $colonpos); + // If a colon is preceded by a slash, question mark or hash, it cannot + // possibly be part of the URL scheme. This must be a relative URL, which + // inherits the (safe) protocol of the base document. + if (preg_match('![/?#]!', $protocol)) { + break; + } + // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3 + // (URI Comparison) scheme comparison must be case-insensitive. + if (!isset($allowed_protocols[strtolower($protocol)])) { + $uri = substr($uri, $colonpos + 1); + } + } + } while ($before != $uri); + + return $uri; + } + + /** + * Verifies the syntax of the given URL. + * + * This function should only be used on actual URLs. It should not be used for + * Drupal menu paths, which can contain arbitrary characters. + * Valid values per RFC 3986. + * + * @param string $url + * The URL to verify. + * @param bool $absolute + * Whether the URL is absolute (beginning with a scheme such as "http:"). + * + * @return bool + * TRUE if the URL is in a valid format. + */ + public static function isValid($url, $absolute = FALSE) { + if ($absolute) { + return (bool) preg_match(" + /^ # Start at the beginning of the text + (?:ftp|https?|feed):\/\/ # Look for ftp, http, https or feed schemes + (?: # Userinfo (optional) which is typically + (?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)* # a username or a username and password + (?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@ # combination + )? + (?: + (?:[a-z0-9\-\.]|%[0-9a-f]{2})+ # A domain name or a IPv4 address + |(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]) # or a well formed IPv6 address + ) + (?::[0-9]+)? # Server port number (optional) + (?:[\/|\?] + (?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2}) # The path and query (optional) + *)? + $/xi", $url); + } + else { + return (bool) preg_match("/^(?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})+$/i", $url); + } + } + +} diff --git a/core/lib/Drupal/Component/Utility/Xss.php b/core/lib/Drupal/Component/Utility/Xss.php new file mode 100644 index 0000000..4009782 --- /dev/null +++ b/core/lib/Drupal/Component/Utility/Xss.php @@ -0,0 +1,285 @@ + # a comment + | # or + <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string + | # or + > # just a > + )%x', '\Drupal\Component\Utility\Xss::split', $string); + } + + /** + * Applies a very permissive XSS/HTML filter for admin-only use. + * + * Use only for fields where it is impractical to use the + * whole filter system, but where some (mainly inline) mark-up + * is desired (so check_plain() is not acceptable). + * + * Allows all tags that can be used inside an HTML body, save + * for scripts and styles. + * + * @param string $string + * The string to apply the filter to. + * + * @return string + * The filtered string. + */ + public static function filterAdmin($string) { + return static::filter($string, static::$adminTags); + } + + /** + * Processes an HTML tag. + * + * @param array $matches + * An array with various meaning depending on the value of $store. + * If $store is TRUE then the array contains the allowed tags. + * If $store is FALSE then the array has one element, the HTML tag to process. + * @param bool $store + * Whether to store $m. + * + * @return string + * If the element isn't allowed, an empty string. Otherwise, the cleaned up + * version of the HTML element. + */ + protected static function split($matches, $store = FALSE) { + static $allowed_html; + + if ($store) { + $allowed_html = array_flip($matches); + return; + } + + $string = $matches[1]; + + if (substr($string, 0, 1) != '<') { + // We matched a lone ">" character. + return '>'; + } + elseif (strlen($string) == 1) { + // We matched a lone "<" character. + return '<'; + } + + if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|()$%', $string, $matches)) { + // Seriously malformed. + return ''; + } + + $slash = trim($matches[1]); + $elem = &$matches[2]; + $attrlist = &$matches[3]; + $comment = &$matches[4]; + + if ($comment) { + $elem = '!--'; + } + + if (!isset($allowed_html[strtolower($elem)])) { + // Disallowed HTML element. + return ''; + } + + if ($comment) { + return $comment; + } + + if ($slash != '') { + return ""; + } + + // Is there a closing XHTML slash at the end of the attributes? + $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count); + $xhtml_slash = $count ? ' /' : ''; + + // Clean up attributes. + $attr2 = implode(' ', static::attributes($attrlist)); + $attr2 = preg_replace('/[<>]/', '', $attr2); + $attr2 = strlen($attr2) ? ' ' . $attr2 : ''; + + return "<$elem$attr2$xhtml_slash>"; + } + + /** + * Processes a string of HTML attributes. + * + * @param string $attr + * The html attribute to process. + * + * @return string + * Cleaned up version of the HTML attributes. + */ + protected static function attributes($attr) { + $attrarr = array(); + $mode = 0; + $attrname = ''; + + while (strlen($attr) != 0) { + // Was the last operation successful? + $working = 0; + + switch ($mode) { + case 0: + // Attribute name, href for instance. + if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { + $attrname = strtolower($match[1]); + $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on'); + $working = $mode = 1; + $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); + } + break; + + case 1: + // Equals sign or valueless ("selected"). + if (preg_match('/^\s*=\s*/', $attr)) { + $working = 1; $mode = 2; + $attr = preg_replace('/^\s*=\s*/', '', $attr); + break; + } + + if (preg_match('/^\s+/', $attr)) { + $working = 1; $mode = 0; + if (!$skip) { + $attrarr[] = $attrname; + } + $attr = preg_replace('/^\s+/', '', $attr); + } + break; + + case 2: + // Attribute value, a URL after href= for instance. + if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) { + $thisval = UrlValidator::filterBadProtocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; + $mode = 0; + $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); + break; + } + + if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) { + $thisval = UrlValidator::filterBadProtocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname='$thisval'"; + } + $working = 1; $mode = 0; + $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); + break; + } + + if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) { + $thisval = UrlValidator::filterBadProtocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; $mode = 0; + $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); + } + break; + } + + if ($working == 0) { + // Not well formed; remove and try again. + $attr = preg_replace('/ + ^ + ( + "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string + | # or + \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string + | # or + \S # - a non-whitespace character + )* # any number of the above three + \s* # any number of whitespaces + /x', '', $attr); + $mode = 0; + } + } + + // The attribute list ends with a valueless attribute like "selected". + if ($mode == 1 && !$skip) { + $attrarr[] = $attrname; + } + return $attrarr; + } + +}