--- modules\filter\filter.module.org Sun Jul 01 20:41:14 2007 +++ modules\filter\filter.module Mon Sep 03 20:19:40 2007 @@ -1083,24 +1083,80 @@ function _filter_url_settings($format) { * ftp links, etc.) into hyperlinks. */ function _filter_url($text, $format) { + // List of tags - the content of which must be skipped + $ignoretags = 'a|script|style|code'; + // Pass length to regexp callback _filter_url_trim(NULL, variable_get('filter_url_length_'. $format, 72)); - - $text = ' '. $text .' '; - - // Match absolute URLs. - $text = preg_replace_callback("`(

|

  • ||[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text); - - // Match e-mail addresses. - $text = preg_replace("`(

    |

  • ||[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '\1\2\3', $text); - - // Match www domains/addresses. - $text = preg_replace_callback("`(

    |

  • |[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text); - $text = substr($text, 1, -1); - + + // Need to process each replacement separately (see switch in the middle of loops). + // The text must be joined and split again after each replacement, since they create + // new html tags. + for ( $task = 1; $task <= 3; $task++ ) { + // Split at all tags. + // This ensures that nothing that is a tagname or attribute will be processed. + $chunks = preg_split('´(<.+?>)´i', $text, -1, PREG_SPLIT_DELIM_CAPTURE); + // Note: PHP ensures the array consists of alternating delimiters and literals + // and begins and ends with a literal (inserting NULL as required). + + // If an ignoretag is found, it is stored here and removed when the + // closing tag is found. Until then no replacements are made. + // Think of this as a stack that always has 0 or 1 items + $opentag = ''; + for ( $i = 0; $i < count( $chunks ); $i++ ) { + // Even/odd = text/tag + if ( $i % 2 == 0 ) { // Text + if ( $opentag == '' ) { + // Only do replacements when there are no unclosed ignoretags + switch ($task) { + case 1: + // Match absolute URLs. + $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://'; + $urlpattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])"; + $re = "`($urlpattern)([\.\,\?\!]*?)`i"; + $chunks[$i] = preg_replace_callback($re, '_filter_url_parse_full_links', $chunks[$i] ); + break; + case 2: + // Match e-mail addresses. + $urlpattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4}'; + $re = "`($urlpattern)`i"; + $chunks[$i] = preg_replace($re, '\1', $chunks[$i]); + break; + case 3: + // Match www domains/addresses. + $urlpattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]'; + $re = "`($urlpattern)([\.\,\?\!]*?)`i"; + $chunks[$i] = preg_replace_callback($re, '_filter_url_parse_partial_links', $chunks[$i]); + break; + } + } + } + else { // Tag + if( $opentag == '' ) { + // No open "ignoretag" + // Is it an ignoretag? (opening tag) + if( preg_match( "´<($ignoretags)(?:\s|>)´i", $chunks[$i], $matches ) ) { + // Store and catch the tag in question + $opentag = $matches[1]; + } + } + else { + // Ignoretag found + // Nobody gets to do anything until we find a matching closing tag + if( preg_match( "´<\/$opentag>´i", $chunks[$i], $matches ) ) { + $opentag = ''; + } + } // if ( $opentag == '' ) + } // if ( $i % 2 ) + } // for chunks + + $text = implode( $chunks ); + } // for tasks 1..3 + return $text; } + /** * Scan input and make sure that all HTML tags are properly closed and nested. */ @@ -1176,23 +1232,27 @@ function _filter_htmlcorrector($text) { } /** - * Make links out of absolute URLs. + * Callback function. Make links out of absolute URLs. */ function _filter_url_parse_full_links($match) { - $match[2] = decode_entities($match[2]); - $caption = check_plain(_filter_url_trim($match[2])); - $match[2] = check_url($match[2]); - return $match[1] . ''. $caption .''. $match[5]; + // Which capturing parenthesis in the regexp contains the url? + $i = 1; + $match[$i] = decode_entities($match[$i]); + $caption = check_plain(_filter_url_trim($match[$i])); + $match[$i] = check_url($match[$i]); + return ''. $caption .''. $match[$i+1]; } /** - * Make links out of domain names starting with "www." + * Callback function. Make links out of domain names starting with "www." */ function _filter_url_parse_partial_links($match) { - $match[2] = decode_entities($match[2]); - $caption = check_plain(_filter_url_trim($match[2])); - $match[2] = check_plain($match[2]); - return $match[1] . ''. $caption .''. $match[3]; + // Which parenthesis in the regexp contains the url? + $i = 1; + $match[$i] = decode_entities($match[$i]); + $caption = check_plain(_filter_url_trim($match[$i])); + $match[$i] = check_plain($match[$i]); + return ''. $caption .''. $match[$i+1]; } /**