--- modules\filter\filter.module.org Sun Jul 01 20:41:14 2007 +++ modules\filter\filter.module Fri Aug 03 08:01:46 2007 @@ -1083,21 +1083,56 @@ function _filter_url_settings($format) { * ftp links, etc.) into hyperlinks. */ function _filter_url($text, $format) { + // The following tags may contain text with a URL that should be converted to a link + // List also contains standalone/empty tags like img and br + $tags_begin = 'caption|td|th|div|dd|dt|li|blockquote|address|p|h[1-6]|i|b|u|tt|big|small|strike|s|em|strong|font|dfn|samp|kbd|var|cite|abbr|acronym|sub|sup|q|ins|del|img|br|object|applet'; + $tags_begin = '<\/?(?:' . $tags_begin . ')(?:\s[^>]|\/)*>'; + // This pattern will match

,

,

,

and

+ // The last ones are significant for xhtml standalone tags like and
+ // Note: the pattern will obviously match many invalid tag-like constructs too + // like

+ // Pass length to regexp callback _filter_url_trim(NULL, variable_get('filter_url_length_'. $format, 72)); - $text = ' '. $text .' '; - - // Match absolute URLs. - $text = preg_replace_callback("`(

|

  • ||[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text); + $text = "\n". $text ."\n"; + // The structure of each three matches below is + // 1. a tag from the list above or newline or + any characters except "<" + // 2. the url to match + // 4. possible period or other marks at end of url are considered separate (end of sentence) + // 3. end url in space, newline or tag ("<" character) (possibly preceded by 4) + // + // (The numbering above reflects the ordering of opening parenthesis) + // All other parenthesis but these are non-capturing, ie (?: ) + + // In many cases the patterns below only match the first url if there are many + // in the same paragraph. Therefore the preg_replace() calls are encapsulated + // in loops that will re-run the replace as many times as needed. + + // Match absolute URLs. + $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://'; + $urlpattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])"; + $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i"; + while(preg_match($re, $text)) { + $text = preg_replace_callback($re, '_filter_url_parse_full_links', $text); + } + // Match e-mail addresses. - $text = preg_replace("`(

    |

  • ||[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '\1\2\3', $text); - + $urlpattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4}'; + $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i"; + while(preg_match($re, $text)) { + $text = preg_replace($re, '\1\2\3', $text); + } + // Match www domains/addresses. - $text = preg_replace_callback("`(

    |

  • |[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text); + $urlpattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]'; + $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i"; + while(preg_match($re, $text)) { + $text = preg_replace_callback($re, '_filter_url_parse_partial_links', $text); + } + $text = substr($text, 1, -1); - return $text; } @@ -1175,24 +1210,29 @@ function _filter_htmlcorrector($text) { return $output; } + /** - * Make links out of absolute URLs. + * Callback function. Make links out of absolute URLs. */ function _filter_url_parse_full_links($match) { - $match[2] = decode_entities($match[2]); - $caption = check_plain(_filter_url_trim($match[2])); - $match[2] = check_url($match[2]); - return $match[1] . ''. $caption .''. $match[5]; + // Which capturing parenthesis in the regexp contains the url? + $i = 2; + $match[$i] = decode_entities($match[$i]); + $caption = check_plain(_filter_url_trim($match[$i])); + $match[$i] = check_url($match[$i]); + return $match[1] . ''. $caption .''. $match[$i+1]; } /** - * Make links out of domain names starting with "www." + * Callback function. Make links out of domain names starting with "www." */ function _filter_url_parse_partial_links($match) { - $match[2] = decode_entities($match[2]); - $caption = check_plain(_filter_url_trim($match[2])); - $match[2] = check_plain($match[2]); - return $match[1] . ''. $caption .''. $match[3]; + // Which parenthesis in the regexp contains the url? + $i = 2; + $match[$i] = decode_entities($match[$i]); + $caption = check_plain(_filter_url_trim($match[$i])); + $match[$i] = check_plain($match[$i]); + return $match[1] . ''. $caption .''. $match[$i+1]; } /**