--- modules\filter\filter.module.org Sun Jul 01 20:41:14 2007
+++ modules\filter\filter.module Thu Aug 02 20:39:11 2007
@@ -1083,24 +1083,67 @@ function _filter_url_settings($format) {
* ftp links, etc.) into hyperlinks.
*/
function _filter_url($text, $format) {
+ // The following tags may contain text with a URL that should be converted to a link
+ // List also contains standalone/empty tags like img and br
+ $tags_begin = 'caption|td|th|div|dd|dt|li|blockquote|address|p|h[1-6]|i|b|u|tt|big|small|strike|s|em|strong|font|dfn|samp|kbd|var|cite|abbr|acronym|sub|sup|q|ins|del|img|br|object|applet';
+
+
+ // Transform each tagname from 'p' to '<\/?p(?:\s[^>]|\/)*>'.
+ // This pattern will match
,
, ,
and
+ // The last ones are significant for xhtml standalone tags like
and
+ // Note: the pattern will obviously match many invalid tag-like constructs too
+ // like
+ $arr_tags_begin = explode( "|", $tags_begin );
+ for ( $i = 0; $i < count( $arr_tags_begin ); $i++ ){
+ $arr_tags_begin[$i] = '<\/?' . $arr_tags_begin[$i] . '(?:\s[^>]|\/)*>';
+ }
+ $tags_begin = implode( "|", $arr_tags_begin );
+
// Pass length to regexp callback
_filter_url_trim(NULL, variable_get('filter_url_length_'. $format, 72));
$text = ' '. $text .' ';
- // Match absolute URLs.
- $text = preg_replace_callback("`(
|
|
|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(||
|[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text);
-
+ // The structure of each three matches below is
+ // 1. a tag from the list above or newline or + any characters except "<"
+ // 2. the url to match
+ // 4. possible period or other marks at end of url are considered separate (end of sentence)
+ // 3. end url in space, newline or tag ("<" character) (possibly preceded by 4)
+ //
+ // (The numbering above reflects the ordering of opening parenthesis)
+ // All other parenthesis but these are non-capturing, ie (?: )
+
+ // In many cases the patterns below only match the first url if there are many
+ // in the same paragraph. Therefore the preg_replace() calls are encapsulated
+ // in loops that will re-run the replace as many times as needed.
+
+ // Match absolute URLs.
+ $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://';
+ $urlpattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])";
+ $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i";
+ while(preg_match($re, $text)) {
+ $text = preg_replace_callback($re, '_filter_url_parse_full_links', $text);
+ }
+
// Match e-mail addresses.
- $text = preg_replace("`(|
|
|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(||
|[ \n\r\t\)]))`i", '\1\2\3', $text);
-
+ $urlpattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4}';
+ $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i";
+ while(preg_match($re, $text)) {
+ $text = preg_replace($re, '\1\2\3', $text);
+ }
+
// Match www domains/addresses.
- $text = preg_replace_callback("`(|
|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(||
|[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text);
+ $urlpattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]';
+ $re = "`((?:$tags_begin|\n|\r|)(?:[^<])*?)($urlpattern)(([\.\,\?\!]*?)[\<\s\n\r])`i";
+ while(preg_match($re, $text)) {
+ $text = preg_replace_callback($re, '_filter_url_parse_partial_links', $text);
+ }
+
$text = substr($text, 1, -1);
-
return $text;
}
+
/**
* Scan input and make sure that all HTML tags are properly closed and nested.
*/
@@ -1175,24 +1218,29 @@ function _filter_htmlcorrector($text) {
return $output;
}
+
/**
- * Make links out of absolute URLs.
+ * Callback function. Make links out of absolute URLs.
*/
function _filter_url_parse_full_links($match) {
- $match[2] = decode_entities($match[2]);
- $caption = check_plain(_filter_url_trim($match[2]));
- $match[2] = check_url($match[2]);
- return $match[1] . ''. $caption .''. $match[5];
+ // Which capturing parenthesis in the regexp contains the url?
+ $i = 2;
+ $match[$i] = decode_entities($match[$i]);
+ $caption = check_plain(_filter_url_trim($match[$i]));
+ $match[$i] = check_url($match[$i]);
+ return $match[1] . ''. $caption .''. $match[$i+1];
}
/**
- * Make links out of domain names starting with "www."
+ * Callback function. Make links out of domain names starting with "www."
*/
function _filter_url_parse_partial_links($match) {
- $match[2] = decode_entities($match[2]);
- $caption = check_plain(_filter_url_trim($match[2]));
- $match[2] = check_plain($match[2]);
- return $match[1] . ''. $caption .''. $match[3];
+ // Which parenthesis in the regexp contains the url?
+ $i = 2;
+ $match[$i] = decode_entities($match[$i]);
+ $caption = check_plain(_filter_url_trim($match[$i]));
+ $match[$i] = check_plain($match[$i]);
+ return $match[1] . ''. $caption .''. $match[$i+1];
}
/**