From 834c4ad0bf82f28949b108eb6c957fde3c18e1ce Mon Sep 17 00:00:00 2001
From: Cash Costello <cash.costello@gmail.com>
Date: Sat, 20 Apr 2013 11:07:44 -0400
Subject: Fixes #5369 allows ! in urls and adds unit tests

---
 engine/lib/output.php                 | 25 ++++++++++-------
 engine/tests/regression/trac_bugs.php | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 10 deletions(-)
diff --git a/engine/lib/output.php b/engine/lib/output.php
index c5a04989b..fe5bbcaaf 100644
--- a/engine/lib/output.php
+++ b/engine/lib/output.php
@@ -13,28 +13,33 @@
  * @param string $text The input string
  *
  * @return string The output string with formatted links
- **/
+ */
 function parse_urls($text) {
+
+	// URI specification: http://www.ietf.org/rfc/rfc3986.txt
+	// This varies from the specification in the following ways:
+	//  * Supports non-ascii characters
+	//  * Does not allow parentheses and single quotes
+	//  * Cuts off commas, exclamation points, and periods off as last character
+
 	// @todo this causes problems with <attr = "val">
 	// must be in <attr="val"> format (no space).
 	// By default htmlawed rewrites tags to this format.
 	// if PHP supported conditional negative lookbehinds we could use this:
 	// $r = preg_replace_callback('/(?<!=)(?<![ ])?(?<!["\'])((ht|f)tps?:\/\/[^\s\r\n\t<>"\'\!\(\),]+)/i',
-	//
-	// we can put , in the list of excluded char but need to keep . because of domain names.
-	// it is removed in the callback.
-	$r = preg_replace_callback('/(?<!=)(?<!["\'])((ht|f)tps?:\/\/[^\s\r\n\t<>"\'\!\(\),]+)/i',
+	$r = preg_replace_callback('/(?<!=)(?<!["\'])((ht|f)tps?:\/\/[^\s\r\n\t<>"\'\(\)]+)/i',
 	create_function(
 		'$matches',
 		'
 			$url = $matches[1];
-			$period = \'\';
-			if (substr($url, -1, 1) == \'.\') {
-				$period = \'.\';
-				$url = trim($url, \'.\');
+			$punc = \'\';
+			$last = substr($url, -1, 1);
+			if (in_array($last, array(".", "!", ","))) {
+				$punc = $last;
+				$url = rtrim($url, ".!,");
 			}
 			$urltext = str_replace("/", "/<wbr />", $url);
-			return "<a href=\"$url\">$urltext</a>$period";
+			return "<a href=\"$url\">$urltext</a>$punc";
 		'
 	), $text);
 
diff --git a/engine/tests/regression/trac_bugs.php b/engine/tests/regression/trac_bugs.php
index 58444dd39..83b78bc6b 100644
--- a/engine/tests/regression/trac_bugs.php
+++ b/engine/tests/regression/trac_bugs.php
@@ -236,4 +236,56 @@ class ElggCoreRegressionBugsTest extends ElggCoreUnitTest {
 			$this->assertIdentical($expected, $friendly_title);
 		}
 	}
+
+	/**
+	 * Test #5369 -- parse_urls()
+	 * https://github.com/Elgg/Elgg/issues/5369
+	 */
+	public function test_parse_urls() {
+
+		$cases = array(
+			'no.link.here' =>
+				'no.link.here',
+			'simple link http://example.org test' =>
+				'simple link <a href="http://example.org">http:/<wbr />/<wbr />example.org</a> test',
+			'non-ascii http://ñew.org/ test' =>
+				'non-ascii <a href="http://ñew.org/">http:/<wbr />/<wbr />ñew.org/<wbr /></a> test',
+
+			// section 2.1
+			'percent encoded http://example.org/a%20b test' =>
+				'percent encoded <a href="http://example.org/a%20b">http:/<wbr />/<wbr />example.org/<wbr />a%20b</a> test',
+			// section 2.2: skipping single quote and parenthese
+			'reserved characters http://example.org/:/?#[]@!$&*+,;= test' =>
+				'reserved characters <a href="http://example.org/:/?#[]@!$&*+,;=">http:/<wbr />/<wbr />example.org/<wbr />:/<wbr />?#[]@!$&*+,;=</a> test',
+			// section 2.3
+			'unreserved characters http://example.org/a1-._~ test' =>
+				'unreserved characters <a href="http://example.org/a1-._~">http:/<wbr />/<wbr />example.org/<wbr />a1-._~</a> test',
+
+			'parameters http://example.org/?val[]=1&val[]=2 test' =>
+				'parameters <a href="http://example.org/?val[]=1&val[]=2">http:/<wbr />/<wbr />example.org/<wbr />?val[]=1&val[]=2</a> test',
+			'port http://example.org:80/ test' =>
+				'port <a href="http://example.org:80/">http:/<wbr />/<wbr />example.org:80/<wbr /></a> test',
+
+			'parentheses (http://www.google.com) test' =>
+				'parentheses (<a href="http://www.google.com">http:/<wbr />/<wbr />www.google.com</a>) test',
+			'comma http://elgg.org, test' =>
+				'comma <a href="http://elgg.org">http:/<wbr />/<wbr />elgg.org</a>, test',
+			'period http://elgg.org. test' =>
+				'period <a href="http://elgg.org">http:/<wbr />/<wbr />elgg.org</a>. test',
+			'exclamation http://elgg.org! test' =>
+				'exclamation <a href="http://elgg.org">http:/<wbr />/<wbr />elgg.org</a>! test',
+
+			'already anchor <a href="http://twitter.com/">twitter</a> test' =>
+				'already anchor <a href="http://twitter.com/">twitter</a> test',
+
+			'ssl https://example.org/ test' =>
+				'ssl <a href="https://example.org/">https:/<wbr />/<wbr />example.org/<wbr /></a> test',
+			'ftp ftp://example.org/ test' =>
+				'ftp <a href="ftp://example.org/">ftp:/<wbr />/<wbr />example.org/<wbr /></a> test',
+
+		);
+		foreach ($cases as $input => $output) {
+			$this->assertEqual($output, parse_urls($input));
+		}
+	}
 }
-- 
cgit v1.2.3