1 files changed, 93 insertions, 55 deletions
diff --git a/engine/lib/output.php b/engine/lib/output.php
index b1245a924..bff0bf6e9 100644
--- a/engine/lib/output.php
+++ b/engine/lib/output.php
@@ -16,7 +16,7 @@
  **/
 function parse_urls($text) {
 	// @todo this causes problems with <attr = "val">
-	// must be ing <attr="val"> format (no space).
+	// must be in <attr="val"> format (no space).
 	// By default htmlawed rewrites tags to this format.
 	// if PHP supported conditional negative lookbehinds we could use this:
 	// $r = preg_replace_callback('/(?<!=)(?<![ ])?(?<!["\'])((ht|f)tps?:\/\/[^\s\r\n\t<>"\'\!\(\),]+)/i',
@@ -43,51 +43,26 @@ function parse_urls($text) {
 
 /**
  * Create paragraphs from text with line spacing
- * Borrowed from Wordpress.
  *
  * @param string $pee The string
- * @param bool   $br  Add BRs?
+ * @deprecated Use elgg_autop instead
+ * @todo Add deprecation warning in 1.9
  *
- * @todo Rewrite
  * @return string
  **/
-function autop($pee, $br = 1) {
-	$pee = $pee . "\n"; // just to make things a little easier, pad the end
-	$pee = preg_replace('|<br />\s*<br />|', "\n\n", $pee);
-	// Space things out a little
-	$allblocks = '(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr)';
-	$pee = preg_replace('!(<' . $allblocks . '[^>]*>)!', "\n$1", $pee);
-	$pee = preg_replace('!(</' . $allblocks . '>)!', "$1\n\n", $pee);
-	$pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
-	if (strpos($pee, '<object') !== false) {
-		$pee = preg_replace('|\s*<param([^>]*)>\s*|', "<param$1>", $pee); // no pee inside object/embed
-		$pee = preg_replace('|\s*</embed>\s*|', '</embed>', $pee);
-	}
-	$pee = preg_replace("/\n\n+/", "\n\n", $pee); // take care of duplicates
-	$pee = preg_replace('/\n?(.+?)(?:\n\s*\n|\z)/s', "<p>$1</p>\n", $pee); // make paragraphs, including one at the end
-	$pee = preg_replace('|<p>\s*?</p>|', '', $pee); // under certain strange conditions it could create a P of entirely whitespace
-	$pee = preg_replace('!<p>([^<]+)\s*?(</(?:div|address|form)[^>]*>)!', "<p>$1</p>$2", $pee);
-	$pee = preg_replace('|<p>|', "$1<p>", $pee);
-	$pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee); // don't pee all over a tag
-	$pee = preg_replace("|<p>(<li.+?)</p>|", "$1", $pee); // problem with nested lists
-	$pee = preg_replace('|<p><blockquote([^>]*)>|i', "<blockquote$1><p>", $pee);
-	$pee = str_replace('</blockquote></p>', '</p></blockquote>', $pee);
-	$pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)!', "$1", $pee);
-	$pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee);
-	if ($br) {
-		$pee = preg_replace_callback('/<(script|style).*?<\/\\1>/s', create_function('$matches', 'return str_replace("\n", "<WPPreserveNewline />", $matches[0]);'), $pee);
-		$pee = preg_replace('|(?<!<br />)\s*\n|', "<br />\n", $pee); // optionally make line breaks
-		$pee = str_replace('<WPPreserveNewline />', "\n", $pee);
-	}
-	$pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*<br />!', "$1", $pee);
-	$pee = preg_replace('!<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)!', '$1', $pee);
-	//if (strpos($pee, '<pre') !== false) {
-	//	mind the space between the ? and >.  Only there because of the comment.
-	//	$pee = preg_replace_callback('!(<pre.*? >)(.*?)</pre>!is', 'clean_pre', $pee );
-	//}
-	$pee = preg_replace("|\n</p>$|", '</p>', $pee);
-
-	return $pee;
+function autop($pee) {
+	return elgg_autop($pee);
+}
+
+/**
+ * Create paragraphs from text with line spacing
+ *
+ * @param string $string The string
+ *
+ * @return string
+ **/
+function elgg_autop($string) {
+	return ElggAutoP::getInstance()->process($string);
 }
 
 /**
@@ -271,8 +246,8 @@ function elgg_normalize_url($url) {
 		// '?query=test', #target
 		return $url;
 	
-	} elseif (stripos($url, 'javascript:') === 0) {
-		// 'javascript:'
+	} elseif (stripos($url, 'javascript:') === 0 || stripos($url, 'mailto:') === 0) {
+		// 'javascript:' and 'mailto:'
 		// Not covered in FILTER_VALIDATE_URL
 		return $url;
 
@@ -310,19 +285,11 @@ function elgg_get_friendly_title($title) {
 		return $result;
 	}
 
-	// @todo not using this because of locale concerns
-	//$title = iconv('UTF-8', 'ASCII//TRANSLIT', $title);
+	// handle some special cases
+	$title = str_replace('&amp;', 'and', $title);
 
-	// @todo this uses a utf8 character class. can use if
-	// we want to support utf8 in the url.
-	//$title = preg_replace('/[^\p{L}\- ]/u', '', $title);
+	$title = ElggTranslit::urlize($title);
 
-	// use A-Za-z0-9_ instead of \w because \w is locale sensitive
-	$title = preg_replace("/[^A-Za-z0-9_\- ]/", "", $title);
-	$title = str_replace(" ", "-", $title);
-	$title = str_replace("--", "-", $title);
-	$title = trim($title);
-	$title = elgg_strtolower($title);
 	return $title;
 }
 
@@ -392,7 +359,7 @@ function elgg_get_friendly_time($time) {
 /**
  * Strip tags and offer plugins the chance.
  * Plugins register for output:strip_tags plugin hook.
- * 	Original string included in $params['original_string']
+ * Original string included in $params['original_string']
  *
  * @param string $string Formatted string
  *
@@ -406,3 +373,74 @@ function elgg_strip_tags($string) {
 
 	return $string;
 }
+
+/**
+ * Apply html_entity_decode() to a string while re-entitising HTML
+ * special char entities to prevent them from being decoded back to their
+ * unsafe original forms.
+ *
+ * This relies on html_entity_decode() not translating entities when
+ * doing so leaves behind another entity, e.g. &amp;gt; if decoded would
+ * create &gt; which is another entity itself. This seems to escape the
+ * usual behaviour where any two paired entities creating a HTML tag are
+ * usually decoded, i.e. a lone &gt; is not decoded, but &lt;foo&gt; would
+ * be decoded to <foo> since it creates a full tag.
+ *
+ * Note: This function is poorly explained in the manual - which is really
+ * bad given its potential for misuse on user input already escaped elsewhere.
+ * Stackoverflow is littered with advice to use this function in the precise
+ * way that would lead to user input being capable of injecting arbitrary HTML.
+ *
+ * @param string $string
+ *
+ * @return string
+ *
+ * @author Pádraic Brady
+ * @copyright Copyright (c) 2010 Pádraic Brady (http://blog.astrumfutura.com)
+ * @license Released under dual-license GPL2/MIT by explicit permission of Pádraic Brady
+ *
+ * @access private
+ */
+function _elgg_html_decode($string) {
+	$string = str_replace(
+		array('&gt;', '&lt;', '&amp;', '&quot;', '&#039;'),
+		array('&amp;gt;', '&amp;lt;', '&amp;amp;', '&amp;quot;', '&amp;#039;'),
+		$string
+	);
+	$string = html_entity_decode($string, ENT_NOQUOTES, 'UTF-8');
+	$string = str_replace(
+		array('&amp;gt;', '&amp;lt;', '&amp;amp;', '&amp;quot;', '&amp;#039;'),
+		array('&gt;', '&lt;', '&amp;', '&quot;', '&#039;'),
+		$string
+	);
+	return $string;
+}
+
+/**
+ * Unit tests for Output
+ *
+ * @param sting  $hook   unit_test
+ * @param string $type   system
+ * @param mixed  $value  Array of tests
+ * @param mixed  $params Params
+ *
+ * @return array
+ * @access private
+ */
+function output_unit_test($hook, $type, $value, $params) {
+	global $CONFIG;
+	$value[] = $CONFIG->path . 'engine/tests/api/output.php';
+	return $value;
+}
+
+/**
+ * Initialise the Output subsystem.
+ *
+ * @return void
+ * @access private
+ */
+function output_init() {
+	elgg_register_plugin_hook_handler('unit_test', 'system', 'output_unit_test');
+}
+
+elgg_register_event_handler('init', 'system', 'output_init');