Fixes #1479. Added ElggAutoP. Removing [\n\r] from test strings before compare to deal with differing whitespace between tags among PHP versions.

author: Brett Profitt <brett.profitt@gmail.com> 2012-12-10 15:50:25 -0500
committer: Brett Profitt <brett.profitt@gmail.com> 2012-12-10 15:50:25 -0500
commit: 43a395ae735777bfb5474c4f6a37dc1cd0818a37 (patch)
tree: 6c156136680474a61cf162d8ef3aceef12cfc7dd /engine/classes/ElggAutoP.php
parent: 4c517db146cb1c59c8a54d9e87b9e5a4ae17987e (diff)
download: elgg-43a395ae735777bfb5474c4f6a37dc1cd0818a37.tar.gz
elgg-43a395ae735777bfb5474c4f6a37dc1cd0818a37.tar.bz2
1 files changed, 309 insertions, 0 deletions
diff --git a/engine/classes/ElggAutoP.php b/engine/classes/ElggAutoP.php
new file mode 100644
index 000000000..89d77e583
--- /dev/null
+++ b/engine/classes/ElggAutoP.php
@@ -0,0 +1,309 @@
+<?php
+
+/**
+ * Create wrapper P and BR elements in HTML depending on newlines. Useful when
+ * users use newlines to signal line and paragraph breaks. In all cases output
+ * should be well-formed markup.
+ *
+ * In DIV elements, Ps are only added when there would be at
+ * least two of them.
+ */
+class ElggAutoP {
+
+	public $encoding = 'UTF-8';
+
+	/**
+	 * @var DOMDocument
+	 */
+	protected $_doc = null;
+
+	/**
+	 * @var DOMXPath
+	 */
+	protected $_xpath = null;
+
+	protected $_blocks = 'address article area aside blockquote caption col colgroup dd 
+		details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header 
+		hr hgroup legend map math menu nav noscript p pre section select style summary
+		table tbody td tfoot th thead tr ul ol option li';
+
+	/**
+	 * @var array
+	 */
+	protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
+		del dfn em embed i iframe img input ins kbd keygen label map mark meter object
+		output progress q rp rt ruby s samp script select small source span strong style
+		sub sup textarea time var video wbr';
+
+	/**
+	 * Descend into these elements to add Ps
+	 *
+	 * @var array
+	 */
+	protected $_descendList = 'article aside blockquote body details div footer form
+		header section';
+
+	/**
+	 * Add Ps inside these elements
+	 *
+	 * @var array
+	 */
+	protected $_alterList = 'article aside blockquote body details div footer header
+		section';
+
+	protected $_unique = '';
+
+	public function __construct() {
+		$this->_blocks = preg_split('@\\s+@', $this->_blocks);
+		$this->_descendList = preg_split('@\\s+@', $this->_descendList);
+		$this->_alterList = preg_split('@\\s+@', $this->_alterList);
+		$this->_inlines = preg_split('@\\s+@', $this->_inlines);
+		$this->_unique = md5(__FILE__);
+	}
+
+	/**
+	 * Intance of class for singleton pattern.
+	 * @var ElggAutoP
+	 */
+	private static $instance;
+	
+	/**
+	 * Singleton pattern.
+	 * @return ElggAutoP
+	 */
+	public static function getInstance() {
+		$className = __CLASS__;
+		if (!(self::$instance instanceof $className)) {
+			self::$instance = new $className();
+		}
+		return self::$instance;
+	}
+	
+	/**
+	 * Create wrapper P and BR elements in HTML depending on newlines. Useful when
+	 * users use newlines to signal line and paragraph breaks. In all cases output
+	 * should be well-formed markup.
+	 *
+	 * In DIV, LI, TD, and TH elements, Ps are only added when their would be at
+	 * least two of them.
+	 *
+	 * @param string $html snippet
+	 * @return string|false output or false if parse error occurred
+	 */
+	public function process($html) {
+		// normalize whitespace
+		$html = str_replace(array("\r\n", "\r"), "\n", $html);
+
+		// allows preserving entities untouched
+		$html = str_replace('&', $this->_unique . 'AMP', $html);
+
+		$this->_doc = new DOMDocument();
+	   
+		// parse to DOM, suppressing loadHTML warnings
+		// http://www.php.net/manual/en/domdocument.loadhtml.php#95463
+		libxml_use_internal_errors(true);
+
+		if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' " 
+				. "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
+				. "</html>")) {
+			return false;
+		}
+
+		$this->_xpath = new DOMXPath($this->_doc);
+		// start processing recursively at the BODY element
+		$nodeList = $this->_xpath->query('//body[1]');
+		$this->_addParagraphs($nodeList->item(0));
+
+		// serialize back to HTML
+		$html = $this->_doc->saveHTML();
+
+		// split AUTOPs into multiples at /\n\n+/
+		$html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
+		$html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'), 
+				'<br />',
+				$html);
+		$html = str_replace('<br /></autop>', '</autop>', $html);
+
+		// re-parse so we can handle new AUTOP elements
+
+		if (!$this->_doc->loadHTML($html)) {
+			return false;
+		}
+		// must re-create XPath object after DOM load
+		$this->_xpath = new DOMXPath($this->_doc);
+
+		// strip AUTOPs that only have comments/whitespace
+		foreach ($this->_xpath->query('//autop') as $autop) {
+			$hasContent = false;
+			if (trim($autop->textContent) !== '') {
+				$hasContent = true;
+			} else {
+				foreach ($autop->childNodes as $node) {
+					if ($node->nodeType === XML_ELEMENT_NODE) {
+						$hasContent = true;
+						break;
+					}
+				}
+			}
+			if (!$hasContent) {
+				// strip w/ preg_replace later (faster than moving nodes out)
+				$autop->setAttribute("r", "1");
+			}
+		}
+
+		// remove a single AUTOP inside certain elements
+		foreach ($this->_xpath->query('//div') as $el) {
+			$autops = $this->_xpath->query('./autop', $el);
+			if ($autops->length === 1) {
+				// strip w/ preg_replace later (faster than moving nodes out)
+				$autops->item(0)->setAttribute("r", "1");
+			}
+		}
+
+		$html = $this->_doc->saveHTML();
+
+		// trim to the contents of BODY
+		$bodyStart = strpos($html, '<body>');
+		$bodyEnd = strpos($html, '</body>', $bodyStart + 6);
+		$html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
+		
+		// strip AUTOPs that should be removed
+		$html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
+
+		// commit to converting AUTOPs to Ps
+		$html = str_replace('<autop>', "\n<p>", $html);
+		$html = str_replace('</autop>', "</p>\n", $html);
+		
+		$html = str_replace('<br>', '<br />', $html);
+		$html = str_replace($this->_unique . 'AMP', '&', $html);
+		return $html;
+	}
+
+	/**
+	 * Add P and BR elements as necessary
+	 *
+	 * @param DOMElement $el
+	 */
+	protected function _addParagraphs(DOMElement $el) {
+		// no need to recurse, just queue up
+		$elsToProcess = array($el);
+		$inlinesToProcess = array();
+		while ($el = array_shift($elsToProcess)) {
+			// if true, we can alter all child nodes, if not, we'll just call
+			// _addParagraphs on each element in the descendInto list
+			$alterInline = in_array($el->nodeName, $this->_alterList);
+
+			// inside affected elements, we want to trim leading whitespace from
+			// the first text node
+			$ltrimFirstTextNode = true;
+
+			// should we open a new AUTOP element to move inline elements into?
+			$openP = true;
+			$autop = null;
+
+			// after BR, ignore a newline
+			$isFollowingBr = false;
+
+			$node = $el->firstChild;
+			while (null !== $node) {
+				if ($alterInline) {
+					if ($openP) {
+						$openP = false;
+						// create a P to move inline content into (this may be removed later)
+						$autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
+					}
+				}
+
+				$isElement = ($node->nodeType === XML_ELEMENT_NODE);
+				if ($isElement) {
+					$elName = $node->nodeName;
+				}
+				$isBlock = ($isElement && in_array($elName, $this->_blocks));
+
+				if ($alterInline) {
+					$isInline = $isElement && ! $isBlock;
+					$isText = ($node->nodeType === XML_TEXT_NODE);
+					$isLastInline = (! $node->nextSibling
+								   || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
+									   && in_array($node->nextSibling->nodeName, $this->_blocks)));
+					if ($isElement) {
+						$isFollowingBr = ($node->nodeName === 'br');
+					}
+
+					if ($isText) {
+						$nodeText = $node->nodeValue;
+						if ($ltrimFirstTextNode) {
+							$nodeText = ltrim($nodeText);
+							$ltrimFirstTextNode = false;
+						}
+						if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
+							// if a user ends a line with <br>, don't add a second BR
+							$nodeText = substr($nodeText, strlen($m[0]));
+						}
+						if ($isLastInline) {
+							$nodeText = rtrim($nodeText);
+						}
+						$nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
+						$tmpNode = $node;
+						$node = $node->nextSibling; // move loop to next node
+
+						// alter node in place, then move into AUTOP
+						$tmpNode->nodeValue = $nodeText;
+						$autop->appendChild($tmpNode);
+
+						continue;
+					}
+				}
+				if ($isBlock || ! $node->nextSibling) {
+					if ($isBlock) {
+						if (in_array($node->nodeName, $this->_descendList)) {
+							$elsToProcess[] = $node;
+							//$this->_addParagraphs($node);
+						}
+					}
+					$openP = true;
+					$ltrimFirstTextNode = true;
+				}
+				if ($alterInline) {
+					if (! $isBlock) {
+						$tmpNode = $node;
+						if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
+							$inlinesToProcess[] = $tmpNode;
+						}
+						$node = $node->nextSibling;
+						$autop->appendChild($tmpNode);
+						continue;
+					}
+				}
+
+				$node = $node->nextSibling;
+			}
+		}
+
+		// handle inline nodes
+		// no need to recurse, just queue up
+		while ($el = array_shift($inlinesToProcess)) {
+			$ignoreLeadingNewline = false;
+			foreach ($el->childNodes as $node) {
+				if ($node->nodeType === XML_ELEMENT_NODE) {
+					if ($node->nodeValue === 'BR') {
+						$ignoreLeadingNewline = true;
+					} else {
+						$ignoreLeadingNewline = false;
+						if (false !== strpos($node->textContent, "\n")) {
+							$inlinesToProcess[] = $node;
+						}
+					}
+					continue;
+				} elseif ($node->nodeType === XML_TEXT_NODE) {
+					$text = $node->nodeValue;
+					if ($text[0] === "\n" && $ignoreLeadingNewline) {
+						$text = substr($text, 1);
+						$ignoreLeadingNewline = false;
+					}
+					$node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
+				}
+			}
+		}
+	}
+}
author	Brett Profitt <brett.profitt@gmail.com>	2012-12-10 15:50:25 -0500
committer	Brett Profitt <brett.profitt@gmail.com>	2012-12-10 15:50:25 -0500
commit	43a395ae735777bfb5474c4f6a37dc1cd0818a37 (patch)
tree	6c156136680474a61cf162d8ef3aceef12cfc7dd /engine/classes/ElggAutoP.php
parent	4c517db146cb1c59c8a54d9e87b9e5a4ae17987e (diff)
download	elgg-43a395ae735777bfb5474c4f6a37dc1cd0818a37.tar.gz elgg-43a395ae735777bfb5474c4f6a37dc1cd0818a37.tar.bz2