aboutsummaryrefslogtreecommitdiff
path: root/engine/classes/ElggAutop.php
diff options
context:
space:
mode:
authorBrett Profitt <brett.profitt@gmail.com>2012-12-06 14:02:18 -0500
committerBrett Profitt <brett.profitt@gmail.com>2012-12-06 14:02:18 -0500
commita60a58a6e884af4ab89b118bf2f23132335381fe (patch)
tree3e27ff18d2a6c476b526cc214263c51186ca75ad /engine/classes/ElggAutop.php
parent6676577386c72d4a024c5c61a948589db8aaf9c7 (diff)
parent3bf72994688ad9292bf37444d80ab5ab1a002748 (diff)
downloadelgg-a60a58a6e884af4ab89b118bf2f23132335381fe.tar.gz
elgg-a60a58a6e884af4ab89b118bf2f23132335381fe.tar.bz2
Merge branch 'pr-420' into 1.8
Diffstat (limited to 'engine/classes/ElggAutop.php')
-rw-r--r--engine/classes/ElggAutop.php315
1 files changed, 315 insertions, 0 deletions
diff --git a/engine/classes/ElggAutop.php b/engine/classes/ElggAutop.php
new file mode 100644
index 000000000..fa0c34225
--- /dev/null
+++ b/engine/classes/ElggAutop.php
@@ -0,0 +1,315 @@
+<?php
+
+/**
+ * Create wrapper P and BR elements in HTML depending on newlines. Useful when
+ * users use newlines to signal line and paragraph breaks. In all cases output
+ * should be well-formed markup.
+ *
+ * In DIV elements, Ps are only added when there would be at
+ * least two of them.
+ *
+ * @author Steve Clay <steve@mrclay.org>
+ * @license http://www.opensource.org/licenses/mit-license.php MIT License
+ */
+class ElggAutop {
+
+ public $encoding = 'UTF-8';
+
+ /**
+ * @var DOMDocument
+ */
+ protected $_doc = null;
+
+ /**
+ * @var DOMXPath
+ */
+ protected $_xpath = null;
+
+ protected $_blocks = 'address article area aside blockquote caption col colgroup dd
+ details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
+ hr hgroup legend map math menu nav noscript p pre section select style summary
+ table tbody td tfoot th thead tr ul ol option li';
+
+ /**
+ * @var array
+ */
+ protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
+ del dfn em embed i iframe img input ins kbd keygen label map mark meter object
+ output progress q rp rt ruby s samp script select small source span strong style
+ sub sup textarea time var video wbr';
+
+ /**
+ * Descend into these elements to add Ps
+ *
+ * @var array
+ */
+ protected $_descendList = 'article aside blockquote body details div footer form
+ header section';
+
+ /**
+ * Add Ps inside these elements
+ *
+ * @var array
+ */
+ protected $_alterList = 'article aside blockquote body details div footer header
+ section';
+
+ protected $_unique = '';
+
+ public function __construct()
+ {
+ $this->_blocks = preg_split('@\\s+@', $this->_blocks);
+ $this->_descendList = preg_split('@\\s+@', $this->_descendList);
+ $this->_alterList = preg_split('@\\s+@', $this->_alterList);
+ $this->_inlines = preg_split('@\\s+@', $this->_inlines);
+ $this->_unique = md5(__FILE__);
+ }
+
+ /**
+ * Intance of class for singleton pattern.
+ * @var ElggAutop
+ */
+ private static $instance;
+
+ /**
+ * Singleton pattern.
+ * @return ElggAutop
+ */
+ public static function getInstance() {
+ $className = __CLASS__;
+ if (!(self::$instance instanceof $className)) {
+ self::$instance = new $className();
+ }
+ return self::$instance;
+ }
+
+ /**
+ * Create wrapper P and BR elements in HTML depending on newlines. Useful when
+ * users use newlines to signal line and paragraph breaks. In all cases output
+ * should be well-formed markup.
+ *
+ * In DIV, LI, TD, and TH elements, Ps are only added when their would be at
+ * least two of them.
+ *
+ * @param string $html snippet
+ * @return string|false output or false if parse error occurred
+ */
+ public function process($html)
+ {
+ // normalize whitespace
+ $html = str_replace(array("\r\n", "\r"), "\n", $html);
+
+ // allows preserving entities untouched
+ $html = str_replace('&', $this->_unique . 'AMP', $html);
+
+ $this->_doc = new DOMDocument();
+
+ // parse to DOM, suppressing loadHTML warnings
+ // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
+ libxml_use_internal_errors(true);
+ if (! @$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
+ . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
+ . "</html>")) {
+ return false;
+ }
+
+ $this->_xpath = new DOMXPath($this->_doc);
+ // start processing recursively at the BODY element
+ $nodeList = $this->_xpath->query('//body[1]');
+ $this->_addParagraphs($nodeList->item(0));
+
+ // serialize back to HTML
+ $html = $this->_doc->saveHTML();
+
+ // split AUTOPs into multiples at /\n\n+/
+ $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
+ $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),
+ '<br />',
+ $html);
+ $html = str_replace('<br /></autop>', '</autop>', $html);
+
+ // re-parse so we can handle new AUTOP elements
+
+ if (! @$this->_doc->loadHTML($html)) {
+ return false;
+ }
+ // must re-create XPath object after DOM load
+ $this->_xpath = new DOMXPath($this->_doc);
+
+ // strip AUTOPs that only have comments/whitespace
+ foreach ($this->_xpath->query('//autop') as $autop) {
+ $hasContent = false;
+ if (trim($autop->textContent) !== '') {
+ $hasContent = true;
+ } else {
+ foreach ($autop->childNodes as $node) {
+ if ($node->nodeType === XML_ELEMENT_NODE) {
+ $hasContent = true;
+ break;
+ }
+ }
+ }
+ if (! $hasContent) {
+ // strip w/ preg_replace later (faster than moving nodes out)
+ $autop->setAttribute("r", "1");
+ }
+ }
+
+ // remove a single AUTOP inside certain elements
+
+ foreach ($this->_xpath->query('//div') as $el) {
+ $autops = $this->_xpath->query('./autop', $el);
+ if ($autops->length === 1) {
+ // strip w/ preg_replace later (faster than moving nodes out)
+ $autops->item(0)->setAttribute("r", "1");
+ }
+ }
+
+ $html = $this->_doc->saveHTML();
+
+ // trim to the contents of BODY
+ $bodyStart = strpos($html, '<body>');
+ $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
+ $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
+
+ // strip AUTOPs that should be removed
+ $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
+
+ // commit to converting AUTOPs to Ps
+ $html = str_replace('<autop>', "\n<p>", $html);
+ $html = str_replace('</autop>', "</p>\n", $html);
+
+ $html = str_replace('<br>', '<br />', $html);
+ $html = str_replace($this->_unique . 'AMP', '&', $html);
+ return $html;
+ }
+
+ /**
+ * Add P and BR elements as necessary
+ *
+ * @param DOMElement $el
+ */
+ protected function _addParagraphs(DOMElement $el)
+ {
+ // no need to recurse, just queue up
+ $elsToProcess = array($el);
+ $inlinesToProcess = array();
+ while ($el = array_shift($elsToProcess)) {
+ // if true, we can alter all child nodes, if not, we'll just call
+ // _addParagraphs on each element in the descendInto list
+ $alterInline = in_array($el->nodeName, $this->_alterList);
+
+ // inside affected elements, we want to trim leading whitespace from
+ // the first text node
+ $ltrimFirstTextNode = true;
+
+ // should we open a new AUTOP element to move inline elements into?
+ $openP = true;
+ $autop = null;
+
+ // after BR, ignore a newline
+ $isFollowingBr = false;
+
+ $node = $el->firstChild;
+ while (null !== $node) {
+ if ($alterInline) {
+ if ($openP) {
+ $openP = false;
+ // create a P to move inline content into (this may be removed later)
+ $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
+ }
+ }
+
+ $isElement = ($node->nodeType === XML_ELEMENT_NODE);
+ if ($isElement) {
+ $elName = $node->nodeName;
+ }
+ $isBlock = ($isElement && in_array($elName, $this->_blocks));
+
+ if ($alterInline) {
+ $isInline = $isElement && ! $isBlock;
+ $isText = ($node->nodeType === XML_TEXT_NODE);
+ $isLastInline = (! $node->nextSibling
+ || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
+ && in_array($node->nextSibling->nodeName, $this->_blocks)));
+ if ($isElement) {
+ $isFollowingBr = ($node->nodeName === 'br');
+ }
+
+ if ($isText) {
+ $nodeText = $node->nodeValue;
+ if ($ltrimFirstTextNode) {
+ $nodeText = ltrim($nodeText);
+ $ltrimFirstTextNode = false;
+ }
+ if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
+ // if a user ends a line with <br>, don't add a second BR
+ $nodeText = substr($nodeText, strlen($m[0]));
+ }
+ if ($isLastInline) {
+ $nodeText = rtrim($nodeText);
+ }
+ $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
+ $tmpNode = $node;
+ $node = $node->nextSibling; // move loop to next node
+
+ // alter node in place, then move into AUTOP
+ $tmpNode->nodeValue = $nodeText;
+ $autop->appendChild($tmpNode);
+
+ continue;
+ }
+ }
+ if ($isBlock || ! $node->nextSibling) {
+ if ($isBlock) {
+ if (in_array($node->nodeName, $this->_descendList)) {
+ $elsToProcess[] = $node;
+ //$this->_addParagraphs($node);
+ }
+ }
+ $openP = true;
+ $ltrimFirstTextNode = true;
+ }
+ if ($alterInline) {
+ if (! $isBlock) {
+ $tmpNode = $node;
+ if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
+ $inlinesToProcess[] = $tmpNode;
+ }
+ $node = $node->nextSibling;
+ $autop->appendChild($tmpNode);
+ continue;
+ }
+ }
+
+ $node = $node->nextSibling;
+ }
+ }
+
+ // handle inline nodes
+ // no need to recurse, just queue up
+ while ($el = array_shift($inlinesToProcess)) {
+ $ignoreLeadingNewline = false;
+ foreach ($el->childNodes as $node) {
+ if ($node->nodeType === XML_ELEMENT_NODE) {
+ if ($node->nodeValue === 'BR') {
+ $ignoreLeadingNewline = true;
+ } else {
+ $ignoreLeadingNewline = false;
+ if (false !== strpos($node->textContent, "\n")) {
+ $inlinesToProcess[] = $node;
+ }
+ }
+ continue;
+ } elseif ($node->nodeType === XML_TEXT_NODE) {
+ $text = $node->nodeValue;
+ if ($text[0] === "\n" && $ignoreLeadingNewline) {
+ $text = substr($text, 1);
+ $ignoreLeadingNewline = false;
+ }
+ $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
+ }
+ }
+ }
+ }
+}