From 3bf72994688ad9292bf37444d80ab5ab1a002748 Mon Sep 17 00:00:00 2001 From: Paweł Sroka Date: Sun, 4 Nov 2012 08:25:28 +0100 Subject: Fixes #1479 - Replaces WP autop with implementation from Steve Clay. --- engine/tests/api/output.php | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 engine/tests/api/output.php (limited to 'engine/tests/api/output.php') diff --git a/engine/tests/api/output.php b/engine/tests/api/output.php new file mode 100644 index 000000000..eb1a66b29 --- /dev/null +++ b/engine/tests/api/output.php @@ -0,0 +1,64 @@ + + */ +class ElggCoreOutputAutoPTest extends ElggCoreUnitTest { + + /** + * @var ElggAutop + */ + protected $_autop; + + public function setUp() { + $this->_autop = new ElggAutop(); + } + + public function testDomRoundtrip() + { + $d = dir(dirname(__DIR__) . '/test_files/output/autop'); + $in = file_get_contents($d->path . "/domdoc_in.html"); + $exp = file_get_contents($d->path . "/domdoc_exp.html"); + + $doc = new DOMDocument(); + libxml_use_internal_errors(true); + $doc->loadHTML("" + . $in . ''); + $serialized = $doc->saveHTML(); + list(,$out) = explode('', $serialized, 2); + list($out) = explode('', $out, 2); + + $this->assertEqual($exp, $out, "DOMDocument's parsing/serialization roundtrip"); + } + + public function testProcess() + { + $data = $this->provider(); + foreach ($data as $row) { + list($test, $in, $exp) = $row; + $out = $this->_autop->process($in); + $this->assertEqual($exp, $out, "Equality case {$test}"); + } + } + + public function provider() + { + $d = dir(dirname(__DIR__) . '/test_files/output/autop'); + $tests = array(); + while (false !== ($entry = $d->read())) { + if (preg_match('/^([a-z\\-]+)\.in\.html$/i', $entry, $m)) { + $tests[] = $m[1]; + } + } + + $data = array(); + foreach ($tests as $test) { + $data[] = array( + $test, + file_get_contents($d->path . '/' . "{$test}.in.html"), + file_get_contents($d->path . '/' . "{$test}.exp.html"), + ); + } + return $data; + } +} -- cgit v1.2.3 From 43a395ae735777bfb5474c4f6a37dc1cd0818a37 Mon Sep 17 00:00:00 2001 From: Brett Profitt Date: Mon, 10 Dec 2012 15:50:25 -0500 Subject: Fixes #1479. Added ElggAutoP. Removing [\n\r] from test strings before compare to deal with differing whitespace between tags among PHP versions. --- engine/classes/ElggAutoP.php | 309 ++++++++++++++++++++++++++++++++++++++++++ engine/classes/ElggAutop.php | 315 ------------------------------------------- engine/lib/output.php | 11 +- engine/tests/api/output.php | 138 ++++++++++--------- 4 files changed, 389 insertions(+), 384 deletions(-) create mode 100644 engine/classes/ElggAutoP.php delete mode 100644 engine/classes/ElggAutop.php (limited to 'engine/tests/api/output.php') diff --git a/engine/classes/ElggAutoP.php b/engine/classes/ElggAutoP.php new file mode 100644 index 000000000..89d77e583 --- /dev/null +++ b/engine/classes/ElggAutoP.php @@ -0,0 +1,309 @@ +_blocks = preg_split('@\\s+@', $this->_blocks); + $this->_descendList = preg_split('@\\s+@', $this->_descendList); + $this->_alterList = preg_split('@\\s+@', $this->_alterList); + $this->_inlines = preg_split('@\\s+@', $this->_inlines); + $this->_unique = md5(__FILE__); + } + + /** + * Intance of class for singleton pattern. + * @var ElggAutoP + */ + private static $instance; + + /** + * Singleton pattern. + * @return ElggAutoP + */ + public static function getInstance() { + $className = __CLASS__; + if (!(self::$instance instanceof $className)) { + self::$instance = new $className(); + } + return self::$instance; + } + + /** + * Create wrapper P and BR elements in HTML depending on newlines. Useful when + * users use newlines to signal line and paragraph breaks. In all cases output + * should be well-formed markup. + * + * In DIV, LI, TD, and TH elements, Ps are only added when their would be at + * least two of them. + * + * @param string $html snippet + * @return string|false output or false if parse error occurred + */ + public function process($html) { + // normalize whitespace + $html = str_replace(array("\r\n", "\r"), "\n", $html); + + // allows preserving entities untouched + $html = str_replace('&', $this->_unique . 'AMP', $html); + + $this->_doc = new DOMDocument(); + + // parse to DOM, suppressing loadHTML warnings + // http://www.php.net/manual/en/domdocument.loadhtml.php#95463 + libxml_use_internal_errors(true); + + if (!$this->_doc->loadHTML("{$html}" + . "")) { + return false; + } + + $this->_xpath = new DOMXPath($this->_doc); + // start processing recursively at the BODY element + $nodeList = $this->_xpath->query('//body[1]'); + $this->_addParagraphs($nodeList->item(0)); + + // serialize back to HTML + $html = $this->_doc->saveHTML(); + + // split AUTOPs into multiples at /\n\n+/ + $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '', $html); + $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '
'), + '
', + $html); + $html = str_replace('
', '', $html); + + // re-parse so we can handle new AUTOP elements + + if (!$this->_doc->loadHTML($html)) { + return false; + } + // must re-create XPath object after DOM load + $this->_xpath = new DOMXPath($this->_doc); + + // strip AUTOPs that only have comments/whitespace + foreach ($this->_xpath->query('//autop') as $autop) { + $hasContent = false; + if (trim($autop->textContent) !== '') { + $hasContent = true; + } else { + foreach ($autop->childNodes as $node) { + if ($node->nodeType === XML_ELEMENT_NODE) { + $hasContent = true; + break; + } + } + } + if (!$hasContent) { + // strip w/ preg_replace later (faster than moving nodes out) + $autop->setAttribute("r", "1"); + } + } + + // remove a single AUTOP inside certain elements + foreach ($this->_xpath->query('//div') as $el) { + $autops = $this->_xpath->query('./autop', $el); + if ($autops->length === 1) { + // strip w/ preg_replace later (faster than moving nodes out) + $autops->item(0)->setAttribute("r", "1"); + } + } + + $html = $this->_doc->saveHTML(); + + // trim to the contents of BODY + $bodyStart = strpos($html, ''); + $bodyEnd = strpos($html, '', $bodyStart + 6); + $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6); + + // strip AUTOPs that should be removed + $html = preg_replace('@(.*?)@', '\\1', $html); + + // commit to converting AUTOPs to Ps + $html = str_replace('', "\n

", $html); + $html = str_replace('', "

\n", $html); + + $html = str_replace('
', '
', $html); + $html = str_replace($this->_unique . 'AMP', '&', $html); + return $html; + } + + /** + * Add P and BR elements as necessary + * + * @param DOMElement $el + */ + protected function _addParagraphs(DOMElement $el) { + // no need to recurse, just queue up + $elsToProcess = array($el); + $inlinesToProcess = array(); + while ($el = array_shift($elsToProcess)) { + // if true, we can alter all child nodes, if not, we'll just call + // _addParagraphs on each element in the descendInto list + $alterInline = in_array($el->nodeName, $this->_alterList); + + // inside affected elements, we want to trim leading whitespace from + // the first text node + $ltrimFirstTextNode = true; + + // should we open a new AUTOP element to move inline elements into? + $openP = true; + $autop = null; + + // after BR, ignore a newline + $isFollowingBr = false; + + $node = $el->firstChild; + while (null !== $node) { + if ($alterInline) { + if ($openP) { + $openP = false; + // create a P to move inline content into (this may be removed later) + $autop = $el->insertBefore($this->_doc->createElement('autop'), $node); + } + } + + $isElement = ($node->nodeType === XML_ELEMENT_NODE); + if ($isElement) { + $elName = $node->nodeName; + } + $isBlock = ($isElement && in_array($elName, $this->_blocks)); + + if ($alterInline) { + $isInline = $isElement && ! $isBlock; + $isText = ($node->nodeType === XML_TEXT_NODE); + $isLastInline = (! $node->nextSibling + || ($node->nextSibling->nodeType === XML_ELEMENT_NODE + && in_array($node->nextSibling->nodeName, $this->_blocks))); + if ($isElement) { + $isFollowingBr = ($node->nodeName === 'br'); + } + + if ($isText) { + $nodeText = $node->nodeValue; + if ($ltrimFirstTextNode) { + $nodeText = ltrim($nodeText); + $ltrimFirstTextNode = false; + } + if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) { + // if a user ends a line with
, don't add a second BR + $nodeText = substr($nodeText, strlen($m[0])); + } + if ($isLastInline) { + $nodeText = rtrim($nodeText); + } + $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText); + $tmpNode = $node; + $node = $node->nextSibling; // move loop to next node + + // alter node in place, then move into AUTOP + $tmpNode->nodeValue = $nodeText; + $autop->appendChild($tmpNode); + + continue; + } + } + if ($isBlock || ! $node->nextSibling) { + if ($isBlock) { + if (in_array($node->nodeName, $this->_descendList)) { + $elsToProcess[] = $node; + //$this->_addParagraphs($node); + } + } + $openP = true; + $ltrimFirstTextNode = true; + } + if ($alterInline) { + if (! $isBlock) { + $tmpNode = $node; + if ($isElement && false !== strpos($tmpNode->textContent, "\n")) { + $inlinesToProcess[] = $tmpNode; + } + $node = $node->nextSibling; + $autop->appendChild($tmpNode); + continue; + } + } + + $node = $node->nextSibling; + } + } + + // handle inline nodes + // no need to recurse, just queue up + while ($el = array_shift($inlinesToProcess)) { + $ignoreLeadingNewline = false; + foreach ($el->childNodes as $node) { + if ($node->nodeType === XML_ELEMENT_NODE) { + if ($node->nodeValue === 'BR') { + $ignoreLeadingNewline = true; + } else { + $ignoreLeadingNewline = false; + if (false !== strpos($node->textContent, "\n")) { + $inlinesToProcess[] = $node; + } + } + continue; + } elseif ($node->nodeType === XML_TEXT_NODE) { + $text = $node->nodeValue; + if ($text[0] === "\n" && $ignoreLeadingNewline) { + $text = substr($text, 1); + $ignoreLeadingNewline = false; + } + $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text); + } + } + } + } +} diff --git a/engine/classes/ElggAutop.php b/engine/classes/ElggAutop.php deleted file mode 100644 index fa0c34225..000000000 --- a/engine/classes/ElggAutop.php +++ /dev/null @@ -1,315 +0,0 @@ - - * @license http://www.opensource.org/licenses/mit-license.php MIT License - */ -class ElggAutop { - - public $encoding = 'UTF-8'; - - /** - * @var DOMDocument - */ - protected $_doc = null; - - /** - * @var DOMXPath - */ - protected $_xpath = null; - - protected $_blocks = 'address article area aside blockquote caption col colgroup dd - details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header - hr hgroup legend map math menu nav noscript p pre section select style summary - table tbody td tfoot th thead tr ul ol option li'; - - /** - * @var array - */ - protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist - del dfn em embed i iframe img input ins kbd keygen label map mark meter object - output progress q rp rt ruby s samp script select small source span strong style - sub sup textarea time var video wbr'; - - /** - * Descend into these elements to add Ps - * - * @var array - */ - protected $_descendList = 'article aside blockquote body details div footer form - header section'; - - /** - * Add Ps inside these elements - * - * @var array - */ - protected $_alterList = 'article aside blockquote body details div footer header - section'; - - protected $_unique = ''; - - public function __construct() - { - $this->_blocks = preg_split('@\\s+@', $this->_blocks); - $this->_descendList = preg_split('@\\s+@', $this->_descendList); - $this->_alterList = preg_split('@\\s+@', $this->_alterList); - $this->_inlines = preg_split('@\\s+@', $this->_inlines); - $this->_unique = md5(__FILE__); - } - - /** - * Intance of class for singleton pattern. - * @var ElggAutop - */ - private static $instance; - - /** - * Singleton pattern. - * @return ElggAutop - */ - public static function getInstance() { - $className = __CLASS__; - if (!(self::$instance instanceof $className)) { - self::$instance = new $className(); - } - return self::$instance; - } - - /** - * Create wrapper P and BR elements in HTML depending on newlines. Useful when - * users use newlines to signal line and paragraph breaks. In all cases output - * should be well-formed markup. - * - * In DIV, LI, TD, and TH elements, Ps are only added when their would be at - * least two of them. - * - * @param string $html snippet - * @return string|false output or false if parse error occurred - */ - public function process($html) - { - // normalize whitespace - $html = str_replace(array("\r\n", "\r"), "\n", $html); - - // allows preserving entities untouched - $html = str_replace('&', $this->_unique . 'AMP', $html); - - $this->_doc = new DOMDocument(); - - // parse to DOM, suppressing loadHTML warnings - // http://www.php.net/manual/en/domdocument.loadhtml.php#95463 - libxml_use_internal_errors(true); - if (! @$this->_doc->loadHTML("{$html}" - . "")) { - return false; - } - - $this->_xpath = new DOMXPath($this->_doc); - // start processing recursively at the BODY element - $nodeList = $this->_xpath->query('//body[1]'); - $this->_addParagraphs($nodeList->item(0)); - - // serialize back to HTML - $html = $this->_doc->saveHTML(); - - // split AUTOPs into multiples at /\n\n+/ - $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '
', $html); - $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '
'), - '
', - $html); - $html = str_replace('
', '', $html); - - // re-parse so we can handle new AUTOP elements - - if (! @$this->_doc->loadHTML($html)) { - return false; - } - // must re-create XPath object after DOM load - $this->_xpath = new DOMXPath($this->_doc); - - // strip AUTOPs that only have comments/whitespace - foreach ($this->_xpath->query('//autop') as $autop) { - $hasContent = false; - if (trim($autop->textContent) !== '') { - $hasContent = true; - } else { - foreach ($autop->childNodes as $node) { - if ($node->nodeType === XML_ELEMENT_NODE) { - $hasContent = true; - break; - } - } - } - if (! $hasContent) { - // strip w/ preg_replace later (faster than moving nodes out) - $autop->setAttribute("r", "1"); - } - } - - // remove a single AUTOP inside certain elements - - foreach ($this->_xpath->query('//div') as $el) { - $autops = $this->_xpath->query('./autop', $el); - if ($autops->length === 1) { - // strip w/ preg_replace later (faster than moving nodes out) - $autops->item(0)->setAttribute("r", "1"); - } - } - - $html = $this->_doc->saveHTML(); - - // trim to the contents of BODY - $bodyStart = strpos($html, ''); - $bodyEnd = strpos($html, '', $bodyStart + 6); - $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6); - - // strip AUTOPs that should be removed - $html = preg_replace('@(.*?)@', '\\1', $html); - - // commit to converting AUTOPs to Ps - $html = str_replace('', "\n

", $html); - $html = str_replace('', "

\n", $html); - - $html = str_replace('
', '
', $html); - $html = str_replace($this->_unique . 'AMP', '&', $html); - return $html; - } - - /** - * Add P and BR elements as necessary - * - * @param DOMElement $el - */ - protected function _addParagraphs(DOMElement $el) - { - // no need to recurse, just queue up - $elsToProcess = array($el); - $inlinesToProcess = array(); - while ($el = array_shift($elsToProcess)) { - // if true, we can alter all child nodes, if not, we'll just call - // _addParagraphs on each element in the descendInto list - $alterInline = in_array($el->nodeName, $this->_alterList); - - // inside affected elements, we want to trim leading whitespace from - // the first text node - $ltrimFirstTextNode = true; - - // should we open a new AUTOP element to move inline elements into? - $openP = true; - $autop = null; - - // after BR, ignore a newline - $isFollowingBr = false; - - $node = $el->firstChild; - while (null !== $node) { - if ($alterInline) { - if ($openP) { - $openP = false; - // create a P to move inline content into (this may be removed later) - $autop = $el->insertBefore($this->_doc->createElement('autop'), $node); - } - } - - $isElement = ($node->nodeType === XML_ELEMENT_NODE); - if ($isElement) { - $elName = $node->nodeName; - } - $isBlock = ($isElement && in_array($elName, $this->_blocks)); - - if ($alterInline) { - $isInline = $isElement && ! $isBlock; - $isText = ($node->nodeType === XML_TEXT_NODE); - $isLastInline = (! $node->nextSibling - || ($node->nextSibling->nodeType === XML_ELEMENT_NODE - && in_array($node->nextSibling->nodeName, $this->_blocks))); - if ($isElement) { - $isFollowingBr = ($node->nodeName === 'br'); - } - - if ($isText) { - $nodeText = $node->nodeValue; - if ($ltrimFirstTextNode) { - $nodeText = ltrim($nodeText); - $ltrimFirstTextNode = false; - } - if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) { - // if a user ends a line with
, don't add a second BR - $nodeText = substr($nodeText, strlen($m[0])); - } - if ($isLastInline) { - $nodeText = rtrim($nodeText); - } - $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText); - $tmpNode = $node; - $node = $node->nextSibling; // move loop to next node - - // alter node in place, then move into AUTOP - $tmpNode->nodeValue = $nodeText; - $autop->appendChild($tmpNode); - - continue; - } - } - if ($isBlock || ! $node->nextSibling) { - if ($isBlock) { - if (in_array($node->nodeName, $this->_descendList)) { - $elsToProcess[] = $node; - //$this->_addParagraphs($node); - } - } - $openP = true; - $ltrimFirstTextNode = true; - } - if ($alterInline) { - if (! $isBlock) { - $tmpNode = $node; - if ($isElement && false !== strpos($tmpNode->textContent, "\n")) { - $inlinesToProcess[] = $tmpNode; - } - $node = $node->nextSibling; - $autop->appendChild($tmpNode); - continue; - } - } - - $node = $node->nextSibling; - } - } - - // handle inline nodes - // no need to recurse, just queue up - while ($el = array_shift($inlinesToProcess)) { - $ignoreLeadingNewline = false; - foreach ($el->childNodes as $node) { - if ($node->nodeType === XML_ELEMENT_NODE) { - if ($node->nodeValue === 'BR') { - $ignoreLeadingNewline = true; - } else { - $ignoreLeadingNewline = false; - if (false !== strpos($node->textContent, "\n")) { - $inlinesToProcess[] = $node; - } - } - continue; - } elseif ($node->nodeType === XML_TEXT_NODE) { - $text = $node->nodeValue; - if ($text[0] === "\n" && $ignoreLeadingNewline) { - $text = substr($text, 1); - $ignoreLeadingNewline = false; - } - $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text); - } - } - } - } -} diff --git a/engine/lib/output.php b/engine/lib/output.php index cce1c7cba..bff0bf6e9 100644 --- a/engine/lib/output.php +++ b/engine/lib/output.php @@ -16,7 +16,7 @@ **/ function parse_urls($text) { // @todo this causes problems with - // must be ing format (no space). + // must be in format (no space). // By default htmlawed rewrites tags to this format. // if PHP supported conditional negative lookbehinds we could use this: // $r = preg_replace_callback('/(?"\'\!\(\),]+)/i', @@ -46,6 +46,7 @@ function parse_urls($text) { * * @param string $pee The string * @deprecated Use elgg_autop instead + * @todo Add deprecation warning in 1.9 * * @return string **/ @@ -56,12 +57,12 @@ function autop($pee) { /** * Create paragraphs from text with line spacing * - * @param string $pee The string + * @param string $string The string * * @return string **/ -function elgg_autop($pee) { - return ElggAutop::getInstance()->process($pee); +function elgg_autop($string) { + return ElggAutoP::getInstance()->process($string); } /** @@ -358,7 +359,7 @@ function elgg_get_friendly_time($time) { /** * Strip tags and offer plugins the chance. * Plugins register for output:strip_tags plugin hook. - * Original string included in $params['original_string'] + * Original string included in $params['original_string'] * * @param string $string Formatted string * diff --git a/engine/tests/api/output.php b/engine/tests/api/output.php index eb1a66b29..c3d5aa8c6 100644 --- a/engine/tests/api/output.php +++ b/engine/tests/api/output.php @@ -1,64 +1,74 @@ - - */ -class ElggCoreOutputAutoPTest extends ElggCoreUnitTest { - - /** - * @var ElggAutop - */ - protected $_autop; - - public function setUp() { - $this->_autop = new ElggAutop(); - } - - public function testDomRoundtrip() - { - $d = dir(dirname(__DIR__) . '/test_files/output/autop'); - $in = file_get_contents($d->path . "/domdoc_in.html"); - $exp = file_get_contents($d->path . "/domdoc_exp.html"); - - $doc = new DOMDocument(); - libxml_use_internal_errors(true); - $doc->loadHTML("" - . $in . ''); - $serialized = $doc->saveHTML(); - list(,$out) = explode('', $serialized, 2); - list($out) = explode('', $out, 2); - - $this->assertEqual($exp, $out, "DOMDocument's parsing/serialization roundtrip"); - } - - public function testProcess() - { - $data = $this->provider(); - foreach ($data as $row) { - list($test, $in, $exp) = $row; - $out = $this->_autop->process($in); - $this->assertEqual($exp, $out, "Equality case {$test}"); - } - } - - public function provider() - { - $d = dir(dirname(__DIR__) . '/test_files/output/autop'); - $tests = array(); - while (false !== ($entry = $d->read())) { - if (preg_match('/^([a-z\\-]+)\.in\.html$/i', $entry, $m)) { - $tests[] = $m[1]; - } - } - - $data = array(); - foreach ($tests as $test) { - $data[] = array( - $test, - file_get_contents($d->path . '/' . "{$test}.in.html"), - file_get_contents($d->path . '/' . "{$test}.exp.html"), - ); - } - return $data; - } -} +_autop = new ElggAutoP(); + } + + public function testDomRoundtrip() { + $d = dir(dirname(dirname(__FILE__)) . '/test_files/output/autop'); + $in = file_get_contents($d->path . "/domdoc_in.html"); + $exp = file_get_contents($d->path . "/domdoc_exp.html"); + $exp = $this->flattenString($exp); + + $doc = new DOMDocument(); + libxml_use_internal_errors(true); + $doc->loadHTML("" + . $in . ''); + $serialized = $doc->saveHTML(); + list(,$out) = explode('', $serialized, 2); + list($out) = explode('', $out, 2); + $out = $this->flattenString($out); + + $this->assertEqual($exp, $out, "DOMDocument's parsing/serialization roundtrip"); + } + + public function testProcess() { + $data = $this->provider(); + foreach ($data as $row) { + list($test, $in, $exp) = $row; + $exp = $this->flattenString($exp); + $out = $this->_autop->process($in); + $out = $this->flattenString($out); + + $this->assertEqual($exp, $out, "Equality case {$test}"); + } + } + + public function provider() { + $d = dir(dirname(dirname(__FILE__)) . '/test_files/output/autop'); + $tests = array(); + while (false !== ($entry = $d->read())) { + if (preg_match('/^([a-z\\-]+)\.in\.html$/i', $entry, $m)) { + $tests[] = $m[1]; + } + } + + $data = array(); + foreach ($tests as $test) { + $data[] = array( + $test, + file_get_contents($d->path . '/' . "{$test}.in.html"), + file_get_contents($d->path . '/' . "{$test}.exp.html"), + ); + } + return $data; + } + + /** + * Different versions of PHP return different whitespace between tags. + * Removing all line breaks normalizes that. + */ + public function flattenString($string) { + $r = preg_replace('/[\n\r]+/', '', $string); + return $r; + } +} \ No newline at end of file -- cgit v1.2.3