diff options
author | brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544> | 2010-01-30 22:44:04 +0000 |
---|---|---|
committer | brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544> | 2010-01-30 22:44:04 +0000 |
commit | 701567f5e5e0c0bfb76744e535b55f863323859a (patch) | |
tree | 9e426c11203d1433de892b03b08d31dccbed3e7c /mod/htmlawed/vendors/htmLawed | |
parent | 0068d7f46452188f807e413f6cbd32cd765e6530 (diff) | |
download | elgg-701567f5e5e0c0bfb76744e535b55f863323859a.tar.gz elgg-701567f5e5e0c0bfb76744e535b55f863323859a.tar.bz2 |
Fixes #1425, Fixes #1341: Upgraded htmlawed to latest. Altered the htmlawed attribute filtering function to return <attr="val"> for proper linking in parse_urls(). Added background-color as a non-filtered style attribute.
git-svn-id: http://code.elgg.org/elgg/trunk@3862 36083f99-b078-4883-b0ff-0f9b5a30f544
Diffstat (limited to 'mod/htmlawed/vendors/htmLawed')
-rw-r--r-- | mod/htmlawed/vendors/htmLawed/htmLawed.php | 14 | ||||
-rw-r--r-- | mod/htmlawed/vendors/htmLawed/htmLawedTest.php | 25 | ||||
-rw-r--r-- | mod/htmlawed/vendors/htmLawed/htmLawed_README.htm | 105 | ||||
-rw-r--r-- | mod/htmlawed/vendors/htmLawed/htmLawed_README.txt | 67 | ||||
-rw-r--r-- | mod/htmlawed/vendors/htmLawed/htmLawed_TESTCASE.txt | 5 |
5 files changed, 176 insertions, 40 deletions
diff --git a/mod/htmlawed/vendors/htmLawed/htmLawed.php b/mod/htmlawed/vendors/htmLawed/htmLawed.php index 7f9a43a92..2556fdcf2 100644 --- a/mod/htmlawed/vendors/htmLawed/htmLawed.php +++ b/mod/htmlawed/vendors/htmLawed/htmLawed.php @@ -1,7 +1,7 @@ <?php /* -htmLawed 1.1.8, 23 April 2009 +htmLawed 1.1.9, 22 December 2009 Copyright Santosh Patnaik GPL v3 license A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -37,7 +37,7 @@ else{ $C['elements'] =& $e; // config attrs $x = !empty($C['deny_attribute']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute']) : ''; -$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. ($C['safe'] == 1 ? ',on*' : ''))); +$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); if(isset($x['on*'])){ unset($x['on*']); $x += array('onblur'=>1, 'onchange'=>1, 'onclick'=>1, 'ondblclick'=>1, 'onfocus'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onreset'=>1, 'onselect'=>1, 'onsubmit'=>1); @@ -419,10 +419,7 @@ if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); } // attr string -$a = str_replace(array("\xad", "\n", "\r", "\t"), ' ', trim($m[3])); -if(strpos($a, '&') !== false){ - str_replace(array('­', '­', '­'), ' ', $a); -} +$a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3])); // tag transform static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated if($C['make_tag_strict'] && isset($eD[$e])){ @@ -506,6 +503,7 @@ foreach($aA as $k=>$v){ $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v); $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ + $v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); $v = hl_prot($v, $k); if($k == 'href'){ // X-spam if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ @@ -690,7 +688,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array( function hl_version(){ // rel -return '1.1.8'; +return '1.1.9'; // eof } @@ -702,8 +700,6 @@ foreach($h as $k=>$v){ $C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0; $C['keep_bad'] = 1; $C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*'; -print_r($C['elements']); -exit; $C['hook'] = 'kses_hook'; $C['schemes'] = '*:'. implode(',', $p); return htmLawed($t, $C, $h); diff --git a/mod/htmlawed/vendors/htmLawed/htmLawedTest.php b/mod/htmlawed/vendors/htmLawed/htmLawedTest.php index c2caaff50..160bd012d 100644 --- a/mod/htmlawed/vendors/htmLawed/htmLawedTest.php +++ b/mod/htmlawed/vendors/htmLawed/htmLawedTest.php @@ -1,8 +1,8 @@ <?php /* -htmLawedTest.php, 23 April 2009 -htmLawed 1.1.8, 23 April 2009 +htmLawedTest.php, 16 July 2009 +htmLawed 1.1.9, 22 December 2009 Copyright Santosh Patnaik GPL v3 license A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -132,7 +132,7 @@ function hexdump($d){ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html lang="en" xml:lang="en"> <head> -<meta http-equiv="Content-Type" content="text/html; charset=<?php echo htmlspecialchars($_POST['enc']); ?>" /> +<meta http-equiv="content-type" content="text/html; charset=utf-8" /> <meta name="description" content="htmLawed <?php echo hl_version();?> test page" /> <style type="text/css"><!--/*--><![CDATA[/*><!--*/ a, a.resizer{text-decoration:none;} @@ -542,7 +542,7 @@ if($do){ $st = microtime(); $out = htmLawed($_POST['text'], $cfg, str_replace(array('$', '{'), '', $_POST['spec'])); $et = microtime(); - echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code »</span></a> <span class="help" title="tags estimated as half of total > and < chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2), '</big> tags</small> </span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary » </span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized settings » </span></a> <div id="settingF" style="display: none;">', str_replace(array(' ', "\t", ' '), array(' ', ' ', ' '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output »</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>'; + echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code »</span></a> <span class="help" title="tags estimated as half of total > and < chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2), '</big> tags</small> </span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary » </span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings » </span></a> <div id="settingF" style="display: none;">', str_replace(array(' ', "\t", ' '), array(' ', ' ', ' '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output »</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>'; if($_w3c_validate && $validation) { ?> @@ -559,22 +559,23 @@ else{ <br /> -<div class="help">Use with a Javascript- and cookie-enabled, relatively new version of a common browser. +<div class="help">Use with a Javascript- and cookie-enabled, relatively new version of a common browser. <em>Submitted input will also be HTML-rendered (XHTML 1) after htmLawed-filtering.</em> <?php echo (file_exists('./htmLawed_TESTCASE.txt') ? '<br /><br />You can use text from <a href="htmLawed_TESTCASE.txt"><span class="notice">this collection of test-cases</span></a> in the input. Set the character encoding of the browser to Unicode/utf-8 before copying.' : ''); ?> -<br /><br />For more about the anti-XSS capability of htmLawed, see <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm"><span class="notice">this page</span></a>. -<br /><br /><em>Submitted input will also be HTML-rendered (XHTML 1) after htmLawed-filtering.</em> -<br /><br />Change <em>Encoding</em> to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important. -<br /><br />Refer to the htmLawed documentation (<a href="htmLawed_README.htm"><span class="notice">htm</span></a>/<a href="htmLawed_README.txt"><span class="notice">txt</span></a>) for details about <em>Settings</em>, and htmLawed's behavior and limitations. -<br /><br />For <em>Settings</em>, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the <em>Spec</em> field. -<br /><br />Hovering the mouse over some of the text can provide additional information in some browsers. +<br /><br />For anti-XSS tests, try the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawedSafeModeTest.php"><span class="notice">special test-page</span></a> or see <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm"><span class="notice">these results</span></a>. + +<br /><br /><small>Change <em>Encoding</em> to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important. +<br /><br />Refer to the htmLawed documentation (<a href="htmLawed_README.htm"><span class="notice">htm</span></a>/<a href="htmLawed_README.txt"><span class="notice">txt</span></a>) for details about <em>Settings</em>, and htmLawed's behavior and limitations. For <em>Settings</em>, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the <em>Spec</em> field. + + +<br /><br />Hovering the mouse over some of the text can provide additional information in some browsers.</small> <?php if($_w3c_validate){ ?> -<br /><br />Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the <em>HTML Validator</em> Firefox browser add-on may be useful in such cases. +<small><br /><br />Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the <em>HTML Validator</em> Firefox browser add-on may be useful in such cases.</small> <?php } diff --git a/mod/htmlawed/vendors/htmLawed/htmLawed_README.htm b/mod/htmlawed/vendors/htmLawed/htmLawed_README.htm index 131838ade..7138ee9c0 100644 --- a/mod/htmlawed/vendors/htmLawed/htmLawed_README.htm +++ b/mod/htmlawed/vendors/htmLawed/htmLawed_README.htm @@ -64,7 +64,7 @@ span.totop a, span.totop a:visited {color: #6699cc;}   <span class="toc-item"><a href="#s2.6"><span class="item-no">2.6</span>  Use without modifying old <span class="term">kses()</span> code</a></span><br />   <span class="toc-item"><a href="#s2.7"><span class="item-no">2.7</span>  Tolerance for ill-written HTML</a></span><br />   <span class="toc-item"><a href="#s2.8"><span class="item-no">2.8</span>  Limitations & work-arounds</a></span><br /> -  <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>  Examples</a></span><br /> +  <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>  Examples of usage</a></span><br /> <span class="toc-item"><a href="#s3"><span class="item-no">3</span>  Details</a></span><br />   <span class="toc-item"><a href="#s3.1"><span class="item-no">3.1</span>  Invalid/dangerous characters</a></span><br />   <span class="toc-item"><a href="#s3.2"><span class="item-no">3.2</span>  Character references/entities</a></span><br /> @@ -110,8 +110,8 @@ span.totop a, span.totop a:visited {color: #6699cc;} <div id="body"> <br /> -<div class="comment">htmLawed_README.txt, 23 April 2009<br /> -htmLawed 1.1.8, 23 April 2009<br /> +<div class="comment">htmLawed_README.txt, 22 December 2009<br /> +htmLawed 1.1.9, 22 December 2009<br /> Copyright Santosh Patnaik<br /> GPL v3 license<br /> A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a> </div> @@ -180,7 +180,7 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl <br />   *  remove <strong>null</strong> characters  *<br />   *  neutralize potentially dangerous proprietary Netscape <strong>Javascript entities</strong>  *<br /> -  *  replace potentially dangerous <strong>soft-hyphen</strong> character in attribute values with spaces  *<br /> +  *  replace potentially dangerous <strong>soft-hyphen</strong> character in URL-accepting attribute values with spaces  *<br /> <br />   *  remove common <strong>invalid characters</strong> not allowed in HTML or XML  ^`<br />   *  replace <strong>characters from Microsoft applications</strong> like <span class="term">Word</span> that are discouraged in HTML or XML  ^~`<br /> @@ -726,9 +726,92 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl </div> <div class="sub-section"><h3> -<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>  Examples +<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>  Examples of usage </h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> <br /> +  Safest, allowing only <em>safe</em> HTML markup --<br /> +<br /> + +<code class="code">    $config = array('safe'=>1);</code> +<br /> + +<code class="code">    $out = htmLawed($in);</code> +<br /> +<br /> +  Simplest, allowing all valid HTML markup except <span class="term">javascript:</span> --<br /> +<br /> + +<code class="code">    $out = htmLawed($in);</code> +<br /> +<br /> +  Allowing all valid HTML markup including <span class="term">javascript:</span> --<br /> +<br /> + +<code class="code">    $config = array('schemes'=>'*:*');</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Allowing only <span class="term">safe</span> HTML and the elements <span class="term">a</span>, <span class="term">em</span>, and <span class="term">strong</span> --<br /> +<br /> + +<code class="code">    $config = array('safe'=>1, 'elements'=>'a, em, strong');</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Not allowing elements <span class="term">script</span> and <span class="term">object</span> --<br /> +<br /> + +<code class="code">    $config = array('elements'=>'* -script -object');</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Not allowing attributes <span class="term">id</span> and <span class="term">style</span> --<br /> +<br /> + +<code class="code">    $config = array('deny_attribute'=>'id, style');</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Permitting only attributes <span class="term">title</span> and <span class="term">href</span> --<br /> +<br /> + +<code class="code">    $config = array('deny_attribute'=>'* -title -href');</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Remove bad/disallowed tags altogether instead of converting them to entities --<br /> +<br /> + +<code class="code">    $config = array('keep_bad'=>0);</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config);</code> +<br /> +<br /> +  Allowing attribute <span class="term">title</span> only in <span class="term">a</span> and not allowing attributes <span class="term">id</span>, <span class="term">style</span>, or scriptable <em>on*</em> attributes like <span class="term">onclick</span> --<br /> +<br /> + +<code class="code">    $config = array('deny_attribute'=>'title, id, style, on*');</code> +<br /> + +<code class="code">    $spec = 'a=title';</code> +<br /> + +<code class="code">    $out = htmLawed($in, $config, $spec);</code> +<br /> +<br /> +  Some case-studies are presented below.<br /> +<br />   <strong>1.</strong> A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span> and <span class="term">u</span> in comments, but needs <span class="term">strike</span> and <span class="term">u</span> transformed to <span class="term">span</span> for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span> links to be to <span class="term">http</span> or <span class="term">https</span> resources:<br /> <br /> @@ -772,14 +855,14 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl <br />   The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br /> <br /> -  The <span class="term">$config["clean_ms_char"]</span> parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the <span class="term">Windows 1252</span> or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br /> +  The <span class="term">$config["clean_ms_char"]</span> parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span> (<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br /> </div> <div class="sub-section"><h3> <a name="s3.2" id="s3.2"></a><span class="item-no">3.2</span>  Character references/entities </h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> <br /> -  Valid character entities take the form <span class="term">&*;</span> where <span class="term">*</span> is <span class="term">#x</span> followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&#xA0;</span> for non-breaking space), or alphanumeric like <span class="term">gt</span> (external or named entity; like <span class="term">&nbsp;</span> for non-breaking space), or <span class="term">#</span> followed by a number (decimal numeric entity; like <span class="term">&#160;</span> for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&shy;</span> or <span class="term">\xad</span> character; hexadecimal code-point <span class="term">ad</span> [decimal <span class="term">173</span>]) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br /> +  Valid character entities take the form <span class="term">&*;</span> where <span class="term">*</span> is <span class="term">#x</span> followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&#xA0;</span> for non-breaking space), or alphanumeric like <span class="term">gt</span> (external or named entity; like <span class="term">&nbsp;</span> for non-breaking space), or <span class="term">#</span> followed by a number (decimal numeric entity; like <span class="term">&#160;</span> for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&shy;</span> or <span class="term">\xad</span> character; hexadecimal code-point <span class="term">ad</span> [decimal <span class="term">173</span>]) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br /> <br />   htmLawed (function <span class="term">hl_ent()</span>):<br /> <br /> @@ -1605,6 +1688,10 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl <br />   <em>Version number - Release date. Notes</em><br /> <br /> +  1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values<br /> +<br /> +  1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice<br /> +<br />   1.1.8 - 23 April 2009. Parameter <span class="term">deny_attribute</span> now accepts the wild-card <span class="term">*</span>, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting <span class="term">$spec</span><br /> <br />   1.1.7 - 11-12 March 2009. Attributes globally denied through <span class="term">deny_attribute</span> can be allowed element-specifically through <span class="term">$spec</span>; <span class="term">$config["style_pass"]</span> allowing letting through any <span class="term">style</span> value introduced; altered logic to catch certain types of dynamic crafted CSS expressions<br /> @@ -1658,7 +1745,7 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl <a name="s4.6" id="s4.6"></a><span class="item-no">4.6</span>  Comparison with <span class="term">HTMLPurifier</span> </h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> <br /> -  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it:<br /> +  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of mid-2009):<br /> <br />   *  does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)<br /> <br /> @@ -1970,7 +2057,7 @@ A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phpl </div> </div> <br /> -<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 23 Apr, 2009 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span> +<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 22 Dec, 2009 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span> </div><!-- ended div body --> </div><!-- ended div top --> </body> diff --git a/mod/htmlawed/vendors/htmLawed/htmLawed_README.txt b/mod/htmlawed/vendors/htmLawed/htmLawed_README.txt index 4e3afba92..48a67009b 100644 --- a/mod/htmlawed/vendors/htmLawed/htmLawed_README.txt +++ b/mod/htmlawed/vendors/htmLawed/htmLawed_README.txt @@ -1,6 +1,6 @@ /* -htmLawed_README.txt, 23 April 2009 -htmLawed 1.1.8, 23 April 2009 +htmLawed_README.txt, 22 December 2009 +htmLawed 1.1.9, 22 December 2009 Copyright Santosh Patnaik GPL v3 license A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -25,7 +25,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 2.6 Use without modifying old 'kses()' code 2.7 Tolerance for ill-written HTML 2.8 Limitations & work-arounds - 2.9 Examples + 2.9 Examples of usage 3 Details 3.1 Invalid/dangerous characters 3.2 Character references/entities @@ -131,7 +131,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * remove *null* characters * * neutralize potentially dangerous proprietary Netscape *Javascript entities* * - * replace potentially dangerous *soft-hyphen* character in attribute values with spaces * + * replace potentially dangerous *soft-hyphen* character in URL-accepting attribute values with spaces * * remove common *invalid characters* not allowed in HTML or XML ^` * replace *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~` @@ -618,9 +618,56 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. --- 2.9 Examples ---------------------------------------------------o +-- 2.9 Examples of usage -------------------------------------------o + Safest, allowing only `safe` HTML markup -- + + $config = array('safe'=>1); + $out = htmLawed($in); + + Simplest, allowing all valid HTML markup except 'javascript:' -- + + $out = htmLawed($in); + + Allowing all valid HTML markup including 'javascript:' -- + + $config = array('schemes'=>'*:*'); + $out = htmLawed($in, $config); + + Allowing only 'safe' HTML and the elements 'a', 'em', and 'strong' -- + + $config = array('safe'=>1, 'elements'=>'a, em, strong'); + $out = htmLawed($in, $config); + + Not allowing elements 'script' and 'object' -- + + $config = array('elements'=>'* -script -object'); + $out = htmLawed($in, $config); + + Not allowing attributes 'id' and 'style' -- + + $config = array('deny_attribute'=>'id, style'); + $out = htmLawed($in, $config); + + Permitting only attributes 'title' and 'href' -- + + $config = array('deny_attribute'=>'* -title -href'); + $out = htmLawed($in, $config); + + Remove bad/disallowed tags altogether instead of converting them to entities -- + + $config = array('keep_bad'=>0); + $out = htmLawed($in, $config); + + Allowing attribute 'title' only in 'a' and not allowing attributes 'id', 'style', or scriptable `on*` attributes like 'onclick' -- + + $config = array('deny_attribute'=>'title, id, style, on*'); + $spec = 'a=title'; + $out = htmLawed($in, $config, $spec); + + Some case-studies are presented below. + *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to be to 'http' or 'https' resources: $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); @@ -656,13 +703,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text. - The '$config["clean_ms_char"]' parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the 'Windows 1252' or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up. + The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251'. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up. -- 3.2 Character references/entities ------------------------------o - Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like ' ' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like ' ' for non-breaking space), or '#' followed by a number (decimal numeric entity; like ' ' for non-breaking space). Character entities referring to the soft-hyphen character (the '­' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. + Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like ' ' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like ' ' for non-breaking space), or '#' followed by a number (decimal numeric entity; like ' ' for non-breaking space). Character entities referring to the soft-hyphen character (the '­' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. htmLawed (function 'hl_ent()'): @@ -1241,6 +1288,10 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern `Version number - Release date. Notes` + 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values + + 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice + 1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec' 1.1.7 - 11-12 March 2009. Attributes globally denied through 'deny_attribute' can be allowed element-specifically through '$spec'; '$config["style_pass"]' allowing letting through any 'style' value introduced; altered logic to catch certain types of dynamic crafted CSS expressions @@ -1291,7 +1342,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 4.6 Comparison with 'HTMLPurifier' -----------------------------o - The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it: + The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of mid-2009): * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) diff --git a/mod/htmlawed/vendors/htmLawed/htmLawed_TESTCASE.txt b/mod/htmlawed/vendors/htmLawed/htmLawed_TESTCASE.txt index 302a4b28c..ea24b1839 100644 --- a/mod/htmlawed/vendors/htmLawed/htmLawed_TESTCASE.txt +++ b/mod/htmlawed/vendors/htmLawed/htmLawed_TESTCASE.txt @@ -1,6 +1,6 @@ /* -htmLawed_TESTCASE.txt, 23 April 2009 -htmLawed 1.1.8, 23 April 2009 +htmLawed_TESTCASE.txt, 22 December 2009 +htmLawed 1.1.9, 22 December 2009 Copyright Santosh Patnaik GPL v3 license A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -247,6 +247,7 @@ Invalid: <em <!-- check -->>comment in tag content</em>, <!--check--> Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br /> გთხოვთ ახლავე გაიაროთ რეგისტრაცია<br /> večjezično računalništvo<br /> +<a title="อ.อ่าง">อ.อ่าง</a><br /> <a title="הירשמו כעת לכנס ">Зарегистрируйтесь сейчас на Десятую Международную Конференцию по</a><br /> |