diff options
Diffstat (limited to 'models/Auth/OpenID/Parse.php')
-rw-r--r-- | models/Auth/OpenID/Parse.php | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/models/Auth/OpenID/Parse.php b/models/Auth/OpenID/Parse.php new file mode 100644 index 000000000..6c2e72169 --- /dev/null +++ b/models/Auth/OpenID/Parse.php @@ -0,0 +1,377 @@ +<?php + +/** + * This module implements a VERY limited parser that finds <link> tags + * in the head of HTML or XHTML documents and parses out their + * attributes according to the OpenID spec. It is a liberal parser, + * but it requires these things from the data in order to work: + * + * - There must be an open <html> tag + * + * - There must be an open <head> tag inside of the <html> tag + * + * - Only <link>s that are found inside of the <head> tag are parsed + * (this is by design) + * + * - The parser follows the OpenID specification in resolving the + * attributes of the link tags. This means that the attributes DO + * NOT get resolved as they would by an XML or HTML parser. In + * particular, only certain entities get replaced, and href + * attributes do not get resolved relative to a base URL. + * + * From http://openid.net/specs.bml: + * + * - The openid.server URL MUST be an absolute URL. OpenID consumers + * MUST NOT attempt to resolve relative URLs. + * + * - The openid.server URL MUST NOT include entities other than &, + * <, >, and ". + * + * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds + * of quoting are allowed for attributes. + * + * The parser deals with invalid markup in these ways: + * + * - Tag names are not case-sensitive + * + * - The <html> tag is accepted even when it is not at the top level + * + * - The <head> tag is accepted even when it is not a direct child of + * the <html> tag, but a <html> tag must be an ancestor of the + * <head> tag + * + * - <link> tags are accepted even when they are not direct children + * of the <head> tag, but a <head> tag must be an ancestor of the + * <link> tag + * + * - If there is no closing tag for an open <html> or <head> tag, the + * remainder of the document is viewed as being inside of the + * tag. If there is no closing tag for a <link> tag, the link tag is + * treated as a short tag. Exceptions to this rule are that <html> + * closes <html> and <body> or <head> closes <head> + * + * - Attributes of the <link> tag are not required to be quoted. + * + * - In the case of duplicated attribute names, the attribute coming + * last in the tag will be the value returned. + * + * - Any text that does not parse as an attribute within a link tag + * will be ignored. (e.g. <link pumpkin rel='openid.server' /> will + * ignore pumpkin) + * + * - If there are more than one <html> or <head> tag, the parser only + * looks inside of the first one. + * + * - The contents of <script> tags are ignored entirely, except + * unclosed <script> tags. Unclosed <script> tags are ignored. + * + * - Any other invalid markup is ignored, including unclosed SGML + * comments and unclosed <![CDATA[blocks. + * + * PHP versions 4 and 5 + * + * LICENSE: See the COPYING file included in this distribution. + * + * @access private + * @package OpenID + * @author JanRain, Inc. <openid@janrain.com> + * @copyright 2005-2008 Janrain, Inc. + * @license http://www.apache.org/licenses/LICENSE-2.0 Apache + */ + +/** + * Require Auth_OpenID::arrayGet(). + */ +require_once "Auth/OpenID.php"; + +class Auth_OpenID_Parse { + + /** + * Specify some flags for use with regex matching. + */ + var $_re_flags = "si"; + + /** + * Stuff to remove before we start looking for tags + */ + var $_removed_re = + "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>"; + + /** + * Starts with the tag name at a word boundary, where the tag name + * is not a namespace + */ + var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*)(?:<\/?%s\s*>|\Z))"; + + var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)'; + + var $_open_tag_expr = "<%s\b"; + var $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>"; + + function Auth_OpenID_Parse() + { + $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s", + $this->_re_flags); + + $this->_entity_replacements = array( + 'amp' => '&', + 'lt' => '<', + 'gt' => '>', + 'quot' => '"' + ); + + $this->_attr_find = sprintf("/%s/%s", + $this->_attr_find, + $this->_re_flags); + + $this->_removed_re = sprintf("/%s/%s", + $this->_removed_re, + $this->_re_flags); + + $this->_ent_replace = + sprintf("&(%s);", implode("|", + $this->_entity_replacements)); + } + + /** + * Returns a regular expression that will match a given tag in an + * SGML string. + */ + function tagMatcher($tag_name, $close_tags = null) + { + $expr = $this->_tag_expr; + + if ($close_tags) { + $options = implode("|", array_merge(array($tag_name), $close_tags)); + $closer = sprintf("(?:%s)", $options); + } else { + $closer = $tag_name; + } + + $expr = sprintf($expr, $tag_name, $closer); + return sprintf("/%s/%s", $expr, $this->_re_flags); + } + + function openTag($tag_name) + { + $expr = sprintf($this->_open_tag_expr, $tag_name); + return sprintf("/%s/%s", $expr, $this->_re_flags); + } + + function closeTag($tag_name) + { + $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name); + return sprintf("/%s/%s", $expr, $this->_re_flags); + } + + function htmlBegin($s) + { + $matches = array(); + $result = preg_match($this->openTag('html'), $s, + $matches, PREG_OFFSET_CAPTURE); + if ($result === false || !$matches) { + return false; + } + // Return the offset of the first match. + return $matches[0][1]; + } + + function htmlEnd($s) + { + $matches = array(); + $result = preg_match($this->closeTag('html'), $s, + $matches, PREG_OFFSET_CAPTURE); + if ($result === false || !$matches) { + return false; + } + // Return the offset of the first match. + return $matches[count($matches) - 1][1]; + } + + function headFind() + { + return $this->tagMatcher('head', array('body', 'html')); + } + + function replaceEntities($str) + { + foreach ($this->_entity_replacements as $old => $new) { + $str = preg_replace(sprintf("/&%s;/", $old), $new, $str); + } + return $str; + } + + function removeQuotes($str) + { + $matches = array(); + $double = '/^"(.*)"$/'; + $single = "/^\'(.*)\'$/"; + + if (preg_match($double, $str, $matches)) { + return $matches[1]; + } else if (preg_match($single, $str, $matches)) { + return $matches[1]; + } else { + return $str; + } + } + + function match($regexp, $text, &$match) + { + if (!is_callable('mb_ereg_search_init')) { + return preg_match($regexp, $text, $match); + } + + $regexp = substr($regexp, 1, strlen($regexp) - 2 - strlen($this->_re_flags)); + mb_ereg_search_init($text); + if (!mb_ereg_search($regexp)) { + return false; + } + list($match) = mb_ereg_search_getregs(); + return true; + } + + /** + * Find all link tags in a string representing a HTML document and + * return a list of their attributes. + * + * @todo This is quite ineffective and may fail with the default + * pcre.backtrack_limit of 100000 in PHP 5.2, if $html is big. + * It should rather use stripos (in PHP5) or strpos()+strtoupper() + * in PHP4 to manage this. + * + * @param string $html The text to parse + * @return array $list An array of arrays of attributes, one for each + * link tag + */ + function parseLinkAttrs($html) + { + $stripped = preg_replace($this->_removed_re, + "", + $html); + + $html_begin = $this->htmlBegin($stripped); + $html_end = $this->htmlEnd($stripped); + + if ($html_begin === false) { + return array(); + } + + if ($html_end === false) { + $html_end = strlen($stripped); + } + + $stripped = substr($stripped, $html_begin, + $html_end - $html_begin); + + // Workaround to prevent PREG_BACKTRACK_LIMIT_ERROR: + $old_btlimit = ini_set( 'pcre.backtrack_limit', -1 ); + + // Try to find the <HEAD> tag. + $head_re = $this->headFind(); + $head_match = ''; + if (!$this->match($head_re, $stripped, $head_match)) { + ini_set( 'pcre.backtrack_limit', $old_btlimit ); + return array(); + } + + $link_data = array(); + $link_matches = array(); + + if (!preg_match_all($this->_link_find, $head_match, + $link_matches)) { + ini_set( 'pcre.backtrack_limit', $old_btlimit ); + return array(); + } + + foreach ($link_matches[0] as $link) { + $attr_matches = array(); + preg_match_all($this->_attr_find, $link, $attr_matches); + $link_attrs = array(); + foreach ($attr_matches[0] as $index => $full_match) { + $name = $attr_matches[1][$index]; + $value = $this->replaceEntities( + $this->removeQuotes($attr_matches[2][$index])); + + $link_attrs[strtolower($name)] = $value; + } + $link_data[] = $link_attrs; + } + + ini_set( 'pcre.backtrack_limit', $old_btlimit ); + return $link_data; + } + + function relMatches($rel_attr, $target_rel) + { + // Does this target_rel appear in the rel_str? + // XXX: TESTME + $rels = preg_split("/\s+/", trim($rel_attr)); + foreach ($rels as $rel) { + $rel = strtolower($rel); + if ($rel == $target_rel) { + return 1; + } + } + + return 0; + } + + function linkHasRel($link_attrs, $target_rel) + { + // Does this link have target_rel as a relationship? + // XXX: TESTME + $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null); + return ($rel_attr && $this->relMatches($rel_attr, + $target_rel)); + } + + function findLinksRel($link_attrs_list, $target_rel) + { + // Filter the list of link attributes on whether it has + // target_rel as a relationship. + // XXX: TESTME + $result = array(); + foreach ($link_attrs_list as $attr) { + if ($this->linkHasRel($attr, $target_rel)) { + $result[] = $attr; + } + } + + return $result; + } + + function findFirstHref($link_attrs_list, $target_rel) + { + // Return the value of the href attribute for the first link + // tag in the list that has target_rel as a relationship. + // XXX: TESTME + $matches = $this->findLinksRel($link_attrs_list, + $target_rel); + if (!$matches) { + return null; + } + $first = $matches[0]; + return Auth_OpenID::arrayGet($first, 'href', null); + } +} + +function Auth_OpenID_legacy_discover($html_text, $server_rel, + $delegate_rel) +{ + $p = new Auth_OpenID_Parse(); + + $link_attrs = $p->parseLinkAttrs($html_text); + + $server_url = $p->findFirstHref($link_attrs, + $server_rel); + + if ($server_url === null) { + return false; + } else { + $delegate_url = $p->findFirstHref($link_attrs, + $delegate_rel); + return array($delegate_url, $server_url); + } +} + |