| [ Index ] |
PHP Cross Reference of Limb3 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * base include file for SimpleTest 4 * @package SimpleTest 5 * @subpackage MockObjects 6 * @version $Id: parser.php 5999 2007-06-18 13:13:08Z pachanga $ 7 */ 8 9 /**#@+ 10 * Lexer mode stack constants 11 */ 12 if (! defined('LEXER_ENTER')) { 13 define('LEXER_ENTER', 1); 14 } 15 if (! defined('LEXER_MATCHED')) { 16 define('LEXER_MATCHED', 2); 17 } 18 if (! defined('LEXER_UNMATCHED')) { 19 define('LEXER_UNMATCHED', 3); 20 } 21 if (! defined('LEXER_EXIT')) { 22 define('LEXER_EXIT', 4); 23 } 24 if (! defined('LEXER_SPECIAL')) { 25 define('LEXER_SPECIAL', 5); 26 } 27 /**#@-*/ 28 29 /** 30 * Compounded regular expression. Any of 31 * the contained patterns could match and 32 * when one does, it's label is returned. 33 * @package SimpleTest 34 * @subpackage WebTester 35 */ 36 class ParallelRegex { 37 var $_patterns; 38 var $_labels; 39 var $_regex; 40 var $_case; 41 42 /** 43 * Constructor. Starts with no patterns. 44 * @param boolean $case True for case sensitive, false 45 * for insensitive. 46 * @access public 47 */ 48 function ParallelRegex($case) { 49 $this->_case = $case; 50 $this->_patterns = array(); 51 $this->_labels = array(); 52 $this->_regex = null; 53 } 54 55 /** 56 * Adds a pattern with an optional label. 57 * @param string $pattern Perl style regex, but ( and ) 58 * lose the usual meaning. 59 * @param string $label Label of regex to be returned 60 * on a match. 61 * @access public 62 */ 63 function addPattern($pattern, $label = true) { 64 $count = count($this->_patterns); 65 $this->_patterns[$count] = $pattern; 66 $this->_labels[$count] = $label; 67 $this->_regex = null; 68 } 69 70 /** 71 * Attempts to match all patterns at once against 72 * a string. 73 * @param string $subject String to match against. 74 * @param string $match First matched portion of 75 * subject. 76 * @return boolean True on success. 77 * @access public 78 */ 79 function match($subject, &$match) { 80 if (count($this->_patterns) == 0) { 81 return false; 82 } 83 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) { 84 $match = ''; 85 return false; 86 } 87 $match = $matches[0]; 88 for ($i = 1; $i < count($matches); $i++) { 89 if ($matches[$i]) { 90 return $this->_labels[$i - 1]; 91 } 92 } 93 return true; 94 } 95 96 /** 97 * Compounds the patterns into a single 98 * regular expression separated with the 99 * "or" operator. Caches the regex. 100 * Will automatically escape (, ) and / tokens. 101 * @param array $patterns List of patterns in order. 102 * @access private 103 */ 104 function _getCompoundedRegex() { 105 if ($this->_regex == null) { 106 for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) { 107 $this->_patterns[$i] = '(' . str_replace( 108 array('/', '(', ')'), 109 array('\/', '\(', '\)'), 110 $this->_patterns[$i]) . ')'; 111 } 112 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); 113 } 114 return $this->_regex; 115 } 116 117 /** 118 * Accessor for perl regex mode flags to use. 119 * @return string Perl regex flags. 120 * @access private 121 */ 122 function _getPerlMatchingFlags() { 123 return ($this->_case ? "msS" : "msSi"); 124 } 125 } 126 127 /** 128 * States for a stack machine. 129 * @package SimpleTest 130 * @subpackage WebTester 131 */ 132 class SimpleStateStack { 133 var $_stack; 134 135 /** 136 * Constructor. Starts in named state. 137 * @param string $start Starting state name. 138 * @access public 139 */ 140 function SimpleStateStack($start) { 141 $this->_stack = array($start); 142 } 143 144 /** 145 * Accessor for current state. 146 * @return string State. 147 * @access public 148 */ 149 function getCurrent() { 150 return $this->_stack[count($this->_stack) - 1]; 151 } 152 153 /** 154 * Adds a state to the stack and sets it 155 * to be the current state. 156 * @param string $state New state. 157 * @access public 158 */ 159 function enter($state) { 160 array_push($this->_stack, $state); 161 } 162 163 /** 164 * Leaves the current state and reverts 165 * to the previous one. 166 * @return boolean False if we drop off 167 * the bottom of the list. 168 * @access public 169 */ 170 function leave() { 171 if (count($this->_stack) == 1) { 172 return false; 173 } 174 array_pop($this->_stack); 175 return true; 176 } 177 } 178 179 /** 180 * Accepts text and breaks it into tokens. 181 * Some optimisation to make the sure the 182 * content is only scanned by the PHP regex 183 * parser once. Lexer modes must not start 184 * with leading underscores. 185 * @package SimpleTest 186 * @subpackage WebTester 187 */ 188 class SimpleLexer { 189 var $_regexes; 190 var $_parser; 191 var $_mode; 192 var $_mode_handlers; 193 var $_case; 194 195 /** 196 * Sets up the lexer in case insensitive matching 197 * by default. 198 * @param SimpleSaxParser $parser Handling strategy by 199 * reference. 200 * @param string $start Starting handler. 201 * @param boolean $case True for case sensitive. 202 * @access public 203 */ 204 function SimpleLexer(&$parser, $start = "accept", $case = false) { 205 $this->_case = $case; 206 $this->_regexes = array(); 207 $this->_parser = &$parser; 208 $this->_mode = &new SimpleStateStack($start); 209 $this->_mode_handlers = array($start => $start); 210 } 211 212 /** 213 * Adds a token search pattern for a particular 214 * parsing mode. The pattern does not change the 215 * current mode. 216 * @param string $pattern Perl style regex, but ( and ) 217 * lose the usual meaning. 218 * @param string $mode Should only apply this 219 * pattern when dealing with 220 * this type of input. 221 * @access public 222 */ 223 function addPattern($pattern, $mode = "accept") { 224 if (! isset($this->_regexes[$mode])) { 225 $this->_regexes[$mode] = new ParallelRegex($this->_case); 226 } 227 $this->_regexes[$mode]->addPattern($pattern); 228 if (! isset($this->_mode_handlers[$mode])) { 229 $this->_mode_handlers[$mode] = $mode; 230 } 231 } 232 233 /** 234 * Adds a pattern that will enter a new parsing 235 * mode. Useful for entering parenthesis, strings, 236 * tags, etc. 237 * @param string $pattern Perl style regex, but ( and ) 238 * lose the usual meaning. 239 * @param string $mode Should only apply this 240 * pattern when dealing with 241 * this type of input. 242 * @param string $new_mode Change parsing to this new 243 * nested mode. 244 * @access public 245 */ 246 function addEntryPattern($pattern, $mode, $new_mode) { 247 if (! isset($this->_regexes[$mode])) { 248 $this->_regexes[$mode] = new ParallelRegex($this->_case); 249 } 250 $this->_regexes[$mode]->addPattern($pattern, $new_mode); 251 if (! isset($this->_mode_handlers[$new_mode])) { 252 $this->_mode_handlers[$new_mode] = $new_mode; 253 } 254 } 255 256 /** 257 * Adds a pattern that will exit the current mode 258 * and re-enter the previous one. 259 * @param string $pattern Perl style regex, but ( and ) 260 * lose the usual meaning. 261 * @param string $mode Mode to leave. 262 * @access public 263 */ 264 function addExitPattern($pattern, $mode) { 265 if (! isset($this->_regexes[$mode])) { 266 $this->_regexes[$mode] = new ParallelRegex($this->_case); 267 } 268 $this->_regexes[$mode]->addPattern($pattern, "__exit"); 269 if (! isset($this->_mode_handlers[$mode])) { 270 $this->_mode_handlers[$mode] = $mode; 271 } 272 } 273 274 /** 275 * Adds a pattern that has a special mode. Acts as an entry 276 * and exit pattern in one go, effectively calling a special 277 * parser handler for this token only. 278 * @param string $pattern Perl style regex, but ( and ) 279 * lose the usual meaning. 280 * @param string $mode Should only apply this 281 * pattern when dealing with 282 * this type of input. 283 * @param string $special Use this mode for this one token. 284 * @access public 285 */ 286 function addSpecialPattern($pattern, $mode, $special) { 287 if (! isset($this->_regexes[$mode])) { 288 $this->_regexes[$mode] = new ParallelRegex($this->_case); 289 } 290 $this->_regexes[$mode]->addPattern($pattern, "_$special"); 291 if (! isset($this->_mode_handlers[$special])) { 292 $this->_mode_handlers[$special] = $special; 293 } 294 } 295 296 /** 297 * Adds a mapping from a mode to another handler. 298 * @param string $mode Mode to be remapped. 299 * @param string $handler New target handler. 300 * @access public 301 */ 302 function mapHandler($mode, $handler) { 303 $this->_mode_handlers[$mode] = $handler; 304 } 305 306 /** 307 * Splits the page text into tokens. Will fail 308 * if the handlers report an error or if no 309 * content is consumed. If successful then each 310 * unparsed and parsed token invokes a call to the 311 * held listener. 312 * @param string $raw Raw HTML text. 313 * @return boolean True on success, else false. 314 * @access public 315 */ 316 function parse($raw) { 317 if (! isset($this->_parser)) { 318 return false; 319 } 320 $length = strlen($raw); 321 while (is_array($parsed = $this->_reduce($raw))) { 322 list($raw, $unmatched, $matched, $mode) = $parsed; 323 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) { 324 return false; 325 } 326 if ($raw === '') { 327 return true; 328 } 329 if (strlen($raw) == $length) { 330 return false; 331 } 332 $length = strlen($raw); 333 } 334 if (! $parsed) { 335 return false; 336 } 337 return $this->_invokeParser($raw, LEXER_UNMATCHED); 338 } 339 340 /** 341 * Sends the matched token and any leading unmatched 342 * text to the parser changing the lexer to a new 343 * mode if one is listed. 344 * @param string $unmatched Unmatched leading portion. 345 * @param string $matched Actual token match. 346 * @param string $mode Mode after match. A boolean 347 * false mode causes no change. 348 * @return boolean False if there was any error 349 * from the parser. 350 * @access private 351 */ 352 function _dispatchTokens($unmatched, $matched, $mode = false) { 353 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) { 354 return false; 355 } 356 if (is_bool($mode)) { 357 return $this->_invokeParser($matched, LEXER_MATCHED); 358 } 359 if ($this->_isModeEnd($mode)) { 360 if (! $this->_invokeParser($matched, LEXER_EXIT)) { 361 return false; 362 } 363 return $this->_mode->leave(); 364 } 365 if ($this->_isSpecialMode($mode)) { 366 $this->_mode->enter($this->_decodeSpecial($mode)); 367 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) { 368 return false; 369 } 370 return $this->_mode->leave(); 371 } 372 $this->_mode->enter($mode); 373 return $this->_invokeParser($matched, LEXER_ENTER); 374 } 375 376 /** 377 * Tests to see if the new mode is actually to leave 378 * the current mode and pop an item from the matching 379 * mode stack. 380 * @param string $mode Mode to test. 381 * @return boolean True if this is the exit mode. 382 * @access private 383 */ 384 function _isModeEnd($mode) { 385 return ($mode === "__exit"); 386 } 387 388 /** 389 * Test to see if the mode is one where this mode 390 * is entered for this token only and automatically 391 * leaves immediately afterwoods. 392 * @param string $mode Mode to test. 393 * @return boolean True if this is the exit mode. 394 * @access private 395 */ 396 function _isSpecialMode($mode) { 397 return (strncmp($mode, "_", 1) == 0); 398 } 399 400 /** 401 * Strips the magic underscore marking single token 402 * modes. 403 * @param string $mode Mode to decode. 404 * @return string Underlying mode name. 405 * @access private 406 */ 407 function _decodeSpecial($mode) { 408 return substr($mode, 1); 409 } 410 411 /** 412 * Calls the parser method named after the current 413 * mode. Empty content will be ignored. The lexer 414 * has a parser handler for each mode in the lexer. 415 * @param string $content Text parsed. 416 * @param boolean $is_match Token is recognised rather 417 * than unparsed data. 418 * @access private 419 */ 420 function _invokeParser($content, $is_match) { 421 if (($content === '') || ($content === false)) { 422 return true; 423 } 424 $handler = $this->_mode_handlers[$this->_mode->getCurrent()]; 425 return $this->_parser->$handler($content, $is_match); 426 } 427 428 /** 429 * Tries to match a chunk of text and if successful 430 * removes the recognised chunk and any leading 431 * unparsed data. Empty strings will not be matched. 432 * @param string $raw The subject to parse. This is the 433 * content that will be eaten. 434 * @return array/boolean Three item list of unparsed 435 * content followed by the 436 * recognised token and finally the 437 * action the parser is to take. 438 * True if no match, false if there 439 * is a parsing error. 440 * @access private 441 */ 442 function _reduce($raw) { 443 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { 444 $unparsed_character_count = strpos($raw, $match); 445 $unparsed = substr($raw, 0, $unparsed_character_count); 446 $raw = substr($raw, $unparsed_character_count + strlen($match)); 447 return array($raw, $unparsed, $match, $action); 448 } 449 return true; 450 } 451 } 452 453 /** 454 * Breas HTML into SAX events. 455 * @package SimpleTest 456 * @subpackage WebTester 457 */ 458 class SimpleHtmlLexer extends SimpleLexer { 459 460 /** 461 * Sets up the lexer with case insensitive matching 462 * and adds the HTML handlers. 463 * @param SimpleSaxParser $parser Handling strategy by 464 * reference. 465 * @access public 466 */ 467 function SimpleHtmlLexer(&$parser) { 468 $this->SimpleLexer($parser, 'text'); 469 $this->mapHandler('text', 'acceptTextToken'); 470 $this->_addSkipping(); 471 foreach ($this->_getParsedTags() as $tag) { 472 $this->_addTag($tag); 473 } 474 $this->_addInTagTokens(); 475 } 476 477 /** 478 * List of parsed tags. Others are ignored. 479 * @return array List of searched for tags. 480 * @access private 481 */ 482 function _getParsedTags() { 483 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select', 484 'option', 'frameset', 'frame', 'label'); 485 } 486 487 /** 488 * The lexer has to skip certain sections such 489 * as server code, client code and styles. 490 * @access private 491 */ 492 function _addSkipping() { 493 $this->mapHandler('css', 'ignore'); 494 $this->addEntryPattern('<style', 'text', 'css'); 495 $this->addExitPattern('</style>', 'css'); 496 $this->mapHandler('js', 'ignore'); 497 $this->addEntryPattern('<script', 'text', 'js'); 498 $this->addExitPattern('</script>', 'js'); 499 $this->mapHandler('comment', 'ignore'); 500 $this->addEntryPattern('<!--', 'text', 'comment'); 501 $this->addExitPattern('-->', 'comment'); 502 } 503 504 /** 505 * Pattern matches to start and end a tag. 506 * @param string $tag Name of tag to scan for. 507 * @access private 508 */ 509 function _addTag($tag) { 510 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken'); 511 $this->addEntryPattern("<$tag", 'text', 'tag'); 512 } 513 514 /** 515 * Pattern matches to parse the inside of a tag 516 * including the attributes and their quoting. 517 * @access private 518 */ 519 function _addInTagTokens() { 520 $this->mapHandler('tag', 'acceptStartToken'); 521 $this->addSpecialPattern('\s+', 'tag', 'ignore'); 522 $this->_addAttributeTokens(); 523 $this->addExitPattern('/>', 'tag'); 524 $this->addExitPattern('>', 'tag'); 525 } 526 527 /*