| [ Index ] |
PHP Cross Reference of Limb3 |
[Summary view] [Print] [Text view]
1 <?php 2 /* 3 * Limb PHP Framework 4 * 5 * @link http://limb-project.com 6 * @copyright Copyright © 2004-2007 BIT(http://bit-creative.com) 7 * @license LGPL http://www.gnu.org/copyleft/lesser.html 8 */ 9 lmb_require('limb/web_spider/src/lmbUriFilter.class.php'); 10 lmb_require('limb/web_spider/src/lmbContentTypeFilter.class.php'); 11 lmb_require('limb/web_spider/src/lmbUriExtractor.class.php'); 12 lmb_require('limb/web_spider/src/lmbUriNormalizer.class.php'); 13 lmb_require('limb/web_spider/src/lmbUriContentReader.class.php'); 14 15 /** 16 * class lmbWebSpider. 17 * 18 * @package web_spider 19 * @version $Id: lmbWebSpider.class.php 5945 2007-06-06 08:31:43Z pachanga $ 20 */ 21 class lmbWebSpider 22 { 23 protected $uri_extractor; 24 protected $content_reader; 25 protected $uri_filter; 26 protected $uri_normalizer; 27 28 protected $observers = array(); 29 30 protected $uri_cache = array(); 31 32 function crawl($uri) 33 { 34 if($uri->getHost() == '')//??? 35 return false; 36 37 $this->_crawlRecursive($uri, $uri); 38 39 return true; 40 } 41 42 function _crawlRecursive($uri, $context_uri) 43 { 44 $this->_normalizeUriUsingContext($uri, $context_uri); 45 46 $this->getUriNormalizer()->process($uri); 47 48 if($this->_isCacheHit($uri)) 49 return; 50 51 $this->_markCached($uri); 52 53 if(!$this->getUriFilter()->canPass($uri)) 54 return; 55 56 $reader = $this->getUriContentReader(); 57 $reader->open($uri); 58 59 if(!$this->getContentTypeFilter()->canPass($reader->getContentType())) 60 return; 61 62 $this->_notifyObservers(); 63 64 $links = $this->getUriExtractor()->extract($reader->getContent()); 65 66 foreach(array_keys($links) as $key) 67 { 68 $this->_crawlRecursive($links[$key], $uri); 69 } 70 } 71 72 function _normalizeUriUsingContext($uri, $context_uri) 73 { 74 if(!$uri->getHost()) 75 { 76 $uri->setHost($context_uri->getHost()); 77 78 if(($path = $context_uri->getPath()) && $uri->isRelative()) 79 { 80 $path = preg_replace('~(.*)(/[^/]*)$~', '$1/', $path); 81 $uri->setPath($path . $uri->getPath()); 82 } 83 } 84 85 if(!$uri->getProtocol()) 86 $uri->setProtocol($context_uri->getProtocol()); 87 88 $uri->setAnchor(''); 89 90 $uri->normalizePath(); 91 } 92 93 function _isCacheHit($uri) 94 { 95 return isset($this->uri_cache[$uri->toString()]); 96 } 97 98 function _markCached($uri) 99 { 100 $this->uri_cache[$uri->toString()] = 1; 101 } 102 103 function _notifyObservers() 104 { 105 foreach(array_keys($this->observers) as $key) 106 $this->observers[$key]->notify($this->content_reader); 107 } 108 109 function registerObserver(&$observer) 110 { 111 $this->observers[] =& $observer; 112 } 113 114 function getUriExtractor() 115 { 116 if(is_object($this->uri_extractor)) 117 return $this->uri_extractor; 118 119 include_once(dirname(__FILE__) . '/lmbUriExtractor.class.php'); 120 $this->uri_extractor = new lmbUriExtractor(); 121 return $this->uri_extractor; 122 } 123 124 function setUriExtractor($extractor) 125 { 126 $this->uri_extractor = $extractor; 127 } 128 129 function getUriContentReader() 130 { 131 if(is_object($this->content_reader)) 132 return $this->content_reader; 133 134 include_once(dirname(__FILE__) . '/lmbUriContentReader.class.php'); 135 $this->content_reader = new lmbUriContentReader(); 136 return $this->content_reader; 137 } 138 139 function setUriContentReader($reader) 140 { 141 $this->content_reader = $reader; 142 } 143 144 function getContentTypeFilter() 145 { 146 if(is_object($this->content_type_filter)) 147 return $this->content_type_filter; 148 149 include_once(dirname(__FILE__) . '/lmbContentTypeFilter.class.php'); 150 $this->content_type_filter = new lmbContentTypeFilter(); 151 return $this->content_type_filter; 152 } 153 154 function setContentTypeFilter($filter) 155 { 156 $this->content_type_filter = $filter; 157 } 158 159 function setUriFilter($filter) 160 { 161 $this->uri_filter = $filter; 162 } 163 164 function getUriFilter() 165 { 166 if(is_object($this->uri_filter)) 167 return $this->uri_filter; 168 169 include_once(dirname(__FILE__) . '/lmbUriFilter.class.php'); 170 $this->uri_filter = new lmbUriFilter(); 171 return $this->uri_filter; 172 } 173 174 function setUriNormalizer($normalizer) 175 { 176 $this->uri_normalizer = $normalizer; 177 } 178 179 function getUriNormalizer() 180 { 181 if(is_object($this->uri_normalizer)) 182 return $this->uri_normalizer; 183 184 include_once(dirname(__FILE__) . '/lmbUriNormalizer.class.php'); 185 $this->uri_normalizer = new lmbUriNormalizer(); 186 return $this->uri_normalizer; 187 } 188 } 189 190 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Sun Oct 12 04:41:30 2008 | Cross-referenced by PHPXref 0.7 |