[ Index ]

PHP Cross Reference of Limb3

title

Body

[close]

/web_spider/src/ -> lmbWebSpider.class.php (source)

   1  <?php
   2  /*
   3   * Limb PHP Framework
   4   *
   5   * @link http://limb-project.com 
   6   * @copyright  Copyright &copy; 2004-2007 BIT(http://bit-creative.com)
   7   * @license    LGPL http://www.gnu.org/copyleft/lesser.html 
   8   */
   9  lmb_require('limb/web_spider/src/lmbUriFilter.class.php');
  10  lmb_require('limb/web_spider/src/lmbContentTypeFilter.class.php');
  11  lmb_require('limb/web_spider/src/lmbUriExtractor.class.php');
  12  lmb_require('limb/web_spider/src/lmbUriNormalizer.class.php');
  13  lmb_require('limb/web_spider/src/lmbUriContentReader.class.php');
  14  
  15  /**

  16   * class lmbWebSpider.

  17   *

  18   * @package web_spider

  19   * @version $Id: lmbWebSpider.class.php 5945 2007-06-06 08:31:43Z pachanga $

  20   */
  21  class lmbWebSpider
  22  {
  23    protected $uri_extractor;
  24    protected $content_reader;
  25    protected $uri_filter;
  26    protected $uri_normalizer;
  27  
  28    protected $observers = array();
  29  
  30    protected $uri_cache = array();
  31  
  32    function crawl($uri)
  33    {
  34      if($uri->getHost() == '')//???
  35        return false;
  36  
  37      $this->_crawlRecursive($uri, $uri);
  38  
  39      return true;
  40    }
  41  
  42    function _crawlRecursive($uri, $context_uri)
  43    {
  44      $this->_normalizeUriUsingContext($uri, $context_uri);
  45  
  46      $this->getUriNormalizer()->process($uri);
  47  
  48      if($this->_isCacheHit($uri))
  49        return;
  50  
  51      $this->_markCached($uri);
  52  
  53      if(!$this->getUriFilter()->canPass($uri))
  54        return;
  55  
  56      $reader = $this->getUriContentReader();
  57      $reader->open($uri);
  58  
  59      if(!$this->getContentTypeFilter()->canPass($reader->getContentType()))
  60        return;
  61  
  62      $this->_notifyObservers();
  63  
  64      $links = $this->getUriExtractor()->extract($reader->getContent());
  65  
  66      foreach(array_keys($links) as $key)
  67      {
  68        $this->_crawlRecursive($links[$key], $uri);
  69      }
  70    }
  71  
  72    function _normalizeUriUsingContext($uri, $context_uri)
  73    {
  74      if(!$uri->getHost())
  75      {
  76        $uri->setHost($context_uri->getHost());
  77  
  78        if(($path = $context_uri->getPath()) && $uri->isRelative())
  79        {
  80          $path = preg_replace('~(.*)(/[^/]*)$~', '$1/', $path);
  81          $uri->setPath($path . $uri->getPath());
  82        }
  83      }
  84  
  85      if(!$uri->getProtocol())
  86        $uri->setProtocol($context_uri->getProtocol());
  87  
  88      $uri->setAnchor('');
  89  
  90      $uri->normalizePath();
  91    }
  92  
  93    function _isCacheHit($uri)
  94    {
  95      return isset($this->uri_cache[$uri->toString()]);
  96    }
  97  
  98    function _markCached($uri)
  99    {
 100      $this->uri_cache[$uri->toString()] = 1;
 101    }
 102  
 103    function _notifyObservers()
 104    {
 105      foreach(array_keys($this->observers) as $key)
 106        $this->observers[$key]->notify($this->content_reader);
 107    }
 108  
 109    function registerObserver(&$observer)
 110    {
 111      $this->observers[] =& $observer;
 112    }
 113  
 114    function getUriExtractor()
 115    {
 116      if(is_object($this->uri_extractor))
 117        return $this->uri_extractor;
 118  
 119      include_once(dirname(__FILE__) . '/lmbUriExtractor.class.php');
 120      $this->uri_extractor = new lmbUriExtractor();
 121      return $this->uri_extractor;
 122    }
 123  
 124    function setUriExtractor($extractor)
 125    {
 126      $this->uri_extractor = $extractor;
 127    }
 128  
 129    function getUriContentReader()
 130    {
 131      if(is_object($this->content_reader))
 132        return $this->content_reader;
 133  
 134      include_once(dirname(__FILE__) . '/lmbUriContentReader.class.php');
 135      $this->content_reader = new lmbUriContentReader();
 136      return $this->content_reader;
 137    }
 138  
 139    function setUriContentReader($reader)
 140    {
 141      $this->content_reader = $reader;
 142    }
 143  
 144    function getContentTypeFilter()
 145    {
 146      if(is_object($this->content_type_filter))
 147        return $this->content_type_filter;
 148  
 149      include_once(dirname(__FILE__) . '/lmbContentTypeFilter.class.php');
 150      $this->content_type_filter = new lmbContentTypeFilter();
 151      return $this->content_type_filter;
 152    }
 153  
 154    function setContentTypeFilter($filter)
 155    {
 156      $this->content_type_filter = $filter;
 157    }
 158  
 159    function setUriFilter($filter)
 160    {
 161      $this->uri_filter = $filter;
 162    }
 163  
 164    function getUriFilter()
 165    {
 166      if(is_object($this->uri_filter))
 167        return $this->uri_filter;
 168  
 169      include_once(dirname(__FILE__) . '/lmbUriFilter.class.php');
 170      $this->uri_filter = new lmbUriFilter();
 171      return $this->uri_filter;
 172    }
 173  
 174    function setUriNormalizer($normalizer)
 175    {
 176      $this->uri_normalizer = $normalizer;
 177    }
 178  
 179    function getUriNormalizer()
 180    {
 181      if(is_object($this->uri_normalizer))
 182        return $this->uri_normalizer;
 183  
 184      include_once(dirname(__FILE__) . '/lmbUriNormalizer.class.php');
 185      $this->uri_normalizer = new lmbUriNormalizer();
 186      return $this->uri_normalizer;
 187    }
 188  }
 189  
 190  ?>


Generated: Sun Oct 12 04:41:30 2008 Cross-referenced by PHPXref 0.7