#!/usr/bin/env php
.
*/
/**
* \file dev/tools/spider.php
* \brief Script to spider Dolibarr app.
*
* To use it:
* - Disable module "bookmark"
* - Exclude param optioncss, token, sortfield, sortorder
*/
$crawledLinks=array();
const MAX_DEPTH=2;
/**
* @param string $url URL
* @param string $depth Depth
* @return string String
*/
function followLink($url, $depth = 0)
{
global $crawledLinks;
$crawling=array();
if ($depth>MAX_DEPTH) {
echo "
The Crawler is giving up!
";
return;
}
$options=array(
'http'=>array(
'method'=>"GET",
'user-agent'=>"gfgBot/0.1\n"
)
);
$context=stream_context_create($options);
$doc=new DomDocument();
@$doc->loadHTML(file_get_contents($url, false, $context));
$links=$doc->getElementsByTagName('a');
$pageTitle=getDocTitle($doc, $url);
$metaData=getDocMetaData($doc);
foreach ($links as $i) {
$link=$i->getAttribute('href');
if (ignoreLink($link)) continue;
$link=convertLink($url, $link);
if (!in_array($link, $crawledLinks)) {
$crawledLinks[]=$link;
$crawling[]=$link;
insertIntoDatabase($link, $pageTitle, $metaData, $depth);
}
}
foreach ($crawling as $crawlURL)
followLink($crawlURL, $depth+1);
}
/**
* @param string $site Site
* @param string $path Path
* @return string String
*/
function convertLink($site, $path)
{
if (substr_compare($path, "//", 0, 2)==0)
return parse_url($site)['scheme'].$path;
elseif (substr_compare($path, "http://", 0, 7)==0 or
substr_compare($path, "https://", 0, 8)==0 or
substr_compare($path, "www.", 0, 4)==0)
return $path;
else return $site.'/'.$path;
}
/**
* @param string $url URL
* @return boolean
*/
function ignoreLink($url)
{
return $url[0]=="#" or substr($url, 0, 11) == "javascript:";
}
/**
* @param string $link URL
* @param string $title Title
* @param string $metaData Array
* @param int $depth Depth
* @return void
*/
function insertIntoDatabase($link, $title, &$metaData, $depth)
{
//global $crawledLinks;
echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}
";
//²$crawledLinks[]=$link;
}
/**
* @param string $doc Doc
* @param string $url URL
* @return string URL/Title
*/
function getDocTitle(&$doc, $url)
{
$titleNodes=$doc->getElementsByTagName('title');
if (count($titleNodes)==0 or !isset($titleNodes[0]->nodeValue))
return $url;
$title=str_replace('', '\n', $titleNodes[0]->nodeValue);
return (strlen($title)<1)?$url:$title;
}
/**
* @param string $doc Doc
* @return array Array
*/
function getDocMetaData(&$doc)
{
$metaData=array();
$metaNodes=$doc->getElementsByTagName('meta');
foreach ($metaNodes as $node)
$metaData[$node->getAttribute("name")] = $node->getAttribute("content");
if (!isset($metaData['description']))
$metaData['description']='No Description Available';
if (!isset($metaData['keywords'])) $metaData['keywords']='';
return array(
'keywords'=>str_replace('', '\n', $metaData['keywords']),
'description'=>str_replace('', '\n', $metaData['description'])
);
}
followLink("http://localhost/dolibarr_dev/htdocs");