#!/usr/bin/env php
.
*/
/**
* \file dev/tools/spider.php
* \brief Script to spider Dolibarr app.
*
* To use it:
* - Disable module "bookmark"
* - Exclude param optioncss, token, sortfield, sortorder
*/
$crawledLinks = array();
const MAX_DEPTH = 2;
/**
* @param string $url URL
* @param string $depth Depth
* @return string String
*/
function followLink($url, $depth = 0)
{
global $crawledLinks;
$crawling = array();
if ($depth > MAX_DEPTH) {
echo "
The Crawler is giving up!
";
return;
}
$options = array(
'http' => array(
'method' => "GET",
'user-agent' => "gfgBot/0.1\n"
)
);
$context = stream_context_create($options);
$doc = new DomDocument();
@$doc->loadHTML(file_get_contents($url, false, $context));
$links = $doc->getElementsByTagName('a');
$pageTitle = getDocTitle($doc, $url);
$metaData = getDocMetaData($doc);
foreach ($links as $i) {
$link = $i->getAttribute('href');
if (ignoreLink($link)) {
continue;
}
$link = convertLink($url, $link);
if (!in_array($link, $crawledLinks)) {
$crawledLinks[] = $link;
$crawling[] = $link;
insertIntoDatabase($link, $pageTitle, $metaData, $depth);
}
}
foreach ($crawling as $crawlURL) {
followLink($crawlURL, $depth + 1);
}
}
/**
* @param string $site Site
* @param string $path Path
* @return string String
*/
function convertLink($site, $path)
{
if (substr_compare($path, "//", 0, 2) == 0) {
return parse_url($site)['scheme'].$path;
} elseif (substr_compare($path, "http://", 0, 7) == 0
or substr_compare($path, "https://", 0, 8) == 0
or substr_compare($path, "www.", 0, 4) == 0
) {
return $path;
} else {
return $site.'/'.$path;
}
}
/**
* @param string $url URL
* @return boolean
*/
function ignoreLink($url)
{
return $url[0] == "#" or substr($url, 0, 11) == "javascript:";
}
/**
* @param string $link URL
* @param string $title Title
* @param string $metaData Array
* @param int $depth Depth
* @return void
*/
function insertIntoDatabase($link, $title, &$metaData, $depth)
{
//global $crawledLinks;
echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}
";
//²$crawledLinks[]=$link;
}
/**
* @param string $doc Doc
* @param string $url URL
* @return string URL/Title
*/
function getDocTitle(&$doc, $url)
{
$titleNodes = $doc->getElementsByTagName('title');
if (count($titleNodes) == 0 or !isset($titleNodes[0]->nodeValue)) {
return $url;
}
$title = str_replace('', '\n', $titleNodes[0]->nodeValue);
return (strlen($title) < 1) ? $url : $title;
}
/**
* @param string $doc Doc
* @return array Array
*/
function getDocMetaData(&$doc)
{
$metaData = array();
$metaNodes = $doc->getElementsByTagName('meta');
foreach ($metaNodes as $node) {
$metaData[$node->getAttribute("name")] = $node->getAttribute("content");
}
if (!isset($metaData['description'])) {
$metaData['description'] = 'No Description Available';
}
if (!isset($metaData['keywords'])) {
$metaData['keywords'] = '';
}
return array(
'keywords' => str_replace('', '\n', $metaData['keywords']),
'description' => str_replace('', '\n', $metaData['description'])
);
}
followLink("http://localhost/dolibarr_dev/htdocs");