forked from Wavyzz/dolibarr
147 lines
3.8 KiB
PHP
147 lines
3.8 KiB
PHP
#!/usr/bin/env php
|
|
<?php
|
|
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/**
|
|
* \file dev/tools/spider.php
|
|
* \brief Script to spider Dolibarr app.
|
|
*
|
|
* To use it:
|
|
* - Disable module "bookmark"
|
|
* - Exclude param optioncss, token, sortfield, sortorder
|
|
*/
|
|
|
|
$crawledLinks=array();
|
|
const MAX_DEPTH=2;
|
|
|
|
|
|
/**
|
|
* @param string $url URL
|
|
* @param string $depth Depth
|
|
* @return string String
|
|
*/
|
|
function followLink($url, $depth = 0)
|
|
{
|
|
global $crawledLinks;
|
|
$crawling=array();
|
|
if ($depth>MAX_DEPTH) {
|
|
echo "<div style='color:red;'>The Crawler is giving up!</div>";
|
|
return;
|
|
}
|
|
$options=array(
|
|
'http'=>array(
|
|
'method'=>"GET",
|
|
'user-agent'=>"gfgBot/0.1\n"
|
|
)
|
|
);
|
|
$context=stream_context_create($options);
|
|
$doc=new DomDocument();
|
|
@$doc->loadHTML(file_get_contents($url, false, $context));
|
|
$links=$doc->getElementsByTagName('a');
|
|
$pageTitle=getDocTitle($doc, $url);
|
|
$metaData=getDocMetaData($doc);
|
|
foreach ($links as $i) {
|
|
$link=$i->getAttribute('href');
|
|
if (ignoreLink($link)) continue;
|
|
$link=convertLink($url, $link);
|
|
if (!in_array($link, $crawledLinks)) {
|
|
$crawledLinks[]=$link;
|
|
$crawling[]=$link;
|
|
insertIntoDatabase($link, $pageTitle, $metaData, $depth);
|
|
}
|
|
}
|
|
foreach ($crawling as $crawlURL)
|
|
followLink($crawlURL, $depth+1);
|
|
}
|
|
|
|
/**
|
|
* @param string $site Site
|
|
* @param string $path Path
|
|
* @return string String
|
|
*/
|
|
function convertLink($site, $path)
|
|
{
|
|
if (substr_compare($path, "//", 0, 2)==0)
|
|
return parse_url($site)['scheme'].$path;
|
|
elseif (substr_compare($path, "http://", 0, 7)==0
|
|
or substr_compare($path, "https://", 0, 8)==0
|
|
or substr_compare($path, "www.", 0, 4)==0
|
|
)
|
|
return $path;
|
|
else return $site.'/'.$path;
|
|
}
|
|
|
|
/**
|
|
* @param string $url URL
|
|
* @return boolean
|
|
*/
|
|
function ignoreLink($url)
|
|
{
|
|
return $url[0]=="#" or substr($url, 0, 11) == "javascript:";
|
|
}
|
|
|
|
/**
|
|
* @param string $link URL
|
|
* @param string $title Title
|
|
* @param string $metaData Array
|
|
* @param int $depth Depth
|
|
* @return void
|
|
*/
|
|
function insertIntoDatabase($link, $title, &$metaData, $depth)
|
|
{
|
|
//global $crawledLinks;
|
|
|
|
echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}<br/><br/><br/>";
|
|
|
|
//²$crawledLinks[]=$link;
|
|
}
|
|
|
|
/**
|
|
* @param string $doc Doc
|
|
* @param string $url URL
|
|
* @return string URL/Title
|
|
*/
|
|
function getDocTitle(&$doc, $url)
|
|
{
|
|
$titleNodes=$doc->getElementsByTagName('title');
|
|
if (count($titleNodes)==0 or !isset($titleNodes[0]->nodeValue))
|
|
return $url;
|
|
$title=str_replace('', '\n', $titleNodes[0]->nodeValue);
|
|
return (strlen($title)<1)?$url:$title;
|
|
}
|
|
|
|
/**
|
|
* @param string $doc Doc
|
|
* @return array Array
|
|
*/
|
|
function getDocMetaData(&$doc)
|
|
{
|
|
$metaData=array();
|
|
$metaNodes=$doc->getElementsByTagName('meta');
|
|
foreach ($metaNodes as $node)
|
|
$metaData[$node->getAttribute("name")] = $node->getAttribute("content");
|
|
if (!isset($metaData['description']))
|
|
$metaData['description']='No Description Available';
|
|
if (!isset($metaData['keywords'])) $metaData['keywords']='';
|
|
return array(
|
|
'keywords'=>str_replace('', '\n', $metaData['keywords']),
|
|
'description'=>str_replace('', '\n', $metaData['description'])
|
|
);
|
|
}
|
|
|
|
|
|
followLink("http://localhost/dolibarr_dev/htdocs");
|