From b10eaf08f186bd4e28f6e474dcb0e2c7b7c13d84 Mon Sep 17 00:00:00 2001 From: Laurent Destailleur Date: Tue, 10 Aug 2021 12:47:06 +0200 Subject: [PATCH] Experiment a spider --- dev/tools/spider.php | 141 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 dev/tools/spider.php diff --git a/dev/tools/spider.php b/dev/tools/spider.php new file mode 100644 index 00000000000..ee1172f3939 --- /dev/null +++ b/dev/tools/spider.php @@ -0,0 +1,141 @@ +#!/usr/bin/env php +. + */ + +/** + * \file dev/tools/spider.php + * \brief Script to spider Dolibarr app. + */ + +$crawledLinks=array(); +const MAX_DEPTH=2; + + +/** + * @param string $url URL + * @param string $depth Depth + * @return string String + */ +function followLink($url, $depth = 0) +{ + global $crawledLinks; + $crawling=array(); + if ($depth>MAX_DEPTH) { + echo "
The Crawler is giving up!
"; + return; + } + $options=array( + 'http'=>array( + 'method'=>"GET", + 'user-agent'=>"gfgBot/0.1\n" + ) + ); + $context=stream_context_create($options); + $doc=new DomDocument(); + @$doc->loadHTML(file_get_contents($url, false, $context)); + $links=$doc->getElementsByTagName('a'); + $pageTitle=getDocTitle($doc, $url); + $metaData=getDocMetaData($doc); + foreach ($links as $i) { + $link=$i->getAttribute('href'); + if (ignoreLink($link)) continue; + $link=convertLink($url, $link); + if (!in_array($link, $crawledLinks)) { + $crawledLinks[]=$link; + $crawling[]=$link; + insertIntoDatabase($link, $pageTitle, $metaData, $depth); + } + } + foreach ($crawling as $crawlURL) + followLink($crawlURL, $depth+1); +} + +/** + * @param string $site Site + * @param string $path Path + * @return string String + */ +function convertLink($site, $path) +{ + if (substr_compare($path, "//", 0, 2)==0) + return parse_url($site)['scheme'].$path; + elseif (substr_compare($path, "http://", 0, 7)==0 or + substr_compare($path, "https://", 0, 8)==0 or + substr_compare($path, "www.", 0, 4)==0) + return $path; + else return $site.'/'.$path; +} + +/** + * @param string $url URL + * @return boolean + */ +function ignoreLink($url) +{ + return $url[0]=="#" or substr($url, 0, 11) == "javascript:"; +} + +/** + * @param string $link URL + * @param string $title Title + * @param string $metaData Array + * @param int $depth Depth + * @return void + */ +function insertIntoDatabase($link, $title, &$metaData, $depth) +{ + //global $crawledLinks; + + echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}


"; + + //²$crawledLinks[]=$link; +} + +/** + * @param string $doc Doc + * @param string $url URL + * @return string URL/Title + */ +function getDocTitle(&$doc, $url) +{ + $titleNodes=$doc->getElementsByTagName('title'); + if (count($titleNodes)==0 or !isset($titleNodes[0]->nodeValue)) + return $url; + $title=str_replace('', '\n', $titleNodes[0]->nodeValue); + return (strlen($title)<1)?$url:$title; +} + +/** + * @param string $doc Doc + * @return array Array + */ +function getDocMetaData(&$doc) +{ + $metaData=array(); + $metaNodes=$doc->getElementsByTagName('meta'); + foreach ($metaNodes as $node) + $metaData[$node->getAttribute("name")] = $node->getAttribute("content"); + if (!isset($metaData['description'])) + $metaData['description']='No Description Available'; + if (!isset($metaData['keywords'])) $metaData['keywords']=''; + return array( + 'keywords'=>str_replace('', '\n', $metaData['keywords']), + 'description'=>str_replace('', '\n', $metaData['description']) + ); +} + + +followLink("http://localhost/dolibarr_dev/htdocs");