From 5ce90555eec3eb8905dd0157d8d7ec5698195e4e Mon Sep 17 00:00:00 2001 From: Laurent Destailleur Date: Thu, 21 Dec 2023 18:29:55 +0100 Subject: [PATCH] Removed useless file #yogosha19667 --- dev/tools/spider.php | 156 ------------------------------------------- 1 file changed, 156 deletions(-) delete mode 100644 dev/tools/spider.php diff --git a/dev/tools/spider.php b/dev/tools/spider.php deleted file mode 100644 index 28871122e2c..00000000000 --- a/dev/tools/spider.php +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env php -. - */ - -/** - * \file dev/tools/spider.php - * \brief Script to spider Dolibarr app. - * - * To use it: - * - Disable module "bookmark" - * - Exclude param optioncss, token, sortfield, sortorder - */ - -$crawledLinks = array(); -const MAX_DEPTH = 2; - - -/** - * @param string $url URL - * @param string $depth Depth - * @return string String - */ -function followLink($url, $depth = 0) -{ - global $crawledLinks; - $crawling = array(); - if ($depth > MAX_DEPTH) { - echo "
The Crawler is giving up!
"; - return; - } - $options = array( - 'http' => array( - 'method' => "GET", - 'user-agent' => "gfgBot/0.1\n" - ) - ); - $context = stream_context_create($options); - $doc = new DomDocument(); - @$doc->loadHTML(file_get_contents($url, false, $context)); - $links = $doc->getElementsByTagName('a'); - $pageTitle = getDocTitle($doc, $url); - $metaData = getDocMetaData($doc); - foreach ($links as $i) { - $link = $i->getAttribute('href'); - if (ignoreLink($link)) { - continue; - } - $link = convertLink($url, $link); - if (!in_array($link, $crawledLinks)) { - $crawledLinks[] = $link; - $crawling[] = $link; - insertIntoDatabase($link, $pageTitle, $metaData, $depth); - } - } - foreach ($crawling as $crawlURL) { - followLink($crawlURL, $depth + 1); - } -} - -/** - * @param string $site Site - * @param string $path Path - * @return string String - */ -function convertLink($site, $path) -{ - if (substr_compare($path, "//", 0, 2) == 0) { - return parse_url($site)['scheme'].$path; - } elseif (substr_compare($path, "http://", 0, 7) == 0 - or substr_compare($path, "https://", 0, 8) == 0 - or substr_compare($path, "www.", 0, 4) == 0 - ) { - return $path; - } else { - return $site.'/'.$path; - } -} - -/** - * @param string $url URL - * @return boolean - */ -function ignoreLink($url) -{ - return $url[0] == "#" or substr($url, 0, 11) == "javascript:"; -} - -/** - * @param string $link URL - * @param string $title Title - * @param string $metaData Array - * @param int $depth Depth - * @return void - */ -function insertIntoDatabase($link, $title, &$metaData, $depth) -{ - //global $crawledLinks; - - echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}


"; - - //²$crawledLinks[]=$link; -} - -/** - * @param string $doc Doc - * @param string $url URL - * @return string URL/Title - */ -function getDocTitle(&$doc, $url) -{ - $titleNodes = $doc->getElementsByTagName('title'); - if (count($titleNodes) == 0 or !isset($titleNodes[0]->nodeValue)) { - return $url; - } - $title = str_replace('', '\n', $titleNodes[0]->nodeValue); - return (strlen($title) < 1) ? $url : $title; -} - -/** - * @param string $doc Doc - * @return array Array - */ -function getDocMetaData(&$doc) -{ - $metaData = array(); - $metaNodes = $doc->getElementsByTagName('meta'); - foreach ($metaNodes as $node) { - $metaData[$node->getAttribute("name")] = $node->getAttribute("content"); - } - if (!isset($metaData['description'])) { - $metaData['description'] = 'No Description Available'; - } - if (!isset($metaData['keywords'])) { - $metaData['keywords'] = ''; - } - return array( - 'keywords' => str_replace('', '\n', $metaData['keywords']), - 'description' => str_replace('', '\n', $metaData['description']) - ); -} - - -followLink("http://localhost/dolibarr_dev/htdocs");