#!/usr/bin/env php . */ /** * \file dev/tools/spider.php * \brief Script to spider Dolibarr app. * * To use it: * - Disable module "bookmark" * - Exclude param optioncss, token, sortfield, sortorder */ $crawledLinks = array(); const MAX_DEPTH = 2; /** * @param string $url URL * @param string $depth Depth * @return string String */ function followLink($url, $depth = 0) { global $crawledLinks; $crawling = array(); if ($depth > MAX_DEPTH) { echo "
The Crawler is giving up!
"; return; } $options = array( 'http' => array( 'method' => "GET", 'user-agent' => "gfgBot/0.1\n" ) ); $context = stream_context_create($options); $doc = new DomDocument(); @$doc->loadHTML(file_get_contents($url, false, $context)); $links = $doc->getElementsByTagName('a'); $pageTitle = getDocTitle($doc, $url); $metaData = getDocMetaData($doc); foreach ($links as $i) { $link = $i->getAttribute('href'); if (ignoreLink($link)) { continue; } $link = convertLink($url, $link); if (!in_array($link, $crawledLinks)) { $crawledLinks[] = $link; $crawling[] = $link; insertIntoDatabase($link, $pageTitle, $metaData, $depth); } } foreach ($crawling as $crawlURL) { followLink($crawlURL, $depth + 1); } } /** * @param string $site Site * @param string $path Path * @return string String */ function convertLink($site, $path) { if (substr_compare($path, "//", 0, 2) == 0) { return parse_url($site)['scheme'].$path; } elseif (substr_compare($path, "http://", 0, 7) == 0 or substr_compare($path, "https://", 0, 8) == 0 or substr_compare($path, "www.", 0, 4) == 0 ) { return $path; } else { return $site.'/'.$path; } } /** * @param string $url URL * @return boolean */ function ignoreLink($url) { return $url[0] == "#" or substr($url, 0, 11) == "javascript:"; } /** * @param string $link URL * @param string $title Title * @param string $metaData Array * @param int $depth Depth * @return void */ function insertIntoDatabase($link, $title, &$metaData, $depth) { //global $crawledLinks; echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}


"; //²$crawledLinks[]=$link; } /** * @param string $doc Doc * @param string $url URL * @return string URL/Title */ function getDocTitle(&$doc, $url) { $titleNodes = $doc->getElementsByTagName('title'); if (count($titleNodes) == 0 or !isset($titleNodes[0]->nodeValue)) { return $url; } $title = str_replace('', '\n', $titleNodes[0]->nodeValue); return (strlen($title) < 1) ? $url : $title; } /** * @param string $doc Doc * @return array Array */ function getDocMetaData(&$doc) { $metaData = array(); $metaNodes = $doc->getElementsByTagName('meta'); foreach ($metaNodes as $node) { $metaData[$node->getAttribute("name")] = $node->getAttribute("content"); } if (!isset($metaData['description'])) { $metaData['description'] = 'No Description Available'; } if (!isset($metaData['keywords'])) { $metaData['keywords'] = ''; } return array( 'keywords' => str_replace('', '\n', $metaData['keywords']), 'description' => str_replace('', '\n', $metaData['description']) ); } followLink("http://localhost/dolibarr_dev/htdocs");