forked from Wavyzz/dolibarr
The website grabber rocks
This commit is contained in:
@@ -180,6 +180,8 @@ if ($action == 'addsite')
|
|||||||
// Add page
|
// Add page
|
||||||
if ($action == 'add')
|
if ($action == 'add')
|
||||||
{
|
{
|
||||||
|
dol_mkdir($pathofwebsite);
|
||||||
|
|
||||||
$db->begin();
|
$db->begin();
|
||||||
|
|
||||||
$objectpage->fk_website = $object->id;
|
$objectpage->fk_website = $object->id;
|
||||||
@@ -240,20 +242,25 @@ if ($action == 'add')
|
|||||||
}
|
}
|
||||||
|
|
||||||
$objectpage->content = $tmp['content'];
|
$objectpage->content = $tmp['content'];
|
||||||
$objectpage->content = preg_replace('/^.*<body[^>]*>/ims', '', $objectpage->content);
|
$objectpage->content = preg_replace('/^.*<body(\s[^>]*)*>/ims', '', $objectpage->content);
|
||||||
$objectpage->content = preg_replace('/<\/body[^>]*>.*$/ims', '', $objectpage->content);
|
$objectpage->content = preg_replace('/<\/body(\s[^>]*)*>.*$/ims', '', $objectpage->content);
|
||||||
|
|
||||||
|
$absoluteurlinaction=$urltograbdirwithoutslash;
|
||||||
|
// TODO Replace 'action="$urltograbdirwithoutslash' into action="/"
|
||||||
|
// TODO Replace 'action="$urltograbdirwithoutslash..."' into action="..."
|
||||||
|
// TODO Replace 'a href="$urltograbdirwithoutslash' into a href="/"
|
||||||
|
// TODO Replace 'a href="$urltograbdirwithoutslash..."' into a href="..."
|
||||||
|
|
||||||
// Now loop to fetch all css files. Include them inline into header of page
|
// Now loop to fetch all css files. Include them inline into header of page
|
||||||
$objectpage->htmlheader = $tmp['content'];
|
$objectpage->htmlheader = $tmp['content'];
|
||||||
$objectpage->htmlheader = preg_replace('/^.*<head[^>]*>/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/^.*<head(\s[^>]*)*>/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<\/head[^>]*>.*$/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<\/head(\s[^>]*)*>.*$/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<base[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<base(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<meta name="robot[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<meta name="robot(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<meta name="keywords[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<meta name="keywords(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<meta name="title[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<meta name="title(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<meta name="description[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<meta name="description(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<meta name="generator[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<meta name="generator(\s[^>]*)*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
//$objectpage->htmlheader = preg_replace('/<meta name="verify-v1[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
//$objectpage->htmlheader = preg_replace('/<meta name="verify-v1[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
//$objectpage->htmlheader = preg_replace('/<meta name="msvalidate.01[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
//$objectpage->htmlheader = preg_replace('/<meta name="msvalidate.01[^>]*>\n*/ims', '', $objectpage->htmlheader);
|
||||||
$objectpage->htmlheader = preg_replace('/<title>[^<]*<\/title>\n*/ims', '', $objectpage->htmlheader);
|
$objectpage->htmlheader = preg_replace('/<title>[^<]*<\/title>\n*/ims', '', $objectpage->htmlheader);
|
||||||
@@ -262,12 +269,14 @@ if ($action == 'add')
|
|||||||
// Now loop to fetch JS
|
// Now loop to fetch JS
|
||||||
$tmp = $objectpage->htmlheader;
|
$tmp = $objectpage->htmlheader;
|
||||||
|
|
||||||
preg_match_all('/<script([^\.]+)src="([^>"]+)"([^>]*)><\/script>/i', $objectpage->htmlheader, $regs);
|
preg_match_all('/<script([^\.>]+)src=["\']([^"\'>]+)["\']([^>]*)><\/script>/i', $objectpage->htmlheader, $regs);
|
||||||
foreach ($regs[0] as $key => $val)
|
foreach ($regs[0] as $key => $val)
|
||||||
{
|
{
|
||||||
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
dol_syslog("We will grab the resource ".$regs[2][$key]);
|
||||||
|
|
||||||
$linkwithoutdomain = $regs[2][$key];
|
$linkwithoutdomain = $regs[2][$key];
|
||||||
|
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
||||||
|
|
||||||
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
||||||
if (preg_match('/^http/', $regs[2][$key]))
|
if (preg_match('/^http/', $regs[2][$key]))
|
||||||
{
|
{
|
||||||
@@ -275,6 +284,15 @@ if ($action == 'add')
|
|||||||
$linkwithoutdomain = preg_replace('/^https?:\/\/[^\/]+\//i', '', $regs[2][$key]);
|
$linkwithoutdomain = preg_replace('/^https?:\/\/[^\/]+\//i', '', $regs[2][$key]);
|
||||||
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//print $domaintograb.' - '.$domaintograbbis.' - '.$urltograbdirwithoutslash.' - ';
|
||||||
|
//print $linkwithoutdomain.' - '.$urltograbbis."<br>\n";
|
||||||
|
|
||||||
|
// Test if this is an external URL of grabbed web site. If yes, we do not load resource
|
||||||
|
$domaintograb = getDomainFromURL($urltograbdirwithoutslash);
|
||||||
|
$domaintograbbis = getDomainFromURL($urltograbbis);
|
||||||
|
if ($domaintograb != $domaintograbbis) continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
$tmpgeturl = getURLContent($urltograbbis);
|
$tmpgeturl = getURLContent($urltograbbis);
|
||||||
if ($tmpgeturl['curl_error_no'])
|
if ($tmpgeturl['curl_error_no'])
|
||||||
@@ -293,9 +311,9 @@ if ($action == 'add')
|
|||||||
if (! empty($conf->global->MAIN_UMASK))
|
if (! empty($conf->global->MAIN_UMASK))
|
||||||
@chmod($file, octdec($conf->global->MAIN_UMASK));
|
@chmod($file, octdec($conf->global->MAIN_UMASK));
|
||||||
}
|
}
|
||||||
|
|
||||||
$filename = 'image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
//$filename = 'image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
||||||
$tmp = preg_replace('/'.preg_quote($regs[0][$key],'/').'/i', '', $tmp);
|
$tmp = preg_replace('/'.preg_quote($regs[0][$key],'/').'/i', '', $tmp);
|
||||||
}
|
}
|
||||||
$objectpage->htmlheader = trim($tmp);
|
$objectpage->htmlheader = trim($tmp);
|
||||||
@@ -304,11 +322,14 @@ if ($action == 'add')
|
|||||||
// Now loop to fetch CSS
|
// Now loop to fetch CSS
|
||||||
$pagecsscontent = "\n".'<style>'."\n";
|
$pagecsscontent = "\n".'<style>'."\n";
|
||||||
|
|
||||||
preg_match_all('/<link([^\.]+)href="([^>"]+\.css)"([^>]*)>/i', $objectpage->htmlheader, $regs);
|
preg_match_all('/<link([^\.>]+)href=["\']([^"\'>]+\.css[^"\'>]*)["\']([^>]*)>/i', $objectpage->htmlheader, $regs);
|
||||||
foreach ($regs[0] as $key => $val)
|
foreach ($regs[0] as $key => $val)
|
||||||
{
|
{
|
||||||
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
dol_syslog("We will grab the resource ".$regs[2][$key]);
|
||||||
|
|
||||||
$linkwithoutdomain = $regs[2][$key];
|
$linkwithoutdomain = $regs[2][$key];
|
||||||
|
$urltograbbis = $urltograbdirwithoutslash.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
||||||
|
|
||||||
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
|
||||||
if (preg_match('/^http/', $regs[2][$key]))
|
if (preg_match('/^http/', $regs[2][$key]))
|
||||||
{
|
{
|
||||||
@@ -317,6 +338,14 @@ if ($action == 'add')
|
|||||||
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
//$filetosave = $conf->medias->multidir_output[$conf->entity].'/css/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//print $domaintograb.' - '.$domaintograbbis.' - '.$urltograbdirwithoutslash.' - ';
|
||||||
|
//print $linkwithoutdomain.' - '.$urltograbbis."<br>\n";
|
||||||
|
|
||||||
|
// Test if this is an external URL of grabbed web site. If yes, we do not load resource
|
||||||
|
$domaintograb = getDomainFromURL($urltograbdirwithoutslash);
|
||||||
|
$domaintograbbis = getDomainFromURL($urltograbbis);
|
||||||
|
if ($domaintograb != $domaintograbbis) continue;
|
||||||
|
|
||||||
$tmpgeturl = getURLContent($urltograbbis);
|
$tmpgeturl = getURLContent($urltograbbis);
|
||||||
if ($tmpgeturl['curl_error_no'])
|
if ($tmpgeturl['curl_error_no'])
|
||||||
{
|
{
|
||||||
@@ -448,6 +477,16 @@ if ($action == 'add')
|
|||||||
|
|
||||||
// To generate the CSS, robot and htmlheader file.
|
// To generate the CSS, robot and htmlheader file.
|
||||||
|
|
||||||
|
// Check symlink to medias and restore it if ko
|
||||||
|
$pathtomedias=DOL_DATA_ROOT.'/medias';
|
||||||
|
$pathtomediasinwebsite=$pathofwebsite.'/medias';
|
||||||
|
if (! is_link(dol_osencode($pathtomediasinwebsite)))
|
||||||
|
{
|
||||||
|
dol_syslog("Create symlink for ".$pathtomedias." into name ".$pathtomediasinwebsite);
|
||||||
|
dol_mkdir(dirname($pathtomediasinwebsite)); // To be sure dir for website exists
|
||||||
|
$result = symlink($pathtomedias, $pathtomediasinwebsite);
|
||||||
|
}
|
||||||
|
|
||||||
if (! dol_is_file($filehtmlheader))
|
if (! dol_is_file($filehtmlheader))
|
||||||
{
|
{
|
||||||
$htmlheadercontent = "<!-- HTML header content (common for all pages) -->";
|
$htmlheadercontent = "<!-- HTML header content (common for all pages) -->";
|
||||||
|
|||||||
Reference in New Issue
Block a user