forked from Wavyzz/dolibarr
Can index with docling
This commit is contained in:
@@ -2193,23 +2193,50 @@ function addFileIntoDatabaseIndex($dir, $file, $fullpathorig = '', $mode = 'uplo
|
||||
|
||||
// TODO Develop a native PHP parser using sample code in https://github.com/adeel/php-pdf-parser
|
||||
|
||||
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
|
||||
$utils = new Utils($db);
|
||||
$outputfile = $conf->admin->dir_temp.'/tmppdttotext.'.$user->id.'.out'; // File used with popen method
|
||||
// Use the method pdftotext to generate a HTML
|
||||
if (preg_match('/pdftotext/i', $useFullTextIndexation)) {
|
||||
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
|
||||
$utils = new Utils($db);
|
||||
$outputfile = $conf->admin->dir_temp.'/tmppdftotext.'.$user->id.'.out'; // File used with popen method
|
||||
|
||||
// We also exclude '/temp/' dir and 'documents/admin/documents'
|
||||
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
|
||||
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_PDFTOTEXT', 'pdftotext')." -htmlmeta '".escapeshellcmd($filetoprocess)."' - ";
|
||||
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
|
||||
// We also exclude '/temp/' dir and 'documents/admin/documents'
|
||||
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
|
||||
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_PDFTOTEXT', 'pdftotext')." -htmlmeta '".escapeshellcmd($filetoprocess)."' - ";
|
||||
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
|
||||
|
||||
if (!$result['error']) {
|
||||
$txt = $result['output'];
|
||||
$matches = array();
|
||||
if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
|
||||
$keywords = $matches[1];
|
||||
if (!$result['error']) {
|
||||
$txt = $result['output'];
|
||||
$matches = array();
|
||||
if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
|
||||
$keywords = $matches[1];
|
||||
}
|
||||
if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
|
||||
$textforfulltextindex = dol_string_nospecial($matches[1]);
|
||||
}
|
||||
}
|
||||
if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
|
||||
$textforfulltextindex = dol_string_nospecial($matches[1]);
|
||||
}
|
||||
|
||||
// Use the method docling to generate a .md (https://ds4sd.github.io/docling/)
|
||||
if (preg_match('/docling/i', $useFullTextIndexation)) {
|
||||
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
|
||||
$utils = new Utils($db);
|
||||
$outputfile = $conf->admin->dir_temp.'/tmpdocling.'.$user->id.'.out'; // File used with popen method
|
||||
|
||||
// We also exclude '/temp/' dir and 'documents/admin/documents'
|
||||
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
|
||||
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_DOCLING', 'docling')." '".escapeshellcmd($filetoprocess)."' --to text ";
|
||||
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
|
||||
|
||||
if (!$result['error']) {
|
||||
$txt = $result['output'];
|
||||
//$matches = array();
|
||||
//if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
|
||||
// $keywords = $matches[1];
|
||||
//}
|
||||
//if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
|
||||
// $textforfulltextindex = dol_string_nospecial($matches[1]);
|
||||
//}
|
||||
$textforfulltextindex = $txt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user