2
0
forked from Wavyzz/dolibarr

Can index with docling

This commit is contained in:
Laurent Destailleur
2024-11-09 03:20:15 +01:00
parent ec7ea921d4
commit e1a7681565
2 changed files with 51 additions and 17 deletions

View File

@@ -2193,23 +2193,50 @@ function addFileIntoDatabaseIndex($dir, $file, $fullpathorig = '', $mode = 'uplo
// TODO Develop a native PHP parser using sample code in https://github.com/adeel/php-pdf-parser
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
$utils = new Utils($db);
$outputfile = $conf->admin->dir_temp.'/tmppdttotext.'.$user->id.'.out'; // File used with popen method
// Use the method pdftotext to generate a HTML
if (preg_match('/pdftotext/i', $useFullTextIndexation)) {
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
$utils = new Utils($db);
$outputfile = $conf->admin->dir_temp.'/tmppdftotext.'.$user->id.'.out'; // File used with popen method
// We also exclude '/temp/' dir and 'documents/admin/documents'
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_PDFTOTEXT', 'pdftotext')." -htmlmeta '".escapeshellcmd($filetoprocess)."' - ";
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
// We also exclude '/temp/' dir and 'documents/admin/documents'
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_PDFTOTEXT', 'pdftotext')." -htmlmeta '".escapeshellcmd($filetoprocess)."' - ";
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
if (!$result['error']) {
$txt = $result['output'];
$matches = array();
if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
$keywords = $matches[1];
if (!$result['error']) {
$txt = $result['output'];
$matches = array();
if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
$keywords = $matches[1];
}
if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
$textforfulltextindex = dol_string_nospecial($matches[1]);
}
}
if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
$textforfulltextindex = dol_string_nospecial($matches[1]);
}
// Use the method docling to generate a .md (https://ds4sd.github.io/docling/)
if (preg_match('/docling/i', $useFullTextIndexation)) {
include_once DOL_DOCUMENT_ROOT.'/core/class/utils.class.php';
$utils = new Utils($db);
$outputfile = $conf->admin->dir_temp.'/tmpdocling.'.$user->id.'.out'; // File used with popen method
// We also exclude '/temp/' dir and 'documents/admin/documents'
// We make escapement here and call executeCLI without escapement because we don't want to have the '*.log' escaped.
$cmd = getDolGlobalString('MAIN_USE_FULL_TEXT_INDEXATION_DOCLING', 'docling')." '".escapeshellcmd($filetoprocess)."' --to text ";
$result = $utils->executeCLI($cmd, $outputfile, 0, null, 1);
if (!$result['error']) {
$txt = $result['output'];
//$matches = array();
//if (preg_match('/<meta name="Keywords" content="([^\/]+)"\s*\/>/i', $txt, $matches)) {
// $keywords = $matches[1];
//}
//if (preg_match('/<pre>(.*)<\/pre>/si', $txt, $matches)) {
// $textforfulltextindex = dol_string_nospecial($matches[1]);
//}
$textforfulltextindex = $txt;
}
}
}