From c0a0acf1290ef20974a3f7c0e4d02aa0fc8f5cb5 Mon Sep 17 00:00:00 2001 From: ldestailleur Date: Tue, 15 Jul 2025 21:51:50 +0200 Subject: [PATCH] FIX #34746 --- htdocs/core/lib/functions.lib.php | 37 ++++++++++++++++++++++--------- test/phpunit/.gitignore | 1 + test/phpunit/SecurityTest.php | 27 +++++++++++++++++++++- 3 files changed, 54 insertions(+), 11 deletions(-) create mode 100644 test/phpunit/.gitignore diff --git a/htdocs/core/lib/functions.lib.php b/htdocs/core/lib/functions.lib.php index a0da95da416..51d9f061f30 100644 --- a/htdocs/core/lib/functions.lib.php +++ b/htdocs/core/lib/functions.lib.php @@ -8530,14 +8530,21 @@ function dol_string_onlythesehtmlattributes($stringtoclean, $allowed_attributes { if (is_null($allowed_attributes)) { $allowed_attributes = array( - "allow", "allowfullscreen", "alt", "async", "class", "content", "contenteditable", "crossorigin", "data-html", "frameborder", "height", "href", "id", "name", "property", "rel", "src", "style", "target", "title", "type", "width", + "allow", "allowfullscreen", "alt", "async", "class", "contenteditable", "crossorigin", "data-html", "frameborder", "height", "href", "id", "name", "property", "rel", "src", "style", "target", "title", "type", "width", // HTML5 "header", "footer", "nav", "section", "menu", "menuitem" ); } + // Always add content and http-equiv for meta tags, required to force encoding and keep html content in utf8 by load/saveHTML functions. + if (!in_array("content", $allowed_attributes)) { + $allowed_attributes[] = "content"; + } + if (!in_array("http-equiv", $allowed_attributes)) { + $allowed_attributes[] = "http-equiv"; + } if (class_exists('DOMDocument') && !empty($stringtoclean)) { - $stringtoclean = ''.$stringtoclean.''; + $stringtoclean = ''.$stringtoclean.''; // Warning: loadHTML does not support HTML5 on old libxml versions. $dom = new DOMDocument('', 'UTF-8'); @@ -8588,12 +8595,15 @@ function dol_string_onlythesehtmlattributes($stringtoclean, $allowed_attributes } } + $dom->encoding = 'UTF-8'; + $return = $dom->saveHTML(); // This may add a LF at end of lines, so we will trim later //$return = 'aaaa

bb

ssdd

'."\n

aaa

aa

bb

"; $return = preg_replace('/^'.preg_quote('', '/').'/', '', $return); - $return = preg_replace('/^'.preg_quote('', '/').'/', '', $return); - $return = preg_replace('/'.preg_quote('', '/').'$/', '', $return); + $return = preg_replace('/^'.preg_quote('<', '/').'[^<>]*'.preg_quote('>', '/').'/', '', $return); + $return = preg_replace('/'.preg_quote('', '/').'$/', '', trim($return)); + return trim($return); } else { return $stringtoclean; @@ -8765,17 +8775,24 @@ function dol_htmlwithnojs($stringtoencode, $nouseofiframesandbox = 0, $check = ' // like 'abc' that wrongly ends up, without the trick, with '

abc

' if (dol_textishtml($out)) { - $out = '
'.$out.'
'; + $out = '
'.$out.'
'; } else { - $out = '
'.dol_nl2br($out).'
'; + $out = '
'.dol_nl2br($out).'
'; } $dom->loadHTML($out, LIBXML_HTML_NODEFDTD | LIBXML_ERR_NONE | LIBXML_HTML_NOIMPLIED | LIBXML_NONET | LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NOXMLDECL); + + $dom->encoding = 'UTF-8'; + $out = trim($dom->saveHTML()); - // Remove the trick added to solve pb with text without parent tag - $out = preg_replace('/^<\?xml encoding="UTF-8">
/', '', $out); - $out = preg_replace('/<\/div>$/', '', $out); + // Remove the trick added to solve pb with text in utf8 and text without parent tag + $out = preg_replace('/^'.preg_quote('', '/').'/', '', $out); + $out = preg_replace('/^'.preg_quote('<', '/').'[^<>]+'.preg_quote('>
', '/').'/', '', $out); + $out = preg_replace('/'.preg_quote('
', '/').'$/', '', trim($out)); + // $out = preg_replace('/^<\?xml encoding="UTF-8">
/', '', $out); + // $out = preg_replace('/<\/div>$/', '', $out); + // var_dump('rrrrrrrrrrrrrrrrrrrrrrrrrrrrr'.$out); } catch (Exception $e) { // If error, invalid HTML string with no way to clean it //print $e->getMessage(); @@ -8889,7 +8906,7 @@ function dol_htmlwithnojs($stringtoencode, $nouseofiframesandbox = 0, $check = ' $out = preg_replace('/on(repeat|begin|finish|beforeinput)[a-z]*\s*=/i', '', $out); } while ($oldstringtoclean != $out); - // Check the limit of external links that are automatically executed in a Rich text content. We count: + // Check the limit of external links that are automatically executed in a Rich text content. We count: // ', we can only accept " diff --git a/test/phpunit/.gitignore b/test/phpunit/.gitignore new file mode 100644 index 00000000000..647a82afb84 --- /dev/null +++ b/test/phpunit/.gitignore @@ -0,0 +1 @@ +/DemoTest.php diff --git a/test/phpunit/SecurityTest.php b/test/phpunit/SecurityTest.php index 3fdef280fed..1d34886db18 100644 --- a/test/phpunit/SecurityTest.php +++ b/test/phpunit/SecurityTest.php @@ -387,7 +387,8 @@ class SecurityTest extends CommonClassTest { $stringtotest = 'eée'; $decodedstring = dol_string_onlythesehtmlattributes($stringtotest); - $this->assertEquals('eée', $decodedstring, 'Function did not sanitize correctly with test 1'); + //$this->assertEquals('eée', $decodedstring, 'Function did not sanitize correctly with test 1'); + $this->assertEquals('eée', $decodedstring, 'Function did not sanitize correctly with test 1'); $stringtotest = '
abc
'; $decodedstring = dol_string_onlythesehtmlattributes($stringtotest); @@ -1296,6 +1297,30 @@ class SecurityTest extends CommonClassTest { global $conf; + // Test on a string in hindi with MAIN_RESTRICTHTML_REMOVE_ALSO_BAD_ATTRIBUTES because + // in past this case was losing the UTF8. + $conf->global->MAIN_RESTRICTHTML_REMOVE_ALSO_BAD_ATTRIBUTES = 0; + + $result = dol_htmlwithnojs('String in Hindi लेखाकर्म', 0, 'restricthtml'); + print __METHOD__." result=".$result."\n"; + $this->assertEquals('String in Hindi लेखाकर्म', $result, 'Test js sanitizing a Hindi string is ko'); + + $conf->global->MAIN_RESTRICTHTML_REMOVE_ALSO_BAD_ATTRIBUTES = 1; + + $result = dol_htmlwithnojs('String in Hindi लेखाकर्म', 0, 'restricthtml'); + print __METHOD__." result=".$result."\n"; + $this->assertEquals('String in Hindi लेखाकर्म', $result, 'Test js sanitizing a Hindi string is ko'); + + $conf->global->MAIN_RESTRICTHTML_REMOVE_ALSO_BAD_ATTRIBUTES = 1; + $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = 1; + $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = 1; + + $result = dol_htmlwithnojs('String in Hindi लेखाकर्म', 0, 'restricthtml'); + print __METHOD__." result=".$result."\n"; + $this->assertEquals('String in Hindi लेखाकर्म', $result, 'Test js sanitizing a Hindi string is ko'); + + + $conf->global->MAIN_RESTRICTHTML_REMOVE_ALSO_BAD_ATTRIBUTES = 0; // If we set this to 1, it will also convert emoticon in htmlentities, so tests must be modified.