From 3bd4d9a1dd32fe05fe32a5e452096e1076fd9e78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BCrk?= Date: Mon, 17 Jul 2023 14:59:37 +0200 Subject: [PATCH] [BUGFIX] Respect language based style names on reading Word files Microsoft Office saves Office document with language based style mappings for default styles. For example, if a german based Word version is used, it writes following to the `word/styles.xml` in the container archive (*.docs): ``` .... ``` versus for a english based version it would be: ``` ... ``` The value of `` defines the internal native code identifier, whereas the `w:styleId` attribute on the outer `` tag would describe the virtual or alias name. Later parsing of the document structure, for example the paragraphs, references the alias (`w:styleId`) name of a style. The reader code uses hardcoded RegEx matchings in a case-insensitive manner but using the englisch speaking variant (`Header\s+d`) - on the language based one, which would not match at all. Therefore, multiple tasks need to be done and contained in this change: * A alias map is implementend and used to register title aliases. Along with this corresponding lookup method is added. * Use the lookup method to resolve for alias where the hardcoded language RegEx is needed to be used. * Gathering all style alias names during reading the wordfile styles settings for all possible styles. --- src/PhpWord/Reader/Word2007/AbstractPart.php | 15 ++++-- src/PhpWord/Reader/Word2007/Styles.php | 29 ++++++++--- src/PhpWord/Style.php | 54 ++++++++++++++++++++ 3 files changed, 87 insertions(+), 11 deletions(-) diff --git a/src/PhpWord/Reader/Word2007/AbstractPart.php b/src/PhpWord/Reader/Word2007/AbstractPart.php index 95799387ed..b64c405a01 100644 --- a/src/PhpWord/Reader/Word2007/AbstractPart.php +++ b/src/PhpWord/Reader/Word2007/AbstractPart.php @@ -28,6 +28,7 @@ use PhpOffice\PhpWord\Element\TrackChange; use PhpOffice\PhpWord\PhpWord; use PhpOffice\PhpWord\Shared\XMLReader; +use PhpOffice\PhpWord\Style; /** * Abstract part reader. @@ -290,14 +291,20 @@ protected function readParagraph(XMLReader $xmlReader, DOMElement $domNode, $par private function getHeadingDepth(?array $paragraphStyle = null) { if (is_array($paragraphStyle) && isset($paragraphStyle['styleName'])) { - if ('Title' === $paragraphStyle['styleName']) { + // Title styles have a special handling in the styles.xms loading and registration, therefore we need to + // use the alias for it here to properly check for the correct systeling. + /** @see Style::addTitleStyle() */ + /** @see Styles::read() */ + $checkStyleName = Style::findAliasForStyleName($paragraphStyle['styleName']); + // Title does not have a depth, early return. + if ('Title' === $checkStyleName) { return 0; } - $headingMatches = []; - preg_match('/Heading(\d)/', $paragraphStyle['styleName'], $headingMatches); + // We need to support here multiple variants: 'Heading 1' , 'Heading_1', 'Heading1' + preg_match('/Heading([_\s]*)(\d)/', $checkStyleName, $headingMatches); if (!empty($headingMatches)) { - return $headingMatches[1]; + return $headingMatches[2]; } } diff --git a/src/PhpWord/Reader/Word2007/Styles.php b/src/PhpWord/Reader/Word2007/Styles.php index 760adf9493..188f31b2c3 100644 --- a/src/PhpWord/Reader/Word2007/Styles.php +++ b/src/PhpWord/Reader/Word2007/Styles.php @@ -19,6 +19,7 @@ use PhpOffice\PhpWord\PhpWord; use PhpOffice\PhpWord\Shared\XMLReader; +use PhpOffice\PhpWord\Style; use PhpOffice\PhpWord\Style\Language; /** @@ -65,8 +66,17 @@ public function read(PhpWord $phpWord): void foreach ($nodes as $node) { $type = $xmlReader->getAttribute('w:type', $node); $name = $xmlReader->getAttribute('w:val', $node, 'w:name'); - if (null === $name) { - $name = $xmlReader->getAttribute('w:styleId', $node); + $alias = $xmlReader->getAttribute('w:styleId', $node); + if (null === $name && null === $alias) { + // no name or alias, skip it as matching would not possible otherwise. + continue; + } + if (null === $name && null !== $alias) { + // fully custom style, use alias as name. + $name = $alias; + } + if (null !== $name && null === $alias) { + $alias = $name; } $headingMatches = []; preg_match('/Heading\s*(\d)/i', $name, $headingMatches); @@ -76,14 +86,17 @@ public function read(PhpWord $phpWord): void $paragraphStyle = $this->readParagraphStyle($xmlReader, $node); $fontStyle = $this->readFontStyle($xmlReader, $node); if (!empty($headingMatches)) { - $phpWord->addTitleStyle($headingMatches[1], $fontStyle, $paragraphStyle); + $titleStyleName = $phpWord->addTitleStyle($headingMatches[1], $fontStyle, $paragraphStyle)->getStyleName(); + Style::addStyleNameAlias($alias, $titleStyleName); } else { if (empty($fontStyle)) { if (is_array($paragraphStyle)) { - $phpWord->addParagraphStyle($name, $paragraphStyle); + $paragraphStyleName = $phpWord->addParagraphStyle($name, $paragraphStyle)->getStyleName(); + Style::addStyleNameAlias($alias, $paragraphStyleName); } } else { - $phpWord->addFontStyle($name, $fontStyle, $paragraphStyle); + $fontStyleName = $phpWord->addFontStyle($name, $fontStyle, $paragraphStyle)->getStyleName(); + Style::addStyleNameAlias($alias, $fontStyleName); } } @@ -91,14 +104,16 @@ public function read(PhpWord $phpWord): void case 'character': $fontStyle = $this->readFontStyle($xmlReader, $node); if (!empty($fontStyle)) { - $phpWord->addFontStyle($name, $fontStyle); + $fontStyleName = $phpWord->addFontStyle($name, $fontStyle)->getStyleName(); + Style::addStyleNameAlias($alias, $fontStyleName); } break; case 'table': $tStyle = $this->readTableStyle($xmlReader, $node); if (!empty($tStyle)) { - $phpWord->addTableStyle($name, $tStyle); + $tableStyleName = $phpWord->addTableStyle($name, $tStyle)->getStyleName(); + Style::addStyleNameAlias($alias, $tableStyleName); } break; diff --git a/src/PhpWord/Style.php b/src/PhpWord/Style.php index ea039622df..4683773dd7 100644 --- a/src/PhpWord/Style.php +++ b/src/PhpWord/Style.php @@ -35,6 +35,13 @@ class Style */ private static $styles = []; + /** + * Mapping style name to internal code identifier. + * + * @var array + */ + private static $nameToIdentifierMapping = []; + /** * Add paragraph style. * @@ -124,6 +131,52 @@ public static function addTableStyle($styleName, $styleTable, $styleFirstRow = n return self::setStyleValues($styleName, new Table($styleTable, $styleFirstRow), null); } + /** + * Add a styleName to identifier mapping entry. + * + * @param string $alias + * @param string $styleName + * + * @see self::resolveStyleNameIdentifier() + */ + public static function addStyleNameAlias($alias, $styleName): void + { + self::$nameToIdentifierMapping[$alias] = $styleName; + } + + /** + * Find a the correct for a specified $alias. If $alias is a validName, it is returned. Otherwise, it will return + * the styleName for the alias if one is found, and a empty string if nothing could be found. + * + * @param string $alias + * + * @return string + */ + public static function findStyleNameForAlias($alias) + { + foreach (self::$nameToIdentifierMapping as $alias => $mappedStyleName) { + if ($mappedStyleName === $alias) { + return $alias; + } + } + + return ''; + } + + /** + * Returns the alias for a specific $styleName. If no alias could be found, $styleName is returned. + * + * @param string $styleName + * + * @return string + */ + public static function findAliasForStyleName($styleName) + { + return (isset(self::$nameToIdentifierMapping[$styleName]) && !empty(self::$nameToIdentifierMapping[$styleName])) + ? self::$nameToIdentifierMapping[$styleName] + : $styleName; + } + /** * Count styles. * @@ -144,6 +197,7 @@ public static function countStyles() public static function resetStyles(): void { self::$styles = []; + self::$nameToIdentifierMapping = []; } /**