Skip to content

Commit

Permalink
[BUGFIX] Respect language based style names on reading Word files
Browse files Browse the repository at this point in the history
Microsoft Office saves Office document with language based style
mappings for default styles. For example, if a german based Word
version is used, it writes following to the `word/styles.xml` in
the container archive (*.docs):

```
<w:style w:type="paragraph" w:styleId="berschrift1">
  <w:name w:val="heading 1"/>
  ....
  </w:style>
```

versus for a english based version it would be:

```
<w:style w:type="paragraph" w:styleId="Heading1">
  <w:name w:val="heading 1"/>
  ...
</w:style>
```

The value of `<w:name />` defines the internal native code
identifier, whereas the `w:styleId` attribute on the outer
`<w:style />` tag would describe the virtual or alias name.

Later parsing of the document structure, for example the
paragraphs, references the alias (`w:styleId`) name of a
style. The reader code uses hardcoded RegEx matchings in
a case-insensitive manner but using the englisch speaking
variant (`Header\s+d`) - on the language based one, which
would not match at all.

Therefore, multiple tasks need to be done and contained
in this change:

* A alias map is implementend and used to register title
  aliases. Along with this corresponding lookup method is
  added.
* Use the lookup method to resolve for alias where the
  hardcoded language RegEx is needed to be used.
* Gathering all style alias names during reading the
  wordfile styles settings for all possible styles.
  • Loading branch information
sbuerk committed Apr 2, 2024
1 parent 8b891bb commit 3bd4d9a
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 11 deletions.
15 changes: 11 additions & 4 deletions src/PhpWord/Reader/Word2007/AbstractPart.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
use PhpOffice\PhpWord\Element\TrackChange;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Shared\XMLReader;
use PhpOffice\PhpWord\Style;

/**
* Abstract part reader.
Expand Down Expand Up @@ -290,14 +291,20 @@ protected function readParagraph(XMLReader $xmlReader, DOMElement $domNode, $par
private function getHeadingDepth(?array $paragraphStyle = null)
{
if (is_array($paragraphStyle) && isset($paragraphStyle['styleName'])) {
if ('Title' === $paragraphStyle['styleName']) {
// Title styles have a special handling in the styles.xms loading and registration, therefore we need to
// use the alias for it here to properly check for the correct systeling.
/** @see Style::addTitleStyle() */
/** @see Styles::read() */
$checkStyleName = Style::findAliasForStyleName($paragraphStyle['styleName']);
// Title does not have a depth, early return.
if ('Title' === $checkStyleName) {
return 0;
}

$headingMatches = [];
preg_match('/Heading(\d)/', $paragraphStyle['styleName'], $headingMatches);
// We need to support here multiple variants: 'Heading 1' , 'Heading_1', 'Heading1'
preg_match('/Heading([_\s]*)(\d)/', $checkStyleName, $headingMatches);
if (!empty($headingMatches)) {
return $headingMatches[1];
return $headingMatches[2];
}
}

Expand Down
29 changes: 22 additions & 7 deletions src/PhpWord/Reader/Word2007/Styles.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Shared\XMLReader;
use PhpOffice\PhpWord\Style;
use PhpOffice\PhpWord\Style\Language;

/**
Expand Down Expand Up @@ -65,8 +66,17 @@ public function read(PhpWord $phpWord): void
foreach ($nodes as $node) {
$type = $xmlReader->getAttribute('w:type', $node);
$name = $xmlReader->getAttribute('w:val', $node, 'w:name');
if (null === $name) {
$name = $xmlReader->getAttribute('w:styleId', $node);
$alias = $xmlReader->getAttribute('w:styleId', $node);
if (null === $name && null === $alias) {
// no name or alias, skip it as matching would not possible otherwise.
continue;
}
if (null === $name && null !== $alias) {
// fully custom style, use alias as name.
$name = $alias;
}
if (null !== $name && null === $alias) {
$alias = $name;
}
$headingMatches = [];
preg_match('/Heading\s*(\d)/i', $name, $headingMatches);
Expand All @@ -76,29 +86,34 @@ public function read(PhpWord $phpWord): void
$paragraphStyle = $this->readParagraphStyle($xmlReader, $node);
$fontStyle = $this->readFontStyle($xmlReader, $node);
if (!empty($headingMatches)) {
$phpWord->addTitleStyle($headingMatches[1], $fontStyle, $paragraphStyle);
$titleStyleName = $phpWord->addTitleStyle($headingMatches[1], $fontStyle, $paragraphStyle)->getStyleName();
Style::addStyleNameAlias($alias, $titleStyleName);
} else {
if (empty($fontStyle)) {
if (is_array($paragraphStyle)) {
$phpWord->addParagraphStyle($name, $paragraphStyle);
$paragraphStyleName = $phpWord->addParagraphStyle($name, $paragraphStyle)->getStyleName();
Style::addStyleNameAlias($alias, $paragraphStyleName);
}
} else {
$phpWord->addFontStyle($name, $fontStyle, $paragraphStyle);
$fontStyleName = $phpWord->addFontStyle($name, $fontStyle, $paragraphStyle)->getStyleName();
Style::addStyleNameAlias($alias, $fontStyleName);
}
}

break;
case 'character':
$fontStyle = $this->readFontStyle($xmlReader, $node);
if (!empty($fontStyle)) {
$phpWord->addFontStyle($name, $fontStyle);
$fontStyleName = $phpWord->addFontStyle($name, $fontStyle)->getStyleName();
Style::addStyleNameAlias($alias, $fontStyleName);
}

break;
case 'table':
$tStyle = $this->readTableStyle($xmlReader, $node);
if (!empty($tStyle)) {
$phpWord->addTableStyle($name, $tStyle);
$tableStyleName = $phpWord->addTableStyle($name, $tStyle)->getStyleName();
Style::addStyleNameAlias($alias, $tableStyleName);
}

break;
Expand Down
54 changes: 54 additions & 0 deletions src/PhpWord/Style.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ class Style
*/
private static $styles = [];

/**
* Mapping style name to internal code identifier.
*
* @var array<string, string>
*/
private static $nameToIdentifierMapping = [];

/**
* Add paragraph style.
*
Expand Down Expand Up @@ -124,6 +131,52 @@ public static function addTableStyle($styleName, $styleTable, $styleFirstRow = n
return self::setStyleValues($styleName, new Table($styleTable, $styleFirstRow), null);
}

/**
* Add a styleName to identifier mapping entry.
*
* @param string $alias
* @param string $styleName
*
* @see self::resolveStyleNameIdentifier()
*/
public static function addStyleNameAlias($alias, $styleName): void
{
self::$nameToIdentifierMapping[$alias] = $styleName;
}

/**
* Find a the correct for a specified $alias. If $alias is a validName, it is returned. Otherwise, it will return
* the styleName for the alias if one is found, and a empty string if nothing could be found.
*
* @param string $alias
*
* @return string
*/
public static function findStyleNameForAlias($alias)
{
foreach (self::$nameToIdentifierMapping as $alias => $mappedStyleName) {
if ($mappedStyleName === $alias) {
return $alias;
}
}

return '';
}

/**
* Returns the alias for a specific $styleName. If no alias could be found, $styleName is returned.
*
* @param string $styleName
*
* @return string
*/
public static function findAliasForStyleName($styleName)
{
return (isset(self::$nameToIdentifierMapping[$styleName]) && !empty(self::$nameToIdentifierMapping[$styleName]))
? self::$nameToIdentifierMapping[$styleName]
: $styleName;
}

/**
* Count styles.
*
Expand All @@ -144,6 +197,7 @@ public static function countStyles()
public static function resetStyles(): void
{
self::$styles = [];
self::$nameToIdentifierMapping = [];
}

/**
Expand Down

0 comments on commit 3bd4d9a

Please sign in to comment.