Skip to content

Commit

Permalink
Handle alternate Unicode name representation cXXX and fix #943 (#944)
Browse files Browse the repository at this point in the history
  • Loading branch information
BobLd authored Nov 24, 2024
1 parent bcc8cce commit 2080424
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
7 changes: 7 additions & 0 deletions src/UglyToad.PdfPig.Fonts/GlyphList.cs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,13 @@ public string NameToUnicode(string name)

unicode = char.ConvertFromUtf32(codePoint);
}
else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
{
// name representation cXXX
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
System.Diagnostics.Debug.Assert(codePoint > 0);
unicode = char.ConvertFromUtf32(codePoint);
}
else
{
return null;
Expand Down
23 changes: 23 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Content;
using DocumentLayoutAnalysis.PageSegmenter;
using DocumentLayoutAnalysis.WordExtractor;

public class GithubIssuesTests
{
[Fact]
public void Issue943()
{
var path = IntegrationHelpers.GetDocumentPath("MOZILLA-10225-0.pdf");

using (var document = PdfDocument.Open(path))
{
var page = document.GetPage(1);
Assert.NotNull(page);

var letters = page.Letters;
Assert.NotNull(letters);

var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);

Assert.Equal("Rocket and Spacecraft Propulsion", blocks[0].TextLines[0].Text);
Assert.Equal("Principles, Practice and New Developments (Second Edition)", blocks[0].TextLines[1].Text);
}
}

[Fact]
public void Issue736()
{
Expand Down

0 comments on commit 2080424

Please sign in to comment.