Skip to content

Commit

Permalink
Fixed problem with UTF-8 BOM
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinboone committed Apr 13, 2022
1 parent 973ffba commit 02f69e6
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
build
epub2txt
*.o
*.epub
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION := 2.03
VERSION := 2.04
CC := gcc
CFLAGS := -Wall -fPIC -fPIE
LDFLAGS := -pie -s
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# epub2txt -- Extract text from EPUB documents

Version 2.03, January 2022
Version 2.04, April 2022

## What is this?

Expand Down Expand Up @@ -232,6 +232,7 @@ even approximately, in ASCII.

Date | Change
-----|-------
2.04, Apr 2022 | Improved handling of UTF-8 BOMs
2.03, Jan 2022 | Fixed a buffer overrun bug
2.02, May 2020 | Updated XML parser
2.01, January 2019 | Various bug fixes
Expand Down
4 changes: 2 additions & 2 deletions man1/epub2txt.1
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
.\" Copyright (C) 2013-19 Kevin Boone
.\" Copyright (C) 2013-22 Kevin Boone
.\" Permission is granted to any individual or institution to use, copy, or
.\" redistribute this software so long as all of the original files are
.\" included, and that this copyright notice is retained.
.\"
.TH epub2txt 1 "May 2020"
.TH epub2txt 1 "April 2022"
.SH NAME
epub2txt \- Extract text from EPUB documents
.SH SYNOPSIS
Expand Down
2 changes: 1 addition & 1 deletion src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ int main (int argc, char **argv)
if (show_version)
{
printf (APPNAME " version " VERSION "\n");
printf ("Copyright (c)2013-2020 Kevin Boone and contributors\n");
printf ("Copyright (c)2013-2022 Kevin Boone and contributors\n");
printf ("Distributed under the terms of the GNU Public Licence, v3.0\n");
exit (0);
}
Expand Down
16 changes: 14 additions & 2 deletions src/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,21 @@ BOOL string_create_from_utf8_file (const char *filename,
fstat (f, &sb);
int64_t size = sb.st_size;
char *buff = malloc (size + 2);
read (f, buff, size);
self->str = buff;
self->str[size] = 0;

// Read the first three characters, to check for a UTF-8 byte-order-mark
read (f, buff, 3);
if (buff[0] == (char)0xEF && buff[1] == (char)0xBB && buff[2] == (char)0xBF)
{
read (f, buff, size - 3);
self->str[size - 3] = 0;
}
else
{
read (f, buff + 3, size - 3);
self->str[size] = 0;
}

*result = self;
ok = TRUE;
}
Expand Down
7 changes: 6 additions & 1 deletion src/wstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ BOOL wstring_create_from_utf8_file (const char *filename,
close (f);
buff[n] = 0;

self->str = wstring_convert_utf8_to_utf32 (buff);
// Might need to skip a UTF-8 BOM when reading file
if (buff[0] == (char)0xEF && buff[1] == (char)0xBB && buff[2] == (char)0xBF)
self->str = wstring_convert_utf8_to_utf32 (buff + 3);
else
self->str = wstring_convert_utf8_to_utf32 (buff);

free (buff);

*result = self;
Expand Down

0 comments on commit 02f69e6

Please sign in to comment.