Skip to content

Commit

Permalink
make .epub files reproducable for testing by using `EBOOK_CREATION_TI…
Browse files Browse the repository at this point in the history
…ME` env to fake time
  • Loading branch information
ilius committed Jan 8, 2025
1 parent ac4df1d commit 7a07f8c
Show file tree
Hide file tree
Showing 4 changed files with 285 additions and 1 deletion.
3 changes: 2 additions & 1 deletion pyglossary/ebook_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from typing import TYPE_CHECKING, cast

from pyglossary.os_utils import indir, rmtree
from pyglossary.repro_zipfile.repro_zipfile import ReproducibleZipFile

if TYPE_CHECKING:
import io
Expand Down Expand Up @@ -430,7 +431,7 @@ def open(self, filename: str) -> None:
self._filename = filename

def _doZip(self) -> None:
with zipfile.ZipFile(
with ReproducibleZipFile(
self._filename,
mode="w",
compression=zipfile.ZIP_DEFLATED,
Expand Down
81 changes: 81 additions & 0 deletions pyglossary/repro_zipfile/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
Unless otherwise indicated, this software is copyright of DrivenData and
licensed under the MIT License. Some portions of this software are copied and
modified from Python 3.11, which is copyright of the Python Software Foundation
and licensed under the Python Software Foundation License Version 2.

==============================================================================

MIT License

Copyright (c) 2023 DrivenData Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the “Software”), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

==============================================================================

PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2

Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
All Rights Reserved

1. This LICENSE AGREEMENT is between the Python Software Foundation
("PSF"), and the Individual or Organization ("Licensee") accessing and
otherwise using this software ("Python") in source or binary form and
its associated documentation.

2. Subject to the terms and conditions of this License Agreement, PSF hereby
grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
analyze, test, perform and/or display publicly, prepare derivative works,
distribute, and otherwise use Python alone or in any derivative version,
provided, however, that PSF's License Agreement and PSF's notice of copyright,
i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
All Rights Reserved" are retained in Python alone or in any derivative version
prepared by Licensee.

3. In the event Licensee prepares a derivative work that is based on
or incorporates Python or any part thereof, and wants to make
the derivative work available to others as provided herein, then
Licensee hereby agrees to include in any such work a brief summary of
the changes made to Python.

4. PSF is making Python available to Licensee on an "AS IS"
basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
INFRINGE ANY THIRD PARTY RIGHTS.

5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.

6. This License Agreement will automatically terminate upon a material
breach of its terms and conditions.

7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between PSF and
Licensee. This License Agreement does not grant permission to use PSF
trademarks or trade name in a trademark sense to endorse or promote
products or services of Licensee, or any third party.

8. By copying, installing or otherwise using Python, Licensee
agrees to be bound by the terms and conditions of this License
Agreement.
201 changes: 201 additions & 0 deletions pyglossary/repro_zipfile/repro_zipfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from copy import copy
import os
import shutil
import sys
import time
from typing import Tuple, Union
from zipfile import ZIP_LZMA, ZipFile, ZipInfo

try:
from zipfile import _MASK_COMPRESS_OPTION_1 # type: ignore[attr-defined]
except ImportError:
_MASK_COMPRESS_OPTION_1 = 0x02

__version__ = "0.3.1"


def date_time() -> Union[time.struct_time, Tuple[int, int, int, int, int, int]]:
"""Returns date_time value used to force overwrite on all ZipInfo objects. Defaults to
1980-01-01 00:00:00. You can set this with the environment variable SOURCE_DATE_EPOCH as an
integer value representing seconds since Epoch.
"""
source_date_epoch = os.environ.get("SOURCE_DATE_EPOCH", None)
if source_date_epoch is not None:
return time.gmtime(int(source_date_epoch))
return (1980, 1, 1, 0, 0, 0)


def file_mode() -> int:
"""Returns the file permissions mode value used to force overwrite on all ZipInfo objects.
Defaults to 0o644 (rw-r--r--). You can set this with the environment variable
REPRO_ZIPFILE_FILE_MODE. It should be in the Unix standard three-digit octal representation
(e.g., '644').
"""
file_mode_env = os.environ.get("REPRO_ZIPFILE_FILE_MODE", None)
if file_mode_env is not None:
return int(file_mode_env, 8)
return 0o644


def dir_mode() -> int:
"""Returns the directory permissions mode value used to force overwrite on all ZipInfo objects.
Defaults to 0o755 (rwxr-xr-x). You can set this with the environment variable
REPRO_ZIPFILE_DIR_MODE. It should be in the Unix standard three-digit octal representation
(e.g., '755').
"""
dir_mode_env = os.environ.get("REPRO_ZIPFILE_DIR_MODE", None)
if dir_mode_env is not None:
return int(dir_mode_env, 8)
return 0o755


class ReproducibleZipFile(ZipFile):
"""Open a ZIP file, where file can be a path to a file (a string), a file-like object or a
path-like object.
This is a replacement for the Python standard library zipfile.ZipFile that overwrites
file-modified timestamps and file/directory permissions modes in write mode in order to create
a reproducible ZIP archive. Other than overwriting these values, it works the same way as
zipfile.ZipFile. For documentation on use, see the Python documentation for zipfile:
https://docs.python.org/3/library/zipfile.html
"""

# Following method modified from Python 3.11
# https://github.com/python/cpython/blob/202efe1a3bcd499f3bf17bd953c6d36d47747e78/Lib/zipfile.py#L1763-L1794
# Copyright Python Software Foundation, licensed under PSF License Version 2
# See LICENSE file for full license agreement and notice of copyright
def write(self, filename, arcname=None, compress_type=None, compresslevel=None):
"""Put the bytes from filename into the archive under the name arcname."""

if not self.fp:
raise ValueError("Attempt to write to ZIP archive that was already closed")
if self._writing:
raise ValueError("Can't write to ZIP archive while an open writing handle exists")

zinfo = ZipInfo.from_file(filename, arcname, strict_timestamps=self._strict_timestamps)

## repro-zipfile ADDED ##
# Overwrite date_time and extrnal_attr (permissions mode)
zinfo = copy(zinfo)
zinfo.date_time = date_time()
if zinfo.is_dir():
zinfo.external_attr = (0o40000 | dir_mode()) << 16
zinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
zinfo.external_attr = file_mode() << 16
#########################

if zinfo.is_dir():
zinfo.compress_size = 0
zinfo.CRC = 0
self.mkdir(zinfo)
else:
if compress_type is not None:
zinfo.compress_type = compress_type
else:
zinfo.compress_type = self.compression

if compresslevel is not None:
zinfo._compresslevel = compresslevel
else:
zinfo._compresslevel = self.compresslevel

with open(filename, "rb") as src, self.open(zinfo, "w") as dest:
shutil.copyfileobj(src, dest, 1024 * 8)

# Following method modified from Python 3.11
# https://github.com/python/cpython/blob/202efe1a3bcd499f3bf17bd953c6d36d47747e78/Lib/zipfile.py#L1796-L1835
# Copyright Python Software Foundation, licensed under PSF License Version 2
# See LICENSE file for full license agreement and notice of copyright
def writestr(self, zinfo_or_arcname, data, compress_type=None, compresslevel=None):
"""Write a file into the archive. The contents is 'data', which may be either a 'str' or
a 'bytes' instance; if it is a 'str', it is encoded as UTF-8 first. 'zinfo_or_arcname' is
either a ZipInfo instance or the name of the file in the archive."""
if isinstance(data, str):
data = data.encode("utf-8")
if not isinstance(zinfo_or_arcname, ZipInfo):
zinfo = ZipInfo(filename=zinfo_or_arcname, date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
zinfo._compresslevel = self.compresslevel
if zinfo.filename.endswith("/"):
zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
zinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
zinfo.external_attr = 0o600 << 16 # ?rw-------
else:
zinfo = zinfo_or_arcname

## repro-zipfile ADDED ##
# Overwrite date_time and extrnal_attr (permissions mode)
zinfo = copy(zinfo)
zinfo.date_time = date_time()
if zinfo.is_dir():
zinfo.external_attr = (0o40000 | dir_mode()) << 16
zinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
zinfo.external_attr = file_mode() << 16
#########################

if not self.fp:
raise ValueError("Attempt to write to ZIP archive that was already closed")
if self._writing:
raise ValueError("Can't write to ZIP archive while an open writing handle exists.")

if compress_type is not None:
zinfo.compress_type = compress_type

if compresslevel is not None:
zinfo._compresslevel = compresslevel

zinfo.file_size = len(data) # Uncompressed size
with self._lock:
with self.open(zinfo, mode="w") as dest:
dest.write(data)

if sys.version_info < (3, 11):
# Following method modified from Python 3.11
# https://github.com/python/cpython/blob/202efe1a3bcd499f3bf17bd953c6d36d47747e78/Lib/zipfile.py#L1837-L1870
# Copyright Python Software Foundation, licensed under PSF License Version 2
# See LICENSE file for full license agreement and notice of copyright
def mkdir(self, zinfo_or_directory_name, mode=511):
"""Creates a directory inside the zip archive."""
if isinstance(zinfo_or_directory_name, ZipInfo):
zinfo = zinfo_or_directory_name
if not zinfo.is_dir():
raise ValueError("The given ZipInfo does not describe a directory")
elif isinstance(zinfo_or_directory_name, str):
directory_name = zinfo_or_directory_name
if not directory_name.endswith("/"):
directory_name += "/"
zinfo = ZipInfo(directory_name)
zinfo.compress_size = 0
zinfo.CRC = 0
zinfo.external_attr = ((0o40000 | mode) & 0xFFFF) << 16
zinfo.file_size = 0
zinfo.external_attr |= 0x10
else:
raise TypeError("Expected type str or ZipInfo")

## repro-zipfile ADDED ##
# Overwrite date_time and extrnal_attr (permissions mode)
zinfo = copy(zinfo)
zinfo.date_time = date_time()
zinfo.external_attr = (0o40000 | dir_mode()) << 16
zinfo.external_attr |= 0x10 # MS-DOS directory flag
#########################

with self._lock:
if self._seekable:
self.fp.seek(self.start_dir)
zinfo.header_offset = self.fp.tell() # Start of header bytes
if zinfo.compress_type == ZIP_LZMA:
# Compressed data includes an end-of-stream (EOS) marker
zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1

self._writecheck(zinfo)
self._didModify = True

self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
self.fp.write(zinfo.FileHeader(False))
self.start_dir = self.fp.tell()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ target-version = "py310"

# Exclude a variety of commonly ignored directories.
exclude = [
"pyglossary/repro_zipfile/repro_zipfile.py",
"whitelist.py", # for vulture
"pyglossary/plugins/babylon_bgl/bgl_gzip.py",
"pyglossary/plugins/testformat",
Expand Down

0 comments on commit 7a07f8c

Please sign in to comment.