From b992031fa2c0acc7924cd2442d9f77da1cfdf4a3 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Fri, 14 Jan 2022 16:16:38 +0800 Subject: [PATCH 1/2] add defusedxml support --- docs/api_elan.rst | 8 +++++++- docs/updates.rst | 10 +++++++++- speach/elan.py | 15 +++++++++++---- test/test_elan.py | 4 ++++ 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/docs/api_elan.rst b/docs/api_elan.rst index fb7bb92..252464b 100644 --- a/docs/api_elan.rst +++ b/docs/api_elan.rst @@ -5,6 +5,12 @@ ELAN module ``speach`` supports reading and manipulating multi-tier transcriptions from ELAN directly. +.. note:: + + For better security, ``speach`` will use the package ``defusedxml`` automatically if available to parse XML streams (instead of Python's default parser). + When ``defusedxml`` is available, the flag ``speach.elan.SAFE_MODE`` will be set to True. + + For common code samples to processing ELAN, see :ref:`recipe_elan` page. .. contents:: Table of Contents @@ -55,4 +61,4 @@ or an annotation sequence in the case of symbolic subdivision tiers. .. autoclass:: TimeSlot :members: - :member-order: groupwise \ No newline at end of file + :member-order: groupwise diff --git a/docs/updates.rst b/docs/updates.rst index fdd7de0..0ca181c 100644 --- a/docs/updates.rst +++ b/docs/updates.rst @@ -3,7 +3,15 @@ Speach Changelog ================ -Speach 01.a12 +Speach 0.1a13 +------------- + +- 2022-01-14 + + - Use ``defusedxml`` automatically when available to parse XML for better security + +Speach 0.1a12 +------------- - 2021-11-03 diff --git a/speach/elan.py b/speach/elan.py index 3f574e4..51b195e 100644 --- a/speach/elan.py +++ b/speach/elan.py @@ -15,6 +15,13 @@ from collections import defaultdict as dd from typing import List, Tuple import xml.etree.ElementTree as etree +try: + import defusedxml.ElementTree as best_parser + SAFE_MODE = True +except ModuleNotFoundError as e: + best_parser = etree + SAFE_MODE = False + import warnings from chirptext import DataObject @@ -848,7 +855,7 @@ def parse_stream(cls, ecv_stream, *args, **kwargs): :param ecv_stream: ECV text input stream :rtype: speach.elan.ExternalControlledVocabResource """ - _root = etree.fromstring(ecv_stream.read()) + _root = best_parser.fromstring(ecv_stream.read()) ecv = ExternalControlledVocabResource(xml_node=_root, **kwargs) return ecv @@ -1176,8 +1183,8 @@ def to_xml_bin(self, encoding='utf-8', default_namespace=None, short_empty_eleme :returns: EAF content :rtype: bytes """ - _content = etree.tostring(self.__xml_root, encoding=encoding, method="xml", - short_empty_elements=short_empty_elements, *args, **kwargs) + _content = best_parser.tostring(self.__xml_root, encoding=encoding, method="xml", + short_empty_elements=short_empty_elements, *args, **kwargs) return _content def to_xml_str(self, encoding='utf-8', *args, **kwargs): @@ -1248,7 +1255,7 @@ def parse_eaf_stream(cls, eaf_stream, *args, **kwargs): :param eaf_stream: EAF text input stream :rtype: speach.elan.Doc """ - _root = etree.fromstring(eaf_stream.read()) + _root = best_parser.fromstring(eaf_stream.read()) _doc = Doc() _doc.__xml_root = _root _doc._update_info_xml(_root) diff --git a/test/test_elan.py b/test/test_elan.py index 37bb14a..f333eba 100644 --- a/test/test_elan.py +++ b/test/test_elan.py @@ -39,6 +39,10 @@ def read_eaf(): class TestELAN(unittest.TestCase): + def test_safe_mode(self): + print(f"ELAN safe mode: {elan.SAFE_MODE}") + print(f"ELAN XML parser: {elan.best_parser.__name__}") + def test_read_elan(self): eaf = read_eaf() expected_tiernames = ['Person1 (Utterance)', 'marker', 'Person1 (Chunk)', 'Person1 (ChunkLanguage)', From 98d00d3ebaa8754997a6c09543927ab4e4aca48e Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Fri, 14 Jan 2022 16:16:51 +0800 Subject: [PATCH 2/2] pump version to 0.1a13 --- speach/__version__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speach/__version__.py b/speach/__version__.py index a85a1fa..503c86a 100644 --- a/speach/__version__.py +++ b/speach/__version__.py @@ -14,6 +14,6 @@ __issue__ = "https://github.com/neocl/speach/issues/" __maintainer__ = "Le Tuan Anh" __version_major__ = "0.1" # follow PEP-0440 -__version__ = "{}a12".format(__version_major__) -__version_long__ = "{} - Alpha 12".format(__version_major__) +__version__ = "{}a13".format(__version_major__) +__version_long__ = "{} - Alpha 13".format(__version_major__) __status__ = "3 - Alpha"