Skip to content

Commit

Permalink
Merge pull request #31 from openscilab/feature/pptx
Browse files Browse the repository at this point in the history
add support for .pptx, .xlsx format
  • Loading branch information
sadrasabouri authored Jul 22, 2024
2 parents 4bf23ab + 7328b69 commit 050cb99
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 74 deletions.
19 changes: 14 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,27 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
### Added
- `pptx` and `xlsx` support
- `get_microsoft_format` function in `util.py`
### Changed
- `extract` function in `util.py`
- `remove_format` function in `util.py`
- `clear` function in `functions.py`
- `clear_all` function in `functions.py`
- `update` function in `functions.py`
- `update_all` function in `functions.py`
- `extract_namespaces` function in `util.py`
## [0.1] - 2024-06-19
### Added
- `CLI` handler
- `main` function in `__main__.py`
- `README.md`
- `clear` function `functions.py`
- `clear` function in `functions.py`
- `clear_all` function in `functions.py`
- `update` function `functions.py`
- `update_all` function `functions.py`
- `run_dmeta` function `functions.py`
- `dmeta_help` function `functions.py`
- `update` function in `functions.py`
- `update_all` function in `functions.py`
- `run_dmeta` function in `functions.py`
- `dmeta_help` function in `functions.py`
- `extract_namespaces` function in `util.py`
- `remove_format` function in `util.py`
- `extract_docx` function in `util.py`
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ dmeta --update-all --config "./config.json"
## Supported files
| File format | support |
| ---------------- | ---------------- |
| Microsoft word office(.docx) | ✅ |
| Microsoft Word (.docx) | ✅ |
| Microsoft PowerPoint (.pptx) | ✅ |
| Microsoft Excel (.xlsx) | ✅ |


## Issues & bug reports

Expand Down
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
art==6.2
defusedxml==0.7.1
lxml>=5.2.2
setuptools>=40.8.0
vulture>=1.0
bandit>=1.5.1
Expand Down
97 changes: 49 additions & 48 deletions dmeta/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,75 @@
import shutil
import zipfile
from art import tprint
from .util import remove_format, extract_docx, read_json
import defusedxml.ElementTree as ET
from .util import get_microsoft_format, extract, read_json
import defusedxml.lxml as lxml


from .params import CORE_XML_MAP, APP_XML_MAP, OVERVIEW, DMETA_VERSION


def clear(docx_file_name):
def clear(microsoft_file_name):
"""
Clear all the editable metadata in the given .docx file.
Clear all the editable metadata in the given microsoft file.
:param docx_file_name: name of .docx file
:type docx_file_name: str
:param microsoft_file_name: name of microsoft file
:type microsoft_file_name: str
:return: None
"""
docx_file_name = remove_format(docx_file_name)
unzipped_dir, source_file = extract_docx(docx_file_name)
microsoft_format = get_microsoft_format(microsoft_file_name)
unzipped_dir, source_file = extract(microsoft_file_name)
doc_props_dir = os.path.join(unzipped_dir, "docProps")
core_xml_path = os.path.join(doc_props_dir, "core.xml")
app_xml_path = os.path.join(doc_props_dir, "app.xml")

if os.path.exists(core_xml_path):
e_core = ET.parse(core_xml_path)
e_core = lxml.parse(core_xml_path)
for xml_element in e_core.iter():
for personal_field in CORE_XML_MAP.keys():
associated_xml_tag = CORE_XML_MAP[personal_field]
if (associated_xml_tag in xml_element.tag):
for personal_field in CORE_XML_MAP.values():
if (personal_field in xml_element.tag):
xml_element.text = ""
e_core.write(core_xml_path, "utf-8", True, None, "xml")
e_core.write(core_xml_path)

if os.path.exists(app_xml_path):
e_app = ET.parse(app_xml_path)
e_app = lxml.parse(app_xml_path)
for xml_element in e_app.iter():
for personal_field in APP_XML_MAP.keys():
associated_xml_tag = APP_XML_MAP[personal_field]
if (associated_xml_tag in xml_element.tag):
for personal_field in APP_XML_MAP.values():
if (personal_field in xml_element.tag):
xml_element.text = ""
e_app.write(app_xml_path, "utf-8", True, None, "xml")
e_app.write(app_xml_path)

modified_docx = docx_file_name + "_cleared"
with zipfile.ZipFile(modified_docx + ".docx", "w") as docx:
modified = microsoft_file_name[:microsoft_file_name.rfind('.')] + "_cleared"
with zipfile.ZipFile(modified + "." + microsoft_format, "w") as file:
for file_name in source_file.namelist():
docx.write(os.path.join(unzipped_dir, file_name), file_name)
docx.close()
file.write(os.path.join(unzipped_dir, file_name), file_name)
file.close()
shutil.rmtree(unzipped_dir)


def clear_all():
"""
Clear all the editable metadata in any .docx file in the current directory.
Clear all the editable metadata in any microsoft file in the current directory.
:return: None
"""
path = os.getcwd()
dir_list = os.listdir(path)
docx_files = []
microsoft_files = []
for item in dir_list:
if ".docx" in item:
docx_files.append(item)
for docx_file in docx_files:
clear(docx_file)
if get_microsoft_format(item) is not None:
microsoft_files.append(item)
for microsoft_file in microsoft_files:
clear(microsoft_file)


def update(config_file_name, docx_file_name):
def update(config_file_name, microsoft_file_name):
"""
Update all the editable metadata in the given .docx file according to the given config file.
Update all the editable metadata in the given microsoft file according to the given config file.
:param config_file_name: name of .json config file
:type config_file_name: str
:param docx_file_name: name of .docx file
:type docx_file_name: str
:param microsoft_file_name: name of microsoft file
:type microsoft_file_name: str
:return: None
"""
config = read_json(config_file_name)
Expand All @@ -86,55 +86,56 @@ def update(config_file_name, docx_file_name):
print("There isn't any chosen personal field to remove")
return

docx_file_name = remove_format(docx_file_name)
unzipped_dir, source_file = extract_docx(docx_file_name)
microsoft_format = get_microsoft_format(microsoft_file_name)
unzipped_dir, source_file = extract(microsoft_file_name)
doc_props_dir = os.path.join(unzipped_dir, "docProps")
core_xml_path = os.path.join(doc_props_dir, "core.xml")
app_xml_path = os.path.join(doc_props_dir, "app.xml")

if has_core_tags:
if os.path.exists(core_xml_path):
e_core = ET.parse(core_xml_path)
e_core = lxml.parse(core_xml_path)
for xml_element in e_core.iter():
for personal_field in personal_fields_core_xml:
associated_xml_tag = CORE_XML_MAP[personal_field]
if (associated_xml_tag in xml_element.tag):
xml_element.text = config[personal_field]
e_core.write(core_xml_path, "utf-8", True, None, "xml")
e_core.write(core_xml_path)

if has_app_tags:
if os.path.exists(app_xml_path):
e_app = ET.parse(app_xml_path)
e_app = lxml.parse(app_xml_path)
for xml_element in e_app.iter():
for personal_field in personal_fields_app_xml:
associated_xml_tag = APP_XML_MAP[personal_field]
if (associated_xml_tag in xml_element.tag):
xml_element.text = config[personal_field]
e_app.write(app_xml_path, "utf-8", True, None, "xml")
e_app.write(app_xml_path)

modified_docx = docx_file_name + "_updated"
with zipfile.ZipFile(modified_docx + ".docx", "w") as docx:
for filename in source_file.namelist():
docx.write(os.path.join(unzipped_dir, filename), filename)
modified = microsoft_file_name[:microsoft_file_name.rfind('.')] + "_updated"
with zipfile.ZipFile(modified + "." + microsoft_format, "w") as file:
for file_name in source_file.namelist():
file.write(os.path.join(unzipped_dir, file_name), file_name)
file.close()
shutil.rmtree(unzipped_dir)


def update_all(config_file_name):
"""
Update all the editable metadata in any .docx file in the current directory according to the given config file.
Update all the editable metadata in any microsoft file in the current directory according to the given config file.
:param config_file_name: name of .json config file
:type config_file_name: str
:return: None
"""
path = os.getcwd()
dir_list = os.listdir(path)
docx_files = []
microsoft_files = []
for item in dir_list:
if ".docx" in item:
docx_files.append(item)
for docx_file in docx_files:
update(config_file_name, docx_file)
if get_microsoft_format(item) is not None:
microsoft_files.append(item)
for microsoft_file in microsoft_files:
update(config_file_name, microsoft_file)


def dmeta_help():
Expand Down
5 changes: 5 additions & 0 deletions dmeta/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,8 @@
"company": "Company",
"manager": "Manager"
}
SUPPORTED_MICROSOFT_FORMATS = [
"docx",
"pptx",
"xlsx"
]
37 changes: 19 additions & 18 deletions dmeta/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from shutil import rmtree
from zipfile import ZipFile
import defusedxml.ElementTree as ET
from .params import SUPPORTED_MICROSOFT_FORMATS


def extract_namespaces(xml_file_path):
Expand All @@ -18,43 +19,43 @@ def extract_namespaces(xml_file_path):
namespaces = {}
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Extract namespaces from the root element
for key, value in root.attrib.items():
if key.startswith('xmlns:'):
_, _, cropped_name = key.partition(':')
namespaces[cropped_name] = value
elif key == 'xmlns':
namespaces['xmlns'] = value

return namespaces


def remove_format(docx_file_name):
def get_microsoft_format(file_name):
"""
Remove the format from the end of the .docx file name.
Extract format from the end of the given microsoft file name.
:param docx_file_name: name of .docx file
:type docx_file_name: str
:return: str (the .docx file name without format at the end)
:param file_name: name of the microsoft file name
:type file_name: str
:return: str
"""
last_dot_index = docx_file_name.rfind('.')
if (last_dot_index != -1):
docx_file_name = docx_file_name[:last_dot_index]
return docx_file_name
last_dot_index = file_name.rfind('.')
if (last_dot_index == -1):
return None
format = file_name[last_dot_index + 1:]
if format not in SUPPORTED_MICROSOFT_FORMATS:
return None
return format


def extract_docx(docx_file_name):
def extract(file_name):
"""
Zip and extract the .docx file.
Zip and extract the microsoft file.
:param docx_file_name: name of .docx file
:type docx_file_name: str
:param file_name: name of microsoft file
:type file_name: str
:return: (str, ZipFile) as (unzipped directory, ZipFile instance to work with the extracted content)
"""
docx_file_name = remove_format(docx_file_name)
source_file = ZipFile(docx_file_name + ".docx")
unzipped_dir = os.path.join(docx_file_name + "_unzipped")
source_file = ZipFile(file_name)
unzipped_dir = os.path.join(file_name[:file_name.rfind(".")] + "_unzipped")
rmtree(unzipped_dir, ignore_errors=True)
os.mkdir(unzipped_dir)
source_file.extractall(unzipped_dir)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
art>=1.8
defusedxml>=0.5.0
defusedxml>=0.7.1
lxml>=5.2.2
Binary file added tests/test_a.pptx
Binary file not shown.
Binary file added tests/test_a.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_dmeta.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dmeta.functions import update, update_all, clear, clear_all
import os
import os

TESTS_DIR_PATH = os.path.join(os.getcwd(), "tests")

Expand Down

0 comments on commit 050cb99

Please sign in to comment.