-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdocx.py
102 lines (90 loc) · 3.59 KB
/
docx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#################################################
# #
# Requires lxml to be installed #
# #
# Credit to 'gtozzi' for some code at #
# https://github.com/mikemaccana/python-docx #
# #
#################################################
import zipfile
from logger import Logger
from scan import Scanner
import panscan
from lxml import etree
nsprefixes = {
# Text Content
'mv':'urn:schemas-microsoft-com:mac:vml',
'mo':'http://schemas.microsoft.com/office/mac/office/2008/main',
've':'http://schemas.openxmlformats.org/markup-compatibility/2006',
'o':'urn:schemas-microsoft-com:office:office',
'r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'm':'http://schemas.openxmlformats.org/officeDocument/2006/math',
'v':'urn:schemas-microsoft-com:vml',
'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10':'urn:schemas-microsoft-com:office:word',
'wne':'http://schemas.microsoft.com/office/word/2006/wordml',
# Drawing
'wp':'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'a':'http://schemas.openxmlformats.org/drawingml/2006/main',
'pic':'http://schemas.openxmlformats.org/drawingml/2006/picture',
# Properties (core and extended)
'cp':"http://schemas.openxmlformats.org/package/2006/metadata/core-properties",
'dc':"http://purl.org/dc/elements/1.1/",
'dcterms':"http://purl.org/dc/terms/",
'dcmitype':"http://purl.org/dc/dcmitype/",
'xsi':"http://www.w3.org/2001/XMLSchema-instance",
'ep':'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
# Content Types (we're just making up our own namespaces here to save time)
'ct':'http://schemas.openxmlformats.org/package/2006/content-types',
# Package Relationships (we're just making up our own namespaces here to save time)
'pr':'http://schemas.openxmlformats.org/package/2006/relationships'
}
def plugin(file, data=None):
if data is not None:
try:
filelikeObj = StringIO.StringIO(data)
except Exception as e:
Logger().log_error(e)
return
try:
mydoc = zipfile.ZipFile(filelikeObj)
except zipfile.BadZipfile as e:
Logger().log_error(e)
return
else:
try:
mydoc = zipfile.ZipFile(file)
except (zipfile.BadZipfile, zipfile.LargeZipFile) as e:
Logger().log_error(e)
return
try:
xmlcontent = mydoc.read('word/document.xml')
except IOError as e:
Logger().log_error(e)
return
try:
document = etree.fromstring(xmlcontent)
except Exception as e:
Logger().log_error(e)
return
# Create a list of paragraphs
paratextlist=[]
paralist = []
for element in document.iter():
if element.tag == '{'+nsprefixes['w']+'}p':
paralist.append(element)
for para in paralist:
paratext=u''
for element in para.iter():
if element.tag == '{'+nsprefixes['w']+'}t':
if element.text:
paratext = paratext+element.text
if not len(paratext) == 0:
paratextlist.append(paratext)
# Scan and log
paraNum = 1
for i in paratextlist:
pans = panscan.panscan(i)
for p in pans:
logger.Logger().log_pan(file, p, "Paragraph number " + i +".")
paraNum += 1