-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregex_utils.py
90 lines (71 loc) · 3.29 KB
/
regex_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import regex
import re
from enum import Enum
'''Module to support choice of regex engine at runtime (re or regex)'''
_regex_engine=[re,regex]
class EngineType(Enum):
RE =0
REGEX =1
def to_engine_type(s):
try:
t=EngineType[s]
except:
raise NameError("Unrecognized regex engine type",name=s)
return t
def to_regex_engine(etype):
try:
return _regex_engine[etype.value]
except:
raise NameError(f'Unrecognized regex engine index {etype}')
def flags_from_array(flist,etype):
#flip a string to an array
_flags = 0
if not isinstance(flist,list):
if isinstance(flist,str): flist=[flist]
else: raise TypeError("ERROR: regular expression 'flag' values should be lists or single strings.")
for f in flist:
_flags=_flags | flag_from_string(f,etype)
return _flags
def flag_from_string(s,etype):
if (s=="ASCII" or s=="A"): return to_regex_engine(etype).ASCII
elif (s=="IGNORECASE" or s=="I"): return to_regex_engine(etype).IGNORECASE
elif (s=="MULTILINE" or s=="M"): return to_regex_engine(etype).MULTILINE
elif (s=="DOTALL" or s=="S"): return to_regex_engine(etype).DOTALL
elif (s=="VERBOSE" or s=="X"): return to_regex_engine(etype).VERBOSE
elif (s=="LOCALE" or s=="L"): return to_regex_engine(etype).LOCALE
else: return 0
def recursive_sub(s,find,replace,flags,etype):
input=s
while True:
output = to_regex_engine(etype).sub(find, replace, input,flags=flags)
if input == output:
break
else:
input = output
return output
def search(find,s,flags,etype):
try:
return to_regex_engine(etype).search(find, s, flags=flags)
except Exception as e:
raise Exception(f"Regex search error. etype={str(etype)}, regex='{find}', s='{s}', type(regex)={type(find)}, type(s)={type(s)}. Underlying error ={e}")
def sub(s,find,replace,flags,etype):
return to_regex_engine(etype).sub(find, replace, s, flags=flags)
def compile(s,flags,etype):
return to_regex_engine(etype).compile(s,flags=flags)
'''Take a set of regular expressions and combine them into a single regular expression with pipes between each element.'''
def list_to_regex(regex_set):
_regex= f'(({")|(".join(regex_set)}))'
#print (f'list_to_regex: {_regex}')
return _regex
'''Compile a list of phrases or regular expressions ready for redaction. Add pre_regex and post_regex to top and tail each set member. If combine_set=True, complie the list into a single pipe separated regex, otherwise returns a list or regexs.'''
def compile_set(regex_set,pre_regex='',post_regex='',single_regex=True,flags=0,etype=EngineType.REGEX):
#single_regex=True is a lot more efficient. Only set it to false if there is a problem with the size of the combined regular expression.
_pattern_set=None
if single_regex:
#print(f'single_regex: {str(regex_set)},{str(pre_regex)},{str(post_regex)}')
_pattern_set = [ compile(pre_regex + list_to_regex(regex_set) + post_regex, flags, etype) ]
else:
#print(f'multi_regex: {str(regex_set)},{str(pre_regex)},{str(post_regex)}')
_pattern_set = [compile(pre_regex + r + post_regex, flags, etype) for r in regex_set]
#print(f'regex: {str(_pattern_set)}')
return _pattern_set