-
Notifications
You must be signed in to change notification settings - Fork 23
/
07_code.py
254 lines (200 loc) · 8.96 KB
/
07_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# -*- coding: utf-8 -*-
"""07_code.py
Author -- Michael Widrich
Contact -- [email protected]
Date -- 01.10.2019
###############################################################################
The following copyright statement applies to all code within this file.
Copyright statement:
This material, no matter whether in printed or electronic form, may be used for
personal and non-commercial educational use only. Any reproduction of this
manuscript, no matter whether as a whole or in parts, no matter whether in
printed or in electronic form, requires explicit prior acceptance of the
authors.
###############################################################################
In this file we will look into how to use the re module to search for
more complex patterns in strings via regular expressions ("regex").
"""
###############################################################################
# re - searching for complicated patterns in text via regular expressions
###############################################################################
# The re module allows you to search for complex patterns in text. If you are
# only looking for simple patterns, the native python string functions (e.g.
# 'stringwithsubstring'.find('substring') are simpler and faster and should be
# preferred.
# Regex syntax can become quite complex. You can find the documentation at
# https://docs.python.org/3/library/re.html .
# You can use tools like https://www.debuggex.com/r/gj5buG9fdS-UJQHu to
# debug a regex.
# If you are struggling to build a specific pattern, it might pay off to
# google it first. Chances are that someone already created a similar pattern.
import re
#
# Searching for one occurrence of a pattern
#
# There are two important functions for finding a pattern or a group of
# patterns in a string: re.search() and re.match().
# re.search() will search for the first occurrence of a pattern and return a
# MatchObject object if it found a pattern. If no pattern is found, None will
# returned.
# Strings are scanned from start (left) to end (right).
pattern = 'Elm Street'
text = 'Ross McFluff: 155 Elm Street'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> {matchobject}')
# A MatchObject evaluates to True for conditions if a pattern was found. We can
# use if conditions to check if the pattern was found:
if matchobject:
# This will only be executed if a pattern was found
print(f'{text} + {pattern} -> {matchobject}')
# re.match() will search for patterns only at the beginning of the string (even
# if it is a multi-line string).
pattern = 'Ross Mc'
text = 'Ross McFluff: 155 Elm Street'
matchobject = re.match(pattern, text)
print(f'{text} + {pattern} -> {matchobject}')
pattern = 'Elm Street'
text = 'Ross McFluff: 155 Elm Street'
matchobject = re.match(pattern, text)
# Note that this will return None because no pattern was found
print(f'{text} + {pattern} -> {matchobject}')
#
# Returning groups within patterns
#
# You can use groups to only return sub-patterns within a search-pattern.
# Groups are created using brackets ().
# This will match the string 'Elm Street' and return 'Elm' and 'Str'
# separately in groups:
pattern = '(Elm) (Str)eet'
text = 'Ross McFluff: 155 Elm Street'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> {matchobject}')
# You can access the found pattern(s) with MatchObject.group(i),
# where i is the group-number of the found pattern you want to retrieve.
# MatchObject.group() or MatchObject.group(0) will return the complete
# pattern, MatchObject.group(1) the first group, MatchObject.group(2) the
# second group and so on.
# MatchObject.groups() will return all found pattern groups.
print(f'{text} + {pattern} -> .groups() -> {matchobject.groups()}')
print(f'{text} + {pattern} -> .group() -> {matchobject.group()}')
print(f'{text} + {pattern} -> .group(0) -> {matchobject.group(0)}')
print(f'{text} + {pattern} -> .group(1) -> {matchobject.group(1)}')
print(f'{text} + {pattern} -> .group(2) -> {matchobject.group(2)}')
# You can also nest groups:
pattern = 'Elm ((Str)eet)'
text = 'Ross McFluff: 155 Elm Street'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> .groups() -> {matchobject.groups()}')
print(f'{text} + {pattern} -> .group() -> {matchobject.group()}')
print(f'{text} + {pattern} -> .group(0) -> {matchobject.group(0)}')
print(f'{text} + {pattern} -> .group(1) -> {matchobject.group(1)}')
print(f'{text} + {pattern} -> .group(2) -> {matchobject.group(2)}')
#
# Getting additional data from MatchObject objects
#
# MatchObject objects contain more than just the found pattern. They also
# let you access information like the start and end position, the width, etc.
# for the individual groups:
print(f'{text} + {pattern} -> .group(1) -> {matchobject.group(1)}\n'
f' start, group 1: {matchobject.start(1)}\n'
f' end, group 1: {matchobject.end(1)}\n'
f' span, group 1: {matchobject.span(1)}')
# Note that the end position is the index+1 to allow for slicing:
print(f'{text[matchobject.start(1):matchobject.end(1)]}')
print(f'{text[matchobject.start(1)]}')
print(f'{text[matchobject.end(1) - 1]}')
#
# Searching for (non-overlapping) multiple occurrences of a pattern
#
# re.findall() will search for all patterns in the text and returns a list.
# re.finditer() also does this but one item at a time. It returns a
# MatchObject.
# Strings are scanned from start (left) to end (right).
pattern = 'bla'
text = 'blablablibla'
match_list = re.findall(pattern, text)
# Note that the function will return None if no pattern was found
print(f'{text} + {pattern} -> {match_list}')
for i, p in enumerate(re.finditer(pattern, text)):
print(f'{i}. pattern: {p.group()} start: {p.start()} end: {p.end()}')
# You can again use groups to return sub-patterns:
pattern = '(bl)(a)'
text = 'blablablibla'
match_list = re.findall(pattern, text)
# Note that the function will return None if no pattern was found
print(f'{text} + {pattern} -> {match_list}')
#
# Making a pattern flexible
#
# Patterns can include meta-characters in regex syntax to search for flexible
# patterns. The regex syntax uses the special characters {}[]()^$.|*+?
# If you want to use them as normal characters in a string, you need to escape
# their special function with a preceding backslash "\". For example "\?".
# [] will specify a set of characters to match.
pattern = '[cbr]at'
text = 'cat bat rat dog'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
# You can use [0-2] to match integers from 0 to 2. [0-9a-fA-F] will e.g. match
# a hexadecimal number (it will match all integers from 0 to 9 and all
# characters from a to f, both upper and lower case).
pattern = '[0-5a-c]at'
text = 'cat bat rat dog 3at 7at'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
# You can use ^ to negate character patterns. [^0-9] will match all
# non-numerical characters
pattern = '[^0-9]'
text = 'a1b2c3'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
# There exist predefined groups of characters, such as \d for numerical or \D
# for non-numerical characters. Important: You need to write \ in the string,
# meaning you need to escape the \ or use a raw string:
pattern = r'\d' # equivalent to '\\d'
text = 'a1b2c3'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
# See https://docs.python.org/3/library/re.html for an exhaustive list of
# special characters and their meaning.
#
# Searching for alternative patterns
#
# The | character can be used to search for alternative patterns
pattern = '[bcr]at|dog'
text = 'cat bat rat dog'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
# This can be combined with the brackets () to group search-patterns:
pattern = '(ai|ml) student'
text = 'this matches ai student and ml students and returns "ai" or "ml"'
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
#
# Repetitions in patterns
#
# * will match any number of repetitions and is by default greedy (ie. searches
# for the largest pattern).
# + will match 1 or more repetitions and is also by default greedy.
pattern = '([bcr]at|dog)*'
text = 'catcat batbat ratratrat dog'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> {matchobject.group()}')
# findall will only report the set of captured groups:
pattern = '([bcr]at|dog)+'
text = 'catcat batbat ratratrat dog'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> {matchobject.group()}')
# To be non-greedy, you need to add the suffix ?
pattern = '([bcr]at|dog)+?'
text = 'catcat batbat ratratrat dog'
matchobject = re.search(pattern, text)
print(f'{text} + {pattern} -> {matchobject.group()}')
match_list = re.findall(pattern, text)
print(f'{text} + {pattern} -> {match_list}')
#
# Substituting and splitting strings
#
# There are many more functions available via the re module, such as split
# and sub (substitution).
# Please see https://docs.python.org/3/library/re.html for more information.