-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemail_parser.py
140 lines (110 loc) · 3.27 KB
/
email_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.4'
# jupytext_version: 1.2.1
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# Extract fields to csv file for data review
#
#
# - parse email .msg files body
# - parse email subject
# - compile regex for all email dta features
# - add groups to named tuple to match features
# - append to one list of named tuples
# - write final csv
import extract_msg # for msg parsing
import re
from pathlib import Path
from typing import NamedTuple
from collections import namedtuple
# named tuple email data container
class EmailData(NamedTuple):
ref_num: int
phone_num: int
first_name: str
last_name: str
address:str
email: str
# +
# Create one regex per feature
# TODO improve regex
def _ref_num(email: str, subject: str) -> str:
"""
Returns ref numer if found else settles for email suject
"""
patt = re.search(r" number:(?P<ref_number>.*)\n", email)
if patt:
return patt.group(1)
else:
return subject
def _phone(email: str) -> str:
patt = re.search(r"Telephone:(?P<phone>.*)\n", email)
if patt:
return patt.group(1)
else:
return "n/a"
def _first_name(email: str) -> str:
patt = re.search(r"First name:(?P<f_name>.*)\n", email)
if patt:
return patt.group(1)
else:
return "n/a"
def _last_name(email: str) -> str:
patt = re.search(r"Last name:(?P<l_name>.*)\n", email)
if patt:
return patt.group(1)
else:
return "n/a"
def _address(email: str) -> str:
patt = re.search(r"Address:(?P<address>.*)\n", email)
if patt:
return patt.group(1)
else:
return "n/a"
def _email(email: str) -> str:
patt = re.search(r"Email:(?P<email>.*)\n", email)
if patt:
return patt.group(1).strip(r"\r")
else:
return "n/a"
def email_regex(email: str, subject: str) -> NamedTuple:
"""parse email and return email_data named tuple
subject parameter only applies to _ref_num parsing"""
return EmailData(_ref_num(email, subject),
_phone(email),
_first_name(email),
_last_name(email),
_address(email),
_email(email))
# +
# extract emails paths and create generator
emails = Path(r"emails_test/").glob("*.msg")
# add one EmailData container with features per email
emails_data = []
for email_path in emails:
with extract_msg.Message(email_path) as msg:
msg_body = msg.body
msg_subject = msg.subject
emails_data.append(email_regex(msg_body, msg_subject))
# -
def strip_string(feature: str) -> str:
"""Cleans whitespaces and return characters from email features"""
return feature.strip("\r").strip("\n").strip()
# +
import csv
with open('test_email_data.csv', 'w') as f:
csv_writer = csv.writer(f, delimiter=',')
headers = ['ref_number','phone_num','first_name', 'last_name', 'address', 'email']
csv_writer.writerow(headers)
for email in emails_data:
features = [strip_string(feature) for feature in email]
csv_writer.writerow(features)