-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
27 lines (22 loc) · 1.04 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pandas as pd
# Before any real work, some unwanted string pieces have to be removed, in order to correctly identify
# routes that share stops. The biggest cause of headache is the fact that if a metro station is called
# "Sesame street" then the corresponding bus station is called "Sesame street M" with no id or
# parent station that can connect the two
# I also did some cleaning by hand for rare occurrences, that are extremely hard to catch otherwise
unwanted = [' M+H',' M', ' H',
' [A]',' [B]',' [C]',
' [D]',' [E]',' [F]',
' [G]',' [H]',' [I]',
' [J]',' [K]',' [L]',
' [M]',' [N]', ' [2]',
' [3]',' [4]',' [5]',
' [6]',' [7]',' [8]',
' [9]',' [10]'' [11]',
' P+R']
df = pd.read_csv('stops.txt',header=0)
for i in range(len(df)):
for u in unwanted:
if u in df.iloc[i,1]:
df.iloc[i,1] = df.iloc[i,1].replace(u,'') # Replace unwanted strings with nothing
df.to_csv('stops.csv',index=False)