-
Notifications
You must be signed in to change notification settings - Fork 1
/
location_subtotals.py
153 lines (130 loc) · 8.64 KB
/
location_subtotals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import re
from copy import deepcopy
from os.path import join, dirname
from glob import glob
import pandas as pd
MACRO_LOCATIONS = ['intersticial', 'arcabouço', 'arcabouço e intersticial']
def calculate_subtotals(target_path, idiom):
dataset = pd.read_csv(target_path, delimiter=',', index_col=0)
feature_names = list(map(lambda x: x.lower(), dataset.columns.values))
feature_names = [re.sub('(\[.*\])', '', feature_name) for feature_name in feature_names]
dataset.columns = feature_names
result_dataset = pd.DataFrame(index=dataset.index)
def extract_compositional_type(s):
n_attributes = s.count(' - ') + 1
if n_attributes == 3:
return 'primary'
elif n_attributes == 7:
return 'diagenetic'
elif n_attributes == 6:
return 'porosity'
else:
return ''
compositional_types = [extract_compositional_type(feature_name) for feature_name in feature_names]
# ==================================================================================================================
# PRIMARY SUBTOTALS
primary_attributes_names = ['constituent', 'location', 'modification']
primary_attributes = [feature_name.split(' - ') for feature_name, compositional_type
in zip(feature_names, compositional_types)
if compositional_type == 'primary']
primary_attributes = pd.DataFrame(primary_attributes, columns=primary_attributes_names)
grouped_primary_attributes = primary_attributes.groupby(['constituent', 'location'])
for name, group in grouped_primary_attributes:
group_name = ' - '.join(name)
result_dataset['[primary-subtotal]'+group_name+' - framework'] = dataset.filter(regex=f'{group_name}.*-[^-]*').sum(axis=1)
# ==================================================================================================================
# ==================================================================================================================
# DIAGENETIC SUBTOTALS
diagenese_mapping = pd.read_excel(
'./subtotals_instructive_tables/Categorias de Localização Diagenética revDeRos.xlsx')
diagenese_mapping = diagenese_mapping.apply(lambda x: x.astype(str).str.lower())
diagenetic_attributes_names = ['consituent', 'habit', 'location', 'modification', 'paragenetic relation',
'paragenetic relation constituents', 'paragenetic relation constituent location']
diagenetic_attributes = [feature_name.split(' - ') for feature_name, compositional_type
in zip(feature_names, compositional_types)
if compositional_type == 'diagenetic']
diagenetic_attributes = pd.DataFrame(diagenetic_attributes, columns=diagenetic_attributes_names)
for feature in diagenetic_attributes.iterrows():
feature_name = feature[1]
feature_values = dataset[' - '.join(feature_name)]
if feature_name['location'] == '' and feature_name['paragenetic relation constituent location'] == '':
raise ValueError(f'Line:\n{" - ".join(feature_name)} is not complete enough on file {target_path}.\n'
f'At least its LOCATION or PARAGENETIC RELATION CONSTITUENT LOCATION have to be filled '
f'properly.')
if feature_name['paragenetic relation constituent location'] in MACRO_LOCATIONS:
subtotal_feature_name = deepcopy(feature_name)
subtotal_feature_name['macro location'] = query['location'].values[0]
del subtotal_feature_name['paragenetic relation constituent location']
subtotal_feature_name = '[diagenetic-subtotal]' + ' - '.join(subtotal_feature_name)
if subtotal_feature_name not in result_dataset:
result_dataset[subtotal_feature_name] = \
pd.Series([0] * result_dataset.shape[0], index=result_dataset.index)
result_dataset[subtotal_feature_name] += feature_values
else:
query = diagenese_mapping[diagenese_mapping['VALUE_' + idiom] == feature_name['paragenetic relation' \
' constituent location']]
if query.empty:
query = diagenese_mapping[diagenese_mapping['VALUE_' + idiom] == feature_name['location']]
if query.empty:
raise ValueError(f'Could not define macro location for {" - ".join(feature_name)} in file'
f'{target_path}')
else:
subtotal_feature_name = deepcopy(feature_name)
subtotal_feature_name['macro location'] = query['location'].values[0]
del subtotal_feature_name['location']
subtotal_feature_name = '[diagenetic-subtotal]'+' - '.join(subtotal_feature_name)
if subtotal_feature_name not in result_dataset:
result_dataset[subtotal_feature_name] = \
pd.Series([0] * result_dataset.shape[0], index=result_dataset.index)
result_dataset[subtotal_feature_name] += feature_values
else:
subtotal_feature_name = deepcopy(feature_name)
subtotal_feature_name['macro location'] = query['location'].values[0]
del subtotal_feature_name['paragenetic relation constituent location']
subtotal_feature_name = '[diagenetic-subtotal]'+' - '.join(subtotal_feature_name)
if subtotal_feature_name not in result_dataset:
result_dataset[subtotal_feature_name] = \
pd.Series([0] * result_dataset.shape[0], index=result_dataset.index)
result_dataset[subtotal_feature_name] += feature_values
# ==================================================================================================================
# ==================================================================================================================
# POROSITY SUBTOTALS
porosity_attributes_names = ['porosity', 'location', 'modification', 'paragenetic relation',
'paragenetic relation constituents',
'paragenetic relation constituent location']
porosity_attributes = [feature_name.split(' - ') for feature_name, compositional_type
in zip(feature_names, compositional_types)
if compositional_type == 'porosity']
porosity_attributes = pd.DataFrame(porosity_attributes, columns=porosity_attributes_names)
grouped_primary_attributes = porosity_attributes.groupby(['porosity', 'location'])
for name, group in grouped_primary_attributes:
group_name = ' - '.join(list(name))
result_dataset['[porosity-subtotal]'+group_name] = dataset.filter(regex=f'^{group_name}.*-.*-.*-.*-[^-]*').sum(axis=1)
# ==================================================================================================================
if any(result_dataset.isna().any().values):
print(result_dataset.isna().any())
raise ValueError('There should not be any NaN values inside the subtotals data frame!')
# ==================================================================================================================
# OTHER SUBTOTALS
compositional_types = ['porosity', 'primary', 'diagenetic']
for compositional_type in compositional_types:
for macro_location in MACRO_LOCATIONS:
result_dataset[f'[{macro_location}-{compositional_type}-subtotal]'] =\
dataset.filter(regex=f'^\[{compositional_type}.*{macro_location}[^-]*$').sum(axis=1)
result_dataset['[framework-subtotal]'] = dataset.filter(regex='.*framework[^-]*$').sum(axis=1)
result_dataset['[interstitial-subtotal]'] = dataset.filter(regex='.*interstitial[^-]*$').sum(axis=1)
# ==================================================================================================================
remaining_columns = ['petrofacie', 'porosity', 'grain_size', 'phi stdev sorting']
for column in remaining_columns:
if column in dataset.columns:
result_dataset[column] = dataset[column]
return result_dataset
if __name__ == '__main__':
idiom = 'PTBR'
# target_paths = glob('datasets/*/dataset.csv')
target_paths = ['dataset_julia.csv']
for target_path in target_paths:
print(f'processing {target_path}')
subtotals_df = calculate_subtotals(target_path, idiom)
subtotals_df.to_csv(join(dirname(target_path), 'subtotals_dataset.csv'))
print('DONE')