-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics_helpers.py
706 lines (663 loc) · 42.1 KB
/
statistics_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
from pathlib import Path
import cv2
import matplotlib
from math import ceil, sqrt
from typing import Any
from matplotlib.colors import LinearSegmentedColormap
import mne
import numpy as np
import pandas as pd
import scipy
from cliffs_delta import cliffs_delta
from matplotlib import cm as cm
from matplotlib import pyplot as plt
from scipy.stats import norm, shapiro
from tqdm.notebook import tqdm
from utils.eeg_helpers import (get_erp_description, load_all_erp_averages,
round_time_EEG, statistics_distribution)
from utils.eeg_settings import *
from utils.file_settings import SEPARATOR, SNIPPET_COLUMN_VARIANT
from utils.json_helpers import JSONFileBuffers, JSONTypesEncoder
from utils.path_helpers import (get_erp_fixation_analysis_path,
get_erp_statistics_path)
from utils.snippet_settings import (CONDITION, CONDITION_CLEAN,
CONDITION_COLORS, CONDITION_CONFUSING,
CONDITION_DIFF, SNIPPET_CLEAN, SNIPPET_OBF)
from utils.statistics_settings import (ALPHA, ALTERNATIVE_HYPOTHESIS,
ALTERNATIVE_HYPOTHESIS_GREATER,
ALTERNATIVE_HYPOTHESIS_LESS, EFFECT_STRENGTH_LARGE, EFFECT_STRENGTH_MEDIUM, EFFECT_STRENGTH_NEGLIGIBLE, EFFECT_STRENGTH_SMALL, MEAN,
MEASURE,
PERMUTATION_CONFIDENCE_INTERVAL,
PERMUTATION_COUNT,
PERMUTATION_COUNT_MAX,
PERMUTATION_COUNT_START,
PERMUTATION_MAX_STEP_FREQUENCY_FACTOR, PERMUTATION_PLOT_PIXEL_X, PERMUTATION_PLOT_PIXEL_Y,
PERMUTATION_TEST_COLORS,
PERMUTATION_TEST_PARAMETERS, PVALUE,
STATISTIC_INFER_H1_ACCEPTED,
STATISTIC_INFER_H1_NON_ACCEPTED, STD,
TEST, TEST_FUNCTION,
TEST_KIND_INDEPENDENT,
TEST_KIND_NON_PARAMETRIC,
TEST_KIND_PAIRED, TEST_KIND_PARAMETRIC,
TEST_MANNWHITNEYU, TEST_T_INDEPENDENT,
TEST_T_PAIRED, TEST_WILCOXON)
from utils.textconstants import CHANNEL, PARTICIPANT
from utils.visual_settings import (FIXATION_SELECTION_ALGORITHM,
FIXATION_SELECTION_ALGORITHMS)
def statistical_analysis_for_subjectwise_averages(analysis_data: pd.DataFrame, alternative_hypotheses_per_measure: dict[str, str],
additional_groupers: list = [], stest_kind: str = TEST_KIND_INDEPENDENT, log: bool = True, independent_variable: str = CONDITION, independent_factors: tuple[str, str] = (CONDITION_CONFUSING, CONDITION_CLEAN, )) -> pd.DataFrame:
'''Calculates whether there is a significant difference for the measures across all snippets between the conditions
This is done by aggregating the data on a condition-subjectwise mean and std, and optionally with additional groupers.
Depending on whether the resulting dataset is pairwise or not, the wilcoxon-test or the mann-whitney-u-test will be performed against the alternative hypothesis.
if the alternative hypothesis not accepted (null kept), then a test in the other direction is also performed.
Arguments:
* analysis_data: contains the measured data with PARTICIPANT, CONDITION, measures and additional groupers
* alternative_hypotheses_per_measure: contains the primary alternative hypothesis per measure
* additional_groupers: statistical analysis for each value combination of the columns between conditions
returns: a dataframe containing the results for each statistical analysis
'''
# or (SNIPPET_COLUMN_VARIANT in analysis_data.columns),
assert independent_variable in analysis_data.columns, f'''There must be a marker for the independent variable in the analysis_data: {
independent_variable}'''
assert len(
independent_factors) == 2, f'''There can only be 2 independent factors, given {independent_factors}'''
assert set(independent_factors) == set(analysis_data[independent_variable].unique()), f'''The independent factors must only and all be present in the independent variable. Factors: {independent_factors},
values in independent variable column:re must be a condition marker in the analysis_data: {analysis_data[independent_variable].unique()}'''
assert PARTICIPANT in analysis_data.columns, f'''{
PARTICIPANT} must be in the analysis_data'''
subjectwise_averages = calculate_subjectwise_averages(
analysis_data, alternative_hypotheses_per_measure.keys(), additional_groupers+[independent_variable])
results = pd.DataFrame([], columns=[*additional_groupers, 'independent variable', MEASURE,
f'normality {independent_factors[0]}', f'''normality {
independent_factors[0]} {PVALUE}''',
f'normality {independent_factors[1]}', f'''normality {
independent_factors[1]} {PVALUE}''',
'test', ALTERNATIVE_HYPOTHESIS, PVALUE, 'alpha', 'decision',
'effect size', 'effect strength'])
decision_position = list(results.columns).index(
'decision')-results.shape[1]
test_settings = subjectwise_averages[additional_groupers].drop_duplicates(
keep='first')
# default is 1 group
if test_settings.empty:
test_settings.index = [1]
# test per group
for i, setting in test_settings.iterrows():
setting_data = subjectwise_averages
if log:
print(setting.to_dict())
for grouper, setting_value in setting.items():
setting_data = setting_data[setting_data[grouper] == setting_value]
# get data
factor0_data = setting_data[setting_data[independent_variable]
== independent_factors[0]].sort_values(PARTICIPANT)
factor1_data = setting_data[setting_data[independent_variable] ==
independent_factors[1]].sort_values(PARTICIPANT)
# elif SNIPPET_COLUMN_VARIANT in analysis_data.columns:
# confusing_data = setting_data[setting_data[SNIPPET_COLUMN_VARIANT]
# == SNIPPET_OBF].sort_values(PARTICIPANT)
# clean_data = setting_data[setting_data[SNIPPET_COLUMN_VARIANT]
# == SNIPPET_CLEAN].sort_values(PARTICIPANT)
# check test kind paired or independent
if stest_kind == TEST_KIND_PAIRED:
if not factor0_data[PARTICIPANT].reset_index(drop=True).equals(factor1_data[PARTICIPANT].reset_index(drop=True)):
print(
'For paired test there must be pairwise data. Selecting independent test instead.')
stest_kind = TEST_KIND_INDEPENDENT
# per measure on data group
for measure in alternative_hypotheses_per_measure:
alternative_hypothesis = alternative_hypotheses_per_measure[measure]
# test for normality
if stest_kind == TEST_KIND_PAIRED:
diff_text = f'diff_{independent_factors[0]}-{independent_factors[1]}'
diff_normal, diff_normal_p = is_normally_distributed(
factor0_data[measure].reset_index(drop=True)-factor1_data[measure].reset_index(drop=True), f'''{measure} {diff_text}''', log)
normality_data = [measure, diff_normal,
diff_normal_p, diff_text, f'{diff_text} pvalue']
is_normal = diff_normal
elif stest_kind == TEST_KIND_INDEPENDENT:
factor0_normal, factor0_normal_p = is_normally_distributed(
factor0_data[measure], f'{measure} {independent_factors[0]}', log)
factor1_normal, factor1_normal_p = is_normally_distributed(
factor1_data[measure], f'{measure} {independent_factors[1]}', log)
normality_data = [measure, factor0_normal,
factor0_normal_p, factor1_normal, factor1_normal_p]
is_normal = factor1_normal and factor0_normal
else:
raise Exception(
f'statistical test kind {stest_kind} not known')
# decide for parametric or nonparametric test based on normality
if is_normal:
test = TEST[TEST_KIND_PARAMETRIC][stest_kind]
else:
test = TEST[TEST_KIND_NON_PARAMETRIC][stest_kind]
# perform test in given direction
test_data = perform_single_test(
test, factor0_data[measure], factor1_data[measure], measure, alternative_hypothesis, log)
results.loc[results.shape[0]] = setting.values.tolist() + [{independent_variable: independent_factors}] +\
normality_data+test_data
if test_data[decision_position] == STATISTIC_INFER_H1_NON_ACCEPTED:
# if not accepted, test out reverse hypothesis
alternative_hypothesis = get_reversed_alternative_hypothesis(
alternative_hypothesis)
test_data = perform_single_test(
test, factor0_data[measure], factor1_data[measure], measure, alternative_hypothesis, log)
results.loc[results.shape[0]
] = setting.values.tolist() + [{independent_variable: independent_factors}]+normality_data+test_data
return results
def calculate_subjectwise_averages(data: pd.DataFrame, categories: list[str], additional_groupers: list = []) -> pd.DataFrame:
# subject-condition-wise mean and the standard deviation
subjectwise_averages = data.groupby(list(set([CONDITION if CONDITION in data.columns else SNIPPET_COLUMN_VARIANT, PARTICIPANT, *additional_groupers]))).agg({
c: [MEAN, STD] for c in categories})
subjectwise_averages.columns = [
(c if i == MEAN else f'{c} {i}') for c in categories for i in [MEAN, STD]]
subjectwise_averages = subjectwise_averages.reset_index()
return subjectwise_averages
def is_normally_distributed(data: pd.Series, description: str, log: bool = True) -> tuple[bool, float]:
try:
result = shapiro(data)
except ValueError:
if log:
print(
f'\t{description} there are too few samples to judge the distribution')
return False, 1
if result.pvalue < ALPHA:
if log:
print(
f'\t{description} is not normally distributed due to {round(result.pvalue, 3)} less than alpha {ALPHA}; Result {result}')
return False, result.pvalue
if log:
print(f'''\t{description} might be normally distributed due to {
round(result.pvalue, 3)} greater equals to alpha {ALPHA}; Result {result}''')
return True, result.pvalue
def perform_single_test(statistical_test: str, confusing_data: pd.Series, clean_data: pd.Series, measure: str, alternative_hypothesis: str, log: bool = True) -> list:
if statistical_test == TEST_WILCOXON:
parameters = {'x': confusing_data,
'y': clean_data,
'alternative': alternative_hypothesis,
'correction': True,
}
elif statistical_test == TEST_MANNWHITNEYU:
parameters = {'x': confusing_data,
'y': clean_data,
'alternative': alternative_hypothesis,
'use_continuity': True
}
elif statistical_test in TEST[TEST_KIND_PARAMETRIC].values():
parameters = {'a': confusing_data,
'b': clean_data,
'alternative': alternative_hypothesis,
'nan_policy': 'raise'
}
result = TEST_FUNCTION[statistical_test](**parameters)
alternative_hypothesis_text = f'''confusing {
alternative_hypothesis} than clean'''
decision = (STATISTIC_INFER_H1_ACCEPTED if result.pvalue <
ALPHA else STATISTIC_INFER_H1_NON_ACCEPTED)
if log:
print(
f'\tFor {measure}, the {statistical_test}\'s stance on the H1 `{alternative_hypothesis_text}` indicates with alpha={ALPHA} and the p-value of {round(result.pvalue, 3)}: {decision}; Result {result}')
final = [statistical_test,
alternative_hypothesis_text, result.pvalue, ALPHA, decision]
if decision == STATISTIC_INFER_H1_ACCEPTED:
effect_size, effect_strength = calculate_effect_size(
statistical_test, confusing_data, clean_data)
if log:
print(f'\t\tEffect size {effect_size} {effect_strength}')
else:
effect_size, effect_strength = 0, 'None'
return final+[effect_size, effect_strength]
def calculate_effect_size(statistical_test: str, confusing_data: pd.Series, clean_data: pd.Series):
if statistical_test in TEST[TEST_KIND_PARAMETRIC].values():
# calculate effect size using cohen's d
clean_mean, clean_var = clean_data.agg([MEAN, 'var'])
confusing_mean, confusing_var = confusing_data.agg([MEAN, 'var'])
if statistical_test == TEST_T_PAIRED:
# d=(m1−m2)/√((Var1+Var2)/2)
pooled_std = sqrt((confusing_var+clean_var)/2)
effect_size = (confusing_mean-clean_mean)/pooled_std
elif statistical_test == TEST_T_INDEPENDENT:
# d=(m1−m2)/√(((n1-1)*Var1+(n2-1)*Var2)/(n1+n2-2))
clean_len_corr, confusing_len_corr = clean_data.shape[0] - \
1, confusing_data.shape[0]-1
pooled_std = sqrt((confusing_len_corr*confusing_var +
clean_len_corr*clean_var)/(confusing_len_corr+clean_len_corr))
effect_size = (confusing_mean-clean_mean)/pooled_std
effect_strength = EFFECT_STRENGTH_NEGLIGIBLE if abs(effect_size) < 0.2 else (
EFFECT_STRENGTH_SMALL if abs(effect_size) < 0.5 else (EFFECT_STRENGTH_MEDIUM if abs(effect_size) < 0.8 else EFFECT_STRENGTH_LARGE))
elif statistical_test in TEST[TEST_KIND_NON_PARAMETRIC].values():
# calculate effect size using cliff's delta
effect_size, _ = cliffs_delta(confusing_data, clean_data)
# print(effect_size, effect_size_cliff)
effect_strength = EFFECT_STRENGTH_NEGLIGIBLE if abs(effect_size) < 0.147 else (
EFFECT_STRENGTH_SMALL if abs(effect_size) < 0.33 else (EFFECT_STRENGTH_MEDIUM if abs(effect_size) < 0.474 else EFFECT_STRENGTH_LARGE))
return effect_size, effect_strength
def get_reversed_alternative_hypothesis(alternative_hypothesis: str):
if alternative_hypothesis == ALTERNATIVE_HYPOTHESIS_GREATER:
return ALTERNATIVE_HYPOTHESIS_LESS
elif alternative_hypothesis == ALTERNATIVE_HYPOTHESIS_LESS:
return ALTERNATIVE_HYPOTHESIS_GREATER
raise Exception()
def test_significance_average_window(subjectwise_averages: dict[str, dict[str, mne.Evoked]], electrode: str, tmin: float, tmax: float, log: bool = True):
assert tmin < tmax, f'tmin {tmin} must be smaller than tmax {tmax}'
subject_conditionwise_mean = pd.DataFrame(
[], index=[], columns=[PARTICIPANT, CONDITION, EEG_MEAN_AMPLITUDE])
for participant, c_averages in subjectwise_averages.items():
for condition, average in c_averages.items():
# electrode and time dimension
electrode_data = average.get_data(
electrode, tmin=tmin, tmax=tmax,units='uV')[0]
# mean across time dimension in microvolt
mean_electrode_data = np.mean(electrode_data)
subject_conditionwise_mean.loc[subject_conditionwise_mean.shape[0]] = [
participant, condition, mean_electrode_data]
results = statistical_analysis_for_subjectwise_averages(subject_conditionwise_mean, {
EEG_MEAN_AMPLITUDE: ALTERNATIVE_HYPOTHESIS_GREATER}, [], TEST_KIND_PAIRED, log)
results[CHANNEL] = electrode
results['tmin'] = tmin
results['tmax'] = tmax
return results
def perform_cluster_permutation_test_diff(erp_frp: bool | str, snippet_group: str, correct_data_only: bool, epoch_interval: tuple[float, float], participants: list[str], subinterval: tuple[float, float] = (0, None), downsample_to: int = None, ignore_channels: list[str] = [], each_channel: bool = True, channel_combo: bool = True):
assert (erp_frp is True or erp_frp in FIXATION_SELECTION_ALGORITHMS), erp_frp
assert (downsample_to is None or (downsample_to > 0 and downsample_to < EEG_FREQUENCY)
), f'downsample_to must be either None or an integer between 0 and {EEG_FREQUENCY}'
assert (all(channel in EEG_CHANNELS for channel in ignore_channels)
), f'all channels to ignore {ignore_channels} must be found in {EEG_CHANNELS}'
description = get_erp_description(
erp_frp, correct_data_only, epoch_interval)
print(description)
# set subinterval values
if subinterval == (0, None):
interval_description = 'allalong'
else:
interval_description = f'''{str(subinterval[0]).replace(
".", "")}-{str(subinterval[1]).replace(".", "")}'''
if subinterval[0] is None or epoch_interval[0] > subinterval[0]:
subinterval = epoch_interval[0], subinterval[1]
if subinterval[1] is None or epoch_interval[1] < subinterval[1]:
subinterval = subinterval[0], epoch_interval[1]
assert (epoch_interval[0] < epoch_interval[1])
assert (subinterval[0] < subinterval[1])
# load subjectwise diff data
subjectwise_diff: dict[str, dict[str, mne.Evoked]] = load_all_erp_averages(erp_frp, snippet_group, correct_data_only, epoch_interval,
subjectwise=True, participants=participants, grand=False, diff=True, conditional=False)
# downsample if required
example = list(subjectwise_diff.values())[0][CONDITION_DIFF]
example_info = example.info
assert example_info[MNE_KEY_FREQUENCY] == EEG_FREQUENCY, example_info[MNE_KEY_FREQUENCY]
if not downsample_to is None:
for subject, diff in subjectwise_diff.items():
subjectwise_diff[subject][CONDITION_DIFF] = diff[CONDITION_DIFF].resample(
downsample_to)
else:
downsample_to = EEG_FREQUENCY
example = list(subjectwise_diff.values())[0][CONDITION_DIFF]
example_info = example.info
# times = np.array([round_time_EEG(time) for time in times])
times = np.asarray([time for time in example.times if subinterval[0]
<= round_time_EEG(time) <= subinterval[1]])
# times = np.asarray(times[np.where(times <= subinterval[1])])
# subinterval=subinterval[0]-sys.float_info.min, subinterval[1]+sys.float_info.min
# temp write file
all_file_buffer = JSONFileBuffers(get_erp_statistics_path(
erp_frp, snippet_group, description, f'permutation swise diff {interval_description} {downsample_to}Hz', 'json', False), True)
positive_file_buffer = JSONFileBuffers(get_erp_statistics_path(
erp_frp, snippet_group, description, f'permutation swise diff {interval_description} {downsample_to}Hz', 'json', True), True)
all_file_buffer.start()
positive_file_buffer.start()
# test parameters
# channel-> test-> [cluster_count, positive_cluster_count, positive_clusters -> (positive_p_value, positive_cluster_range/array)]
channels_to_iterate = []
possible_channels = [
channel for channel in EEG_CHANNELS if channel not in ignore_channels]
if each_channel:
channels_to_iterate += possible_channels
if channel_combo:
channels_to_iterate += [possible_channels]
for channel in tqdm(channels_to_iterate):
if isinstance(channel, list):
channel_description = 'multiple channels' if ignore_channels else 'all channels'
else:
channel_description = channel
print(channel_description)
# create ndarray with dimensions (subjects x times x electrodes), pairwise via diff values
subjectwise_diff_data = []
for participant in sorted(subjectwise_diff.keys()):
evoked = subjectwise_diff[participant][CONDITION_DIFF]
times = np.asarray([time for time in evoked.times if subinterval[0]
<= round_time_EEG(time) <= subinterval[1]])
data = evoked.get_data(channel,units='uV')
new_data = data[:, list(evoked.times).index(
times[0]):list(evoked.times).index(times[-1])+1]
if isinstance(channel, list):
data = np.transpose(new_data)
else:
data = new_data[0]
# print(data)
subjectwise_diff_data.append(data)
subjectwise_diff_data = np.asarray(subjectwise_diff_data)
print('\tsubjects x times (x electrodes)', subjectwise_diff_data.shape)
assert (subjectwise_diff_data.shape[1] == times.shape[0]
), f'({subjectwise_diff_data.shape[1]}, {times}) ({times.shape[0]}, {times})'
# set up channel adjacency
channel_adjacency, _ = get_channel_adjacency(
channel, example_info)
# perform permutation tests
test_clusters = {}
for test_side in PERMUTATION_TEST_PARAMETERS[TEST_KIND_PAIRED]:
print('\tTest', test_side)
# incrementally increase number of permutations if pvalue not confident
permutation_count = PERMUTATION_COUNT_MAX
test_clusters[test_side] = []
while True:
print('\t\tPermutation count', permutation_count)
T_obs, clusters, cluster_p_values, H0, positive_clusters = perform_permutation_cluster_1samp_test(
subjectwise_diff_data, channel_adjacency, **PERMUTATION_TEST_PARAMETERS[TEST_KIND_PAIRED][test_side], n_permutations=permutation_count,
max_step=calculate_max_step(downsample_to))
results = {ERP_PERM_PARAMETER_CHANNEL_DESCRIPTION: channel_description, ERP_PARAMETER_SNIPPET_GROUP: snippet_group, ALTERNATIVE_HYPOTHESIS: test_side,
ERP_PERM_PARAMETER_CHANNELS: channel, ERP_PERM_PARAMETER_PERMUTATION_COUNT: H0.shape[0],
ERP_PERM_PARAMETER_CLUSTER_COUNT: len(clusters), ERP_PERM_PARAMETER_POSITIVE_CLUSTER_COUNT: len(positive_clusters), ERP_PERM_PARAMETER_POSITIVE_CLUSTERS: [],
ERP_PERM_PARAMETER_NEGATIVE_CLUSTER_COUNT: len(clusters)-len(positive_clusters), ERP_PERM_PARAMETER_NEGATIVE_CLUSTERS: [],
ERP_PARAMETER_ERP_FRP: erp_frp, ERP_PARAMETER_EPOCH_INTERVAL: list(epoch_interval), ERP_PERM_PARAMETER_SUBINTERVAL: list(subinterval),
ERP_PARAMETER_CORRECT_TRIALS_ONLY: correct_data_only, ERP_PARAMETER_PARTICIPANTS: participants,
ERP_PERM_PARAMETER_T_STATISTICS: T_obs, ERP_PERM_PARAMETER_H0: H0, ERP_PERM_PARAMETER_P_VALUES_CONFIDENT: False, ERP_PERM_PARAMETER_FREQUENCY: downsample_to}
if results[ERP_PERM_PARAMETER_CLUSTER_COUNT] > results[ERP_PERM_PARAMETER_POSITIVE_CLUSTER_COUNT]:
results[ERP_PERM_PARAMETER_NEXT_P_VALUE] = cluster_p_values[np.where(
cluster_p_values > ALPHA)].min()
else:
results[ERP_PERM_PARAMETER_NEXT_P_VALUE] = 'No cluster not accepted'
# add per cluster p value and range / array
for cluster_index, cluster in enumerate(clusters):
cluster_data = {
ERP_PERM_CLUSTER_PARAMETER_P_VALUE: cluster_p_values[cluster_index]}
if isinstance(channel, list):
cluster_range = np.argwhere(cluster)
cluster_data[ERP_PERM_CLUSTER_PARAMETER_TIMES] = [round_time_EEG(
times[time]) for time in np.unique(cluster_range[:, 0])]
cluster_data[ERP_PERM_CLUSTER_PARAMETER_CHANNELS] = [channel[c]
for c in np.unique(cluster_range[:, 1])]
cluster_range = * \
np.min(cluster_range, axis=0), * \
np.max(cluster_range, axis=0)
cluster_range = [cluster_range[0], cluster_range[2]], [
cluster_range[1], cluster_range[3]]
cluster_data[ERP_PERM_CLUSTER_PARAMETER_TIME_RANGE] = [round_time_EEG(
times[cluster_range[0][0]]), round_time_EEG(times[cluster_range[0][1]])]
cluster_data[ERP_PERM_CLUSTER_PARAMETER_CHANNEL_RANGE] = [
channel[cluster_range[1][0]], channel[cluster_range[1][1]]]
cluster_data[ERP_PERM_CLUSTER_PARAMETER_CLUSTER] = cluster
else:
cluster_range = np.argwhere(cluster).reshape((-1,))
cluster_data[ERP_PERM_CLUSTER_PARAMETER_TIMES] = [round_time_EEG(
times[time]) for time in cluster_range]
cluster_range = [
np.min(cluster_range), np.max(cluster_range)]
cluster_data[ERP_PERM_CLUSTER_PARAMETER_TIME_RANGE] = [round_time_EEG(
times[cluster_range[0]]), round_time_EEG(times[cluster_range[1]])]
results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS if cluster_index in positive_clusters else ERP_PERM_PARAMETER_NEGATIVE_CLUSTERS].append(
cluster_data)
test_clusters[test_side].append(results)
# get the permutation count for which p is a confident result
confident_permutation_counts = [
calculate_required_permutation_count(p) for p in cluster_p_values]
results[ERP_PERM_PARAMETER_REQUIRED_PERMUTATION_COUNTS] = confident_permutation_counts
if len(clusters) == 0:
results[ERP_PERM_PARAMETER_P_VALUES_CONFIDENT] = True
break
# if exact test (perm count > shape of H0 portraying the actual permutation count)
if permutation_count > H0.shape[0]:
print(
'\t\t\t', f'Stop after exact test results, as {permutation_count} exceeds possible {H0.shape[0]}')
results[ERP_PERM_PARAMETER_P_VALUES_CONFIDENT] = True
break
# # select the highest permutation count required for the p values
# new_permutation_count = ceil(
# (max(confident_permutation_counts)+1)/100+1)*100
# # if lower, then we can be confident and stop
# if new_permutation_count <= permutation_count:
# print(
# '\t\t\t', f'Achieved satisfaction in permutation_count, as {permutation_count} exceeds or is equal to required {new_permutation_count}')
# results[ERP_PERM_PARAMETER_P_VALUES_CONFIDENT] = True
# break
# # if too high, check the distance
# elif new_permutation_count > PERMUTATION_COUNT_MAX:
# # if distance low, redo with max
# if abs(new_permutation_count-permutation_count) > PERMUTATION_COUNT_MAX/10:
# permutation_count = PERMUTATION_COUNT_MAX
# continue
# # otherwise stop
# else:
# print(
# '\t\t\t', f'No satisfaction in permutation_count achievable, as {new_permutation_count} exceeds maximum {PERMUTATION_COUNT_MAX}')
# break
# permutation_count = new_permutation_count
break
all_file_buffer.data_chunk(
test_clusters, channel_description, JSONTypesEncoder)
# for all positive cases, take the last entry only if that one is positive
positive_test_clusters = {test_side: test_results[-1] for test_side, test_results in test_clusters.items(
) if test_results[-1][ERP_PERM_PARAMETER_POSITIVE_CLUSTER_COUNT] > 0}
positive_file_buffer.data_chunk(
positive_test_clusters, channel_description, JSONTypesEncoder)
# plot results
if positive_test_clusters:
fig, fig_dataframes = plot_permutation_clusters(positive_test_clusters, [round_time_EEG(time) for time in times], channel, {
CONDITION_DIFF: subjectwise_diff_data.mean(axis=0)})
fig_path = get_erp_statistics_path(
erp_frp, snippet_group, description,
f'permutation swise diff {interval_description} {downsample_to}Hz', 'png', True)
# fig_path = fig_path.with_stem(fig_path.stem+'_'+channel_description)
# create and store contours
contours_only_image = add_store_contours(fig)
# plot
if len(fig.get_axes())>2:
fig.get_axes()[2].imshow(contours_only_image)
plt.show()
# save all plots
fig.savefig(fig_path.with_stem(
fig_path.stem+'_special_contours'))
plt.close(fig)
matplotlib.image.imsave(fig_path.with_stem(
fig_path.stem+'_only_contours'), contours_only_image.copy(order='C'))
fig_data_path = get_erp_statistics_path(
erp_frp, snippet_group, description,
f'permutation swise diff {interval_description} {downsample_to}Hz', 'csv', True)
fig_dataframes[0].to_csv(fig_data_path.with_stem('Data Figure2c amplitudes'), sep=SEPARATOR)
for i, dataframe in enumerate(fig_dataframes[1:]):
dataframe.to_csv(fig_data_path.with_stem(f'Data Figure2c cluster{i} mask'), sep=SEPARATOR)
# finish file buffer
all_file_buffer.end()
positive_file_buffer.end()
def add_store_contours(fig:plt.figure):
axes = fig.get_axes()
# Extract the color data for all entries from the 2nd axis
cluster_image = axes[1].images[1].make_image(renderer=None, unsampled=True)[0]
# transform the color data into pixels of image resolution
cluster_image_pixels = np.kron(cluster_image,
np.ones((PERMUTATION_PLOT_PIXEL_Y, PERMUTATION_PLOT_PIXEL_X, 1)).astype(np.uint8)).astype(np.uint8)
print(cluster_image_pixels.shape)
# Convert the image to a format suitable for OpenCV
cluster_image_cv2 = cv2.cvtColor(cluster_image_pixels, cv2.COLOR_RGBA2RGB)
mask = cv2.inRange(cluster_image_cv2, (0,0,0), (0,0,0))
# Find contours in the mask, including small holes
contours_with_contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Create an empty image to draw the contours on
contours_only_image = np.zeros_like(cluster_image_cv2)
contours_only_image.fill(255)
cv2.drawContours(contours_only_image, contours_with_contours, -1, (0, 0, 0), 2)
# correct contours at the borders
contours_only_image[:,:2] = contours_only_image[:,2:4]
contours_only_image[:,-2:] = contours_only_image[:,-4:-2]
contours_only_image[:2,:] = contours_only_image[2:4,:]
contours_only_image[-2:,:] = contours_only_image[-4:-2,:]
# add alpha channel
alpha = (255-contours_only_image[:,:,0]).reshape((*contours_only_image.shape[:2], 1))
contours_only_image = np.concatenate((contours_only_image, alpha), axis=2)
# flip for correct orientation
contours_only_image = np.flipud(contours_only_image)
return contours_only_image
def get_channel_adjacency(channels, info):
# set up channel adjacency
if isinstance(channels, list):
channel_adjacency, all_channels = mne.channels.find_ch_adjacency(
info, 'eeg')
if channels != all_channels:
channel_indices = [all_channels.index(
channel) for channel in channels]
keep_col = np.in1d(
np.arange(channel_adjacency.shape[1]), channel_indices, assume_unique=True)
keep_cols = np.tile(keep_col, (keep_col.shape[0], 1))
keep = np.logical_and(keep_cols, keep_cols.T)
channel_adjacency = scipy.sparse.csr_matrix(
channel_adjacency[np.where(keep)].reshape((len(channels), len(channels))))
else:
channel_adjacency = scipy.sparse.csr_matrix(np.ones((1, 1)))
channels = [channels]
return channel_adjacency, channels
def calculate_max_step(eeg_frequency: int = EEG_FREQUENCY):
# max_step varies with frequency, bit should be at least 1 to connect adjacent time points
return max(1, int(PERMUTATION_MAX_STEP_FREQUENCY_FACTOR*eeg_frequency))
def perform_permutation_cluster_1samp_test(data, adjacency, tail, n_permutations=PERMUTATION_COUNT, max_step=int(PERMUTATION_MAX_STEP_FREQUENCY_FACTOR*EEG_FREQUENCY)):
import warnings
warnings.filterwarnings('ignore')
T_obs, clusters, cluster_p_values, H0 = mne.stats.permutation_cluster_1samp_test(data, n_permutations=n_permutations, threshold=None,
tail=tail, adjacency=adjacency, out_type='mask', verbose=True, max_step=max_step)
warnings.filterwarnings('error')
positive_clusters = analyze_clusters(clusters, cluster_p_values)
return T_obs, clusters, cluster_p_values, H0, positive_clusters
def perform_permutation_cluster_test(data, adjacency, tail, n_permutations=PERMUTATION_COUNT, max_step=int(PERMUTATION_MAX_STEP_FREQUENCY_FACTOR*EEG_FREQUENCY)):
T_obs, clusters, cluster_p_values, H0 = mne.stats.permutation_cluster_test(data, n_permutations=n_permutations, threshold=None,
tail=tail, adjacency=adjacency, out_type='mask', verbose=True, max_step=max_step)
positive_clusters = analyze_clusters(clusters, cluster_p_values)
return T_obs, clusters, cluster_p_values, H0, positive_clusters
def analyze_clusters(clusters, cluster_p_values):
if len(clusters) == 0:
return []
# if isinstance(clusters[0], np.ndarray):
# print(clusters[0].shape, [np.count_nonzero(cluster*1) for cluster in clusters])
# else:
# print([cluster[0].stop-cluster[0].start for cluster in clusters])
print('\t\t\t', 'P values', cluster_p_values)
positive_clusters = np.where(cluster_p_values <= ALPHA)[0]
print('\t\t\t', 'positive clusters', positive_clusters)
return positive_clusters
def calculate_required_permutation_count(p_value: float, confidence=PERMUTATION_CONFIDENCE_INTERVAL):
confidence_ztable = norm.ppf(confidence)
if abs(p_value-ALPHA) < 1e-8:
p_value = ALPHA-1e-8
return (p_value*(1-p_value))/(((ALPHA-p_value)/confidence_ztable)**2)
def plot_permutation_clusters(test_clusters: dict[str, dict[str, Any]], times: np.ndarray, channels: list[str], channel_data: dict[str, np.ndarray] = None):
if any([test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTER_COUNT] > 0 for test_results in test_clusters.values()]):
any_hypothesis = list(test_clusters.keys())[0]
all_plot = channel_data[CONDITION_DIFF]
cluster_plot = all_plot.copy()
if isinstance(test_clusters[any_hypothesis][ERP_PERM_PARAMETER_CHANNELS], list):
all_positive_clusters = np.full_like(
test_clusters[any_hypothesis][ERP_PERM_PARAMETER_POSITIVE_CLUSTERS][0][ERP_PERM_CLUSTER_PARAMETER_CLUSTER], False)
for test_side, test_results in test_clusters.items():
for positive_clusters in test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS]:
all_positive_clusters = np.logical_or(
all_positive_clusters, positive_clusters[ERP_PERM_CLUSTER_PARAMETER_CLUSTER])
cluster_plot[~all_positive_clusters] = np.nan
fig, ax = plt.subplots(3, 1, figsize=(12, 12), layout='constrained')
max_value_plot = abs(all_plot.T).max(axis=None)
plot_2d_cluster_heatmap(
fig, ax[0], all_plot, cluster_plot, times, channels, max_value_plot)
# # order electrode channels by highest value in cluster
# max_values_cluster = pd.Series(
# np.nanmax(cluster_plot, axis=0, initial=0)).sort_values().to_frame()
# max_values_array = np.repeat(max_values_cluster.index.values, len(
# times), axis=0).reshape((max_values_cluster.shape[0], len(times))).T
# cluster_plot2 = np.take_along_axis(
# cluster_plot, max_values_array, axis=1)
# all_plot2 = np.take_along_axis(all_plot, max_values_array, axis=1)
plot_2d_cluster_heatmap(
fig, ax[1], all_plot, cluster_plot, times, channels, max_value_plot, color_diff=True)
all_dataframe = pd.DataFrame(all_plot, index=times, columns=channels)
cluster_dataframes = [pd.DataFrame(positive_clusters[ERP_PERM_CLUSTER_PARAMETER_CLUSTER],
index=times, columns=channels) for _, test_results in test_clusters.items() for positive_clusters in test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS]]
return fig, [all_dataframe, *cluster_dataframes]
else:
ms_times = [int(round(time*1000, 0)) for time in times]
fig, (ax, ax2) = plt.subplots(2, 1, figsize=(8, 4))
ax.set_title('Channel : ' +
test_clusters[any_hypothesis][ERP_PERM_PARAMETER_CHANNELS])
for condition in channel_data:
ax.plot(times, channel_data[condition],
label=condition, c=CONDITION_COLORS[condition])
ax.set_ylabel('EEG microV')
ax.legend()
legend_handler = [], []
for test_side, test_results in test_clusters.items():
for positive_clusters in test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS]:
for time in positive_clusters[ERP_PERM_CLUSTER_PARAMETER_TIMES]:
h = ax2.axvspan(
time, time+1/test_results[ERP_PERM_PARAMETER_FREQUENCY], color=PERMUTATION_TEST_COLORS[test_side], alpha=0.3)
if test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS]:
legend_handler[0].append(h)
legend_handler[1].append(test_side)
_ = plt.plot(ms_times, all_plot, 'k')
ax2.legend(*legend_handler)
ax.set_xlabel('Time (ms)')
ax2.set_ylabel('f-values')
return fig, [pd.from_dict(channel_data, index=times), *[
pd.Series(positive_clusters[ERP_PERM_CLUSTER_PARAMETER_TIMES])
for _, test_results in test_clusters.items()
for positive_clusters in test_results[ERP_PERM_PARAMETER_POSITIVE_CLUSTERS]]]
def plot_2d_cluster_heatmap(fig, ax, all_plot: np.ndarray, cluster_plot: np.ndarray,
times: np.ndarray, channels: list[str], max_value: float,
color_diff: bool = False, color_all: bool = True):
assert(color_all or color_diff), 'at least one aspect must be colored'
channels = list(reversed(channels))
# Adjust saturation of the color map
if color_all:
c1 = ax.imshow(np.flip(all_plot.T, axis=0), cmap='RdBu_r', aspect='auto', origin='lower',
norm=plt.Normalize(vmin=-max_value, vmax=max_value),)
else:
# grey background
new_colors = plt.get_cmap('bwr')(np.linspace(0, 1, 256))
new_colors[:, :3] = [(0.5,0.5,0.5) for _ in new_colors[:, :3]]
new_map = LinearSegmentedColormap.from_list('bwr_dull', new_colors)
c1 = ax.imshow(np.flip(all_plot.T, axis=0), cmap=new_map, aspect='auto', origin='lower',
norm=plt.Normalize(vmin=-max_value, vmax=max_value),)
if color_diff:
c2 = ax.imshow(np.flip(cluster_plot.T, axis=0), cmap='BrBG_r', aspect='auto',
origin='lower', norm=plt.Normalize(vmin=-max_value, vmax=max_value))
if color_all:
cbar = fig.colorbar(c1, ax=ax, label='amplitude differences (ambiguous - unambiguous)', )
cbar.ax.set_title('µV')
if color_diff:
cbar = fig.colorbar(c2, ax=ax, label='amplitude differences in significant cluster')
cbar.ax.set_title('µV')
if len(times) <= 20:
ax.set_xticks(np.arange(0, len(times)), [
int(round_time_EEG(time*1000)) for time in times])
else:
TICK_TIME_DISTANCE = 0.05
last_tick_name_time = times[0]
tick_names = [int(round(times[0]*1000, 0))]
for time in times[1:]:
rtime = round_time_EEG(time)
next_tick_time = round_time_EEG(
last_tick_name_time+TICK_TIME_DISTANCE)
if rtime >= next_tick_time:
tick_names.append(int(round(rtime*1000, 0)))
last_tick_name_time = rtime
else:
tick_names.append('')
ax.set_xticks(np.arange(0, len(times)), tick_names)
ax.set_xlabel('Time (ms)')
ax.set_ylabel('Electrodes')
ax.set_yticks(np.arange(0, len(channels)), channels)
def analyze_statistics_distribution(erp_frp: bool | str, snippet_group: str, description: str, fixation_analysis_data: pd.DataFrame, analysis_topic: str, analysis_column: str):
conditional_offset, participant_conditional_offset = statistics_distribution(
erp_frp, snippet_group, description, fixation_analysis_data, analysis_topic, analysis_column)
participant_conditional_offset = participant_conditional_offset[['mean']]
participant_conditional_offset.columns = [analysis_column]
participant_conditional_offset = participant_conditional_offset.reset_index()
stat_results = statistical_analysis_for_subjectwise_averages(participant_conditional_offset, {
analysis_column: ALTERNATIVE_HYPOTHESIS_GREATER}, [], TEST_KIND_PAIRED)
stat_results.to_csv(get_erp_fixation_analysis_path(erp_frp, snippet_group,
f'{description}_participant_statistics', analysis_topic+'_stat_test'), index=False, sep=SEPARATOR)
return stat_results