diff --git a/datasets/Wilming/etdb_v1.0.hdf5 b/datasets/Wilming/etdb_v1.0.hdf5 new file mode 100644 index 0000000..5845c8c Binary files /dev/null and b/datasets/Wilming/etdb_v1.0.hdf5 differ diff --git a/emip_toolkit.py b/emip_toolkit.py index 7dfec24..64e45f4 100644 --- a/emip_toolkit.py +++ b/emip_toolkit.py @@ -1,46 +1,55 @@ """ The EMIP Toolkit (EMTK) can be used under the CC 4.0 license (https://creativecommons.org/licenses/by/4.0/) + Authors: Naser Al Madi (nsalmadi@colby.edu) Ricky Peng (siyuan.peng@colby.edu) + """ import math import os import statistics - import numpy as np import pandas as pd from matplotlib import pyplot as plt from PIL import Image, ImageDraw, ImageEnhance, ImageFont import requests, zipfile +import h5py # Dictionary for datasets Key = dataset_name, Value = [url, is_zipped, citation] -data_dictionary = {'EMIP' : ['https://osf.io/j6vt3/download', False, 'https://dl.acm.org/doi/abs/10.1145/3448018.3457425'], -'AlMadi2018' : ['https://github.com/nalmadi/EMIP-Toolkit/raw/main/datasets/AlMadi2018.zip',False,'https://dl.acm.org/doi/10.1145/3448018.345742']} +data_dictionary = {'EMIP' : ['https://osf.io/j6vt3/download', False, 'https://dl.acm.org/doi/abs/10.1145/3448018.3457425']} class Fixation: """ Basic container for storing Fixation data """ def __init__(self, trial_id, participant_id, timestamp, duration, x_cord, y_cord, token, pupil): """Initializes the basic data for each fixation + Parameters ---------- trial_id : int trial id that the fixation belongs to + participant_id : str participant id that the fixation belongs to + timestamp : int fixation time stamp + duration : int fixation duration in milliseconds + x_cord : float fixation x coordinates + y_cord : float fixation y coordinates + token : str the source code token which the fixation is on + pupil : float pupil size of the fixation """ @@ -56,6 +65,7 @@ def __init__(self, trial_id, participant_id, timestamp, duration, x_cord, y_cord def get_fixation(self): """Returns fixation attributes as a list + Returns ------- list @@ -77,10 +87,12 @@ def get_fixation(self): def sample_offset(self, x_offset, y_offset): """Returns the x and y coordinate of the fixation + Parameters ---------- x_offset : float offset to be applied on all fixations in the x-axis + y_offset : float offset to be applied on all fixations in the y-axis """ @@ -89,6 +101,7 @@ def sample_offset(self, x_offset, y_offset): def __str__(self): """Returns string information of fixation + Returns ------- str @@ -102,26 +115,36 @@ class Saccade: def __init__(self, trial_id, participant_id, timestamp, duration, x_cord, y_cord, x1_cord, y1_cord, amplitude, peak_velocity): """Initializes the basic data for each fixation + Parameters ---------- trial_id : int trial id that the fixation belongs to + participant_id : str participant id that the fixation belongs to + timestamp : int saccade start time stamp + duration : int saccade duration in milliseconds + x_cord : float saccade start point x coordinate + y_cord : float saccade start point y coordinate + x1_cord : float saccade end point x coordinate + y1_cord : float saccade end point y coordinate + amplitude : float amplitude for saccade + peak_velocity : int peak velocity during saccade """ @@ -139,6 +162,7 @@ def __init__(self, trial_id, participant_id, timestamp, duration, x_cord, y_cord def get_saccade(self): """Returns saccade attributes as a list + Returns ------- list @@ -158,10 +182,12 @@ def get_saccade(self): def sample_offset(self, x_offset, y_offset): """Returns the x and y coordinate of the saccade + Parameters ---------- x_offset : float offset to be applied on all fixations in the x-axis + y_offset : float offset to be applied on all fixations in the y-axis """ @@ -172,6 +198,7 @@ def sample_offset(self, x_offset, y_offset): def __str__(self): """Returns string information of saccade + Returns ------- str @@ -184,14 +211,18 @@ def __str__(self): class Blink: def __init__(self, trial_id, participant_id, timestamp, duration): """Initializes the basic data for each blink + Parameters ---------- trial_id : int trial id that the blink belongs to + participant_id : str participant id that the blink belongs to + timestamp : int blink time stamp + duration : int blink duration in milliseconds """ @@ -202,6 +233,7 @@ def __init__(self, trial_id, participant_id, timestamp, duration): def get_blink(self): """Returns blink attributes as a list + Returns ------- list @@ -215,6 +247,7 @@ def get_blink(self): def __str__(self): """Returns string information of blink + Returns ------- str @@ -232,22 +265,30 @@ def __init__(self, trial_id: int, participant_id: str, image: str, fixations: di samples: list, eye_tracker: str): """Initializes attributes for storing trial data, fixations, saccades, blinks, and stores image name + Parameters ---------- trial_id : int id of this trial + participant_id : str id of this participant + image : str image path for this trial + fixations : dict dictionary that stores fixations as values, order of eye movement in the trial as key + saccades : dict dictionary that stores saccades as values, order of eye movement in the trial as key + blinks : dict dictionary that stores blinks as values, order of eye movement in the trial as key + samples : list list of raw data samples + eye_tracker : str type of eye tracker """ @@ -265,6 +306,7 @@ def __init__(self, trial_id: int, participant_id: str, image: str, fixations: di def get_trial_id(self): """Returns the trial id + Returns ------- int @@ -274,6 +316,7 @@ def get_trial_id(self): def get_subject_id(self): """Returns the participant id + Returns ------- str @@ -283,6 +326,7 @@ def get_subject_id(self): def get_trial_image(self): """Returns the image filename associated with the trial + Returns ------- str @@ -292,6 +336,7 @@ def get_trial_image(self): def get_fixations(self): """Returns the fixations in the trial + Returns ------- dict @@ -301,6 +346,7 @@ def get_fixations(self): def get_fixation_number(self): """Returns the number of fixations in the trial + Returns ------- int @@ -310,6 +356,7 @@ def get_fixation_number(self): def get_saccades(self): """Returns the saccades in the trial + Returns ------- dict @@ -319,6 +366,7 @@ def get_saccades(self): def get_saccade_number(self): """Returns the number of saccades in the trial + Returns ------- int @@ -328,6 +376,7 @@ def get_saccade_number(self): def get_blinks(self): """Returns the blinks in the trial + Returns ------- dict @@ -337,6 +386,7 @@ def get_blinks(self): def get_blink_number(self): """Returns the number of blinks in the trial + Returns ------- int @@ -346,6 +396,7 @@ def get_blink_number(self): def get_eye_movement_number(self): """Returns the total number of eye movement in the trial + Returns ------- int @@ -355,6 +406,7 @@ def get_eye_movement_number(self): def get_samples(self): """Returns the raw sample in a list + Returns ------- list @@ -364,6 +416,7 @@ def get_samples(self): def get_sample_number(self): """Returns the total number of eye movement in the trial + Returns ------- int @@ -373,6 +426,7 @@ def get_sample_number(self): def get_offset(self): """Returns total offset applied by adding all offsets in offset history + Returns ------- tuple @@ -394,10 +448,12 @@ def reset_offset(self): def sample_offset(self, x_offset, y_offset): """Moves samples +X and +Y pixels across the viewing window to correct fixation shift or other shifting problems manually + Parameters ---------- x_offset : int offset to be applied on all fixations in the x-axis + y_offset : int offset to be applied on all fixations in the y-axis """ @@ -422,6 +478,7 @@ def sample_offset(self, x_offset, y_offset): def __draw_raw_data(self, draw): """Private method that draws raw sample data + Parameters ---------- draw : PIL.ImageDraw.Draw @@ -448,10 +505,12 @@ def __draw_raw_data(self, draw): def __draw_fixation(self, draw, draw_number=False): """Private method that draws the fixation, also allow user to draw eye movement order + Parameters ---------- draw : PIL.ImageDraw.Draw a Draw object imposed on the image + draw_number : bool whether user wants to draw the eye movement number """ @@ -479,12 +538,15 @@ def __draw_fixation(self, draw, draw_number=False): def __draw_aoi(self, draw, aoi, bg_color): """Private method to draw the Area of Interest on the image + Parameters ---------- draw : PIL.ImageDraw.Draw a Draw object imposed on the image + aoi : pandas.DataFrame a DataFrame that contains the area of interest bounds + bg_color : str background color """ @@ -504,10 +566,12 @@ def __draw_aoi(self, draw, aoi, bg_color): def __draw_saccade(self, draw, draw_number=False): """ + Parameters ---------- draw : PIL.ImageDraw.Draw a Draw object imposed on the image + draw_number : bool whether user wants to draw the eye movement number """ @@ -533,18 +597,25 @@ def draw_trial(self, image_path, draw_raw_data=False, draw_fixation=True, draw_s draw_aoi=None, save_image=None): """Draws the trial image and raw-data/fixations over the image circle size indicates fixation duration + image_path : str path for trial image file. + draw_raw_data : bool, optional whether user wants raw data drawn. + draw_fixation : bool, optional whether user wants filtered fixations drawn + draw_saccade : bool, optional whether user wants saccades drawn + draw_number : bool, optional whether user wants to draw eye movement number + draw_aoi : pandas.DataFrame, optional Area of Interests + save_image : str, optional path to save the image, image is saved to this path if it parameter exists """ @@ -612,10 +683,12 @@ class Experiment: def __init__(self, trial: list, eye_tracker: str, filetype: str): """Initialize each experiment with raw data file This method splits data into a bunch of trials based on JPG + Parameters ---------- trial: list raw data TSV file. + eye_tracker: str type of eye tracker used @@ -629,6 +702,7 @@ def __init__(self, trial: list, eye_tracker: str, filetype: str): def get_number_of_trials(self): """Returns the number of trials in the experiment + Returns ------- int @@ -637,6 +711,7 @@ def get_number_of_trials(self): def get_eye_tracker(self): """Returns the name of eye tracker in the experiment + Returns ------- str @@ -648,22 +723,28 @@ def get_eye_tracker(self): def idt_classifier(raw_fixations, minimum_duration=50, sample_duration=4, maximum_dispersion=25): """I-DT classifier based on page 296 of eye tracker manual: https://psychologie.unibas.ch/fileadmin/user_upload/psychologie/Forschung/N-Lab/SMI_iView_X_Manual.pdf + Notes: remember that some data is MSG for mouse clicks. some records are invalid with value -1. read right eye data only. + Parameters ---------- raw_fixations : list a list of fixations information containing timestamp, x_cord, and y_cord + minimum_duration : int, optional minimum duration for a fixation in milliseconds, less than minimum is considered noise. set to 50 milliseconds by default + sample_duration : int, optional Sample duration in milliseconds, this is 4 milliseconds based on this eye tracker + maximum_dispersion : int, optional maximum distance from a group of samples to be considered a single fixation. Set to 25 pixels by default + Returns ------- list @@ -714,20 +795,26 @@ def idt_classifier(raw_fixations, minimum_duration=50, sample_duration=4, maximu def read_SMIRed250(filename, filetype, minimum_duration=50, sample_duration=4, maximum_dispersion=25): """Read tsv file from SMI Red 250 eye tracker + Parameters ---------- filename : str name of the tsv file + filetype : str type of the file, e.g. "tsv" + minimum_duration : int, optional minimum duration for a fixation in milliseconds, less than minimum is considered noise. set to 50 milliseconds by default. + sample_duration : int, optional Sample duration in milliseconds, this is 4 milliseconds based on this eye tracker. + maximum_dispersion : int, optional maximum distance from a group of samples to be considered a single fixation. Set to 25 pixels by default. + Returns ------- Experiment @@ -845,6 +932,7 @@ def read_SMIRed250(filename, filetype, minimum_duration=50, sample_duration=4, m def read_EyeLink1000(filename, filetype): """Read asc file from Eye Link 1000 eye tracker + Parameters ---------- filename : str @@ -894,7 +982,7 @@ def read_EyeLink1000(filename, filetype): # Read image location index = str(int(trial_id) + 1) experiment = participant_id.split('/')[-1] - location = 'datasets/AlMadi2018/AlMadi2018/runtime/dataviewer/' + experiment + '/graphics/VC_' + index + '.vcl' + location = 'runtime/dataviewer/' + experiment + '/graphics/VC_' + index + '.vcl' with open(location, 'r') as file: image = file.readlines()[1].split()[-3].split('/')[-1] @@ -971,7 +1059,7 @@ def read_EyeLink1000(filename, filetype): # Read image location index = str(int(trial_id) + 1) experiment = participant_id.split('/')[-1] - location = 'datasets/AlMadi2018/AlMadi2018/runtime/dataviewer/' + experiment + '/graphics/VC_' + index + '.vcl' + location = 'runtime/dataviewer/' + experiment + '/graphics/VC_' + index + '.vcl' with open(location, 'r') as file: image = file.readlines()[1].split()[-3].split('/')[-1] @@ -992,10 +1080,12 @@ def read_EyeLink1000(filename, filetype): def find_background_color(img): """Private function that identifies the background color of the image + Parameters ---------- img : PIL.Image a PIL (pillow fork) Image object + Returns ------- str @@ -1026,20 +1116,27 @@ def find_background_color(img): def find_aoi(image=None, image_path=None, img=None, level="sub-line", margin_height=4, margin_width=7): """Find Area of Interest in the given image and store the aoi attributes in a Pandas Dataframe + Parameters ---------- image : str filename for the image, e.g. "vehicle_java.jpg" + image_path : str path for all images, e.g. "emip_dataset/stimuli/" + img : PIL.Image, optional PIL.Image object if user chooses to input an PIL image object + level : str, optional level of detection in AOIs, "line" for each line as an AOI or "sub-line" for each token as an AOI + margin_height : int, optional marginal height when finding AOIs, use smaller number for tight text layout + margin_width : int, optional marginal width when finding AOIs, use smaller number for tight text layout + Returns ------- pandas.DataFrame @@ -1165,14 +1262,18 @@ def find_aoi(image=None, image_path=None, img=None, level="sub-line", margin_hei def draw_aoi(aoi, image, image_path): """Draws AOI rectangles on to an image. + Parameters ---------- aoi : pandas.DataFrame a pandas DataFrame containing rectangle attributes representing areas of interest (AOIs) + image : str filename for the image where AOI rectangles will be imposed, e.g. "vehicle_java.jpg" + image_path : str path for all images, e.g. "emip_dataset/stimuli/" + Returns ------- PIL.Image @@ -1204,12 +1305,15 @@ def draw_aoi(aoi, image, image_path): def add_tokens_to_AOIs(file_path, aois_raw): """Adds tokens from code files to aois dataframe and returns it. + Parameters ---------- file_path : str path to directory where code files are stored. In EMIP this is "emip_stimulus_programs" + aois_raw : pandas.Dataframe the dataframe where AOIs are stored. + Returns ------- pandas.DataFrame @@ -1282,13 +1386,17 @@ def add_tokens_to_AOIs(file_path, aois_raw): def add_srcml_to_AOIs(aois_raw, srcML_path): """Adds srcML tags to AOIs dataframe and returns it. Check https://www.srcml.org/ for more information about srcML + The files: rectangle.tsv and vehicle.tsv should be in the same directory as the code. + Parameters ---------- aois_raw : pandas.Dataframe the dataframe where AOIs are stored + srcML_path : string the path of the srcML tags file + Returns ------- pandas.DataFrame @@ -1356,15 +1464,19 @@ def add_srcml_to_AOIs(aois_raw, srcML_path): def overlap(fix, AOI, radius=25): """Checks if fixation is within radius distance or over an AOI. Returns True/False. + Parameters ---------- fix : Fixation A single fixation in a trial being considered for overlapping with the AOI + AOI : pandas.DataFrame contains AOI #kind name x y width height local_id image token + radius : int, optional radius around AOI to consider fixations in it within the AOI. default is 25 pixel since the fixation filter groups samples within 25 pixels. + Returns ------- bool @@ -1382,15 +1494,19 @@ def overlap(fix, AOI, radius=25): def hit_test(trial, aois_tokens, radius=25): """Checks if fixations are within AOI with a fixation radius of 25 px (since each fix is a sum of samples within 25px) + Parameters ---------- trial : Trial contains fixations and other metadata (trial#, participant, code_file, code_language) - fixation includes timestamp, duration, x_cord, y_cord + aois_tokens : pandas.Dataframe contains each AOI location and dimension and token text + radius : int, optional radius of circle using in hit test + Returns ------- pandas.DataFrame @@ -1446,12 +1562,15 @@ def hit_test(trial, aois_tokens, radius=25): def EMIP_dataset(path, sample_size=216): """Import the EMIP dataset + Parameters ---------- path : str path to EMIP dataset raw data directory, e.g. '../../emip_dataset/rawdata/' + sample_size : int, optional the number of subjects to be processed, the default is 216 + Returns ------- dict @@ -1467,6 +1586,7 @@ def EMIP_dataset(path, sample_size=216): for file in f: if '.tsv' in file: participant_id = file.split('/')[-1].split('_')[0] + if subject.get(participant_id, -1) == -1: subject[participant_id] = read_SMIRed250(os.path.join(r, file), filetype="tsv") else: @@ -1483,12 +1603,15 @@ def EMIP_dataset(path, sample_size=216): def AlMadi_dataset(path, sample_size=216): """Import the Al Madi's dataset + Parameters ---------- path : str path to Al Madi's dataset raw data directory, e.g. '../../AlMadi2018/' + sample_size : int, optional the number of subjects to be processed, the default is 216 + Returns ------- dict @@ -1519,42 +1642,164 @@ def AlMadi_dataset(path, sample_size=216): return subject +def Wilming_dataset(filename, sample_size=216): + """Import the Wilming dataset + + Parameters + ---------- + filename : str + path to Wilming dataset .hdf5 file, e.g. '../../etdb_v1.0.hdf5/' + + sample_size : int, optional + the number of subjects to be processed, the default is 216 + + Returns + ------- + dict + a dictionary of experiments where the key is the subject ID + """ + subject = {} + + # go over .tsv files in the rawdata directory add files and count them + # r = root, d = directories, f = files + + with h5py.File(filename, 'r') as file: + print("parsing file:", filename) + data = file["Age study"] + + trials = [] + + fixations = {} + + count = 0 + + participant_id = data["SUBJECTINDEX"][0] + trial_id = data["trial"][0] + + current_fixation = data["fix"][0] + + line = 0 + + data = file["Age study"] + while count < sample_size and line < len(data["SUBJECTINDEX"]): + + if not participant_id == data["SUBJECTINDEX"][line]: + subject[participant_id] = Experiment(trial=trials, + eye_tracker="EyeLink2", + filetype="hdf") + count += 1 + trials = [] + + if not trial_id == data["trial"][line]: + image_category = data["trial"][line-1] + image_number = data["on_image"][line-1] + trials.append(Trial(trial_id=trial_id, + participant_id=participant_id, + image=f"Stimuli_{image_category}/{image_category}/{image_number}.png", + fixations=fixations, + saccades=None, + blinks=None, + samples=None, + eye_tracker="EyeLink2")) + fixations = {} + + if not current_fixation == data["fix"][line]: + timestamp = data["start"][line-1] + duration = data["end"][line-1]-data["start"][line-1] + x_cord = data["x"][line-1] + y_cord = data["y"][line-1] + pupil = None + + fixations[current_fixation] = Fixation(trial_id=trial_id, + participant_id=participant_id, + timestamp=timestamp, + duration=duration, + x_cord=x_cord, + y_cord=y_cord, + token="", + pupil=pupil) + current_fixation = data["fix"][line] + + line += 1 + + + # Code to add last line + timestamp = data["start"][line - 1] + duration = data["end"][line - 1] - data["start"][line - 1] + x_cord = data["x"][line - 1] + y_cord = data["y"][line - 1] + pupil = None + fixations[current_fixation] = Fixation(trial_id=trial_id, + participant_id=participant_id, + timestamp=timestamp, + duration=duration, + x_cord=x_cord, + y_cord=y_cord, + token="", + pupil=pupil) + + image_category = data["trial"][line - 1] + image_number = data["on_image"][line - 1] + trials.append(Trial(trial_id=trial_id, + participant_id=participant_id, + image=f"Stimuli_{image_category}/{image_category}/{image_number}.png", + fixations=fixations, + saccades=None, + blinks=None, + samples=None, + eye_tracker="EyeLink2")) + + subject[participant_id] = Experiment(trial=trials, + eye_tracker="EyeLink2", + filetype="hdf") + + return subject + + def check_downloaded(dataset_name): """Check if the dataset is already in the dataset dictionary + Parameters ---------- dataset_name : str Name of the dataset, path to raw data directory, e.g. '../../dataset_name/' + Returns ------- bool True if dataset is in dataset folder False if not + """ return os.path.isfile('./datasets/' + dataset_name + '.zip') def check_unzipped(dataset_name): """Check if the dataset is already unzipped in the datasets dictionary + Parameters ---------- dataset_name : str Name of the dataset, path to raw data directory, e.g. '../../dataset_name/' + Returns ------- bool True if dataset is unzipped in dataset folder False if not + """ return os.path.isdir('./datasets/' + dataset_name) - + def download(dataset_name): """Download any dataset via a link to the data + Parameters ---------- dataset_name : str Name of the dataset, path to raw data directory, e.g. '../../dataset_name/' + url : str link to the data @@ -1563,13 +1808,14 @@ def download(dataset_name): citation : str link to the paper where the dataset originates from + """ url, is_zipped, citation = data_dictionary[dataset_name] # Check if dataset has already been downloaded if not check_downloaded(dataset_name): print('Downloading...') - + #creates a zip file of the data if unzipped if is_zipped == False: @@ -1587,4 +1833,4 @@ def download(dataset_name): print('Please cite this paper: ', citation) - return './datasets/' + dataset_name + return './datasets/' + dataset_name \ No newline at end of file