Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added OpenEphys to KWD conversion #44

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions klusta_pipeline/openephys_to_kwd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# Converts open ephys format files to kwd files
import os
import glob
import numpy as np
import h5py as h5

# OpenEphys constants
NUM_HEADER_BYTES = 1024
SAMPLES_PER_RECORD = 1024
BYTES_PER_SAMPLE = 2
RECORD_SIZE = 4 + 8 + SAMPLES_PER_RECORD*BYTES_PER_SAMPLE + 10
RECORD_RECORDING_OFFSET = 8+2
RECORD_TIMESTAMP_OFFSET = 0


def find_continuous_files(data_dir):
return glob.glob(os.path.join(data_dir, '*.continuous'))

def find_events_files(data_dir):
return glob.glob(os.path.join(data_dir, '*.events'))

def get_n_channels(continuous_files):
return len(continuous_files)

def initialize_kwd_file(data_dir, experiment_name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have you looked at from klusta_pipeline.dataio import save_info, save_recording, save_chanlist, save_probe, save_parameters
there might be a bunch of code you can reuse in there
Also, are we going to have averaging options here? as in klusta_pipeline.utils? options to ignore dead channels?

# Create an empty KWD file with the appropriate structure
kwd_filename = experiment_name + '.raw.kwd'
kwd_file = h5.File(os.path.join(data_dir, kwd_filename), "w-")
kwd_file.create_group("recordings")

continuous_files = find_continuous_files(data_dir)
unique_recording_numbers = get_openephys_unique_recording_numbers(continuous_files)
for rec_num in unique_recording_numbers:
kwd_file.create_group("/recordings/{}".format(rec_num))
return kwd_file

def read_openephys_header(oe_file):
# return a dictionary of open ephys header values
oe_header = {}
oe_file.seek(0)
h = oe_file.read(1024).decode().replace('\n', '').replace('header.','')
for ind, header_item in enumerate(h.split(';')):
if '=' in header_item:
header_key = header_item.split(' = ')[0]
header_value = header_item.split(' = ')[1]
oe_header[header_key] = header_value
return oe_header

def calculate_openephys_continuous_sizes(oe_file):
# Calculate: Number of Records, Number of Samples
n_file_bytes = os.fstat(oe_file.fileno()).st_size
n_record_bytes = n_file_bytes - NUM_HEADER_BYTES

# Check if file is consistent
if n_record_bytes % RECORD_SIZE != 0:
raise Exception("File size inconsistent: possible corrupt data file")

n_records = n_record_bytes // RECORD_SIZE
n_samples = n_records * SAMPLES_PER_RECORD
return (n_records, n_samples)

def load_openephys_continuous_record(oe_file, record_num):
# Loads a single record from an openephys continous file

# Calculate start of record in bytes
record_start = 1024 + record_num*RECORD_SIZE
# Move to start of record
oe_file.seek(record_start)
# Get Timestamp
record_timestamp = np.fromfile(oe_file, np.dtype('<i8'), 1)
# Get Number of samples
record_n_samples = np.fromfile(oe_file, np.dtype('<u2'), 1)[0]
if record_n_samples != SAMPLES_PER_RECORD:
raise Exception('Found corrupted record in ' + str(record_num))
# Get recording number
record_recording_number = np.fromfile(oe_file, np.dtype('>u2'), 1)[0]
# Get raw record data
record_raw_data = np.fromfile(oe_file, np.dtype('>i2'), record_n_samples)
# Ignore record marker
oe_file.read(10)
# Make array of timesamples
sample_numbers = np.arange(record_timestamp,
record_timestamp + record_n_samples)
# Make data array
data = np.zeros((record_n_samples, 2))
data[:, 0] = sample_numbers
data[:, 1] = record_raw_data
recording_start_sample = int(data[0, 0])

return (record_num, record_recording_number, recording_start_sample, data)

def get_openephys_continuous_recording_numbers(oe_file):
# Returns an array of recording numbers within a .continuous file
recordings = []
(n_records, n_samples) = calculate_openephys_continuous_sizes(oe_file)
for record in range(n_records):
record_offset = 1024 + record*RECORD_SIZE
recording_offset = record_offset + RECORD_RECORDING_OFFSET
oe_file.seek(recording_offset)
record_recording_number = np.fromfile(oe_file, np.dtype('>u2'), 1)[0]
recordings.append(record_recording_number)
return recordings

def get_openephys_continuous_timestamps(oe_file):
# Returns an array of recording numbers within a .continuous file
timestamps = []
(n_records, n_samples) = calculate_openephys_continuous_sizes(oe_file)
for record in range(n_records):
record_offset = 1024 + record*RECORD_SIZE
timestamp_offset = record_offset + RECORD_TIMESTAMP_OFFSET
oe_file.seek(timestamp_offset)
record_timestamp = np.fromfile(oe_file, np.dtype('<i8'), 1)
timestamps.append(record_timestamp)
return timestamps

def get_openephys_continuous_record_metadata(oe_file):
# Returns an array of recording numbers within a .continuous file
recordings = []
timestamps = []
(n_records, n_samples) = calculate_openephys_continuous_sizes(oe_file)
for record in range(n_records):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this record/record offset code is a bit of a repeated motif throughout several functions, it might be clearer to say

for record_offset in gen_record_offsets(n_records):
    timestamps.append(seek_and_read(oe_file, record_offset + RECORD_TIMESTAMP_OFFSET, '<i8'))
    recordings.append(seek_and_read(oe_file, record_offset + RECORD_RECORDING_OFFSET, '>u2')[0])

but that's just my style

record_offset = 1024 + record*RECORD_SIZE
timestamp_offset = record_offset + RECORD_TIMESTAMP_OFFSET
recording_offset = record_offset + RECORD_RECORDING_OFFSET
oe_file.seek(timestamp_offset)
record_timestamp = np.fromfile(oe_file, np.dtype('<i8'), 1)
timestamps.append(record_timestamp)
oe_file.seek(recording_offset)
record_recording_number = np.fromfile(oe_file, np.dtype('>u2'), 1)[0]
recordings.append(record_recording_number)
return (recordings, timestamps)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need parenthesis here either


def get_openephys_continuous_record_recording_number(oe_file, record):
offset = 1024 + record*RECORD_SIZE + RECORD_RECORDING_OFFSET
oe_file.seek(offset)
return np.fromfile(oe_file, np.dtype('>u2'), 1)[0]

def get_openephys_continuous_record_timestamp(oe_file, record):
offset = 1024 + record*RECORD_SIZE + RECORD_TIMESTAMP_OFFSET
oe_file.seek(offset)
return np.fromfile(oe_file, np.dtype('<i8'), 1)

def get_openephys_continuous_recording_timestamps(oe_file, recording):
(recordings, timestamps) = get_openephys_continuous_record_metadata(oe_file)
return timestamps[recordings == recording]

def get_openephys_unique_recording_numbers(continuous_files):
recordings = []
for cf in continuous_files:
with open(cf, 'rb') as f:
cf_unique_recordings = get_openephys_continuous_recording_numbers(f)
recordings = recordings + cf_unique_recordings
return list(set(recordings))

def check_n_samples_consistency(continuous_files):
n_samples_list = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not make this a set to begin with?

for cf in continuous_files:
with open(cf, "rb") as oe_file:
n_samples = calculate_openephys_continuous_sizes(oe_file)[1]
n_samples_list.append(n_samples)
if len(set(n_samples_list)) != 1:
raise Exception("Inconsistent number of samples across channels")
return list(set(n_samples_list))[0]

def get_openephys_channel_number(continuous_file):
cf_filename = os.path.split(continuous_file)[1]
cf_filename = cf_filename.replace('.continuous', '')
if 'CH' in cf_filename:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're going to have these kinds of magic numbers can you do some asserts to make sure you're getting the data you expected.

also maybe elifs and a else case that raises an error?

channel_number = int(cf_filename.split('CH')[-1]) - 1
if 'ADC' in cf_filename:
channel_number = int(cf_filename.split('ADC')[-1]) - 12
if 'AUX' in cf_filename:
channel_number = int(cf_filename.split('AUX')[-1]) - 4
return channel_number

def build_kwd_recording(kwd_file, continuous_files, recording_number):
n_channels = get_n_channels(continuous_files)
n_samples = check_n_samples_consistency(continuous_files)
dset = kwd_file.create_dataset("/recordings/{}/data".format(recording_number),
(n_samples, n_channels), dtype='int16')

channel_bit_volts = np.zeros(n_channels)
channel_sample_rates = np.zeros(n_channels)
channel_timestamps = []

for cf in continuous_files:
# get channel number
channel = get_openephys_channel_number(cf)
print("Storing channel {} of {}".format(channel, n_channels))
with open(cf, "rb") as oe_file:
cf_header = read_openephys_header(oe_file)
channel_sample_rates[channel] = int(cf_header['sampleRate'])
channel_bit_volts[channel] = float(cf_header['bitVolts'])

(n_records, n_oe_file_samples) = calculate_openephys_continuous_sizes(oe_file)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need the parenthesis around (n_records, n_oe_file_samples)

(recordings, timestamps) = get_openephys_continuous_record_metadata(oe_file)
recordings = np.array(recordings)
timestamps = np.array(timestamps)
channel_timestamps.append(timestamps)

# Select records corresponding to this recording
recording_records = np.arange(n_records)[recordings==recording_number]

# For each record, save the samples in the kwd recording dataset
for record_ind, record in enumerate(recording_records):
record_data_list = load_openephys_continuous_record(oe_file, record)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe change this to
_, _, _. record_data = load_openephys_continuous_record(oe_file, record) or
record_data = load_openephys_continuous_record(oe_file, record)[3]

record_data = record_data_list[3]
record_sample_lo = record_ind*SAMPLES_PER_RECORD
record_sample_hi = record_ind*SAMPLES_PER_RECORD + SAMPLES_PER_RECORD
#print(record_ind, record, record_sample_lo, record_sample_hi, n_records, n_samples)
dset[record_sample_lo:record_sample_hi, channel] = record_data[:, 1]

# Save recording metadata
kwd_file.create_dataset("/recordings/{}/application_data/channel_sample_rates".format(recording_number), data=channel_sample_rates)
kwd_file.create_dataset("/recordings/{}/application_data/channel_bit_volts".format(recording_number), data=channel_bit_volts)
kwd_file.create_dataset("/recordings/{}/application_data/timestamps".format(recording_number), data=np.array(channel_timestamps))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we set it up to have a an if name == main so we can run it as a script