From 337b0533f46a35fc141cf8b1a3d12d26b82158a3 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 19 Sep 2023 16:29:33 -0700 Subject: [PATCH 01/26] transfer_learn --- setup.cfg | 44 +- src/baskerville/HY_helper.py | 74 +++ src/baskerville/layers.py | 49 ++ src/baskerville/scripts/hound_train.py | 138 ++++- .../scripts/westminster_train_folds_copy.py | 509 ++++++++++++++++++ src/baskerville/trainer.py | 79 ++- 6 files changed, 853 insertions(+), 40 deletions(-) create mode 100644 src/baskerville/HY_helper.py create mode 100755 src/baskerville/scripts/westminster_train_folds_copy.py diff --git a/setup.cfg b/setup.cfg index 43fda39..0f1e198 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,31 +19,31 @@ package_dir = packages = find: python_requires = >=3.8, <3.11 install_requires = - h5py~=3.7.0 - intervaltree~=3.1.0 - joblib~=1.1.1 - matplotlib~=3.7.1 - google-cloud-storage~=2.0.0 - natsort~=7.1.1 - networkx~=2.8.4 - numpy~=1.24.3 - pandas~=1.5.3 - pybigwig~=0.3.18 - pysam~=0.21.0 - pybedtools~=0.9.0 - qnorm~=0.8.1 - seaborn~=0.12.2 - scikit-learn~=1.2.2 - scipy~=1.9.1 - statsmodels~=0.13.5 - tabulate~=0.8.10 - tensorflow~=2.12.0 - tqdm~=4.65.0 + h5py>=3.7.0 + intervaltree>=3.1.0 + joblib>=1.1.1 + matplotlib>=3.7.1 + google-cloud-storage>=2.0.0 + natsort>=7.1.1 + networkx>=2.8.4 + numpy>=1.24.3 + pandas>=1.5.3 + pybigwig>=0.3.18 + pysam>=0.21.0 + pybedtools>=0.9.0 + qnorm>=0.8.1 + seaborn>=0.12.2 + scikit-learn>=1.2.2 + scipy>=1.9.1 + statsmodels>=0.13.5 + tabulate>=0.8.10 + tensorflow>=2.12.0 + tqdm>=4.65.0 [options.extras_require] dev = - black==22.3.0 - pytest==7.1.2 + black>=22.3.0 + pytest>=7.1.2 [options.packages.find] where = src diff --git a/src/baskerville/HY_helper.py b/src/baskerville/HY_helper.py new file mode 100644 index 0000000..f4f7878 --- /dev/null +++ b/src/baskerville/HY_helper.py @@ -0,0 +1,74 @@ +import numpy as np +from basenji import dna_io +import pysam +import pyBigWig + +def make_seq_1hot(genome_open, chrm, start, end, seq_len): + if start < 0: + seq_dna = 'N'*(-start) + genome_open.fetch(chrm, 0, end) + else: + seq_dna = genome_open.fetch(chrm, start, end) + + #Extend to full length + if len(seq_dna) < seq_len: + seq_dna += 'N'*(seq_len-len(seq_dna)) + + seq_1hot = dna_io.dna_1hot(seq_dna) + return seq_1hot + +#Helper function to get (padded) one-hot +def process_sequence(fasta_file, chrom, start, end, seq_len=524288) : + + fasta_open = pysam.Fastafile(fasta_file) + seq_len_actual = end - start + + #Pad sequence to input window size + start -= (seq_len - seq_len_actual) // 2 + end += (seq_len - seq_len_actual) // 2 + + #Get one-hot + sequence_one_hot = make_seq_1hot(fasta_open, chrom, start, end, seq_len) + + return sequence_one_hot.astype('float32') + +def compute_cov(seqnn_model, chr, start, end): + seq_len = seqnn_model.model.layers[0].input.shape[1] + seq1hot = process_sequence('/home/yuanh/programs/genomes/hg38/hg38.fa', chr, start, end, seq_len=seq_len) + out = seqnn_model.model(seq1hot[None, ]) + return out.numpy() + +def write_bw(bw_file, chr, start, end, values, span=32): + bw_out = pyBigWig.open(bw_file, 'w') + header = [] + header.append((chr, end+1)) + bw_out.addHeader(header) + bw_out.addEntries(chr, start, values=values, span=span, step=span) + bw_out.close() + +def transform(seq_cov, clip=384, clip_soft=320, scale=0.3): + seq_cov = scale * seq_cov # scale + seq_cov = -1 + np.sqrt(1+seq_cov) # variant stabilize + clip_mask = (seq_cov > clip_soft) # soft clip + seq_cov[clip_mask] = clip_soft-1 + np.sqrt(seq_cov[clip_mask] - clip_soft+1) + seq_cov = np.clip(seq_cov, -clip, clip) # hard clip + return seq_cov + +def untransform(cov, scale=0.3, clip_soft=320, pool_width=32): + + # undo clip_soft + cov_unclipped = (cov - clip_soft + 1)**2 + clip_soft - 1 + unclip_mask = (cov > clip_soft) + cov[unclip_mask] = cov_unclipped[unclip_mask] + + # undo sqrt + cov = (cov +1)**2 - 1 + + # undo scale + cov = cov / scale + + # undo sum + cov = cov / pool_width + + return cov + + diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index acde2e5..1d28c2c 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -23,6 +23,55 @@ for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) +##################### +# transfer learning # +##################### +class AdapterHoulsby(tf.keras.layers.Layer): + ### Houlsby et al. 2019 implementation + + def __init__( + self, + latent_size, + activation=tf.keras.layers.ReLU(), + **kwargs): + super(AdapterHoulsby, self).__init__(**kwargs) + self.latent_size = latent_size + self.activation = activation + + def build(self, input_shape): + self.down_project = tf.keras.layers.Dense( + units=self.latent_size, + activation="linear", + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + bias_initializer="zeros", + name='adapter_down' + ) + + self.up_project = tf.keras.layers.Dense( + units=input_shape[-1], + activation="linear", + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + bias_initializer="zeros", + name='adapter_up' + ) + + def call(self, inputs): + projected_down = self.down_project(inputs) + activated = self.activation(projected_down) + projected_up = self.up_project(activated) + output = projected_up + inputs + return output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "latent_size": self.latent_size, + "activation": self.activation + } + ) + return config + ############################################################ # Basic ############################################################ diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index e7ec150..d5a754d 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -17,6 +17,7 @@ import json import os import shutil +import re import numpy as np import pandas as pd @@ -26,6 +27,7 @@ from baskerville import dataset from baskerville import seqnn from baskerville import trainer +from baskerville import layers """ hound_train.py @@ -33,7 +35,6 @@ Train Hound model using given parameters and data. """ - def main(): parser = argparse.ArgumentParser(description="Train a model.") parser.add_argument( @@ -67,6 +68,17 @@ def main(): default=False, help="Restore only model trunk [Default: %(default)s]", ) + parser.add_argument( + "--transfer_mode", + default="full", + help="transfer method. [full, linear, adapter]", + ) + parser.add_argument( + "--latent", + type=int, + default=16, + help="adapter latent size.", + ) parser.add_argument( "--tfr_train", default=None, @@ -131,31 +143,65 @@ def main(): tfr_pattern=args.tfr_eval, ) ) - + params_model["strand_pair"] = strand_pairs if args.mixed_precision: - mixed_precision.set_global_policy("mixed_float16") - + policy = mixed_precision.Policy('mixed_float16') + mixed_precision.set_global_policy(policy) + if params_train.get("num_gpu", 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) - + # restore if args.restore: seqnn_model.restore(args.restore, trunk=args.trunk) + # transfer learning strategies + if args.transfer_mode=='full': + seqnn_model.model.trainable=True + + elif args.transfer_mode=='batch_norm': + seqnn_model.model_trunk.trainable=False + for l in seqnn_model.model.layers: + if l.name.startswith("batch_normalization"): + l.trainable=True + seqnn_model.model.summary() + + elif args.transfer_mode=='linear': + seqnn_model.model_trunk.trainable=False + seqnn_model.model.summary() + + elif args.transfer_mode=='adapterHoulsby': + seqnn_model.model_trunk.trainable=False + strand_pair = strand_pairs[0] + adapter_model = make_adapter_model(seqnn_model.model, strand_pair, args.latent) + seqnn_model.model = adapter_model + seqnn_model.models[0] = seqnn_model.model + seqnn_model.model_trunk = None + seqnn_model.model.summary() + # initialize trainer seqnn_trainer = trainer.Trainer( params_train, train_data, eval_data, args.out_dir ) - + # compile model seqnn_trainer.compile(seqnn_model) + # train model + if args.keras_fit: + seqnn_trainer.fit_keras(seqnn_model) + else: + if len(args.data_dirs) == 1: + seqnn_trainer.fit_tape(seqnn_model) + else: + seqnn_trainer.fit2(seqnn_model) + else: ######################################## # multi GPU @@ -163,6 +209,7 @@ def main(): strategy = tf.distribute.MirroredStrategy() with strategy.scope(): + if not args.keras_fit: # distribute data for di in range(len(args.data_dirs)): @@ -190,16 +237,81 @@ def main(): # compile model seqnn_trainer.compile(seqnn_model) - # train model - if args.keras_fit: - seqnn_trainer.fit_keras(seqnn_model) - else: - if len(args.data_dirs) == 1: - seqnn_trainer.fit_tape(seqnn_model) + # train model + if args.keras_fit: + seqnn_trainer.fit_keras(seqnn_model) else: - seqnn_trainer.fit2(seqnn_model) + if len(args.data_dirs) == 1: + seqnn_trainer.fit_tape(seqnn_model) + else: + seqnn_trainer.fit2(seqnn_model) +def make_adapter_model(input_model, strand_pair, latent_size=16): + # take seqnn_model as input + # output a new seqnn_model object + # only the adapter, and layer_norm are trainable + + model = tf.keras.Model(inputs=input_model.input, + outputs=input_model.layers[-2].output) # remove the switch_reverse layer + + # save current graph + layer_parent_dict_old = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict_old: + layer_parent_dict_old.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict_old[layer_name]: + layer_parent_dict_old[layer_name].append(layer.name) + + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({model.layers[0].name: model.input}) + + # remove switch_reverse + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in model.layers[1:]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + + # insert adapter: + elif re.match('add', layer.name): + if any([re.match('dropout', i) for i in parent_layers]): + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) + x = layer([layer_input[0], x]) + else: + x = layer(layer_input) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) + + # set layer_norm layers to trainable + for l in model_adapter.layers: + if re.match('layer_normalization', l.name): l.trainable = True + return model_adapter ################################################################################ # __main__ ################################################################################ diff --git a/src/baskerville/scripts/westminster_train_folds_copy.py b/src/baskerville/scripts/westminster_train_folds_copy.py new file mode 100755 index 0000000..6f27ec5 --- /dev/null +++ b/src/baskerville/scripts/westminster_train_folds_copy.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python +# Copyright 2019 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from optparse import OptionParser, OptionGroup +import glob +import json +import os +import pdb +import shutil + +from natsort import natsorted + +import slurm + +""" +westminster_train_folds.py + +Train baskerville model replicates on cross folds using given parameters and data. +""" + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] ...' + parser = OptionParser(usage) + + # train + train_options = OptionGroup(parser, 'houndtrain.py options') + train_options.add_option('-k', dest='keras_fit', + default=False, action='store_true', + help='Train with Keras fit method [Default: %default]') + train_options.add_option('-m', dest='mixed_precision', + default=False, action='store_true', + help='Train with mixed precision [Default: %default]') + train_options.add_option('-o', dest='out_dir', + default='train_out', + help='Training output directory [Default: %default]') + train_options.add_option('--restore', dest='restore', + help='Restore model and continue training, from existing fold train dir [Default: %default]') + train_options.add_option('--trunk', dest='trunk', + default=False, action='store_true', + help='Restore only model trunk [Default: %default]') + train_options.add_option('--tfr_train', dest='tfr_train_pattern', + default=None, + help='Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') + train_options.add_option('--tfr_eval', dest='tfr_eval_pattern', + default=None, + help='Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') + parser.add_option_group(train_options) + + # eval + eval_options = OptionGroup(parser, 'hound_eval.py options') + eval_options.add_option('--rank', dest='rank_corr', + default=False, action='store_true', + help='Compute Spearman rank correlation [Default: %default]') + eval_options.add_option('--rc', dest='rc', + default=False, action='store_true', + help='Average forward and reverse complement predictions [Default: %default]') + eval_options.add_option('--shifts', dest='shifts', + default='0', type='str', + help='Ensemble prediction shifts [Default: %default]') + parser.add_option('--step', dest='step', + default=1, type='int', + help='Spatial step for specificity/spearmanr [Default: %default]') + parser.add_option_group(eval_options) + + # multi + rep_options = OptionGroup(parser, 'replication options') + rep_options.add_option('-c', dest='crosses', + default=1, type='int', + help='Number of cross-fold rounds [Default:%default]') + rep_options.add_option('--checkpoint', dest='checkpoint', + default=False, action='store_true', + help='Restart training from checkpoint [Default: %default]') + rep_options.add_option('-e', dest='conda_env', + default='tf12', + help='Anaconda environment [Default: %default]') + rep_options.add_option('-f', dest='fold_subset', + default=None, type='int', + help='Run a subset of folds [Default:%default]') + rep_options.add_option('--name', dest='name', + default='fold', help='SLURM name prefix [Default: %default]') + rep_options.add_option('-p', dest='processes', + default=None, type='int', + help='Number of processes, passed by multi script') + rep_options.add_option('-q', dest='queue', + default='titan_rtx', + help='SLURM queue on which to run the jobs [Default: %default]') + rep_options.add_option('-r', '--restart', dest='restart', + default=False, action='store_true') + rep_options.add_option('--setup', dest='setup', + default=False, action='store_true', + help='Setup folds data directory only [Default: %default]') + rep_options.add_option('--spec_off', dest='spec_off', + default=False, action='store_true') + rep_options.add_option('--eval_off', dest='eval_off', + default=False, action='store_true') + rep_options.add_option('--eval_train_off', dest='eval_train_off', + default=False, action='store_true') + parser.add_option_group(rep_options) + + (options, args) = parser.parse_args() + + if len(args) < 2: + parser.error('Must provide parameters and data directory.') + else: + params_file = os.path.abspath(args[0]) + data_dirs = [os.path.abspath(arg) for arg in args[1:]] + + ####################################################### + # prep work + + if not options.restart and os.path.isdir(options.out_dir): + print('Output directory %s exists. Please remove.' % options.out_dir) + exit(1) + os.makedirs(options.out_dir, exist_ok=True) + + # read model parameters + with open(params_file) as params_open: + params = json.load(params_open) + params_train = params['train'] + + # copy params into output directory + shutil.copy(params_file, '%s/params.json' % options.out_dir) + + # read data parameters + num_data = len(data_dirs) + data_stats_file = '%s/statistics.json' % data_dirs[0] + with open(data_stats_file) as data_stats_open: + data_stats = json.load(data_stats_open) + + # count folds + num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) + + # subset folds + if options.fold_subset is not None: + num_folds = min(options.fold_subset, num_folds) + + if options.queue == 'standard': + num_cpu = 8 + num_gpu = 0 + time_base = 64 + else: + num_cpu = 2 + num_gpu = 1 + time_base = 24 + + # arrange data + for ci in range(options.crosses): + for fi in range(num_folds): + rep_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) + os.makedirs(rep_dir, exist_ok=True) + + # make data directories + for di in range(num_data): + rep_data_dir = '%s/data%d' % (rep_dir, di) + if not os.path.isdir(rep_data_dir): + make_rep_data(data_dirs[di], rep_data_dir, fi, ci) + + if options.setup: + exit(0) + + cmd_source = 'source /home/yuanh/.bashrc;' + hound_train = '/home/yuanh/programs/source/python_packages/baskerville/scripts/hound_train.py' + ####################################################### + # train + + jobs = [] + + for ci in range(options.crosses): + for fi in range(num_folds): + rep_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) + + train_dir = '%s/train' % rep_dir + if options.restart and not options.checkpoint and os.path.isdir(train_dir): + print('%s found and skipped.' % rep_dir) + + else: + # collect data directories + rep_data_dirs = [] + for di in range(num_data): + rep_data_dirs.append('%s/data%d' % (rep_dir, di)) + + # if options.checkpoint: + # os.rename('%s/train.out' % rep_dir, '%s/train1.out' % rep_dir) + + # train command + cmd = cmd_source + cmd += ' conda activate %s;' % options.conda_env + cmd += ' echo $HOSTNAME;' + + cmd += ' %s' %hound_train + cmd += ' %s' % options_string(options, train_options, rep_dir) + cmd += ' %s %s' % (params_file, ' '.join(rep_data_dirs)) + + name = '%s-train-f%dc%d' % (options.name, fi, ci) + sbf = os.path.abspath('%s/train.sb' % rep_dir) + outf = os.path.abspath('%s/train.%%j.out' % rep_dir) + errf = os.path.abspath('%s/train.%%j.err' % rep_dir) + + j = slurm.Job(cmd, name, + outf, errf, sbf, + queue=options.queue, + cpu=4, + gpu=params_train.get('num_gpu',1), + mem=30000, time='60-0:0:0') + jobs.append(j) + + slurm.multi_run(jobs, max_proc=options.processes, verbose=True, + launch_sleep=10, update_sleep=60) + + + ####################################################### + # evaluate training set + + jobs = [] + + if not options.eval_train_off: + for ci in range(options.crosses): + for fi in range(num_folds): + it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) + + for di in range(num_data): + if num_data == 1: + out_dir = '%s/eval_train' % it_dir + model_file = '%s/train/model_check.h5' % it_dir + else: + out_dir = '%s/eval%d_train' % (it_dir, di) + model_file = '%s/train/model%d_check.h5' % (it_dir, di) + + # check if done + acc_file = '%s/acc.txt' % out_dir + if os.path.isfile(acc_file): + print('%s already generated.' % acc_file) + else: + # hound evaluate + cmd = cmd_source + cmd += ' conda activate %s;' % options.conda_env + cmd += ' echo $HOSTNAME;' + cmd += ' hound_eval.py' + cmd += ' --head %d' % di + cmd += ' -o %s' % out_dir + if options.rc: + cmd += ' --rc' + if options.shifts: + cmd += ' --shifts %s' % options.shifts + cmd += ' --split train' + cmd += ' %s' % params_file + cmd += ' %s' % model_file + cmd += ' %s/data%d' % (it_dir, di) + + name = '%s-evaltr-f%dc%d' % (options.name, fi, ci) + job = slurm.Job(cmd, + name=name, + out_file='%s.out'%out_dir, + err_file='%s.err'%out_dir, + queue=options.queue, + cpu=num_cpu, gpu=num_gpu, + mem=30000, + time='%d:00:00' % (3*time_base)) + jobs.append(job) + + + ####################################################### + # evaluate test set + + if not options.eval_off: + for ci in range(options.crosses): + for fi in range(num_folds): + it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) + + for di in range(num_data): + if num_data == 1: + out_dir = '%s/eval' % it_dir + model_file = '%s/train/model_best.h5' % it_dir + else: + out_dir = '%s/eval%d' % (it_dir, di) + model_file = '%s/train/model%d_best.h5' % (it_dir, di) + + # check if done + acc_file = '%s/acc.txt' % out_dir + if os.path.isfile(acc_file): + print('%s already generated.' % acc_file) + else: + cmd = cmd_source + cmd += ' conda activate %s;' % options.conda_env + cmd += ' echo $HOSTNAME;' + cmd += ' hound_eval.py' + cmd += ' --head %d' % di + cmd += ' -o %s' % out_dir + if options.rc: + cmd += ' --rc' + if options.shifts: + cmd += ' --shifts %s' % options.shifts + if options.rank_corr: + cmd += ' --rank' + cmd += ' --step %d' % options.step + cmd += ' %s' % params_file + cmd += ' %s' % model_file + cmd += ' %s/data%d' % (it_dir, di) + + name = '%s-eval-f%dc%d' % (options.name, fi, ci) + job = slurm.Job(cmd, + name=name, + out_file='%s.out'%out_dir, + err_file='%s.err'%out_dir, + queue=options.queue, + cpu=num_cpu, gpu=num_gpu, + mem=30000, + time='%d:00:00' % time_base) + jobs.append(job) + + ####################################################### + # evaluate test specificity + + if not options.spec_off: + for ci in range(options.crosses): + for fi in range(num_folds): + it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) + + for di in range(num_data): + if num_data == 1: + out_dir = '%s/eval_spec' % it_dir + model_file = '%s/train/model_best.h5' % it_dir + else: + out_dir = '%s/eval%d_spec' % (it_dir, di) + model_file = '%s/train/model%d_best.h5' % (it_dir, di) + + # check if done + acc_file = '%s/acc.txt' % out_dir + if os.path.isfile(acc_file): + print('%s already generated.' % acc_file) + else: + cmd = cmd_source + cmd += ' conda activate %s;' % options.conda_env + cmd += ' echo $HOSTNAME;' + cmd += ' hound_eval_spec.py' + cmd += ' --head %d' % di + cmd += ' -o %s' % out_dir + cmd += ' --step %d' % options.step + if options.rc: + cmd += ' --rc' + if options.shifts: + cmd += ' --shifts %s' % options.shifts + cmd += ' %s' % params_file + cmd += ' %s' % model_file + cmd += ' %s/data%d' % (it_dir, di) + + name = '%s-spec-f%dc%d' % (options.name, fi, ci) + job = slurm.Job(cmd, + name=name, + out_file='%s.out'%out_dir, + err_file='%s.err'%out_dir, + queue=options.queue, + cpu=num_cpu, gpu=num_gpu, + mem=150000, + time='%d:00:00' % (5*time_base)) + jobs.append(job) + + slurm.multi_run(jobs, max_proc=options.processes, verbose=True, + launch_sleep=10, update_sleep=60) + + +def make_rep_data(data_dir, rep_data_dir, fi, ci): + # read data parameters + data_stats_file = '%s/statistics.json' % data_dir + with open(data_stats_file) as data_stats_open: + data_stats = json.load(data_stats_open) + + # sequences per fold + fold_seqs = [] + dfi = 0 + while 'fold%d_seqs'%dfi in data_stats: + fold_seqs.append(data_stats['fold%d_seqs'%dfi]) + del data_stats['fold%d_seqs'%dfi] + dfi += 1 + num_folds = dfi + + # split folds into train/valid/test + test_fold = fi + valid_fold = (fi+1+ci) % num_folds + train_folds = [fold for fold in range(num_folds) if fold not in [valid_fold,test_fold]] + + # clear existing directory + if os.path.isdir(rep_data_dir): + shutil.rmtree(rep_data_dir) + + # make data directory + os.makedirs(rep_data_dir, exist_ok=True) + + # dump data stats + data_stats['test_seqs'] = fold_seqs[test_fold] + data_stats['valid_seqs'] = fold_seqs[valid_fold] + data_stats['train_seqs'] = sum([fold_seqs[tf] for tf in train_folds]) + with open('%s/statistics.json'%rep_data_dir, 'w') as data_stats_open: + json.dump(data_stats, data_stats_open, indent=4) + + # set sequence tvt + try: + seqs_bed_out = open('%s/sequences.bed'%rep_data_dir, 'w') + for line in open('%s/sequences.bed'%data_dir): + a = line.split() + sfi = int(a[-1].replace('fold','')) + if sfi == test_fold: + a[-1] = 'test' + elif sfi == valid_fold: + a[-1] = 'valid' + else: + a[-1] = 'train' + print('\t'.join(a), file=seqs_bed_out) + seqs_bed_out.close() + except (ValueError, FileNotFoundError): + pass + + # copy targets + shutil.copy('%s/targets.txt'%data_dir, '%s/targets.txt'%rep_data_dir) + + # sym link tfrecords + rep_tfr_dir = '%s/tfrecords' % rep_data_dir + os.mkdir(rep_tfr_dir) + + # test tfrecords + ti = 0 + test_tfrs = natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, test_fold))) + for test_tfr in test_tfrs: + test_tfr = os.path.abspath(test_tfr) + test_rep_tfr = '%s/test-%d.tfr' % (rep_tfr_dir, ti) + os.symlink(test_tfr, test_rep_tfr) + ti += 1 + + # valid tfrecords + ti = 0 + valid_tfrs = natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, valid_fold))) + for valid_tfr in valid_tfrs: + valid_tfr = os.path.abspath(valid_tfr) + valid_rep_tfr = '%s/valid-%d.tfr' % (rep_tfr_dir, ti) + os.symlink(valid_tfr, valid_rep_tfr) + ti += 1 + + # train tfrecords + ti = 0 + train_tfrs = [] + for tfi in train_folds: + train_tfrs += natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, tfi))) + for train_tfr in train_tfrs: + train_tfr = os.path.abspath(train_tfr) + train_rep_tfr = '%s/train-%d.tfr' % (rep_tfr_dir, ti) + os.symlink(train_tfr, train_rep_tfr) + ti += 1 + + +def options_string(options, train_options, rep_dir): + options_str = '' + + for opt in train_options.option_list: + opt_str = opt.get_opt_string() + opt_value = options.__dict__[opt.dest] + + # wrap askeriks in "" + if type(opt_value) == str and opt_value.find('*') != -1: + opt_value = '"%s"' % opt_value + + # no value for bools + elif type(opt_value) == bool: + if not opt_value: + opt_str = '' + opt_value = '' + + # skip Nones + elif opt_value is None: + opt_str = '' + opt_value = '' + + # modify + elif opt.dest == 'out_dir': + opt_value = '%s/train' % rep_dir + + # find matching restore + elif opt.dest == 'restore': + fold_dir_mid = rep_dir.split('/')[-1] + if options.trunk: + opt_value = '%s/%s/train/model_trunk.h5' % (opt_value, fold_dir_mid) + else: + opt_value = '%s/%s/train/model_best.h5' % (opt_value, fold_dir_mid) + + options_str += ' %s %s' % (opt_str, opt_value) + + return options_str + + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 5c55f52..6503815 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # ========================================================================= +# modified fit2 to: +# show progress bar during training +# save gpu memory information + import time import pdb @@ -19,6 +23,7 @@ import tensorflow as tf from baskerville import metrics +from tensorflow.keras import mixed_precision def parse_loss( @@ -53,6 +58,14 @@ def parse_loss( loss_fn = metrics.PoissonMultinomial( total_weight, reduction=tf.keras.losses.Reduction.NONE ) + elif loss_label == "poisson_kl": + loss_fn = metrics.PoissonKL( + spec_weight, reduction=tf.keras.losses.Reduction.NONE + ) + elif loss_label == "mse_udot": + loss_fn = metrics.MeanSquaredErrorUDot( + spec_weight, reduction=tf.keras.losses.Reduction.NONE + ) else: loss_fn = tf.keras.losses.Poisson(reduction=tf.keras.losses.Reduction.NONE) else: @@ -94,6 +107,7 @@ def __init__( strategy=None, num_gpu: int = 1, keras_fit: bool = False, + loss_scale: bool = False, ): self.params = params self.train_data = train_data @@ -107,6 +121,7 @@ def __init__( self.num_gpu = num_gpu self.batch_size = self.train_data[0].batch_size self.compiled = False + self.loss_scale = loss_scale # early stopping self.patience = self.params.get("patience", 20) @@ -133,7 +148,7 @@ def __init__( ) # optimizer - self.make_optimizer() + self.make_optimizer(loss_scale=loss_scale) def compile(self, seqnn_model): for model in seqnn_model.models: @@ -396,6 +411,11 @@ def eval_step1_distr(xd, yd): ################################################################ # training loop + gpu_memory_callback = GPUMemoryUsageCallback() + file_path='%s/gpu_mem.txt' % self.out_dir + with open(file_path, 'w') as file: + file.write('epoch\tbatch\tgpu_mem(GB)\n') + first_step = True for ei in range(epoch_start, self.train_epochs_max): if ei >= self.train_epochs_min and np.min(unimproved) > self.patience: @@ -406,10 +426,11 @@ def eval_step1_distr(xd, yd): # get iterators train_data_iters = [iter(td.dataset) for td in self.train_data] - + # train t0 = time.time() - for di in self.dataset_indexes: + prog_bar = tf.keras.utils.Progbar(len(self.dataset_indexes)) # Create Keras Progbar + for didx, di in enumerate(self.dataset_indexes): x, y = safe_next(train_data_iters[di]) if self.strategy is None: if di == 0: @@ -424,7 +445,13 @@ def eval_step1_distr(xd, yd): if first_step: print("Successful first step!", flush=True) first_step = False - + prog_bar.add(1) + + if (ei == epoch_start) and (didx < 1000) and (didx%100 == 1): + mem=gpu_memory_callback.on_batch_end() + file = open(file_path, 'a') + file.write("%d\t%d\t%.2f\n"%(ei, didx, mem)) + print("Epoch %d - %ds" % (ei, (time.time() - t0))) for di in range(self.num_datasets): print(" Data %d" % di, end="") @@ -486,6 +513,7 @@ def eval_step1_distr(xd, yd): valid_r[di].reset_states() valid_r2[di].reset_states() + def fit_tape(self, seqnn_model): """Train the model using a custom tf.GradientTape loop.""" if not self.compiled: @@ -588,6 +616,11 @@ def eval_step_distr(xd, yd): unimproved = 0 # training loop + gpu_memory_callback = GPUMemoryUsageCallback() + file_path='%s/gpu_mem.txt' % self.out_dir + with open(file_path, 'w') as file: + file.write('epoch\tbatch\tgpu_mem(GB)\n') + for ei in range(epoch_start, self.train_epochs_max): if ei >= self.train_epochs_min and unimproved > self.patience: break @@ -604,6 +637,12 @@ def eval_step_distr(xd, yd): if ei == epoch_start and si == 0: print("Successful first step!", flush=True) + # print gpu memory usage + if (ei == epoch_start) and (si < 1000) and (si%100 == 1): + mem=gpu_memory_callback.on_batch_end() + with open(file_path, 'a') as file: + file.write("%d\t%d\t%.2f\n"%(ei, si, mem)) + # evaluate for x, y in self.eval_data[0].dataset: if self.strategy is not None: @@ -660,7 +699,7 @@ def eval_step_distr(xd, yd): valid_r.reset_states() valid_r2.reset_states() - def make_optimizer(self): + def make_optimizer(self, loss_scale=False): """Make optimizer object from given parameters.""" cyclical1 = True for lrs_param in [ @@ -715,12 +754,17 @@ def make_optimizer(self): # optimizer optimizer_type = self.params.get("optimizer", "sgd").lower() if optimizer_type == "adam": + if loss_scale: + epsilon_value = 1e-04 + else: + epsilon_value = 1e-07 self.optimizer = tf.keras.optimizers.Adam( learning_rate=lr_schedule, beta_1=self.params.get("adam_beta1", 0.9), beta_2=self.params.get("adam_beta2", 0.999), clipnorm=clip_norm, global_clipnorm=global_clipnorm, + epsilon=epsilon_value, amsgrad=False, ) # reduces performance in my experience @@ -747,6 +791,9 @@ def make_optimizer(self): print("Cannot recognize optimization algorithm %s" % optimizer_type) exit(1) + if loss_scale: + self.optimizer = mixed_precision.LossScaleOptimizer(self.optimizer) + ################################################################ # AGC @@ -964,3 +1011,25 @@ def safe_next(data_iter, retry=5, sleep=10): d = next(data_iter) return d + + +def CheckGradientNA(gradients): + for grad in gradients: + if grad is not None: + if tf.reduce_any(tf.math.is_nan(grad)): + raise ValueError("NaN gradient detected.") + +# Define a custom callback class to track GPU memory usage +class GPUMemoryUsageCallback(tf.keras.callbacks.Callback): + def on_train_begin(self, logs=None): + # Enable memory growth to avoid GPU memory allocation issues + physical_devices = tf.config.experimental.list_physical_devices('GPU') + if physical_devices: + for device in physical_devices: + tf.config.experimental.set_memory_growth(device, True) + + def on_batch_end(self, logs=None): + gpu_memory = tf.config.experimental.get_memory_info('GPU:0') + current_memory = gpu_memory['peak'] / 1e9 # Convert to GB + return current_memory + From 6e79986a117afbe94968c71ccf65a4c1d706c3e6 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 19 Sep 2023 17:19:00 -0700 Subject: [PATCH 02/26] create_model_with_adapter_by_specify_json --- src/baskerville/blocks.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py index 2490f2e..d71bb3e 100644 --- a/src/baskerville/blocks.py +++ b/src/baskerville/blocks.py @@ -1188,6 +1188,8 @@ def transformer( qkv_width=1, mha_initializer="he_normal", kernel_initializer="he_normal", + adapter=None, + latent=16, **kwargs, ): """Construct a transformer block. @@ -1225,6 +1227,10 @@ def transformer( if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) + # add houlsby-adapter + if adapter=='houlsby': + current = layers.AdapterHoulsby(latent_size=latent)(current) + # residual current = tf.keras.layers.Add()([inputs, current]) @@ -1232,7 +1238,7 @@ def transformer( final = current else: final = transformer_dense( - current, out_size, dense_expansion, l2_scale, dropout, kernel_initializer + current, out_size, dense_expansion, l2_scale, dropout, kernel_initializer, adapter, latent ) return final @@ -1344,7 +1350,8 @@ def transformer_split( def transformer_dense( - inputs, out_size, dense_expansion, l2_scale, dropout, kernel_initializer + inputs, out_size, dense_expansion, l2_scale, dropout, kernel_initializer, + adapter=None, latent=16 ): """Transformer block dense portion.""" # layer norm @@ -1376,6 +1383,9 @@ def transformer_dense( if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) + if adapter=='houlsby': + current = layers.AdapterHoulsby(latent_size=latent)(current) + # residual final = tf.keras.layers.Add()([inputs, current]) From ae31d2fe433ecdcf101dd0ac6e5a1e254587fab3 Mon Sep 17 00:00:00 2001 From: hy395 Date: Thu, 21 Sep 2023 16:15:03 -0700 Subject: [PATCH 03/26] lora --- src/baskerville/layers.py | 69 ++++++++++++++++++- src/baskerville/scripts/hound_train.py | 31 ++++++++- .../scripts/westminster_train_folds_copy.py | 25 ++++++- 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index 1d28c2c..028a6a5 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -26,8 +26,75 @@ ##################### # transfer learning # ##################### + +class Lora(tf.keras.layers.Layer): + # https://arxiv.org/abs/2106.09685 + # adapted from: + # https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/ + # https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py + + def __init__(self, + original_layer, + rank=8, + alpha=16, + trainable=True, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.output_dim = original_layer_config["units"] + + if rank > self.output_dim: + raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + + self.rank = rank + self.alpha = alpha + self.scale = alpha / rank + self.original_layer = original_layer + self.original_layer.trainable = False + + # Note: the original paper mentions that normal distribution was + # used for initialization. However, the official LoRA implementation + # uses "Kaiming/He Initialization". + self.down_layer = tf.keras.layers.Dense( + units=rank, + use_bias=False, + kernel_initializer=tf.keras.initializers.HeUniform(), + #kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), + trainable=trainable, + name="lora_a" + ) + + self.up_layer = tf.keras.layers.Dense( + units=self.output_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Zeros(), + trainable=trainable, + name="lora_b" + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + lora_output = self.up_layer(self.down_layer(inputs)) * self.scale + return original_output + lora_output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "rank": self.rank, + "alpha": self.alpha + } + ) + return config + class AdapterHoulsby(tf.keras.layers.Layer): - ### Houlsby et al. 2019 implementation + # https://arxiv.org/abs/1902.00751 + # adapted from: https://github.com/jain-harshil/Adapter-BERT def __init__( self, diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index d5a754d..871ff51 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -71,7 +71,7 @@ def main(): parser.add_argument( "--transfer_mode", default="full", - help="transfer method. [full, linear, adapter]", + help="transfer method. [full, linear, adapterHoulsby, lora, lora_full]", ) parser.add_argument( "--latent", @@ -185,6 +185,16 @@ def main(): seqnn_model.model_trunk = None seqnn_model.model.summary() + elif args.transfer_mode=='lora': + seqnn_model.model_trunk.trainable=False + add_lora(seqnn_model.model, rank=args.latent, mode='default') + seqnn_model.model.summary() + + elif args.transfer_mode=='lora_full': + seqnn_model.model_trunk.trainable=False + add_lora(seqnn_model.model, rank=args.latent, mode='full') + seqnn_model.model.summary() + # initialize trainer seqnn_trainer = trainer.Trainer( params_train, train_data, eval_data, args.out_dir @@ -312,6 +322,25 @@ def make_adapter_model(input_model, strand_pair, latent_size=16): if re.match('layer_normalization', l.name): l.trainable = True return model_adapter + +def add_lora(input_model, rank=8, alpha=16, mode='default'): + # take seqnn.model as input + # replace _q_layer, _v_layer in multihead_attention + # optionally replace _k_layer, _embedding_layer + if mode not in ['default','full']: + raise ValueError("mode must be default or full") + + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + # default loRA + layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha) + layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha) + # full loRA + if mode=='full': + layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha) + layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha) + + ################################################################################ # __main__ ################################################################################ diff --git a/src/baskerville/scripts/westminster_train_folds_copy.py b/src/baskerville/scripts/westminster_train_folds_copy.py index 6f27ec5..777d784 100755 --- a/src/baskerville/scripts/westminster_train_folds_copy.py +++ b/src/baskerville/scripts/westminster_train_folds_copy.py @@ -62,6 +62,18 @@ def main(): help='Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') parser.add_option_group(train_options) + # transfer options + transfer_options = OptionGroup(parser, 'transfer options') + transfer_options.add_option('--transfer', dest='transfer', + default=False, action='store_true', + help='whether to do transfer learning.') + transfer_options.add_option('--pretrain', dest='pretrain', + default=None, help='path to pretrained model trunk.') + transfer_options.add_option('--transfer_mode', dest='transfer_mode', + default='linear', help='transfer method.') + transfer_options.add_option('--latent', dest='latent', type='int', + default=0, help='latent size. ') + # eval eval_options = OptionGroup(parser, 'hound_eval.py options') eval_options.add_option('--rank', dest='rank_corr', @@ -87,7 +99,7 @@ def main(): default=False, action='store_true', help='Restart training from checkpoint [Default: %default]') rep_options.add_option('-e', dest='conda_env', - default='tf12', + default='tf2.12', help='Anaconda environment [Default: %default]') rep_options.add_option('-f', dest='fold_subset', default=None, type='int', @@ -175,7 +187,7 @@ def main(): exit(0) cmd_source = 'source /home/yuanh/.bashrc;' - hound_train = '/home/yuanh/programs/source/python_packages/baskerville/scripts/hound_train.py' + hound_train = 'hound_train.py' ####################################################### # train @@ -205,6 +217,15 @@ def main(): cmd += ' %s' %hound_train cmd += ' %s' % options_string(options, train_options, rep_dir) + + # transfer learning options + if options.transfer: + cmd += ' --restore %s/f%dc%d.h5' % (options.pretrain, fi, ci) + cmd += ' --trunk' + cmd += ' --transfer_mode %s' % options.transfer_mode + if options.latent!=0: + cmd += ' --latent %d' % options.latent + cmd += ' %s %s' % (params_file, ' '.join(rep_data_dirs)) name = '%s-train-f%dc%d' % (options.name, fi, ci) From 0ba9122081a9b85be2a51c709734727280e626e2 Mon Sep 17 00:00:00 2001 From: hy395 Date: Mon, 2 Oct 2023 23:34:44 -0700 Subject: [PATCH 04/26] update lora ia3 --- src/baskerville/layers.py | 48 ++- src/baskerville/scripts/hound_train.py | 169 +-------- src/baskerville/scripts/hound_transfer.py | 418 ++++++++++++++++++++++ 3 files changed, 478 insertions(+), 157 deletions(-) create mode 100755 src/baskerville/scripts/hound_transfer.py diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index 028a6a5..6996ce1 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -26,10 +26,52 @@ ##################### # transfer learning # ##################### +class IA3(tf.keras.layers.Layer): + # activation-rescale adapter: + # https://arxiv.org/pdf/2205.05638.pdf + + def __init__(self, + original_layer, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.output_dim = original_layer_config["units"] + + self.original_layer = original_layer + self.original_layer.trainable = False + + # IA3 weights. Make it a dense layer to control trainable + self._ia3_layer = tf.keras.layers.Dense( + units=self.output_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Ones(), + trainable=True, + name="ia3" + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + return original_output * scaler + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "size": self.output_dim, + } + ) + return config class Lora(tf.keras.layers.Layer): - # https://arxiv.org/abs/2106.09685 # adapted from: + # https://arxiv.org/abs/2106.09685 # https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/ # https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py @@ -37,7 +79,7 @@ def __init__(self, original_layer, rank=8, alpha=16, - trainable=True, + trainable=False, **kwargs): # keep the name of this layer the same as the original dense layer. @@ -554,12 +596,14 @@ def __init__( shape=[1, self._num_heads, 1, self._key_size], initializer=self._initializer, dtype=tf.float32, + trainable=True, ) self._r_r_bias = self.add_weight( "%s/r_r_bias" % self.name, shape=[1, self._num_heads, 1, self._key_size], initializer=self._initializer, dtype=tf.float32, + trainable=True, ) def _multihead_output(self, linear_layer, inputs): diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index 871ff51..032d6f0 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -17,7 +17,6 @@ import json import os import shutil -import re import numpy as np import pandas as pd @@ -27,7 +26,6 @@ from baskerville import dataset from baskerville import seqnn from baskerville import trainer -from baskerville import layers """ hound_train.py @@ -35,6 +33,7 @@ Train Hound model using given parameters and data. """ + def main(): parser = argparse.ArgumentParser(description="Train a model.") parser.add_argument( @@ -68,17 +67,6 @@ def main(): default=False, help="Restore only model trunk [Default: %(default)s]", ) - parser.add_argument( - "--transfer_mode", - default="full", - help="transfer method. [full, linear, adapterHoulsby, lora, lora_full]", - ) - parser.add_argument( - "--latent", - type=int, - default=16, - help="adapter latent size.", - ) parser.add_argument( "--tfr_train", default=None, @@ -143,75 +131,31 @@ def main(): tfr_pattern=args.tfr_eval, ) ) - + params_model["strand_pair"] = strand_pairs if args.mixed_precision: - policy = mixed_precision.Policy('mixed_float16') - mixed_precision.set_global_policy(policy) - + mixed_precision.set_global_policy("mixed_float16") + if params_train.get("num_gpu", 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) - + # restore if args.restore: seqnn_model.restore(args.restore, trunk=args.trunk) - # transfer learning strategies - if args.transfer_mode=='full': - seqnn_model.model.trainable=True - - elif args.transfer_mode=='batch_norm': - seqnn_model.model_trunk.trainable=False - for l in seqnn_model.model.layers: - if l.name.startswith("batch_normalization"): - l.trainable=True - seqnn_model.model.summary() - - elif args.transfer_mode=='linear': - seqnn_model.model_trunk.trainable=False - seqnn_model.model.summary() - - elif args.transfer_mode=='adapterHoulsby': - seqnn_model.model_trunk.trainable=False - strand_pair = strand_pairs[0] - adapter_model = make_adapter_model(seqnn_model.model, strand_pair, args.latent) - seqnn_model.model = adapter_model - seqnn_model.models[0] = seqnn_model.model - seqnn_model.model_trunk = None - seqnn_model.model.summary() - - elif args.transfer_mode=='lora': - seqnn_model.model_trunk.trainable=False - add_lora(seqnn_model.model, rank=args.latent, mode='default') - seqnn_model.model.summary() - - elif args.transfer_mode=='lora_full': - seqnn_model.model_trunk.trainable=False - add_lora(seqnn_model.model, rank=args.latent, mode='full') - seqnn_model.model.summary() - # initialize trainer seqnn_trainer = trainer.Trainer( params_train, train_data, eval_data, args.out_dir ) - + # compile model seqnn_trainer.compile(seqnn_model) - # train model - if args.keras_fit: - seqnn_trainer.fit_keras(seqnn_model) - else: - if len(args.data_dirs) == 1: - seqnn_trainer.fit_tape(seqnn_model) - else: - seqnn_trainer.fit2(seqnn_model) - else: ######################################## # multi GPU @@ -219,7 +163,6 @@ def main(): strategy = tf.distribute.MirroredStrategy() with strategy.scope(): - if not args.keras_fit: # distribute data for di in range(len(args.data_dirs)): @@ -247,102 +190,18 @@ def main(): # compile model seqnn_trainer.compile(seqnn_model) - # train model - if args.keras_fit: - seqnn_trainer.fit_keras(seqnn_model) - else: - if len(args.data_dirs) == 1: - seqnn_trainer.fit_tape(seqnn_model) - else: - seqnn_trainer.fit2(seqnn_model) - -def make_adapter_model(input_model, strand_pair, latent_size=16): - # take seqnn_model as input - # output a new seqnn_model object - # only the adapter, and layer_norm are trainable - - model = tf.keras.Model(inputs=input_model.input, - outputs=input_model.layers[-2].output) # remove the switch_reverse layer - - # save current graph - layer_parent_dict_old = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict_old: - layer_parent_dict_old.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict_old[layer_name]: - layer_parent_dict_old[layer_name].append(layer.name) - - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({model.layers[0].name: model.input}) - - # remove switch_reverse - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - - for layer in model.layers[1:]: - - # parent layers - parent_layers = layer_parent_dict_old[layer.name] - - # layer inputs - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - - # insert adapter: - elif re.match('add', layer.name): - if any([re.match('dropout', i) for i in parent_layers]): - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) - x = layer([layer_input[0], x]) - else: - x = layer(layer_input) - + # train model + if args.keras_fit: + seqnn_trainer.fit_keras(seqnn_model) + else: + if len(args.data_dirs) == 1: + seqnn_trainer.fit_tape(seqnn_model) else: - x = layer(layer_input) - - # save the output tensor of every layer - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) - - # set layer_norm layers to trainable - for l in model_adapter.layers: - if re.match('layer_normalization', l.name): l.trainable = True - - return model_adapter - -def add_lora(input_model, rank=8, alpha=16, mode='default'): - # take seqnn.model as input - # replace _q_layer, _v_layer in multihead_attention - # optionally replace _k_layer, _embedding_layer - if mode not in ['default','full']: - raise ValueError("mode must be default or full") - - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - # default loRA - layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha) - layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha) - # full loRA - if mode=='full': - layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha) - layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha) + seqnn_trainer.fit2(seqnn_model) ################################################################################ # __main__ ################################################################################ if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py new file mode 100755 index 0000000..36dbab8 --- /dev/null +++ b/src/baskerville/scripts/hound_transfer.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python +# Copyright 2023 Calico LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +import argparse +import json +import os +import shutil +import re + +import numpy as np +import pandas as pd +import tensorflow as tf +from tensorflow.keras import mixed_precision + +from baskerville import dataset +from baskerville import seqnn +from baskerville import trainer +from baskerville import layers + +""" +hound_transfer.py + +Modified from hound_train.py. +Additional argument to allow for transfer learning from existing Hound model. +""" + +def main(): + parser = argparse.ArgumentParser(description="Train a model.") + parser.add_argument( + "-k", + "--keras_fit", + action="store_true", + default=False, + help="Train with Keras fit method [Default: %(default)s]", + ) + parser.add_argument( + "-m", + "--mixed_precision", + action="store_true", + default=False, + help="Train with mixed precision [Default: %(default)s]", + ) + parser.add_argument( + "-o", + "--out_dir", + default="train_out", + help="Output directory [Default: %(default)s]", + ) + parser.add_argument( + "--restore", + default=None, + help="Restore model and continue training [Default: %(default)s]", + ) + parser.add_argument( + "--trunk", + action="store_true", + default=False, + help="Restore only model trunk [Default: %(default)s]", + ) + parser.add_argument( + "--transfer_mode", + default="full", + help="transfer method. [full, linear, adapterHoulsby, lora, lora_full, ia3]", + ) + parser.add_argument( + "--latent", + type=int, + default=16, + help="adapter latent size.", + ) + parser.add_argument( + "--tfr_train", + default=None, + help="Training TFR pattern string appended to data_dir/tfrecords [Default: %(default)s]", + ) + parser.add_argument( + "--tfr_eval", + default=None, + help="Evaluation TFR pattern string appended to data_dir/tfrecords [Default: %(default)s]", + ) + + parser.add_argument("params_file", help="JSON file with model parameters") + parser.add_argument( + "data_dirs", nargs="+", help="Train/valid/test data directorie(s)" + ) + args = parser.parse_args() + + if args.keras_fit and len(args.data_dirs) > 1: + print("Cannot use keras fit method with multi-genome training.") + exit() + + os.makedirs(args.out_dir, exist_ok=True) + if args.params_file != "%s/params.json" % args.out_dir: + shutil.copy(args.params_file, "%s/params.json" % args.out_dir) + + # read model parameters + with open(args.params_file) as params_open: + params = json.load(params_open) + params_model = params["model"] + params_train = params["train"] + + # read datasets + train_data = [] + eval_data = [] + strand_pairs = [] + + for data_dir in args.data_dirs: + # set strand pairs + targets_df = pd.read_csv("%s/targets.txt" % data_dir, sep="\t", index_col=0) + if "strand_pair" in targets_df.columns: + strand_pairs.append(np.array(targets_df.strand_pair)) + + # load train data + train_data.append( + dataset.SeqDataset( + data_dir, + split_label="train", + batch_size=params_train["batch_size"], + shuffle_buffer=params_train.get("shuffle_buffer", 128), + mode="train", + tfr_pattern=args.tfr_train, + ) + ) + + # load eval data + eval_data.append( + dataset.SeqDataset( + data_dir, + split_label="valid", + batch_size=params_train["batch_size"], + mode="eval", + tfr_pattern=args.tfr_eval, + ) + ) + + params_model["strand_pair"] = strand_pairs + + if args.mixed_precision: + policy = mixed_precision.Policy('mixed_float16') + mixed_precision.set_global_policy(policy) + + if params_train.get("num_gpu", 1) == 1: + ######################################## + # one GPU + + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + + # restore + if args.restore: + seqnn_model.restore(args.restore, trunk=args.trunk) + + # transfer learning strategies + if args.transfer_mode=='full': + seqnn_model.model.trainable=True + + elif args.transfer_mode=='batch_norm': + seqnn_model.model_trunk.trainable=False + for l in seqnn_model.model.layers: + if l.name.startswith("batch_normalization"): + l.trainable=True + seqnn_model.model.summary() + + elif args.transfer_mode=='linear': + seqnn_model.model_trunk.trainable=False + seqnn_model.model.summary() + + elif args.transfer_mode=='adapterHoulsby': + seqnn_model.model_trunk.trainable=False + strand_pair = strand_pairs[0] + adapter_model = make_adapter_model(seqnn_model.model, strand_pair, args.latent) + seqnn_model.model = adapter_model + seqnn_model.models[0] = seqnn_model.model + seqnn_model.model_trunk = None + seqnn_model.model.summary() + + elif args.transfer_mode=='lora': + add_lora(seqnn_model.model, rank=args.latent, mode='default') + seqnn_model.model.summary() + + elif args.transfer_mode=='lora_full': + add_lora(seqnn_model.model, rank=args.latent, mode='full') + seqnn_model.model.summary() + + elif args.transfer_mode=='ia3': + add_ia3(seqnn_model.model) + seqnn_model.model.summary() + + # initialize trainer + seqnn_trainer = trainer.Trainer( + params_train, train_data, eval_data, args.out_dir + ) + + # compile model + seqnn_trainer.compile(seqnn_model) + + # train model + if args.keras_fit: + seqnn_trainer.fit_keras(seqnn_model) + else: + if len(args.data_dirs) == 1: + seqnn_trainer.fit_tape(seqnn_model) + else: + seqnn_trainer.fit2(seqnn_model) + + else: + ######################################## + # multi GPU + + strategy = tf.distribute.MirroredStrategy() + + with strategy.scope(): + + if not args.keras_fit: + # distribute data + for di in range(len(args.data_dirs)): + train_data[di].distribute(strategy) + eval_data[di].distribute(strategy) + + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + + # restore + if args.restore: + seqnn_model.restore(args.restore, args.trunk) + + # initialize trainer + seqnn_trainer = trainer.Trainer( + params_train, + train_data, + eval_data, + args.out_dir, + strategy, + params_train["num_gpu"], + args.keras_fit, + ) + + # compile model + seqnn_trainer.compile(seqnn_model) + + # train model + if args.keras_fit: + seqnn_trainer.fit_keras(seqnn_model) + else: + if len(args.data_dirs) == 1: + seqnn_trainer.fit_tape(seqnn_model) + else: + seqnn_trainer.fit2(seqnn_model) + +def make_adapter_model(input_model, strand_pair, latent_size=16): + # take seqnn_model as input + # output a new seqnn_model object + # only the adapter, and layer_norm are trainable + + model = tf.keras.Model(inputs=input_model.input, + outputs=input_model.layers[-2].output) # remove the switch_reverse layer + + # save current graph + layer_parent_dict_old = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict_old: + layer_parent_dict_old.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict_old[layer_name]: + layer_parent_dict_old[layer_name].append(layer.name) + + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({model.layers[0].name: model.input}) + + # remove switch_reverse + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in model.layers[1:]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + + # insert adapter: + elif re.match('add', layer.name): + if any([re.match('dropout', i) for i in parent_layers]): + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) + x = layer([layer_input[0], x]) + else: + x = layer(layer_input) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) + + # set layer_norm layers to trainable + for l in model_adapter.layers: + if re.match('layer_normalization', l.name): l.trainable = True + + return model_adapter + +def add_lora(input_model, rank=8, alpha=16, mode='default'): + ###################### + # inject lora layers # + ###################### + # take seqnn.model as input + # replace _q_layer, _v_layer in multihead_attention + # optionally replace _k_layer, _embedding_layer + if mode not in ['default','full']: + raise ValueError("mode must be default or full") + + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + # default loRA + layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) + layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) + # full loRA + if mode=='full': + layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) + layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) + + input_model(input_model.input) # initialize new variables + + ################# + # freeze params # + ################# + # freeze all params but lora + for layer in input_model._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name in ["lora_a", "lora_b"]: + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + input_model.layers[-2].trainable=True + + +def add_ia3(input_model): + ##################### + # inject ia3 layers # + ##################### + # take seqnn.model as input + # replace _k_layer, _v_layer, _embedding_layer in multihead_attention + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._k_layer = layers.IA3(layer._k_layer, trainable=True) + layer._v_layer = layers.IA3(layer._v_layer, trainable=True) + layer._embedding_layer = layers.IA3(layer._embedding_layer, trainable=True) + input_model(input_model.input) # instantiate model to initialize new variables + + ################# + # freeze params # + ################# + # set ia3 to trainable + for layer in input_model._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name =='ia3': + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + input_model.layers[-2].trainable=True + +def param_count(model): + trainable = int(sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)) + non_trainable = int(sum(tf.keras.backend.count_params(w) for w in model.non_trainable_weights)) + print('total params:%d' %(trainable + non_trainable)) + print('trainable params:%d' %trainable) + print('non-trainable params:%d' %non_trainable) + +################################################################################ +# __main__ +################################################################################ +if __name__ == "__main__": + main() From 9a459893c28a1810766ae6f77f9d0c5b3ca4c65d Mon Sep 17 00:00:00 2001 From: hy395 Date: Sat, 11 Nov 2023 02:08:12 -0800 Subject: [PATCH 05/26] implement lora/ia3 merge weight --- src/baskerville/HY_helper.py | 2 + src/baskerville/layers.py | 43 +- src/baskerville/scripts/hound_transfer.py | 341 ++++++----- .../scripts/westminster_train_folds_copy.py | 530 ------------------ src/baskerville/trainer.py | 6 +- 5 files changed, 184 insertions(+), 738 deletions(-) delete mode 100755 src/baskerville/scripts/westminster_train_folds_copy.py diff --git a/src/baskerville/HY_helper.py b/src/baskerville/HY_helper.py index f4f7878..2d8b665 100644 --- a/src/baskerville/HY_helper.py +++ b/src/baskerville/HY_helper.py @@ -3,6 +3,8 @@ import pysam import pyBigWig + + def make_seq_1hot(genome_open, chrm, start, end, seq_len): if start < 0: seq_dna = 'N'*(-start) + genome_open.fetch(chrm, 0, end) diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index 6996ce1..8f6af73 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -748,11 +748,14 @@ def get_config(self): class SqueezeExcite(tf.keras.layers.Layer): def __init__( self, - activation="relu", + activation='relu', additive=False, bottleneck_ratio=8, norm_type=None, bn_momentum=0.9, + use_bias=True, + kernel_initializer='glorot_uniform', + bias_initializer='zeros', ): super(SqueezeExcite, self).__init__() self.activation = activation @@ -760,6 +763,9 @@ def __init__( self.norm_type = norm_type self.bn_momentum = bn_momentum self.bottleneck_ratio = bottleneck_ratio + self.kernel_initializer=kernel_initializer + self.bias_initializer=bias_initializer + self.use_bias=use_bias def build(self, input_shape): self.num_channels = input_shape[-1] @@ -778,26 +784,24 @@ def build(self, input_shape): exit(1) self.dense1 = tf.keras.layers.Dense( - units=self.num_channels // self.bottleneck_ratio, activation="relu" + units=self.num_channels // self.bottleneck_ratio, + activation="relu", + use_bias=self.use_bias, + kernel_initializer=self.kernel_initializer, + bias_initializer=self.bias_initializer, ) - self.dense2 = tf.keras.layers.Dense(units=self.num_channels, activation=None) - - # normalize - # if self.norm_type == 'batch-sync': - # self.norm = tf.keras.layers.experimental.SyncBatchNormalization( - # momentum=self.bn_momentum, gamma_initializer='zeros') - # elif self.norm_type == 'batch': - # self.norm = tf.keras.layers.BatchNormalization( - # momentum=self.bn_momentum, gamma_initializer='zeros') - # elif self.norm_type == 'layer': - # self.norm = tf.keras.layers.LayerNormalization( - # gamma_initializer='zeros') - # else: - # self.norm = None + + self.dense2 = tf.keras.layers.Dense( + units=self.num_channels, + use_bias=self.use_bias, + kernel_initializer=self.kernel_initializer, + bias_initializer=self.bias_initializer, + activation=None) def call(self, x): # activate - x = activate(x, self.activation) + if self.activation is not None: + x = activate(x, self.activation) # squeeze squeeze = self.gap(x) @@ -805,8 +809,6 @@ def call(self, x): # excite excite = self.dense1(squeeze) excite = self.dense2(excite) - # if self.norm is not None: - # excite = self.norm(excite) # scale if self.one_or_two == "one": @@ -828,14 +830,15 @@ def get_config(self): { "activation": self.activation, "additive": self.additive, + "use_bias":self.use_bias, "norm_type": self.norm_type, "bn_momentum": self.bn_momentum, "bottleneck_ratio": self.bottleneck_ratio, + 'bottleneck_size': self.num_channels // self.bottleneck_ratio, } ) return config - class GlobalContext(tf.keras.layers.Layer): def __init__(self): super(GlobalContext, self).__init__() diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 36dbab8..0af7997 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -28,6 +28,7 @@ from baskerville import seqnn from baskerville import trainer from baskerville import layers +from baskerville import transfer_helper """ hound_transfer.py @@ -72,13 +73,32 @@ def main(): parser.add_argument( "--transfer_mode", default="full", - help="transfer method. [full, linear, adapterHoulsby, lora, lora_full, ia3]", + help="transfer method. [full, linear, adapter]", ) parser.add_argument( - "--latent", + "--att_adapter", + default=None, + type=str, + help="attention layer module [adapterHoulsby, lora, lora_full, ia3]", + ) + parser.add_argument( + "--att_latent", type=int, default=16, - help="adapter latent size.", + help="attention adapter latent size.", + ) + parser.add_argument( + "--conv_adapter", + default=None, + type=str, + help="conv layer module [conv, batch_norm, squez_excit]", + ) + + parser.add_argument( + "--se_ratio", + type=int, + default=16, + help="se bottleneck ratio.", ) parser.add_argument( "--tfr_train", @@ -105,6 +125,9 @@ def main(): if args.params_file != "%s/params.json" % args.out_dir: shutil.copy(args.params_file, "%s/params.json" % args.out_dir) + if args.transfer_mode not in ['full','linear','sparse']: + raise ValueError("transfer mode must be one of full, linear, sparse") + # read model parameters with open(args.params_file) as params_open: params = json.load(params_open) @@ -156,48 +179,112 @@ def main(): # one GPU # initialize model + params_model['verbose']=False seqnn_model = seqnn.SeqNN(params_model) # restore if args.restore: seqnn_model.restore(args.restore, trunk=args.trunk) - # transfer learning strategies + # head params + print('params in new head: %d' %transfer_helper.param_count(seqnn_model.model.layers[-2])) + + #################### + # transfer options # + #################### if args.transfer_mode=='full': seqnn_model.model.trainable=True - elif args.transfer_mode=='batch_norm': - seqnn_model.model_trunk.trainable=False - for l in seqnn_model.model.layers: - if l.name.startswith("batch_normalization"): - l.trainable=True - seqnn_model.model.summary() - elif args.transfer_mode=='linear': seqnn_model.model_trunk.trainable=False - seqnn_model.model.summary() - - elif args.transfer_mode=='adapterHoulsby': - seqnn_model.model_trunk.trainable=False - strand_pair = strand_pairs[0] - adapter_model = make_adapter_model(seqnn_model.model, strand_pair, args.latent) - seqnn_model.model = adapter_model - seqnn_model.models[0] = seqnn_model.model - seqnn_model.model_trunk = None - seqnn_model.model.summary() - - elif args.transfer_mode=='lora': - add_lora(seqnn_model.model, rank=args.latent, mode='default') - seqnn_model.model.summary() - - elif args.transfer_mode=='lora_full': - add_lora(seqnn_model.model, rank=args.latent, mode='full') - seqnn_model.model.summary() - - elif args.transfer_mode=='ia3': - add_ia3(seqnn_model.model) - seqnn_model.model.summary() - + + ############ + # adapters # + ############ + elif args.transfer_mode=='sparse': + + # attention adapter + if args.att_adapter is not None: + if args.att_adapter=='adapterHoulsby': + seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, + strand_pairs[0], + latent_size=args.att_latent) + elif args.att_adapter=='lora': + transfer_helper.add_lora(seqnn_model.model, + rank=args.att_latent, + mode='default') + + elif args.att_adapter=='lora_full': + transfer_helper.add_lora(seqnn_model.model, + rank=args.att_latent, + mode='full') + + elif args.att_adapter=='ia3': + transfer_helper.add_ia3(seqnn_model.model) + + # conv adapter + # assume seqnn_model is appropriately frozen + if args.conv_adapter is not None: + if args.conv_adapter=='conv': + params_added = 0 + for l in seqnn_model.model.layers: + if l.name.startswith("conv1d"): + l.trainable=True + params_added += transfer_helper.param_count(l, type='trainable') + print('params added/unfrozen by conv: %d'%params_added) + + if args.conv_adapter=='conv_all': + params_added = 0 + for l in seqnn_model.model.layers: + if l.name.startswith(("conv1d","separable_conv1d")): + l.trainable=True + params_added += transfer_helper.param_count(l, type='trainable') + print('params added/unfrozen by conv_all: %d'%params_added) + + elif args.conv_adapter=='batch_norm': + params_added = 0 + for l in seqnn_model.model.layers: + if l.name.startswith("batch_normalization"): + l.trainable=True + params_added += transfer_helper.param_count(l, type='trainable') + print('params added/unfrozen by batch_norm: %d'%params_added) + + ################## + # squeeze-excite # + ################## + elif args.conv_adapter=='se': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=False) + + elif args.conv_adapter=='se_bn': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=True) + + elif args.conv_adapter=='se_all': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + bottleneck_ratio=args.se_ratio, + insert_mode='all', + unfreeze_bn=False) + + elif args.conv_adapter=='se_all_bn': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + bottleneck_ratio=args.se_ratio, + insert_mode='all', + unfreeze_bn=True) + + ################# + # final summary # + ################# + seqnn_model.model.summary() + # initialize trainer seqnn_trainer = trainer.Trainer( params_train, train_data, eval_data, args.out_dir @@ -214,6 +301,41 @@ def main(): seqnn_trainer.fit_tape(seqnn_model) else: seqnn_trainer.fit2(seqnn_model) + + ############################# + # post-training adjustments # + ############################# + if args.transfer_mode=='sparse': + + # Houlsby adapter requires architecture change, overwrite params.json file with new one + if args.att_adapter=='adapterHoulsby': + transfer_helper.modify_json(input_json=args.params_file, + output_json=args.out_dir, + adapter='houlsby', + latent_size=args.att_latent) + + # merge lora weights to original, save weight to: model_best.mergeW.h5 + # use original params.json + if args.att_adapter=='lora': + seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + transfer_helper.merge_lora(seqnn_model.model, mode='default') + seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) + + if args.att_adapter=='lora_full': + seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + transfer_helper.merge_lora(seqnn_model.model, mode='full') + seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) + + # merge ia3 weights to original, save weight to: model_best_mergeweight.h5 + if args.att_adapter=='ia3': + seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + transfer_helper.merge_ia3(seqnn_model.model) + seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) + + else: ######################################## @@ -259,157 +381,6 @@ def main(): else: seqnn_trainer.fit2(seqnn_model) -def make_adapter_model(input_model, strand_pair, latent_size=16): - # take seqnn_model as input - # output a new seqnn_model object - # only the adapter, and layer_norm are trainable - - model = tf.keras.Model(inputs=input_model.input, - outputs=input_model.layers[-2].output) # remove the switch_reverse layer - - # save current graph - layer_parent_dict_old = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict_old: - layer_parent_dict_old.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict_old[layer_name]: - layer_parent_dict_old[layer_name].append(layer.name) - - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({model.layers[0].name: model.input}) - - # remove switch_reverse - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - - for layer in model.layers[1:]: - - # parent layers - parent_layers = layer_parent_dict_old[layer.name] - - # layer inputs - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - - # insert adapter: - elif re.match('add', layer.name): - if any([re.match('dropout', i) for i in parent_layers]): - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) - x = layer([layer_input[0], x]) - else: - x = layer(layer_input) - - else: - x = layer(layer_input) - - # save the output tensor of every layer - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) - - # set layer_norm layers to trainable - for l in model_adapter.layers: - if re.match('layer_normalization', l.name): l.trainable = True - - return model_adapter - -def add_lora(input_model, rank=8, alpha=16, mode='default'): - ###################### - # inject lora layers # - ###################### - # take seqnn.model as input - # replace _q_layer, _v_layer in multihead_attention - # optionally replace _k_layer, _embedding_layer - if mode not in ['default','full']: - raise ValueError("mode must be default or full") - - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - # default loRA - layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) - layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) - # full loRA - if mode=='full': - layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) - layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) - - input_model(input_model.input) # initialize new variables - - ################# - # freeze params # - ################# - # freeze all params but lora - for layer in input_model._flatten_layers(): - lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: - if layer.name in ["lora_a", "lora_b"]: - layer.trainable = True - else: - layer.trainable = False - - ### bias terms need to be frozen separately - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) - - # set final head to be trainable - input_model.layers[-2].trainable=True - - -def add_ia3(input_model): - ##################### - # inject ia3 layers # - ##################### - # take seqnn.model as input - # replace _k_layer, _v_layer, _embedding_layer in multihead_attention - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._k_layer = layers.IA3(layer._k_layer, trainable=True) - layer._v_layer = layers.IA3(layer._v_layer, trainable=True) - layer._embedding_layer = layers.IA3(layer._embedding_layer, trainable=True) - input_model(input_model.input) # instantiate model to initialize new variables - - ################# - # freeze params # - ################# - # set ia3 to trainable - for layer in input_model._flatten_layers(): - lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: - if layer.name =='ia3': - layer.trainable = True - else: - layer.trainable = False - - ### bias terms need to be frozen separately - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) - - # set final head to be trainable - input_model.layers[-2].trainable=True - -def param_count(model): - trainable = int(sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)) - non_trainable = int(sum(tf.keras.backend.count_params(w) for w in model.non_trainable_weights)) - print('total params:%d' %(trainable + non_trainable)) - print('trainable params:%d' %trainable) - print('non-trainable params:%d' %non_trainable) ################################################################################ # __main__ diff --git a/src/baskerville/scripts/westminster_train_folds_copy.py b/src/baskerville/scripts/westminster_train_folds_copy.py deleted file mode 100755 index 777d784..0000000 --- a/src/baskerville/scripts/westminster_train_folds_copy.py +++ /dev/null @@ -1,530 +0,0 @@ -#!/usr/bin/env python -# Copyright 2019 Calico LLC - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# https://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ========================================================================= - -from optparse import OptionParser, OptionGroup -import glob -import json -import os -import pdb -import shutil - -from natsort import natsorted - -import slurm - -""" -westminster_train_folds.py - -Train baskerville model replicates on cross folds using given parameters and data. -""" - -################################################################################ -# main -################################################################################ -def main(): - usage = 'usage: %prog [options] ...' - parser = OptionParser(usage) - - # train - train_options = OptionGroup(parser, 'houndtrain.py options') - train_options.add_option('-k', dest='keras_fit', - default=False, action='store_true', - help='Train with Keras fit method [Default: %default]') - train_options.add_option('-m', dest='mixed_precision', - default=False, action='store_true', - help='Train with mixed precision [Default: %default]') - train_options.add_option('-o', dest='out_dir', - default='train_out', - help='Training output directory [Default: %default]') - train_options.add_option('--restore', dest='restore', - help='Restore model and continue training, from existing fold train dir [Default: %default]') - train_options.add_option('--trunk', dest='trunk', - default=False, action='store_true', - help='Restore only model trunk [Default: %default]') - train_options.add_option('--tfr_train', dest='tfr_train_pattern', - default=None, - help='Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') - train_options.add_option('--tfr_eval', dest='tfr_eval_pattern', - default=None, - help='Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') - parser.add_option_group(train_options) - - # transfer options - transfer_options = OptionGroup(parser, 'transfer options') - transfer_options.add_option('--transfer', dest='transfer', - default=False, action='store_true', - help='whether to do transfer learning.') - transfer_options.add_option('--pretrain', dest='pretrain', - default=None, help='path to pretrained model trunk.') - transfer_options.add_option('--transfer_mode', dest='transfer_mode', - default='linear', help='transfer method.') - transfer_options.add_option('--latent', dest='latent', type='int', - default=0, help='latent size. ') - - # eval - eval_options = OptionGroup(parser, 'hound_eval.py options') - eval_options.add_option('--rank', dest='rank_corr', - default=False, action='store_true', - help='Compute Spearman rank correlation [Default: %default]') - eval_options.add_option('--rc', dest='rc', - default=False, action='store_true', - help='Average forward and reverse complement predictions [Default: %default]') - eval_options.add_option('--shifts', dest='shifts', - default='0', type='str', - help='Ensemble prediction shifts [Default: %default]') - parser.add_option('--step', dest='step', - default=1, type='int', - help='Spatial step for specificity/spearmanr [Default: %default]') - parser.add_option_group(eval_options) - - # multi - rep_options = OptionGroup(parser, 'replication options') - rep_options.add_option('-c', dest='crosses', - default=1, type='int', - help='Number of cross-fold rounds [Default:%default]') - rep_options.add_option('--checkpoint', dest='checkpoint', - default=False, action='store_true', - help='Restart training from checkpoint [Default: %default]') - rep_options.add_option('-e', dest='conda_env', - default='tf2.12', - help='Anaconda environment [Default: %default]') - rep_options.add_option('-f', dest='fold_subset', - default=None, type='int', - help='Run a subset of folds [Default:%default]') - rep_options.add_option('--name', dest='name', - default='fold', help='SLURM name prefix [Default: %default]') - rep_options.add_option('-p', dest='processes', - default=None, type='int', - help='Number of processes, passed by multi script') - rep_options.add_option('-q', dest='queue', - default='titan_rtx', - help='SLURM queue on which to run the jobs [Default: %default]') - rep_options.add_option('-r', '--restart', dest='restart', - default=False, action='store_true') - rep_options.add_option('--setup', dest='setup', - default=False, action='store_true', - help='Setup folds data directory only [Default: %default]') - rep_options.add_option('--spec_off', dest='spec_off', - default=False, action='store_true') - rep_options.add_option('--eval_off', dest='eval_off', - default=False, action='store_true') - rep_options.add_option('--eval_train_off', dest='eval_train_off', - default=False, action='store_true') - parser.add_option_group(rep_options) - - (options, args) = parser.parse_args() - - if len(args) < 2: - parser.error('Must provide parameters and data directory.') - else: - params_file = os.path.abspath(args[0]) - data_dirs = [os.path.abspath(arg) for arg in args[1:]] - - ####################################################### - # prep work - - if not options.restart and os.path.isdir(options.out_dir): - print('Output directory %s exists. Please remove.' % options.out_dir) - exit(1) - os.makedirs(options.out_dir, exist_ok=True) - - # read model parameters - with open(params_file) as params_open: - params = json.load(params_open) - params_train = params['train'] - - # copy params into output directory - shutil.copy(params_file, '%s/params.json' % options.out_dir) - - # read data parameters - num_data = len(data_dirs) - data_stats_file = '%s/statistics.json' % data_dirs[0] - with open(data_stats_file) as data_stats_open: - data_stats = json.load(data_stats_open) - - # count folds - num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) - - # subset folds - if options.fold_subset is not None: - num_folds = min(options.fold_subset, num_folds) - - if options.queue == 'standard': - num_cpu = 8 - num_gpu = 0 - time_base = 64 - else: - num_cpu = 2 - num_gpu = 1 - time_base = 24 - - # arrange data - for ci in range(options.crosses): - for fi in range(num_folds): - rep_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) - os.makedirs(rep_dir, exist_ok=True) - - # make data directories - for di in range(num_data): - rep_data_dir = '%s/data%d' % (rep_dir, di) - if not os.path.isdir(rep_data_dir): - make_rep_data(data_dirs[di], rep_data_dir, fi, ci) - - if options.setup: - exit(0) - - cmd_source = 'source /home/yuanh/.bashrc;' - hound_train = 'hound_train.py' - ####################################################### - # train - - jobs = [] - - for ci in range(options.crosses): - for fi in range(num_folds): - rep_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) - - train_dir = '%s/train' % rep_dir - if options.restart and not options.checkpoint and os.path.isdir(train_dir): - print('%s found and skipped.' % rep_dir) - - else: - # collect data directories - rep_data_dirs = [] - for di in range(num_data): - rep_data_dirs.append('%s/data%d' % (rep_dir, di)) - - # if options.checkpoint: - # os.rename('%s/train.out' % rep_dir, '%s/train1.out' % rep_dir) - - # train command - cmd = cmd_source - cmd += ' conda activate %s;' % options.conda_env - cmd += ' echo $HOSTNAME;' - - cmd += ' %s' %hound_train - cmd += ' %s' % options_string(options, train_options, rep_dir) - - # transfer learning options - if options.transfer: - cmd += ' --restore %s/f%dc%d.h5' % (options.pretrain, fi, ci) - cmd += ' --trunk' - cmd += ' --transfer_mode %s' % options.transfer_mode - if options.latent!=0: - cmd += ' --latent %d' % options.latent - - cmd += ' %s %s' % (params_file, ' '.join(rep_data_dirs)) - - name = '%s-train-f%dc%d' % (options.name, fi, ci) - sbf = os.path.abspath('%s/train.sb' % rep_dir) - outf = os.path.abspath('%s/train.%%j.out' % rep_dir) - errf = os.path.abspath('%s/train.%%j.err' % rep_dir) - - j = slurm.Job(cmd, name, - outf, errf, sbf, - queue=options.queue, - cpu=4, - gpu=params_train.get('num_gpu',1), - mem=30000, time='60-0:0:0') - jobs.append(j) - - slurm.multi_run(jobs, max_proc=options.processes, verbose=True, - launch_sleep=10, update_sleep=60) - - - ####################################################### - # evaluate training set - - jobs = [] - - if not options.eval_train_off: - for ci in range(options.crosses): - for fi in range(num_folds): - it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) - - for di in range(num_data): - if num_data == 1: - out_dir = '%s/eval_train' % it_dir - model_file = '%s/train/model_check.h5' % it_dir - else: - out_dir = '%s/eval%d_train' % (it_dir, di) - model_file = '%s/train/model%d_check.h5' % (it_dir, di) - - # check if done - acc_file = '%s/acc.txt' % out_dir - if os.path.isfile(acc_file): - print('%s already generated.' % acc_file) - else: - # hound evaluate - cmd = cmd_source - cmd += ' conda activate %s;' % options.conda_env - cmd += ' echo $HOSTNAME;' - cmd += ' hound_eval.py' - cmd += ' --head %d' % di - cmd += ' -o %s' % out_dir - if options.rc: - cmd += ' --rc' - if options.shifts: - cmd += ' --shifts %s' % options.shifts - cmd += ' --split train' - cmd += ' %s' % params_file - cmd += ' %s' % model_file - cmd += ' %s/data%d' % (it_dir, di) - - name = '%s-evaltr-f%dc%d' % (options.name, fi, ci) - job = slurm.Job(cmd, - name=name, - out_file='%s.out'%out_dir, - err_file='%s.err'%out_dir, - queue=options.queue, - cpu=num_cpu, gpu=num_gpu, - mem=30000, - time='%d:00:00' % (3*time_base)) - jobs.append(job) - - - ####################################################### - # evaluate test set - - if not options.eval_off: - for ci in range(options.crosses): - for fi in range(num_folds): - it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) - - for di in range(num_data): - if num_data == 1: - out_dir = '%s/eval' % it_dir - model_file = '%s/train/model_best.h5' % it_dir - else: - out_dir = '%s/eval%d' % (it_dir, di) - model_file = '%s/train/model%d_best.h5' % (it_dir, di) - - # check if done - acc_file = '%s/acc.txt' % out_dir - if os.path.isfile(acc_file): - print('%s already generated.' % acc_file) - else: - cmd = cmd_source - cmd += ' conda activate %s;' % options.conda_env - cmd += ' echo $HOSTNAME;' - cmd += ' hound_eval.py' - cmd += ' --head %d' % di - cmd += ' -o %s' % out_dir - if options.rc: - cmd += ' --rc' - if options.shifts: - cmd += ' --shifts %s' % options.shifts - if options.rank_corr: - cmd += ' --rank' - cmd += ' --step %d' % options.step - cmd += ' %s' % params_file - cmd += ' %s' % model_file - cmd += ' %s/data%d' % (it_dir, di) - - name = '%s-eval-f%dc%d' % (options.name, fi, ci) - job = slurm.Job(cmd, - name=name, - out_file='%s.out'%out_dir, - err_file='%s.err'%out_dir, - queue=options.queue, - cpu=num_cpu, gpu=num_gpu, - mem=30000, - time='%d:00:00' % time_base) - jobs.append(job) - - ####################################################### - # evaluate test specificity - - if not options.spec_off: - for ci in range(options.crosses): - for fi in range(num_folds): - it_dir = '%s/f%dc%d' % (options.out_dir, fi, ci) - - for di in range(num_data): - if num_data == 1: - out_dir = '%s/eval_spec' % it_dir - model_file = '%s/train/model_best.h5' % it_dir - else: - out_dir = '%s/eval%d_spec' % (it_dir, di) - model_file = '%s/train/model%d_best.h5' % (it_dir, di) - - # check if done - acc_file = '%s/acc.txt' % out_dir - if os.path.isfile(acc_file): - print('%s already generated.' % acc_file) - else: - cmd = cmd_source - cmd += ' conda activate %s;' % options.conda_env - cmd += ' echo $HOSTNAME;' - cmd += ' hound_eval_spec.py' - cmd += ' --head %d' % di - cmd += ' -o %s' % out_dir - cmd += ' --step %d' % options.step - if options.rc: - cmd += ' --rc' - if options.shifts: - cmd += ' --shifts %s' % options.shifts - cmd += ' %s' % params_file - cmd += ' %s' % model_file - cmd += ' %s/data%d' % (it_dir, di) - - name = '%s-spec-f%dc%d' % (options.name, fi, ci) - job = slurm.Job(cmd, - name=name, - out_file='%s.out'%out_dir, - err_file='%s.err'%out_dir, - queue=options.queue, - cpu=num_cpu, gpu=num_gpu, - mem=150000, - time='%d:00:00' % (5*time_base)) - jobs.append(job) - - slurm.multi_run(jobs, max_proc=options.processes, verbose=True, - launch_sleep=10, update_sleep=60) - - -def make_rep_data(data_dir, rep_data_dir, fi, ci): - # read data parameters - data_stats_file = '%s/statistics.json' % data_dir - with open(data_stats_file) as data_stats_open: - data_stats = json.load(data_stats_open) - - # sequences per fold - fold_seqs = [] - dfi = 0 - while 'fold%d_seqs'%dfi in data_stats: - fold_seqs.append(data_stats['fold%d_seqs'%dfi]) - del data_stats['fold%d_seqs'%dfi] - dfi += 1 - num_folds = dfi - - # split folds into train/valid/test - test_fold = fi - valid_fold = (fi+1+ci) % num_folds - train_folds = [fold for fold in range(num_folds) if fold not in [valid_fold,test_fold]] - - # clear existing directory - if os.path.isdir(rep_data_dir): - shutil.rmtree(rep_data_dir) - - # make data directory - os.makedirs(rep_data_dir, exist_ok=True) - - # dump data stats - data_stats['test_seqs'] = fold_seqs[test_fold] - data_stats['valid_seqs'] = fold_seqs[valid_fold] - data_stats['train_seqs'] = sum([fold_seqs[tf] for tf in train_folds]) - with open('%s/statistics.json'%rep_data_dir, 'w') as data_stats_open: - json.dump(data_stats, data_stats_open, indent=4) - - # set sequence tvt - try: - seqs_bed_out = open('%s/sequences.bed'%rep_data_dir, 'w') - for line in open('%s/sequences.bed'%data_dir): - a = line.split() - sfi = int(a[-1].replace('fold','')) - if sfi == test_fold: - a[-1] = 'test' - elif sfi == valid_fold: - a[-1] = 'valid' - else: - a[-1] = 'train' - print('\t'.join(a), file=seqs_bed_out) - seqs_bed_out.close() - except (ValueError, FileNotFoundError): - pass - - # copy targets - shutil.copy('%s/targets.txt'%data_dir, '%s/targets.txt'%rep_data_dir) - - # sym link tfrecords - rep_tfr_dir = '%s/tfrecords' % rep_data_dir - os.mkdir(rep_tfr_dir) - - # test tfrecords - ti = 0 - test_tfrs = natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, test_fold))) - for test_tfr in test_tfrs: - test_tfr = os.path.abspath(test_tfr) - test_rep_tfr = '%s/test-%d.tfr' % (rep_tfr_dir, ti) - os.symlink(test_tfr, test_rep_tfr) - ti += 1 - - # valid tfrecords - ti = 0 - valid_tfrs = natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, valid_fold))) - for valid_tfr in valid_tfrs: - valid_tfr = os.path.abspath(valid_tfr) - valid_rep_tfr = '%s/valid-%d.tfr' % (rep_tfr_dir, ti) - os.symlink(valid_tfr, valid_rep_tfr) - ti += 1 - - # train tfrecords - ti = 0 - train_tfrs = [] - for tfi in train_folds: - train_tfrs += natsorted(glob.glob('%s/tfrecords/fold%d-*.tfr' % (data_dir, tfi))) - for train_tfr in train_tfrs: - train_tfr = os.path.abspath(train_tfr) - train_rep_tfr = '%s/train-%d.tfr' % (rep_tfr_dir, ti) - os.symlink(train_tfr, train_rep_tfr) - ti += 1 - - -def options_string(options, train_options, rep_dir): - options_str = '' - - for opt in train_options.option_list: - opt_str = opt.get_opt_string() - opt_value = options.__dict__[opt.dest] - - # wrap askeriks in "" - if type(opt_value) == str and opt_value.find('*') != -1: - opt_value = '"%s"' % opt_value - - # no value for bools - elif type(opt_value) == bool: - if not opt_value: - opt_str = '' - opt_value = '' - - # skip Nones - elif opt_value is None: - opt_str = '' - opt_value = '' - - # modify - elif opt.dest == 'out_dir': - opt_value = '%s/train' % rep_dir - - # find matching restore - elif opt.dest == 'restore': - fold_dir_mid = rep_dir.split('/')[-1] - if options.trunk: - opt_value = '%s/%s/train/model_trunk.h5' % (opt_value, fold_dir_mid) - else: - opt_value = '%s/%s/train/model_best.h5' % (opt_value, fold_dir_mid) - - options_str += ' %s %s' % (opt_str, opt_value) - - return options_str - - -################################################################################ -# __main__ -################################################################################ -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 6503815..d7c048e 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -723,8 +723,8 @@ def make_optimizer(self, loss_scale=False): else: # schedule (currently OFF) initial_learning_rate = self.params.get("learning_rate", 0.01) - if False: - lr_schedule = keras.optimizers.schedules.ExponentialDecay( + if self.params.get("decay_steps"): + lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=self.params.get("decay_steps", 100000), decay_rate=self.params.get("decay_rate", 0.96), @@ -778,7 +778,7 @@ def make_optimizer(self, loss_scale=False): global_clipnorm=global_clipnorm, amsgrad=False, ) # reduces performance in my experience - + elif optimizer_type in ["sgd", "momentum"]: self.optimizer = tf.keras.optimizers.SGD( learning_rate=lr_schedule, From 9db213626f7fb54182ab30626c38d44dd8185fe7 Mon Sep 17 00:00:00 2001 From: hy395 Date: Sat, 11 Nov 2023 02:08:32 -0800 Subject: [PATCH 06/26] implement lora/ia3 merge weight --- src/baskerville/transfer_helper.py | 392 +++++++++++++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100644 src/baskerville/transfer_helper.py diff --git a/src/baskerville/transfer_helper.py b/src/baskerville/transfer_helper.py new file mode 100644 index 0000000..d6cd851 --- /dev/null +++ b/src/baskerville/transfer_helper.py @@ -0,0 +1,392 @@ +import argparse +import json +import os +import shutil +import re +import h5py + +import numpy as np +import pandas as pd +import tensorflow as tf +from tensorflow.keras import mixed_precision + +from baskerville import dataset +from baskerville import seqnn +from baskerville import trainer +from baskerville import layers + +def param_count(layer, type='all'): + if type not in ['all','trainable','non_trainable']: + raise ValueError("TYPE must be one of all, trainable, non_trainable") + output = 0 + if type=='all': + output = int(sum(tf.keras.backend.count_params(w) for w in layer.weights)) + elif type=='trainable': + output = int(sum(tf.keras.backend.count_params(w) for w in layer.trainable_weights)) + else: + output = int(sum(tf.keras.backend.count_params(w) for w in layer.non_trainable_weights)) + return output + +def param_summary(model): + trainable = param_count(model, type='trainable') + non_trainable = param_count(model, type='non_trainable') + print('total params:%d' %(trainable + non_trainable)) + print('trainable params:%d' %trainable) + print('non-trainable params:%d' %non_trainable) + + +###################### +# add houlsby layers # +###################### +def add_houlsby(input_model, strand_pair, latent_size=16): + # take seqnn_model as input + # output a new seqnn_model object + # only the adapter, and layer_norm are trainable + + model = tf.keras.Model(inputs=input_model.input, + outputs=input_model.layers[-2].output) # remove the switch_reverse layer + + # save current graph + layer_parent_dict_old = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict_old: + layer_parent_dict_old.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict_old[layer_name]: + layer_parent_dict_old[layer_name].append(layer.name) + + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({model.layers[0].name: model.input}) + + # remove switch_reverse + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in model.layers[1:]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + + # insert adapter: + elif re.match('add', layer.name): + if any([re.match('dropout', i) for i in parent_layers]): + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) + x = layer([layer_input[0], x]) + else: + x = layer(layer_input) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) + + # set trainable + for l in model_adapter.layers[:-2]: # trunk + if re.match('layer_normalization|adapter_houlsby', l.name): + l.trainable = True + else: + l.trainable = False + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_adapter.layers: + if l.name.startswith("adapter_houlsby"): + params_added += param_count(l) + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type='trainable') + print('params added/unfrozen by adapter_houlsby: %d'%params_added) + + return model_adapter + +# save Houlsby json +def modify_json(input_json, output_json, adapter, latent=None): + + with open(input_json) as params_open: + params = json.load(params_open) + + params["model"]["trunk"][2]['adapter']= adapter + params["model"]["trunk"][2]['latent']= latent + + ### output + with open(output_json, 'w') as params_open: + json.dump(params, params_open, indent=4) + +################### +# add lora layers # +################### +def add_lora(input_model, rank=8, alpha=16, mode='default'): + # take seqnn.model as input + # replace _q_layer, _v_layer in multihead_attention + # optionally replace _k_layer, _embedding_layer + if mode not in ['default','full']: + raise ValueError("mode must be default or full") + + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + # default loRA + layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) + layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) + # full loRA + if mode=='full': + layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) + layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) + + input_model(input_model.input) # initialize new variables + + # freeze all params but lora + for layer in input_model._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name in ["lora_a", "lora_b"]: + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + input_model.layers[-2].trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in input_model.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + if mode=='full': + params_added += param_count(l._k_layer.down_layer) + params_added += param_count(l._k_layer.up_layer) + params_added += param_count(l._embedding_layer.down_layer) + params_added += param_count(l._embedding_layer.up_layer) + + print('params added/unfrozen by lora: %d'%params_added) + +# merge lora weights +def merge_lora_layer(lora_layer): + down_weights = lora_layer.down_layer.kernel + up_weights = lora_layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + lora_layer.original_layer.kernel.assign_add(increment_weights) + return lora_layer.original_layer + +def merge_lora(input_model, mode='default'): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + # default loRA + layer._q_layer = merge_lora_layer(layer._q_layer) + layer._v_layer = merge_lora_layer(layer._v_layer) + if mode=='full': + layer._k_layer = merge_lora_layer(layer._k_layer) + layer._embedding_layer = merge_lora_layer(layer._embedding_layer) + input_model(input_model.input) + +# correct weights.h5 weight order +def var_reorder(weight_h5): + # assumes weight_h5 model saved with seqnn_model.save() + # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. + # model.load_weights() load weights sequencially, assuming layer weights are in the right order. + # When inserting lora/ia3, multihead_attention layer weights order changed. + # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs + # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. + var_init_order = ['r_w_bias:0:0', + 'r_r_bias:0:0', + 'q_layer/kernel:0', + 'k_layer/kernel:0', + 'v_layer/kernel:0', + 'embedding_layer/kernel:0', + 'embedding_layer/bias:0', + 'r_k_layer/kernel:0'] + + f = h5py.File(weight_h5, 'r+') + layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + for l_name in layers: + new_name_order = [l_name+'/'+i for i in var_init_order] + f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + f.close() + +################## +# add ia3 layers # +################## +def add_ia3(input_model): + # take seqnn.model as input + # replace _k_layer, _v_layer, _embedding_layer in multihead_attention + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._k_layer = layers.IA3(layer._k_layer, trainable=True) + layer._v_layer = layers.IA3(layer._v_layer, trainable=True) + layer._embedding_layer = layers.IA3(layer._embedding_layer, trainable=True) + input_model(input_model.input) # instantiate model to initialize new variables + + # freeze params: + for layer in input_model._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name =='ia3': + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + input_model.layers[-2].trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in input_model.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._k_layer._ia3_layer) + params_added += param_count(l._v_layer._ia3_layer) + params_added += param_count(l._embedding_layer._ia3_layer) + + print('params added/unfrozen by ia3: %d'%params_added) + +# merge lora weights +def merge_ia3_layer(ia3_layer, type='kv'): + scaler = ia3_layer._ia3_layer.kernel[0] + ia3_layer.original_layer.kernel.assign(ia3_layer.original_layer.kernel * scaler) + if type=='embedding': + ia3_layer.original_layer.bias.assign(ia3_layer.original_layer.bias * scaler) + return ia3_layer.original_layer + +def merge_ia3(input_model): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + layer._k_layer = merge_ia3_layer(layer._k_layer, type='kv') + layer._v_layer = merge_ia3_layer(layer._v_layer, type='kv') + layer._embedding_layer = merge_ia3_layer(layer._embedding_layer, type='embedding') + input_model(input_model.input) + + +###################### +# add squeeze excite # +###################### +def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', unfreeze_bn=False): + # add squeeze-excitation blocks after conv + # input_model should be properly frozen + # pre_att: add se_block to pre-attention conv1d + # all: add se_block to pre-attention conv1d and post-attention separable_conv1d + + if insert_mode not in ['pre_att','all']: + raise ValueError("insert_mode must be pre_att or all") + + model = tf.keras.Model(inputs=input_model.input, + outputs=input_model.layers[-2].output) # remove the switch_reverse layer + + # save current graph + layer_parent_dict_old = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict_old: + layer_parent_dict_old.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict_old[layer_name]: + layer_parent_dict_old[layer_name].append(layer.name) + + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({model.layers[0].name: model.input}) + + # remove switch_reverse + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in model.layers[1:]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if layer.name.startswith("stochastic_reverse_complement"): + x, reverse_bool = layer(layer_input) + + # insert squeeze-excite layer: + elif layer.name.startswith("conv1d"): + se_layer = layers.SqueezeExcite( + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + bottleneck_ratio=bottleneck_ratio, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3) # near-zero weight initialization + ) + x = layer(layer_input) + x = x + se_layer(x) + + elif layer.name.startswith("separable_conv1d"): + if insert_mode=='all': + se_layer = layers.SqueezeExcite( + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + bottleneck_ratio=bottleneck_ratio, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3) # near-zero weight initialization + ) + x = layer(layer_input) + x = x + se_layer(x) + else: + x = layer(layer_input) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) + model_final = tf.keras.Model(inputs=model.inputs, outputs=final) + + # unfreeze layers + for l in model_final.layers: # set trunk + if l.name.startswith("squeeze_excite"): l.trainable = True + + if unfreeze_bn: + for l in model_final.layers: + if l.name.startswith("batch_normalization"): l.trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_final.layers: + if l.name.startswith("squeeze_excite"): + params_added += param_count(l) + elif l.name.startswith("batch_normalization"): + if unfreeze_bn: params_added += param_count(l, type='trainable') + print('params added/unfrozen by se_block: %d'%params_added) + + return model_final From 6502e5285c0d1c6100fc3aabef16b6ba424a9d40 Mon Sep 17 00:00:00 2001 From: hy395 Date: Mon, 29 Jan 2024 09:53:54 -0800 Subject: [PATCH 07/26] add gene eval --- src/baskerville/blocks.py | 54 +- src/baskerville/layers.py | 15 +- src/baskerville/pygene.py | 324 +++++++++++ src/baskerville/scripts/borzoi_test_genes.py | 550 +++++++++++++++++++ src/baskerville/scripts/hound_eval_spec.py | 39 +- src/baskerville/scripts/hound_transfer.py | 150 +++-- src/baskerville/transfer_helper.py | 300 +++++++--- 7 files changed, 1283 insertions(+), 149 deletions(-) create mode 100755 src/baskerville/pygene.py create mode 100755 src/baskerville/scripts/borzoi_test_genes.py diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py index 82e9066..19e64ce 100644 --- a/src/baskerville/blocks.py +++ b/src/baskerville/blocks.py @@ -149,6 +149,8 @@ def conv_dna( conv_type="standard", kernel_initializer="he_normal", padding="same", + transfer_se=False, + se_ratio=16, ): """Construct a single convolution block, assumed to be operating on DNA. @@ -196,6 +198,18 @@ def conv_dna( kernel_regularizer=tf.keras.regularizers.l2(l2_scale), )(current) + # squeeze-excite for transfer + if transfer_se: + se_out = squeeze_excite(current, + activation=None, + additive=False, + bottleneck_ratio=se_ratio, + use_bias=False, + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + scale_fun='tanh' + ) + current = current + se_out + # squeeze-excite if se: current = squeeze_excite(current) @@ -267,6 +281,8 @@ def conv_nac( kernel_initializer="he_normal", padding="same", se=False, + transfer_se=False, + se_ratio=16, ): """Construct a single convolution block. @@ -326,6 +342,18 @@ def conv_nac( kernel_regularizer=tf.keras.regularizers.l2(l2_scale), )(current) + # squeeze-excite for transfer + if transfer_se: + se_out = squeeze_excite(current, + activation=None, + additive=False, + bottleneck_ratio=se_ratio, + use_bias=False, + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + scale_fun='tanh' + ) + current = current + se_out + # squeeze-excite if se: current = squeeze_excite(current) @@ -456,6 +484,8 @@ def fpn_unet( bn_momentum=0.99, kernel_size=1, kernel_initializer="he_normal", + transfer_se=False, + se_ratio=16, ): """Construct a feature pyramid network block. @@ -529,6 +559,17 @@ def fpn_unet( kernel_initializer=kernel_initializer, )(current) + if transfer_se: + se_out = squeeze_excite(current, + activation=None, + additive=False, + bottleneck_ratio=se_ratio, + use_bias=False, + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + scale_fun='tanh' + ) + current = current + se_out + # dropout if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) @@ -1528,11 +1569,20 @@ def squeeze_excite( additive=False, norm_type=None, bn_momentum=0.9, + kernel_initializer='glorot_uniform', + use_bias=True, + scale_fun='sigmoid', **kwargs, ): return layers.SqueezeExcite( - activation, additive, bottleneck_ratio, norm_type, bn_momentum - )(inputs) + activation=activation, + additive=additive, + bottleneck_ratio=bottleneck_ratio, + norm_type=norm_type, + bn_momentum=bn_momentum, + kernel_initializer=kernel_initializer, + scale_fun=scale_fun, + use_bias=use_bias)(inputs) def wheeze_excite(inputs, pool_size, **kwargs): diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index 8f6af73..d0513dc 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -756,6 +756,7 @@ def __init__( use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', + scale_fun='sigmoid', ): super(SqueezeExcite, self).__init__() self.activation = activation @@ -766,6 +767,7 @@ def __init__( self.kernel_initializer=kernel_initializer self.bias_initializer=bias_initializer self.use_bias=use_bias + self.scale_fun=scale_fun def build(self, input_shape): self.num_channels = input_shape[-1] @@ -783,6 +785,17 @@ def build(self, input_shape): ) exit(1) + if self.scale_fun=='sigmoid': + self.scale_f = tf.keras.activations.sigmoid + elif self.scale_fun=='tanh': # set to tanh for transfer + self.scale_f = tf.keras.activations.tanh + else: + print( + "scale function must be sigmoid or tanh", + file=sys.stderr, + ) + exit(1) + self.dense1 = tf.keras.layers.Dense( units=self.num_channels // self.bottleneck_ratio, activation="relu", @@ -819,7 +832,7 @@ def call(self, x): if self.additive: xs = x + excite else: - excite = tf.keras.activations.sigmoid(excite) + excite = self.scale_f(excite) xs = x * excite return xs diff --git a/src/baskerville/pygene.py b/src/baskerville/pygene.py new file mode 100755 index 0000000..86cae4f --- /dev/null +++ b/src/baskerville/pygene.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python +from optparse import OptionParser + +import gzip +import pdb + +''' +pygene + +Classes and methods to manage genes in GTF format. +''' + +################################################################################ +# Classes +################################################################################ +class GenomicInterval: + def __init__(self, start, end, chrom=None, strand=None): + self.start = start + self.end = end + self.chrom = chrom + self.strand = strand + + def __eq__(self, other): + return self.start == other.start + + def __lt__(self, other): + return self.start < other.start + + def __cmp__(self, x): + if self.start < x.start: + return -1 + elif self.start > x.start: + return 1 + else: + return 0 + + def __str__(self): + if self.chrom is None: + label = '[%d-%d]' % (self.start, self.end) + else: + label = '%s:%d-%d' % (self.chrom, self.start, self.end) + return label + + +class Transcript: + def __init__(self, chrom, strand, kv): + self.chrom = chrom + self.strand = strand + self.kv = kv + self.exons = [] + self.cds = [] + self.utrs3 = [] + self.utrs5 = [] + self.sorted = False + self.utrs_defined = False + + def add_cds(self, start, end): + self.cds.append(GenomicInterval(start,end)) + + def add_exon(self, start, end): + self.exons.append(GenomicInterval(start,end)) + + def define_utrs(self): + self.utrs_defined = True + + if len(self.cds) == 0: + self.utrs3 = self.exons + + else: + assert(self.sorted) + + # reset UTR lists + self.utrs5 = [] + self.utrs3 = [] + + # match up exons and CDS + ci = 0 + for ei in range(len(self.exons)): + # left initial + if self.exons[ei].end < self.cds[ci].start: + utr = GenomicInterval(self.exons[ei].start, self.exons[ei].end) + if self.strand == '+': + self.utrs5.append(utr) + else: + self.utrs3.append(utr) + + # right initial + elif self.cds[ci].end < self.exons[ei].start: + utr = GenomicInterval(self.exons[ei].start, self.exons[ei].end) + if self.strand == '+': + self.utrs3.append(utr) + else: + self.utrs5.append(utr) + + # overlap + else: + # left overlap + if self.exons[ei].start < self.cds[ci].start: + utr = GenomicInterval(self.exons[ei].start, self.cds[ci].start-1) + if self.strand == '+': + self.utrs5.append(utr) + else: + self.utrs3.append(utr) + + # right overlap + if self.cds[ci].end < self.exons[ei].end: + utr = GenomicInterval(self.cds[ci].end+1, self.exons[ei].end) + if self.strand == '+': + self.utrs3.append(utr) + else: + self.utrs5.append(utr) + + # increment up to last + ci = min(ci+1, len(self.cds)-1) + + def fasta_cds(self, fasta_open, stranded=False): + assert(self.sorted) + gene_seq = '' + for exon in self.cds: + exon_seq = fasta_open.fetch(self.chrom, exon.start-1, exon.end) + gene_seq += exon_seq + if stranded and self.strand == '-': + gene_seq = rc(gene_seq) + return gene_seq + + def fasta_exons(self, fasta_open, stranded=False): + assert(self.sorted) + gene_seq = '' + for exon in self.exons: + exon_seq = fasta_open.fetch(self.chrom, exon.start-1, exon.end) + gene_seq += exon_seq + if stranded and self.strand == '-': + gene_seq = rc(gene_seq) + return gene_seq + + def sort_exons(self): + self.sorted = True + if len(self.exons) > 1: + self.exons.sort() + if len(self.cds) > 1: + self.cds.sort() + + def span(self): + exon_starts = [exon.start for exon in self.exons] + exon_ends = [exon.end for exon in self.exons] + return min(exon_starts), max(exon_ends) + + def tss(self): + if self.strand == '-': + return self.exons[-1].end + else: + return self.exons[0].start + + def write_gtf(self, gtf_out, write_cds=False, write_utrs=False): + for ex in self.exons: + cols = [self.chrom, 'pygene', 'exon', str(ex.start), str(ex.end)] + cols += ['.', self.strand, '.', kv_gtf(self.kv)] + print('\t'.join(cols), file=gtf_out) + if write_cds: + for cds in self.cds: + cols = [self.chrom, 'pygene', 'CDS', str(cds.start), str(cds.end)] + cols += ['.', self.strand, '.', kv_gtf(self.kv)] + print('\t'.join(cols), file=gtf_out) + if write_utrs: + assert(self.utrs_defined) + for utr in self.utrs5: + cols = [self.chrom, 'pygene', '5\'UTR', str(utr.start), str(utr.end)] + cols += ['.', self.strand, '.', kv_gtf(self.kv)] + print('\t'.join(cols), file=gtf_out) + for utr in self.utrs3: + cols = [self.chrom, 'pygene', '3\'UTR', str(utr.start), str(utr.end)] + cols += ['.', self.strand, '.', kv_gtf(self.kv)] + print('\t'.join(cols), file=gtf_out) + + def __str__(self): + return '%s %s %s %s' % (self.chrom, self.strand, kv_gtf(self.kv), ','.join([ex.__str__() for ex in self.exons])) + + +class Gene: + def __init__(self): + self.transcripts = {} + self.chrom = None + self.strand = None + self.start = None + self.end = None + + def add_transcript(self, tx_id, tx): + self.transcripts[tx_id] = tx + self.chrom = tx.chrom + self.strand = tx.strand + self.kv = tx.kv + + def span(self): + tx_spans = [tx.span() for tx in self.transcripts.values()] + tx_starts, tx_ends = zip(*tx_spans) + self.start = min(tx_starts) + self.end = max(tx_ends) + return self.start, self.end + + +class GTF: + def __init__(self, gtf_file, trim_dot=False): + self.gtf_file = gtf_file + self.genes = {} + self.transcripts = {} + self.utrs_defined = False + self.trim_dot = trim_dot + + self.read_gtf() + + def define_utrs(self): + self.utrs_defined = True + for tx in self.transcripts.values(): + tx.define_utrs() + + def read_gtf(self): + if self.gtf_file[-3:] == '.gz': + gtf_in = gzip.open(self.gtf_file, 'rt') + else: + gtf_in = open(self.gtf_file) + + # ignore header + line = gtf_in.readline() + while line[0] == '#': + line = gtf_in.readline() + + while line: + a = line.split('\t') + if a[2] in ['exon','CDS']: + chrom = a[0] + interval_type = a[2] + start = int(a[3]) + end = int(a[4]) + strand = a[6] + kv = gtf_kv(a[8]) + + # add/get transcript + tx_id = kv['transcript_id'] + if self.trim_dot: + tx_id = trim_dot(tx_id) + if not tx_id in self.transcripts: + self.transcripts[tx_id] = Transcript(chrom, strand, kv) + tx = self.transcripts[tx_id] + + # add/get gene + gene_id = kv['gene_id'] + if self.trim_dot: + gene_id = trim_dot(gene_id) + if not gene_id in self.genes: + self.genes[gene_id] = Gene() + self.genes[gene_id].add_transcript(tx_id, tx) + + # add exons + if interval_type == 'exon': + tx.add_exon(start, end) + elif interval_type == 'CDS': + tx.add_cds(start, end) + + line = gtf_in.readline() + + gtf_in.close() + + # sort transcript exons + for tx in self.transcripts.values(): + tx.sort_exons() + + def write_gtf(self, out_gtf_file, write_cds=False, write_utrs=False): + if write_utrs and not self.utrs_defined: + self.define_utrs() + + gtf_out = open(out_gtf_file, 'w') + for tx in self.transcripts.values(): + tx.write_gtf(gtf_out, write_cds, write_utrs) + gtf_out.close() + + +################################################################################ +# Methods +################################################################################ +def gtf_kv(s): + """Convert the last gtf section of key/value pairs into a dict.""" + d = {} + + a = s.split(';') + for key_val in a: + if key_val.strip(): + eq_i = key_val.find('=') + if eq_i != -1 and key_val[eq_i-1] != '"': + kvs = key_val.split('=') + else: + kvs = key_val.split() + + key = kvs[0] + if kvs[1][0] == '"' and kvs[-1][-1] == '"': + val = (' '.join(kvs[1:]))[1:-1].strip() + else: + val = (' '.join(kvs[1:])).strip() + + d[key] = val + + return d + +def kv_gtf(d): + """Convert a kv hash to str gtf representation.""" + s = '' + + if 'gene_id' in d.keys(): + s += '%s "%s"; ' % ('gene_id',d['gene_id']) + + if 'transcript_id' in d.keys(): + s += '%s "%s"; ' % ('transcript_id',d['transcript_id']) + + for key in sorted(d.keys()): + if key not in ['gene_id','transcript_id']: + s += '%s "%s"; ' % (key,d[key]) + + return s + +def trim_dot(gene_id): + """Trim the final dot suffix off a gene_id.""" + dot_i = gene_id.rfind('.') + if dot_i != -1: + gene_id = gene_id[:dot_i] + return gene_id \ No newline at end of file diff --git a/src/baskerville/scripts/borzoi_test_genes.py b/src/baskerville/scripts/borzoi_test_genes.py new file mode 100755 index 0000000..1e2b853 --- /dev/null +++ b/src/baskerville/scripts/borzoi_test_genes.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python +# Copyright 2021 Calico LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser +import gc +import json +import os +import time + +from intervaltree import IntervalTree +import numpy as np +import pandas as pd +import pybedtools +import pyranges as pr +from qnorm import quantile_normalize +from scipy.stats import pearsonr +from sklearn.metrics import explained_variance_score + +from baskerville import pygene +from baskerville import dataset +from baskerville import seqnn + +""" +borzoi_test_genes.py + +Measure accuracy at gene-level. +""" + +################################################################################ +# main +################################################################################ +def main(): + usage = "usage: %prog [options] " + parser = OptionParser(usage) + parser.add_option( + "--head", + dest="head_i", + default=0, + type="int", + help="Parameters head [Default: %default]", + ) + parser.add_option( + "-o", + dest="out_dir", + default="testg_out", + help="Output directory for predictions [Default: %default]", + ) + parser.add_option( + "--rc", + dest="rc", + default=False, + action="store_true", + help="Average the fwd and rc predictions [Default: %default]", + ) + parser.add_option( + "--shifts", + dest="shifts", + default="0", + help="Ensemble prediction shifts [Default: %default]", + ) + parser.add_option( + "--span", + dest="span", + default=False, + action="store_true", + help="Aggregate entire gene span [Default: %default]", + ) + parser.add_option( + "-t", + dest="targets_file", + default=None, + type="str", + help="File specifying target indexes and labels in table format", + ) + parser.add_option( + "--split", + dest="split_label", + default="test", + help="Dataset split label for eg TFR pattern [Default: %default]", + ) + parser.add_option( + "--tfr", + dest="tfr_pattern", + default=None, + help="TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]", + ) + parser.add_option( + "-u", + dest="untransform_old", + default=False, + action="store_true", + help="Untransform old models [Default: %default]", + ) + (options, args) = parser.parse_args() + + if len(args) != 4: + parser.error("Must provide parameters, model, data directory, and genes GTF") + else: + params_file = args[0] + model_file = args[1] + data_dir = args[2] + genes_gtf_file = args[3] + + if not os.path.isdir(options.out_dir): + os.mkdir(options.out_dir) + + # parse shifts to integers + options.shifts = [int(shift) for shift in options.shifts.split(",")] + + ####################################################### + # inputs + + # read targets + if options.targets_file is None: + options.targets_file = "%s/targets.txt" % data_dir + targets_df = pd.read_csv(options.targets_file, index_col=0, sep="\t") + + # prep strand + targets_strand_df = dataset.targets_prep_strand(targets_df) + num_targets = targets_df.shape[0] + num_targets_strand = targets_strand_df.shape[0] + + # read model parameters + with open(params_file) as params_open: + params = json.load(params_open) + params_model = params["model"] + params_train = params["train"] + + # set strand pairs (using new indexing) + orig_new_index = dict(zip(targets_df.index, np.arange(targets_df.shape[0]))) + targets_strand_pair = np.array( + [orig_new_index[ti] for ti in targets_df.strand_pair] + ) + params_model["strand_pair"] = [targets_strand_pair] + + # construct eval data + eval_data = dataset.SeqDataset( + data_dir, + split_label=options.split_label, + batch_size=params_train["batch_size"], + mode="eval", + tfr_pattern=options.tfr_pattern, + ) + + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file, options.head_i) + seqnn_model.build_slice(targets_df.index) + seqnn_model.build_ensemble(options.rc, options.shifts) + + ####################################################### + # sequence intervals + + # read data parameters + with open("%s/statistics.json" % data_dir) as data_open: + data_stats = json.load(data_open) + crop_bp = data_stats["crop_bp"] + pool_width = data_stats["pool_width"] + + # read sequence positions + seqs_df = pd.read_csv( + "%s/sequences.bed" % data_dir, + sep="\t", + names=["Chromosome", "Start", "End", "Name"], + ) + seqs_df = seqs_df[seqs_df.Name == options.split_label] + seqs_pr = pr.PyRanges(seqs_df) + + ####################################################### + # make gene BED + + t0 = time.time() + print("Making gene BED...", end="") + genes_bed_file = "%s/genes.bed" % options.out_dir + if options.span: + make_genes_span(genes_bed_file, genes_gtf_file, options.out_dir) + else: + make_genes_exon(genes_bed_file, genes_gtf_file, options.out_dir) + + genes_pr = pr.read_bed(genes_bed_file) + print("DONE in %ds" % (time.time() - t0)) + + # count gene normalization lengths + gene_lengths = {} + gene_strand = {} + for line in open(genes_bed_file): + a = line.rstrip().split("\t") + gene_id = a[3] + gene_seg_len = int(a[2]) - int(a[1]) + gene_lengths[gene_id] = gene_lengths.get(gene_id, 0) + gene_seg_len + gene_strand[gene_id] = a[5] + + ####################################################### + # intersect genes w/ preds, targets + + # intersect seqs, genes + t0 = time.time() + print("Intersecting sequences w/ genes...", end="") + seqs_genes_pr = seqs_pr.join(genes_pr) + print("DONE in %ds" % (time.time() - t0), flush=True) + + # hash preds/targets by gene_id + gene_preds_dict = {} + gene_targets_dict = {} + + si = 0 + for x, y in eval_data.dataset: + # predict only if gene overlaps + yh = None + y = y.numpy()[..., targets_df.index] + + t0 = time.time() + print("Sequence %d..." % si, end="") + for bsi in range(x.shape[0]): + seq = seqs_df.iloc[si + bsi] + + cseqs_genes_df = seqs_genes_pr[seq.Chromosome].df + if cseqs_genes_df.shape[0] == 0: + # empty. no genes on this chromosome + seq_genes_df = cseqs_genes_df + else: + seq_genes_df = cseqs_genes_df[cseqs_genes_df.Start == seq.Start] + + for _, seq_gene in seq_genes_df.iterrows(): + gene_id = seq_gene.Name_b + gene_start = seq_gene.Start_b + gene_end = seq_gene.End_b + seq_start = seq_gene.Start + + # clip boundaries + gene_seq_start = max(0, gene_start - seq_start) + gene_seq_end = max(0, gene_end - seq_start) + + # requires >50% overlap + bin_start = int(np.round(gene_seq_start / pool_width)) + bin_end = int(np.round(gene_seq_end / pool_width)) + + # predict + if yh is None: + yh = seqnn_model(x) + + # slice gene region + yhb = yh[bsi, bin_start:bin_end].astype("float16") + yb = y[bsi, bin_start:bin_end].astype("float16") + + if len(yb) > 0: + gene_preds_dict.setdefault(gene_id, []).append(yhb) + gene_targets_dict.setdefault(gene_id, []).append(yb) + + # advance sequence table index + si += x.shape[0] + print("DONE in %ds" % (time.time() - t0), flush=True) + if si % 128 == 0: + gc.collect() + + # aggregate gene bin values into arrays + gene_targets = [] + gene_preds = [] + gene_ids = sorted(gene_targets_dict.keys()) + gene_within = [] + gene_wvar = [] + + for gene_id in gene_ids: + gene_preds_gi = np.concatenate(gene_preds_dict[gene_id], axis=0).astype( + "float32" + ) + gene_targets_gi = np.concatenate(gene_targets_dict[gene_id], axis=0).astype( + "float32" + ) + + # slice strand + if gene_strand[gene_id] == "+": + gene_strand_mask = (targets_df.strand != "-").to_numpy() + else: + gene_strand_mask = (targets_df.strand != "+").to_numpy() + gene_preds_gi = gene_preds_gi[:, gene_strand_mask] + gene_targets_gi = gene_targets_gi[:, gene_strand_mask] + + if gene_targets_gi.shape[0] == 0: + print(gene_id, gene_targets_gi.shape, gene_preds_gi.shape) + + # untransform + if options.untransform_old: + gene_preds_gi = dataset.untransform_preds1(gene_preds_gi, targets_strand_df) + gene_targets_gi = dataset.untransform_preds1(gene_targets_gi, targets_strand_df) + else: + gene_preds_gi = dataset.untransform_preds(gene_preds_gi, targets_strand_df) + gene_targets_gi = dataset.untransform_preds(gene_targets_gi, targets_strand_df) + + # compute within gene correlation before dropping length axis + gene_corr_gi = np.zeros(num_targets_strand) + for ti in range(num_targets_strand): + if ( + gene_preds_gi[:, ti].var() > 1e-6 + and gene_targets_gi[:, ti].var() > 1e-6 + ): + preds_log = np.log2(gene_preds_gi[:, ti] + 1) + targets_log = np.log2(gene_targets_gi[:, ti] + 1) + gene_corr_gi[ti] = pearsonr(preds_log, targets_log)[0] + # gene_corr_gi[ti] = pearsonr(gene_preds_gi[:,ti], gene_targets_gi[:,ti])[0] + else: + gene_corr_gi[ti] = np.nan + gene_within.append(gene_corr_gi) + gene_wvar.append(gene_targets_gi.var(axis=0)) + + # TEMP: save gene preds/targets + # os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True) + # np.save('%s/gene_within/%s_preds.npy' % (options.out_dir, gene_id), gene_preds_gi.astype('float16')) + # np.save('%s/gene_within/%s_targets.npy' % (options.out_dir, gene_id), gene_targets_gi.astype('float16')) + + # mean coverage + gene_preds_gi = gene_preds_gi.mean(axis=0) + gene_targets_gi = gene_targets_gi.mean(axis=0) + + # scale by gene length + gene_preds_gi *= gene_lengths[gene_id] + gene_targets_gi *= gene_lengths[gene_id] + + gene_preds.append(gene_preds_gi) + gene_targets.append(gene_targets_gi) + + gene_targets = np.array(gene_targets) + gene_preds = np.array(gene_preds) + gene_within = np.array(gene_within) + gene_wvar = np.array(gene_wvar) + + # log2 transform + gene_targets = np.log2(gene_targets + 1) + gene_preds = np.log2(gene_preds + 1) + + # save values + genes_targets_df = pd.DataFrame( + gene_targets, index=gene_ids, columns=targets_strand_df.identifier + ) + genes_targets_df.to_csv("%s/gene_targets.tsv" % options.out_dir, sep="\t") + genes_preds_df = pd.DataFrame( + gene_preds, index=gene_ids, columns=targets_strand_df.identifier + ) + genes_preds_df.to_csv("%s/gene_preds.tsv" % options.out_dir, sep="\t") + genes_within_df = pd.DataFrame( + gene_within, index=gene_ids, columns=targets_strand_df.identifier + ) + genes_within_df.to_csv("%s/gene_within.tsv" % options.out_dir, sep="\t") + genes_var_df = pd.DataFrame( + gene_wvar, index=gene_ids, columns=targets_strand_df.identifier + ) + genes_var_df.to_csv("%s/gene_var.tsv" % options.out_dir, sep="\t") + + # quantile and mean normalize + gene_targets_norm = quantile_normalize(gene_targets, ncpus=2) + gene_targets_norm = gene_targets_norm - gene_targets_norm.mean( + axis=-1, keepdims=True + ) + gene_preds_norm = quantile_normalize(gene_preds, ncpus=2) + gene_preds_norm = gene_preds_norm - gene_preds_norm.mean(axis=-1, keepdims=True) + + ####################################################### + # accuracy stats + + wvar_t = np.percentile(gene_wvar, 80, axis=0) + + acc_pearsonr = [] + acc_r2 = [] + acc_npearsonr = [] + acc_nr2 = [] + acc_wpearsonr = [] + for ti in range(num_targets_strand): + r_ti = pearsonr(gene_targets[:, ti], gene_preds[:, ti])[0] + acc_pearsonr.append(r_ti) + r2_ti = explained_variance_score(gene_targets[:, ti], gene_preds[:, ti]) + acc_r2.append(r2_ti) + nr_ti = pearsonr(gene_targets_norm[:, ti], gene_preds_norm[:, ti])[0] + acc_npearsonr.append(nr_ti) + nr2_ti = explained_variance_score( + gene_targets_norm[:, ti], gene_preds_norm[:, ti] + ) + acc_nr2.append(nr2_ti) + var_mask = gene_wvar[:, ti] > wvar_t[ti] + wr_ti = gene_within[var_mask].mean() + acc_wpearsonr.append(wr_ti) + + acc_df = pd.DataFrame( + { + "identifier": targets_strand_df.identifier, + "pearsonr": acc_pearsonr, + "r2": acc_r2, + "pearsonr_norm": acc_npearsonr, + "r2_norm": acc_nr2, + "pearsonr_gene": acc_wpearsonr, + "description": targets_strand_df.description, + } + ) + acc_df.to_csv("%s/acc.txt" % options.out_dir, sep="\t") + + print("%d genes" % gene_targets.shape[0]) + print("Overall PearsonR: %.4f" % np.mean(acc_df.pearsonr)) + print("Overall R2: %.4f" % np.mean(acc_df.r2)) + print("Normalized PearsonR: %.4f" % np.mean(acc_df.pearsonr_norm)) + print("Normalized R2: %.4f" % np.mean(acc_df.r2_norm)) + print("Within-gene PearsonR: %.4f" % np.mean(acc_df.pearsonr_gene)) + + +def genes_aggregate(genes_bed_file, values_bedgraph): + """Aggregate values across genes. + + Args: + genes_bed_file (str): BED file of genes. + values_bedgraph (str): BedGraph file of values. + + Returns: + gene_values (dict): Dictionary of gene values. + """ + values_bt = pybedtools.BedTool(values_bedgraph) + genes_bt = pybedtools.BedTool(genes_bed_file) + + gene_values = {} + + for overlap in genes_bt.intersect(values_bt, wo=True): + gene_id = overlap[3] + value = overlap[7] + gene_values[gene_id] = gene_values.get(gene_id, 0) + value + + return gene_values + + +def make_genes_exon(genes_bed_file: str, genes_gtf_file: str, out_dir: str): + """Make a BED file with each genes' exons, excluding exons overlapping + across genes. + + Args: + genes_bed_file (str): Output BED file of genes. + genes_gtf_file (str): Input GTF file of genes. + out_dir (str): Output directory for temporary files. + """ + # read genes + genes_gtf = pygene.GTF(genes_gtf_file) + + # write gene exons + agenes_bed_file = "%s/genes_all.bed" % out_dir + agenes_bed_out = open(agenes_bed_file, "w") + for gene_id, gene in genes_gtf.genes.items(): + # collect exons + gene_intervals = IntervalTree() + for tx_id, tx in gene.transcripts.items(): + for exon in tx.exons: + gene_intervals[exon.start - 1 : exon.end] = True + + # union + gene_intervals.merge_overlaps() + + # write + for interval in sorted(gene_intervals): + cols = [ + gene.chrom, + str(interval.begin), + str(interval.end), + gene_id, + ".", + gene.strand, + ] + print("\t".join(cols), file=agenes_bed_out) + agenes_bed_out.close() + + # find overlapping exons + genes1_bt = pybedtools.BedTool(agenes_bed_file) + genes2_bt = pybedtools.BedTool(agenes_bed_file) + overlapping_exons = set() + for overlap in genes1_bt.intersect(genes2_bt, s=True, wo=True): + gene1_id = overlap[3] + gene1_start = int(overlap[1]) + gene1_end = int(overlap[2]) + overlapping_exons.add((gene1_id, gene1_start, gene1_end)) + + gene2_id = overlap[9] + gene2_start = int(overlap[7]) + gene2_end = int(overlap[8]) + overlapping_exons.add((gene2_id, gene2_start, gene2_end)) + + # filter for nonoverlapping exons + genes_bed_out = open(genes_bed_file, "w") + for line in open(agenes_bed_file): + a = line.split() + start = int(a[1]) + end = int(a[2]) + gene_id = a[-1] + if (gene_id, start, end) not in overlapping_exons: + print(line, end="", file=genes_bed_out) + genes_bed_out.close() + + +def make_genes_span( + genes_bed_file: str, genes_gtf_file: str, out_dir: str, stranded: bool = True +): + """Make a BED file with the span of each gene. + + Args: + genes_bed_file (str): Output BED file of genes. + genes_gtf_file (str): Input GTF file of genes. + out_dir (str): Output directory for temporary files. + stranded (bool): Perform stranded intersection. + """ + # read genes + genes_gtf = pygene.GTF(genes_gtf_file) + + # write all gene spans + agenes_bed_file = "%s/genes_all.bed" % out_dir + agenes_bed_out = open(agenes_bed_file, "w") + for gene_id, gene in genes_gtf.genes.items(): + start, end = gene.span() + cols = [gene.chrom, str(start - 1), str(end), gene_id, ".", gene.strand] + print("\t".join(cols), file=agenes_bed_out) + agenes_bed_out.close() + + # find overlapping genes + genes1_bt = pybedtools.BedTool(agenes_bed_file) + genes2_bt = pybedtools.BedTool(agenes_bed_file) + overlapping_genes = set() + for overlap in genes1_bt.intersect(genes2_bt, s=stranded, wo=True): + gene1_id = overlap[3] + gene2_id = overlap[7] + if gene1_id != gene2_id: + overlapping_genes.add(gene1_id) + overlapping_genes.add(gene2_id) + + # filter for nonoverlapping genes + genes_bed_out = open(genes_bed_file, "w") + for line in open(agenes_bed_file): + gene_id = line.split()[-1] + if gene_id not in overlapping_genes: + print(line, end="", file=genes_bed_out) + genes_bed_out.close() + + +################################################################################ +# __main__ +################################################################################ +if __name__ == "__main__": + main() diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 43d908c..ad66fa3 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -45,7 +45,7 @@ def main(): parser.add_option( "-c", dest="class_min", - default=100, + default=5, type="int", help="Minimum target class size to consider [Default: %default]", ) @@ -97,6 +97,13 @@ def main(): type="str", help="File specifying target indexes and labels in table format", ) + parser.add_option( + "--target_classes", + dest="target_classes", + default=None, + type="str", + help="comma separated string of target classes", + ) parser.add_option( "--split", dest="split_label", @@ -142,19 +149,25 @@ def main(): # classify target_classes = [] - for ti in range(num_targets): - description = targets_df.iloc[ti].description - if description.find(":") == -1: - tc = "*" - else: - desc_split = description.split(":") - if desc_split[0] == "CHIP": - tc = "/".join(desc_split[:2]) + + if options.target_classes is None: + for ti in range(num_targets): + description = targets_df.iloc[ti].description + if description.find(":") == -1: + tc = "*" else: - tc = desc_split[0] - target_classes.append(tc) - targets_df["class"] = target_classes - target_classes = sorted(set(target_classes)) + desc_split = description.split(":") + if desc_split[0] == "CHIP": + tc = "/".join(desc_split[:2]) + else: + tc = desc_split[0] + target_classes.append(tc) + targets_df["class"] = target_classes + target_classes = sorted(set(target_classes)) + else: + targets_df["class"] = targets_df['description'].str.replace(':.*','',regex=True) + target_classes = options.target_classes.split(',') + print(target_classes) ####################################################### diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 0af7997..592a846 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -91,7 +91,7 @@ def main(): "--conv_adapter", default=None, type=str, - help="conv layer module [conv, batch_norm, squez_excit]", + help="conv layer module [conv, bn, conv_bn, squez_excit]", ) parser.add_argument( @@ -206,9 +206,12 @@ def main(): # attention adapter if args.att_adapter is not None: if args.att_adapter=='adapterHoulsby': - seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, - strand_pairs[0], - latent_size=args.att_latent) + if args.conv_adapter not in ['se', 'se_bn', 'se_all','se_all_bn']: + # when att_adapter=='Houlsby' and conv_adapter=='se', do nothing. + # see conv_adapter section. + seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, + strand_pairs[0], + latent_size=args.att_latent) elif args.att_adapter=='lora': transfer_helper.add_lora(seqnn_model.model, rank=args.att_latent, @@ -228,57 +231,89 @@ def main(): if args.conv_adapter=='conv': params_added = 0 for l in seqnn_model.model.layers: - if l.name.startswith("conv1d"): + if l.name.startswith(("conv1d","separable_conv1d")): l.trainable=True params_added += transfer_helper.param_count(l, type='trainable') print('params added/unfrozen by conv: %d'%params_added) - if args.conv_adapter=='conv_all': + elif args.conv_adapter=='conv_bn': params_added = 0 for l in seqnn_model.model.layers: - if l.name.startswith(("conv1d","separable_conv1d")): + if l.name.startswith(("conv1d","separable_conv1d","batch_normalization")): l.trainable=True params_added += transfer_helper.param_count(l, type='trainable') - print('params added/unfrozen by conv_all: %d'%params_added) + print('params added/unfrozen by conv_bn: %d'%params_added) - elif args.conv_adapter=='batch_norm': + elif args.conv_adapter=='bn': params_added = 0 for l in seqnn_model.model.layers: if l.name.startswith("batch_normalization"): l.trainable=True params_added += transfer_helper.param_count(l, type='trainable') - print('params added/unfrozen by batch_norm: %d'%params_added) + print('params added/unfrozen by bn: %d'%params_added) ################## # squeeze-excite # ################## - elif args.conv_adapter=='se': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=False) - - elif args.conv_adapter=='se_bn': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=True) - - elif args.conv_adapter=='se_all': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - bottleneck_ratio=args.se_ratio, - insert_mode='all', - unfreeze_bn=False) - - elif args.conv_adapter=='se_all_bn': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - bottleneck_ratio=args.se_ratio, - insert_mode='all', - unfreeze_bn=True) + elif args.conv_adapter in ['se','se_bn','se_all','se_all_bn']: + if args.att_adapter=='adapterHoulsby': + if args.conv_adapter=='se': + seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=False) + elif args.conv_adapter=='se_bn': + seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=True) + elif args.conv_adapter=='se_all': + seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='all', + unfreeze_bn=False) + elif args.conv_adapter=='se_all_bn': + seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='all', + unfreeze_bn=True) + else: + if args.conv_adapter=='se': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=False) + elif args.conv_adapter=='se_bn': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=True) + elif args.conv_adapter=='se_all': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='all', + unfreeze_bn=False) + elif args.conv_adapter=='se_all_bn': + seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + strand_pair=strand_pairs[0], + houlsby_latent=args.att_latent, + bottleneck_ratio=args.se_ratio, + insert_mode='pre_att', + unfreeze_bn=True) ################# # final summary # @@ -307,36 +342,37 @@ def main(): ############################# if args.transfer_mode=='sparse': - # Houlsby adapter requires architecture change, overwrite params.json file with new one - if args.att_adapter=='adapterHoulsby': - transfer_helper.modify_json(input_json=args.params_file, - output_json=args.out_dir, - adapter='houlsby', - latent_size=args.att_latent) - - # merge lora weights to original, save weight to: model_best.mergeW.h5 - # use original params.json + # overwrite json file when needed + # for: adapterHoulsby and squeeze-excite + transfer_helper.modify_json(input_json=args.params_file, + output_json='%s/params.json'%args.out_dir, + adapter=args.att_adapter, + latent=args.att_latent, + conv=args.conv_adapter, + se_ratio=args.se_ratio) + + # merge weights when needed + # for: lora and ia3 + # save weight to: model_best.mergeW.h5 if args.att_adapter=='lora': - seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) transfer_helper.merge_lora(seqnn_model.model, mode='default') - seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) + seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) if args.att_adapter=='lora_full': - seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) transfer_helper.merge_lora(seqnn_model.model, mode='full') - seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) + seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) # merge ia3 weights to original, save weight to: model_best_mergeweight.h5 if args.att_adapter=='ia3': - seqnn_model.model.load_weights('%s/model_best.h5'args.out_dir) + seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) transfer_helper.merge_ia3(seqnn_model.model) - seqnn_model.save('%s/model_best.mergeW.h5'args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'args.out_dir) - + seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) + transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) - else: ######################################## # multi GPU diff --git a/src/baskerville/transfer_helper.py b/src/baskerville/transfer_helper.py index d6cd851..401cd3a 100644 --- a/src/baskerville/transfer_helper.py +++ b/src/baskerville/transfer_helper.py @@ -34,7 +34,6 @@ def param_summary(model): print('trainable params:%d' %trainable) print('non-trainable params:%d' %non_trainable) - ###################### # add houlsby layers # ###################### @@ -117,19 +116,6 @@ def add_houlsby(input_model, strand_pair, latent_size=16): return model_adapter -# save Houlsby json -def modify_json(input_json, output_json, adapter, latent=None): - - with open(input_json) as params_open: - params = json.load(params_open) - - params["model"]["trunk"][2]['adapter']= adapter - params["model"]["trunk"][2]['latent']= latent - - ### output - with open(output_json, 'w') as params_open: - json.dump(params, params_open, indent=4) - ################### # add lora layers # ################### @@ -186,49 +172,6 @@ def add_lora(input_model, rank=8, alpha=16, mode='default'): print('params added/unfrozen by lora: %d'%params_added) -# merge lora weights -def merge_lora_layer(lora_layer): - down_weights = lora_layer.down_layer.kernel - up_weights = lora_layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale - lora_layer.original_layer.kernel.assign_add(increment_weights) - return lora_layer.original_layer - -def merge_lora(input_model, mode='default'): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - # default loRA - layer._q_layer = merge_lora_layer(layer._q_layer) - layer._v_layer = merge_lora_layer(layer._v_layer) - if mode=='full': - layer._k_layer = merge_lora_layer(layer._k_layer) - layer._embedding_layer = merge_lora_layer(layer._embedding_layer) - input_model(input_model.input) - -# correct weights.h5 weight order -def var_reorder(weight_h5): - # assumes weight_h5 model saved with seqnn_model.save() - # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. - # model.load_weights() load weights sequencially, assuming layer weights are in the right order. - # When inserting lora/ia3, multihead_attention layer weights order changed. - # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs - # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. - var_init_order = ['r_w_bias:0:0', - 'r_r_bias:0:0', - 'q_layer/kernel:0', - 'k_layer/kernel:0', - 'v_layer/kernel:0', - 'embedding_layer/kernel:0', - 'embedding_layer/bias:0', - 'r_k_layer/kernel:0'] - - f = h5py.File(weight_h5, 'r+') - layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] - for l_name in layers: - new_name_order = [l_name+'/'+i for i in var_init_order] - f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) - f.close() - ################## # add ia3 layers # ################## @@ -270,23 +213,6 @@ def add_ia3(input_model): print('params added/unfrozen by ia3: %d'%params_added) -# merge lora weights -def merge_ia3_layer(ia3_layer, type='kv'): - scaler = ia3_layer._ia3_layer.kernel[0] - ia3_layer.original_layer.kernel.assign(ia3_layer.original_layer.kernel * scaler) - if type=='embedding': - ia3_layer.original_layer.bias.assign(ia3_layer.original_layer.bias * scaler) - return ia3_layer.original_layer - -def merge_ia3(input_model): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - layer._k_layer = merge_ia3_layer(layer._k_layer, type='kv') - layer._v_layer = merge_ia3_layer(layer._v_layer, type='kv') - layer._embedding_layer = merge_ia3_layer(layer._embedding_layer, type='embedding') - input_model(input_model.input) - - ###################### # add squeeze excite # ###################### @@ -344,7 +270,8 @@ def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', additive=False, # use sigmoid multiplicative scaling bottleneck_ratio=bottleneck_ratio, # bottleneck ratio use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3) # near-zero weight initialization + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization + scale_fun='tanh' ) x = layer(layer_input) x = x + se_layer(x) @@ -356,7 +283,8 @@ def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', additive=False, # use sigmoid multiplicative scaling bottleneck_ratio=bottleneck_ratio, # bottleneck ratio use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3) # near-zero weight initialization + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization + scale_fun='tanh' ) x = layer(layer_input) x = x + se_layer(x) @@ -390,3 +318,223 @@ def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', print('params added/unfrozen by se_block: %d'%params_added) return model_final + + +def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio=8, insert_mode='pre_att', unfreeze_bn=False): + # add squeeze-excitation blocks after conv + # input_model should be properly frozen + # pre_att: add se_block to pre-attention conv1d + # all: add se_block to pre-attention conv1d and post-attention separable_conv1d + + if insert_mode not in ['pre_att','all']: + raise ValueError("insert_mode must be pre_att or all") + + model = tf.keras.Model(inputs=input_model.input, + outputs=input_model.layers[-2].output) # remove the switch_reverse layer + + # save current graph + layer_parent_dict_old = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict_old: + layer_parent_dict_old.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict_old[layer_name]: + layer_parent_dict_old[layer_name].append(layer.name) + + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({model.layers[0].name: model.input}) + + # remove switch_reverse + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in model.layers[1:]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if layer.name.startswith("stochastic_reverse_complement"): + x, reverse_bool = layer(layer_input) + + # insert houlsby: + elif re.match('add', layer.name): + if any([re.match('dropout', i) for i in parent_layers]): + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) + x = layer([layer_input[0], x]) + else: + x = layer(layer_input) + + # insert squeeze-excite layer: + elif layer.name.startswith("conv1d"): + se_layer = layers.SqueezeExcite( + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + bottleneck_ratio=bottleneck_ratio, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization + scale_fun='tanh' + ) + x = layer(layer_input) + x = x + se_layer(x) + + elif layer.name.startswith("separable_conv1d"): + if insert_mode=='all': + se_layer = layers.SqueezeExcite( + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + bottleneck_ratio=bottleneck_ratio, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization + scale_fun='tanh' + ) + x = layer(layer_input) + x = x + se_layer(x) + else: + x = layer(layer_input) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) + model_final = tf.keras.Model(inputs=model.inputs, outputs=final) + + # set trainable + for l in model_final.layers[:-2]: # trunk + if re.match('layer_normalization|adapter_houlsby', l.name): + l.trainable = True + else: + l.trainable = False + + for l in model_final.layers: # set trunk + if l.name.startswith("squeeze_excite"): l.trainable = True + + if unfreeze_bn: + for l in model_final.layers: + if l.name.startswith("batch_normalization"): l.trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_final.layers: + if l.name.startswith("squeeze_excite"): + params_added += param_count(l) + elif l.name.startswith("batch_normalization"): + if unfreeze_bn: params_added += param_count(l, type='trainable') + elif l.name.startswith("adapter_houlsby"): + params_added += param_count(l) + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type='trainable') + print('params added/unfrozen by se_block: %d'%params_added) + + return model_final + +############### +# modify json # +############### +# houlsby and squeeze-excite +def modify_json(input_json, output_json, adapter='adapterHoulsby', latent=None, conv=None, se_ratio=None): + + with open(input_json) as params_open: + params = json.load(params_open) + + # houlsby # + if adapter=='adapterHoulsby': + params["model"]["trunk"][2]['adapter']= 'houlsby' + params["model"]["trunk"][2]['latent']= latent + + # squeeze-excite # + if conv=='se_all' or conv=='se_all_bn': + for i in [0, 1, 3, 4]: + params['model']['trunk'][i]['transfer_se']=True + params['model']['trunk'][i]['se_ratio']=se_ratio + + elif conv=='se' or conv=='se_bn': + for i in [0, 1]: + params['model']['trunk'][i]['transfer_se']=True + params['model']['trunk'][i]['se_ratio']=se_ratio + + else: + pass + + ### output + with open(output_json, 'w') as params_open: + json.dump(params, params_open, indent=4) + + +###################### +# merge lora weights # +###################### +def merge_lora_layer(lora_layer): + down_weights = lora_layer.down_layer.kernel + up_weights = lora_layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + lora_layer.original_layer.kernel.assign_add(increment_weights) + return lora_layer.original_layer + +def merge_lora(input_model, mode='default'): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + # default loRA + layer._q_layer = merge_lora_layer(layer._q_layer) + layer._v_layer = merge_lora_layer(layer._v_layer) + if mode=='full': + layer._k_layer = merge_lora_layer(layer._k_layer) + layer._embedding_layer = merge_lora_layer(layer._embedding_layer) + input_model(input_model.input) + +# correct weights.h5 weight order +def var_reorder(weight_h5): + # assumes weight_h5 model saved with seqnn_model.save() + # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. + # model.load_weights() load weights sequencially, assuming layer weights are in the right order. + # When inserting lora/ia3, multihead_attention layer weights order changed. + # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs + # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. + var_init_order = ['r_w_bias:0:0', + 'r_r_bias:0:0', + 'q_layer/kernel:0', + 'k_layer/kernel:0', + 'v_layer/kernel:0', + 'embedding_layer/kernel:0', + 'embedding_layer/bias:0', + 'r_k_layer/kernel:0'] + + f = h5py.File(weight_h5, 'r+') + layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + for l_name in layers: + new_name_order = [l_name+'/'+i for i in var_init_order] + f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + f.close() + +##################### +# merge ia3 weights # +##################### +def merge_ia3_layer(ia3_layer, type='kv'): + scaler = ia3_layer._ia3_layer.kernel[0] + ia3_layer.original_layer.kernel.assign(ia3_layer.original_layer.kernel * scaler) + if type=='embedding': + ia3_layer.original_layer.bias.assign(ia3_layer.original_layer.bias * scaler) + return ia3_layer.original_layer + +def merge_ia3(input_model): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + layer._k_layer = merge_ia3_layer(layer._k_layer, type='kv') + layer._v_layer = merge_ia3_layer(layer._v_layer, type='kv') + layer._embedding_layer = merge_ia3_layer(layer._embedding_layer, type='embedding') + input_model(input_model.input) + From 21271c957d5dc0cc06eaa9f55d371457362ec8da Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 23 Apr 2024 18:36:28 -0700 Subject: [PATCH 08/26] add --f16 for float16 inference --- src/baskerville/HY_helper.py | 3 +-- src/baskerville/scripts/borzoi_test_genes.py | 23 ++++++++++++++++++-- src/baskerville/scripts/hound_eval.py | 23 +++++++++++++++++--- src/baskerville/scripts/hound_eval_spec.py | 23 ++++++++++++++++++-- src/baskerville/seqnn.py | 6 +++++ 5 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/baskerville/HY_helper.py b/src/baskerville/HY_helper.py index 2d8b665..d4de926 100644 --- a/src/baskerville/HY_helper.py +++ b/src/baskerville/HY_helper.py @@ -4,7 +4,6 @@ import pyBigWig - def make_seq_1hot(genome_open, chrm, start, end, seq_len): if start < 0: seq_dna = 'N'*(-start) + genome_open.fetch(chrm, 0, end) @@ -18,7 +17,7 @@ def make_seq_1hot(genome_open, chrm, start, end, seq_len): seq_1hot = dna_io.dna_1hot(seq_dna) return seq_1hot -#Helper function to get (padded) one-hot +# Helper function to get (padded) one-hot def process_sequence(fasta_file, chrom, start, end, seq_len=524288) : fasta_open = pysam.Fastafile(fasta_file) diff --git a/src/baskerville/scripts/borzoi_test_genes.py b/src/baskerville/scripts/borzoi_test_genes.py index 1e2b853..83f1dec 100755 --- a/src/baskerville/scripts/borzoi_test_genes.py +++ b/src/baskerville/scripts/borzoi_test_genes.py @@ -27,6 +27,7 @@ from qnorm import quantile_normalize from scipy.stats import pearsonr from sklearn.metrics import explained_variance_score +from tensorflow.keras import mixed_precision from baskerville import pygene from baskerville import dataset @@ -77,6 +78,13 @@ def main(): action="store_true", help="Aggregate entire gene span [Default: %default]", ) + parser.add_option( + "--f16", + dest="f16", + default=False, + action="store_true", + help="use mixed precision for inference", + ) parser.add_option( "-t", dest="targets_file", @@ -155,8 +163,19 @@ def main(): ) # initialize model - seqnn_model = seqnn.SeqNN(params_model) - seqnn_model.restore(model_file, options.head_i) + ################### + # mixed precision # + ################### + if options.f16: + mixed_precision.set_global_policy('mixed_float16') # first set global policy + seqnn_model = seqnn.SeqNN(params_model) # then create model + seqnn_model.restore(model_file, options.head_i) + seqnn_model.append_activation() # add additional activation to cast float16 output to float32 + else: + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file, options.head_i) + seqnn_model.build_slice(targets_df.index) seqnn_model.build_ensemble(options.rc, options.shifts) diff --git a/src/baskerville/scripts/hound_eval.py b/src/baskerville/scripts/hound_eval.py index 8db7851..b7fca0d 100755 --- a/src/baskerville/scripts/hound_eval.py +++ b/src/baskerville/scripts/hound_eval.py @@ -23,6 +23,7 @@ from scipy.stats import spearmanr import tensorflow as tf from tqdm import tqdm +from tensorflow.keras import mixed_precision from baskerville import bed from baskerville import dataset @@ -85,6 +86,12 @@ def main(): type=int, help="Step across positions [Default: %(default)s]", ) + parser.add_argument( + "--f16", + default=False, + action="store_true", + help="use mixed precision for inference", + ) parser.add_argument( "-t", "--targets_file", @@ -140,9 +147,19 @@ def main(): tfr_pattern=args.tfr_pattern, ) - # initialize model - seqnn_model = seqnn.SeqNN(params_model) - seqnn_model.restore(args.model_file, args.head_i) + ################### + # mixed precision # + ################### + if args.f16: + mixed_precision.set_global_policy('mixed_float16') # first set global policy + seqnn_model = seqnn.SeqNN(params_model) # then create model + seqnn_model.restore(args.model_file, args.head_i) + seqnn_model.append_activation() # add additional activation to cast float16 output to float32 + else: + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(args.model_file, args.head_i) + seqnn_model.build_ensemble(args.rc, args.shifts) ####################################################### diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 0732da6..2a4608f 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -25,6 +25,7 @@ from qnorm import quantile_normalize from scipy.stats import pearsonr import tensorflow as tf +from tensorflow.keras import mixed_precision from baskerville import dataset from baskerville import seqnn @@ -74,6 +75,13 @@ def main(): type="int", help="Step across positions [Default: %default]", ) + parser.add_option( + "--f16", + dest="f16", + default=False, + action="store_true", + help="use mixed precision for inference", + ) parser.add_option( "--save", dest="save", @@ -190,8 +198,19 @@ def main(): ) # initialize model - seqnn_model = seqnn.SeqNN(params_model) - seqnn_model.restore(model_file, options.head_i) + ################### + # mixed precision # + ################### + if options.f16: + mixed_precision.set_global_policy('mixed_float16') # set global policy + seqnn_model = seqnn.SeqNN(params_model) # create model + seqnn_model.restore(model_file, options.head_i) + seqnn_model.append_activation() # add additional activation to cast float16 output to float32 + else: + # initialize model + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file, options.head_i) + seqnn_model.build_slice(targets_df.index) if options.step > 1: seqnn_model.step(options.step) diff --git a/src/baskerville/seqnn.py b/src/baskerville/seqnn.py index 48aa300..1ffca86 100644 --- a/src/baskerville/seqnn.py +++ b/src/baskerville/seqnn.py @@ -219,6 +219,12 @@ def build_embed(self, conv_layer_i: int, batch_norm: bool = True): inputs=self.model.inputs, outputs=conv_layer.output ) + def append_activation(self): + """add additional activation to convert float16 output to float32, required for mixed precision""" + model_0 = self.model + new_outputs = tf.keras.layers.Activation('linear', dtype='float32')(model_0.layers[-1].output) + self.model = tf.keras.Model(inputs=model_0.layers[0].input, outputs=new_outputs) + def build_ensemble(self, ensemble_rc: bool = False, ensemble_shifts=[0]): """Build ensemble of models computing on augmented input sequences.""" shift_bool = len(ensemble_shifts) > 1 or ensemble_shifts[0] != 0 From e1be2c895dd2e5f1a588e3fd3895ac827b9632f1 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 14 May 2024 12:59:08 -0700 Subject: [PATCH 09/26] make the model take variable size input --- src/baskerville/blocks.py | 2 ++ src/baskerville/layers.py | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py index 527dd27..8e74e31 100644 --- a/src/baskerville/blocks.py +++ b/src/baskerville/blocks.py @@ -1152,6 +1152,7 @@ def transformer( kernel_initializer="he_normal", adapter=None, latent=16, + seqlen_train=None, **kwargs, ): """Construct a transformer block. @@ -1183,6 +1184,7 @@ def transformer( initializer=mha_initializer, l2_scale=mha_l2_scale, qkv_width=qkv_width, + seqlen_train=seqlen_train )(current) # dropout diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index d0513dc..e1e974d 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -448,6 +448,7 @@ def __init__( initializer="he_normal", l2_scale=0, qkv_width=1, + seqlen_train=None ): """Creates a MultiheadAttention module. Original version written by Ziga Avsec. @@ -480,6 +481,7 @@ def __init__( self._gated = gated self._relative_position_symmetric = relative_position_symmetric self._relative_position_functions = relative_position_functions + self.seqlen_train = seqlen_train if num_position_features is None: # num_position_features needs to be divisible by the number of # relative positional functions *2 (for symmetric & asymmetric version). @@ -641,13 +643,23 @@ def call(self, inputs, training=False): else: # Project positions to form relative keys. distances = tf.range(-seq_len + 1, seq_len, dtype=tf.float32)[tf.newaxis] - positional_encodings = positional_features( - positions=distances, - feature_size=self._num_position_features, - seq_length=seq_len, - symmetric=self._relative_position_symmetric, - ) - # [1, 2T-1, Cr] + + if self.seqlen_train is None: + positional_encodings = positional_features( + positions=distances, + feature_size=self._num_position_features, + seq_length=seq_len, + symmetric=self._relative_position_symmetric, + ) + # [1, 2T-1, Cr] + else: + positional_encodings = positional_features( + positions=distances, + feature_size=self._num_position_features, + seq_length=self.seqlen_train, + symmetric=self._relative_position_symmetric, + ) + # [1, 2T-1, Cr] if training: positional_encodings = tf.nn.dropout( From cdf36348bdbc40e4a47008f22c8a280f93e496b5 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 18 Jun 2024 15:32:20 -0700 Subject: [PATCH 10/26] fix ia3 --- src/baskerville/layers.py | 44 ++- src/baskerville/scripts/hound_transfer.py | 18 +- src/baskerville/transfer_helper.py | 318 ++++++++++++---------- tests/data/transfer/params.json | 77 ++++++ tests/data/transfer/targets.txt | 69 +++++ tests/test_transfer/test_ia3.ipynb | 195 +++++++++++++ 6 files changed, 577 insertions(+), 144 deletions(-) create mode 100644 tests/data/transfer/params.json create mode 100644 tests/data/transfer/targets.txt create mode 100644 tests/test_transfer/test_ia3.ipynb diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index e1e974d..2bfb5cc 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -27,8 +27,8 @@ # transfer learning # ##################### class IA3(tf.keras.layers.Layer): - # activation-rescale adapter: # https://arxiv.org/pdf/2205.05638.pdf + # ia3 module for attention layer, scale output. def __init__(self, original_layer, @@ -69,6 +69,48 @@ def get_config(self): ) return config +class IA3_ff(tf.keras.layers.Layer): + # https://arxiv.org/pdf/2205.05638.pdf + # ia3 module for down-projection ff layer, scale input. + + def __init__(self, + original_layer, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.input_dim = original_layer.input_shape[-1] + + self.original_layer = original_layer + self.original_layer.trainable = False + + # IA3 weights. Make it a dense layer to control trainable + self._ia3_layer = tf.keras.layers.Dense( + units=self.input_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Ones(), + trainable=True, + name="ia3_ff" + ) + + def call(self, inputs): + scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + return self.original_layer(inputs * scaler) + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "size": self.input_dim + } + ) + return config + class Lora(tf.keras.layers.Layer): # adapted from: # https://arxiv.org/abs/2106.09685 diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 592a846..85864f3 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -223,8 +223,9 @@ def main(): mode='full') elif args.att_adapter=='ia3': - transfer_helper.add_ia3(seqnn_model.model) + seqnn_model.model = transfer_helper.add_ia3(seqnn_model.model, strand_pairs[0]) + ''' # conv adapter # assume seqnn_model is appropriately frozen if args.conv_adapter is not None: @@ -314,6 +315,7 @@ def main(): bottleneck_ratio=args.se_ratio, insert_mode='pre_att', unfreeze_bn=True) + ''' ################# # final summary # @@ -368,10 +370,16 @@ def main(): # merge ia3 weights to original, save weight to: model_best_mergeweight.h5 if args.att_adapter=='ia3': - seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) - transfer_helper.merge_ia3(seqnn_model.model) - seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) + # ia3 model + ia3_model = seqnn_model.model + ia3_model.load_weights('%s/model_best.h5'%args.out_dir) + # original model + seqnn_model2 = seqnn.SeqNN(params_model) + seqnn_model2.restore(args.restore, trunk=args.trunk) + original_model = seqnn_model2.model + # merge weights into original model + transfer_helper.merge_ia3(original_model, ia3_model) + original_model.save('%s/model_best.mergeW.h5'%args.out_dir) else: ######################################## diff --git a/src/baskerville/transfer_helper.py b/src/baskerville/transfer_helper.py index 401cd3a..72acefc 100644 --- a/src/baskerville/transfer_helper.py +++ b/src/baskerville/transfer_helper.py @@ -34,6 +34,18 @@ def param_summary(model): print('trainable params:%d' %trainable) print('non-trainable params:%d' %non_trainable) +def keras2dict(model): + layer_parent_dict = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict: + layer_parent_dict.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict[layer_name]: + layer_parent_dict[layer_name].append(layer.name) + return layer_parent_dict + ###################### # add houlsby layers # ###################### @@ -46,15 +58,7 @@ def add_houlsby(input_model, strand_pair, latent_size=16): outputs=input_model.layers[-2].output) # remove the switch_reverse layer # save current graph - layer_parent_dict_old = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict_old: - layer_parent_dict_old.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict_old[layer_name]: - layer_parent_dict_old[layer_name].append(layer.name) + layer_parent_dict_old = keras2dict(model) layer_output_dict_new = {} # the output tensor of each layer in the new graph layer_output_dict_new.update({model.layers[0].name: model.input}) @@ -98,7 +102,9 @@ def add_houlsby(input_model, strand_pair, latent_size=16): final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) - # set trainable + ################# + # set trainable # + ################# for l in model_adapter.layers[:-2]: # trunk if re.match('layer_normalization|adapter_houlsby', l.name): l.trainable = True @@ -175,44 +181,192 @@ def add_lora(input_model, rank=8, alpha=16, mode='default'): ################## # add ia3 layers # ################## -def add_ia3(input_model): - # take seqnn.model as input - # replace _k_layer, _v_layer, _embedding_layer in multihead_attention +def add_ia3(input_model, strand_pair): + + #################### + # add to kv layers # + #################### for layer in input_model.layers: if re.match('multihead_attention', layer.name): layer._k_layer = layers.IA3(layer._k_layer, trainable=True) layer._v_layer = layers.IA3(layer._v_layer, trainable=True) - layer._embedding_layer = layers.IA3(layer._embedding_layer, trainable=True) - input_model(input_model.input) # instantiate model to initialize new variables + + ################### + # add to ff layer # + ################### + # save old graph to dictionary + layer_parent_dict_old = keras2dict(input_model) + + # remove switch_reverse_layer + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] - # freeze params: - for layer in input_model._flatten_layers(): + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + for layer in input_model.layers[1:-1]: + + # get layer inputs + parent_layers = layer_parent_dict_old[layer.name] + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + # construct + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + # transformer ff down-project layer (1536 -> 768): + elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + x = layers.IA3_ff(layer, trainable=True)(layer_input) + else: + x = layer(layer_input) + + # save layers to dictionary + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) + + ################# + # set trainable # + ################# + for layer in model_adapter._flatten_layers(): lst_of_sublayers = list(layer._flatten_layers()) if len(lst_of_sublayers) == 1: - if layer.name =='ia3': + if layer.name in ['ia3', 'ia3_ff']: layer.trainable = True else: layer.trainable = False ### bias terms need to be frozen separately - for layer in input_model.layers: + for layer in model_adapter.layers: if re.match('multihead_attention', layer.name): layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) # set final head to be trainable - input_model.layers[-2].trainable=True + model_adapter.layers[-2].trainable=True # expected number of trainable params added/unfrozen: params_added = 0 - for l in input_model.layers: - if re.match('multihead_attention', l.name): + for l in model_adapter.layers: + if re.match('multihead_attention', l.name): # kv layers params_added += param_count(l._k_layer._ia3_layer) params_added += param_count(l._v_layer._ia3_layer) - params_added += param_count(l._embedding_layer._ia3_layer) + elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers + params_added += param_count(l._ia3_layer) print('params added/unfrozen by ia3: %d'%params_added) + + return model_adapter + + +############### +# modify json # +############### +# houlsby and squeeze-excite +def modify_json(input_json, output_json, adapter='adapterHoulsby', latent=None, conv=None, se_ratio=None): + + with open(input_json) as params_open: + params = json.load(params_open) + + # houlsby # + if adapter=='adapterHoulsby': + params["model"]["trunk"][2]['adapter']= 'houlsby' + params["model"]["trunk"][2]['latent']= latent + + # squeeze-excite # + if conv=='se_all' or conv=='se_all_bn': + for i in [0, 1, 3, 4]: + params['model']['trunk'][i]['transfer_se']=True + params['model']['trunk'][i]['se_ratio']=se_ratio + + elif conv=='se' or conv=='se_bn': + for i in [0, 1]: + params['model']['trunk'][i]['transfer_se']=True + params['model']['trunk'][i]['se_ratio']=se_ratio + + else: + pass + + ### output + with open(output_json, 'w') as params_open: + json.dump(params, params_open, indent=4) + + +###################### +# merge lora weights # +###################### +def merge_lora_layer(lora_layer): + down_weights = lora_layer.down_layer.kernel + up_weights = lora_layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + lora_layer.original_layer.kernel.assign_add(increment_weights) + return lora_layer.original_layer + +def merge_lora(input_model, mode='default'): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + # default loRA + layer._q_layer = merge_lora_layer(layer._q_layer) + layer._v_layer = merge_lora_layer(layer._v_layer) + if mode=='full': + layer._k_layer = merge_lora_layer(layer._k_layer) + layer._embedding_layer = merge_lora_layer(layer._embedding_layer) + input_model(input_model.input) + +# correct weights.h5 weight order +def var_reorder(weight_h5): + # assumes weight_h5 model saved with seqnn_model.save() + # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. + # model.load_weights() load weights sequencially, assuming h5 weights are in the right order. + # When inserting lora/ia3, multihead_attention layer weights order changed. + # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs + # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. + var_init_order = ['r_w_bias:0:0', + 'r_r_bias:0:0', + 'q_layer/kernel:0', + 'k_layer/kernel:0', + 'v_layer/kernel:0', + 'embedding_layer/kernel:0', + 'embedding_layer/bias:0', + 'r_k_layer/kernel:0'] + + f = h5py.File(weight_h5, 'r+') + layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + for l_name in layers: + new_name_order = [l_name+'/'+i for i in var_init_order] + f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + f.close() + +##################### +# merge ia3 weights # +##################### +def merge_ia3(original_model, ia3_model): + # original model contains pre-trained weights + # ia3 model is the fine-tuned ia3 model + for i, layer in enumerate(original_model.layers): + # attention layers + if re.match('multihead_attention', layer.name): + # scale k + k_scaler = ia3_model.layers[i]._k_layer._ia3_layer.kernel[0] + layer._k_layer.kernel.assign(layer._k_layer.kernel * k_scaler) + # scale v + v_scaler = ia3_model.layers[i]._v_layer._ia3_layer.kernel[0] + layer._v_layer.kernel.assign(layer._v_layer.kernel * v_scaler) + # ff layers + elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + ff_scaler = tf.expand_dims(ia3_model.layers[i]._ia3_layer.kernel[0], 1) + layer.kernel.assign(layer.kernel * ff_scaler) + # other layers + else: + layer.set_weights(ia3_model.layers[i].get_weights()) +''' ###################### # add squeeze excite # ###################### @@ -229,15 +383,7 @@ def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', outputs=input_model.layers[-2].output) # remove the switch_reverse layer # save current graph - layer_parent_dict_old = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict_old: - layer_parent_dict_old.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict_old[layer_name]: - layer_parent_dict_old[layer_name].append(layer.name) + layer_parent_dict_old = keras2dict(model) layer_output_dict_new = {} # the output tensor of each layer in the new graph layer_output_dict_new.update({model.layers[0].name: model.input}) @@ -333,15 +479,7 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio= outputs=input_model.layers[-2].output) # remove the switch_reverse layer # save current graph - layer_parent_dict_old = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict_old: - layer_parent_dict_old.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict_old[layer_name]: - layer_parent_dict_old[layer_name].append(layer.name) + layer_parent_dict_old = keras2dict(model) layer_output_dict_new = {} # the output tensor of each layer in the new graph layer_output_dict_new.update({model.layers[0].name: model.input}) @@ -441,100 +579,4 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio= print('params added/unfrozen by se_block: %d'%params_added) return model_final - -############### -# modify json # -############### -# houlsby and squeeze-excite -def modify_json(input_json, output_json, adapter='adapterHoulsby', latent=None, conv=None, se_ratio=None): - - with open(input_json) as params_open: - params = json.load(params_open) - - # houlsby # - if adapter=='adapterHoulsby': - params["model"]["trunk"][2]['adapter']= 'houlsby' - params["model"]["trunk"][2]['latent']= latent - - # squeeze-excite # - if conv=='se_all' or conv=='se_all_bn': - for i in [0, 1, 3, 4]: - params['model']['trunk'][i]['transfer_se']=True - params['model']['trunk'][i]['se_ratio']=se_ratio - - elif conv=='se' or conv=='se_bn': - for i in [0, 1]: - params['model']['trunk'][i]['transfer_se']=True - params['model']['trunk'][i]['se_ratio']=se_ratio - - else: - pass - - ### output - with open(output_json, 'w') as params_open: - json.dump(params, params_open, indent=4) - - -###################### -# merge lora weights # -###################### -def merge_lora_layer(lora_layer): - down_weights = lora_layer.down_layer.kernel - up_weights = lora_layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale - lora_layer.original_layer.kernel.assign_add(increment_weights) - return lora_layer.original_layer - -def merge_lora(input_model, mode='default'): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - # default loRA - layer._q_layer = merge_lora_layer(layer._q_layer) - layer._v_layer = merge_lora_layer(layer._v_layer) - if mode=='full': - layer._k_layer = merge_lora_layer(layer._k_layer) - layer._embedding_layer = merge_lora_layer(layer._embedding_layer) - input_model(input_model.input) - -# correct weights.h5 weight order -def var_reorder(weight_h5): - # assumes weight_h5 model saved with seqnn_model.save() - # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. - # model.load_weights() load weights sequencially, assuming layer weights are in the right order. - # When inserting lora/ia3, multihead_attention layer weights order changed. - # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs - # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. - var_init_order = ['r_w_bias:0:0', - 'r_r_bias:0:0', - 'q_layer/kernel:0', - 'k_layer/kernel:0', - 'v_layer/kernel:0', - 'embedding_layer/kernel:0', - 'embedding_layer/bias:0', - 'r_k_layer/kernel:0'] - - f = h5py.File(weight_h5, 'r+') - layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] - for l_name in layers: - new_name_order = [l_name+'/'+i for i in var_init_order] - f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) - f.close() - -##################### -# merge ia3 weights # -##################### -def merge_ia3_layer(ia3_layer, type='kv'): - scaler = ia3_layer._ia3_layer.kernel[0] - ia3_layer.original_layer.kernel.assign(ia3_layer.original_layer.kernel * scaler) - if type=='embedding': - ia3_layer.original_layer.bias.assign(ia3_layer.original_layer.bias * scaler) - return ia3_layer.original_layer - -def merge_ia3(input_model): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - layer._k_layer = merge_ia3_layer(layer._k_layer, type='kv') - layer._v_layer = merge_ia3_layer(layer._v_layer, type='kv') - layer._embedding_layer = merge_ia3_layer(layer._embedding_layer, type='embedding') - input_model(input_model.input) - +''' diff --git a/tests/data/transfer/params.json b/tests/data/transfer/params.json new file mode 100644 index 0000000..08304e2 --- /dev/null +++ b/tests/data/transfer/params.json @@ -0,0 +1,77 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/targets.txt b/tests/data/transfer/targets.txt new file mode 100644 index 0000000..4b23e23 --- /dev/null +++ b/tests/data/transfer/targets.txt @@ -0,0 +1,69 @@ +identifier file clip clip_soft scale sum_stat strand_pair description +0 hTERT_TP1_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_A-.w5 384 320 0.3 sum_sqrt 1 RNA:hTERT_TP1_A- +1 hTERT_TP1_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_A+.w5 384 320 0.3 sum_sqrt 0 RNA:hTERT_TP1_A+ +2 hTERT_TP1_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_B-.w5 384 320 0.3 sum_sqrt 3 RNA:hTERT_TP1_B- +3 hTERT_TP1_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_B+.w5 384 320 0.3 sum_sqrt 2 RNA:hTERT_TP1_B+ +4 hTERT_TP1_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_C-.w5 384 320 0.3 sum_sqrt 5 RNA:hTERT_TP1_C- +5 hTERT_TP1_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP1_C+.w5 384 320 0.3 sum_sqrt 4 RNA:hTERT_TP1_C+ +6 hTERT_TP2_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_A-.w5 384 320 0.3 sum_sqrt 7 RNA:hTERT_TP2_A- +7 hTERT_TP2_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_A+.w5 384 320 0.3 sum_sqrt 6 RNA:hTERT_TP2_A+ +8 hTERT_TP2_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_B-.w5 384 320 0.3 sum_sqrt 9 RNA:hTERT_TP2_B- +9 hTERT_TP2_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_B+.w5 384 320 0.3 sum_sqrt 8 RNA:hTERT_TP2_B+ +10 hTERT_TP2_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_C-.w5 384 320 0.3 sum_sqrt 11 RNA:hTERT_TP2_C- +11 hTERT_TP2_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP2_C+.w5 384 320 0.3 sum_sqrt 10 RNA:hTERT_TP2_C+ +12 hTERT_TP4_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_A-.w5 384 320 0.3 sum_sqrt 13 RNA:hTERT_TP4_A- +13 hTERT_TP4_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_A+.w5 384 320 0.3 sum_sqrt 12 RNA:hTERT_TP4_A+ +14 hTERT_TP4_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_B-.w5 384 320 0.3 sum_sqrt 15 RNA:hTERT_TP4_B- +15 hTERT_TP4_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_B+.w5 384 320 0.3 sum_sqrt 14 RNA:hTERT_TP4_B+ +16 hTERT_TP4_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_C-.w5 384 320 0.3 sum_sqrt 17 RNA:hTERT_TP4_C- +17 hTERT_TP4_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP4_C+.w5 384 320 0.3 sum_sqrt 16 RNA:hTERT_TP4_C+ +18 hTERT_TP5_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_A-.w5 384 320 0.3 sum_sqrt 19 RNA:hTERT_TP5_A- +19 hTERT_TP5_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_A+.w5 384 320 0.3 sum_sqrt 18 RNA:hTERT_TP5_A+ +20 hTERT_TP5_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_B-.w5 384 320 0.3 sum_sqrt 21 RNA:hTERT_TP5_B- +21 hTERT_TP5_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_B+.w5 384 320 0.3 sum_sqrt 20 RNA:hTERT_TP5_B+ +22 hTERT_TP5_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_C-.w5 384 320 0.3 sum_sqrt 23 RNA:hTERT_TP5_C- +23 hTERT_TP5_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP5_C+.w5 384 320 0.3 sum_sqrt 22 RNA:hTERT_TP5_C+ +24 hTERT_TP6_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP6_A-.w5 384 320 0.3 sum_sqrt 25 RNA:hTERT_TP6_A- +25 hTERT_TP6_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP6_A+.w5 384 320 0.3 sum_sqrt 24 RNA:hTERT_TP6_A+ +26 hTERT_TP6_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP6_B-.w5 384 320 0.3 sum_sqrt 27 RNA:hTERT_TP6_B- +27 hTERT_TP6_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP6_B+.w5 384 320 0.3 sum_sqrt 26 RNA:hTERT_TP6_B+ +28 hTERT_TP7_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_A-.w5 384 320 0.3 sum_sqrt 29 RNA:hTERT_TP7_A- +29 hTERT_TP7_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_A+.w5 384 320 0.3 sum_sqrt 28 RNA:hTERT_TP7_A+ +30 hTERT_TP7_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_B-.w5 384 320 0.3 sum_sqrt 31 RNA:hTERT_TP7_B- +31 hTERT_TP7_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_B+.w5 384 320 0.3 sum_sqrt 30 RNA:hTERT_TP7_B+ +32 hTERT_TP7_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_C-.w5 384 320 0.3 sum_sqrt 33 RNA:hTERT_TP7_C- +33 hTERT_TP7_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/hTERT_TP7_C+.w5 384 320 0.3 sum_sqrt 32 RNA:hTERT_TP7_C+ +34 PDL20_TP1_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_A-.w5 384 320 0.3 sum_sqrt 35 RNA:PDL20_TP1_A- +35 PDL20_TP1_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_A+.w5 384 320 0.3 sum_sqrt 34 RNA:PDL20_TP1_A+ +36 PDL20_TP1_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_B-.w5 384 320 0.3 sum_sqrt 37 RNA:PDL20_TP1_B- +37 PDL20_TP1_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_B+.w5 384 320 0.3 sum_sqrt 36 RNA:PDL20_TP1_B+ +38 PDL20_TP1_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_C-.w5 384 320 0.3 sum_sqrt 39 RNA:PDL20_TP1_C- +39 PDL20_TP1_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL20_TP1_C+.w5 384 320 0.3 sum_sqrt 38 RNA:PDL20_TP1_C+ +40 PDL25_TP2_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_A-.w5 384 320 0.3 sum_sqrt 41 RNA:PDL25_TP2_A- +41 PDL25_TP2_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_A+.w5 384 320 0.3 sum_sqrt 40 RNA:PDL25_TP2_A+ +42 PDL25_TP2_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_B-.w5 384 320 0.3 sum_sqrt 43 RNA:PDL25_TP2_B- +43 PDL25_TP2_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_B+.w5 384 320 0.3 sum_sqrt 42 RNA:PDL25_TP2_B+ +44 PDL25_TP2_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_C-.w5 384 320 0.3 sum_sqrt 45 RNA:PDL25_TP2_C- +45 PDL25_TP2_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL25_TP2_C+.w5 384 320 0.3 sum_sqrt 44 RNA:PDL25_TP2_C+ +46 PDL33_TP4_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_A-.w5 384 320 0.3 sum_sqrt 47 RNA:PDL33_TP4_A- +47 PDL33_TP4_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_A+.w5 384 320 0.3 sum_sqrt 46 RNA:PDL33_TP4_A+ +48 PDL33_TP4_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_B-.w5 384 320 0.3 sum_sqrt 49 RNA:PDL33_TP4_B- +49 PDL33_TP4_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_B+.w5 384 320 0.3 sum_sqrt 48 RNA:PDL33_TP4_B+ +50 PDL33_TP4_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_C-.w5 384 320 0.3 sum_sqrt 51 RNA:PDL33_TP4_C- +51 PDL33_TP4_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL33_TP4_C+.w5 384 320 0.3 sum_sqrt 50 RNA:PDL33_TP4_C+ +52 PDL37_TP5_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_A-.w5 384 320 0.3 sum_sqrt 53 RNA:PDL37_TP5_A- +53 PDL37_TP5_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_A+.w5 384 320 0.3 sum_sqrt 52 RNA:PDL37_TP5_A+ +54 PDL37_TP5_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_B-.w5 384 320 0.3 sum_sqrt 55 RNA:PDL37_TP5_B- +55 PDL37_TP5_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_B+.w5 384 320 0.3 sum_sqrt 54 RNA:PDL37_TP5_B+ +56 PDL37_TP5_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_C-.w5 384 320 0.3 sum_sqrt 57 RNA:PDL37_TP5_C- +57 PDL37_TP5_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL37_TP5_C+.w5 384 320 0.3 sum_sqrt 56 RNA:PDL37_TP5_C+ +58 PDL46_TP6_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL46_TP6_A-.w5 384 320 0.3 sum_sqrt 59 RNA:PDL46_TP6_A- +59 PDL46_TP6_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL46_TP6_A+.w5 384 320 0.3 sum_sqrt 58 RNA:PDL46_TP6_A+ +60 PDL46_TP6_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL46_TP6_B-.w5 384 320 0.3 sum_sqrt 61 RNA:PDL46_TP6_B- +61 PDL46_TP6_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL46_TP6_B+.w5 384 320 0.3 sum_sqrt 60 RNA:PDL46_TP6_B+ +62 PDL50_TP7_A- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_A-.w5 384 320 0.3 sum_sqrt 63 RNA:PDL50_TP7_A- +63 PDL50_TP7_A+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_A+.w5 384 320 0.3 sum_sqrt 62 RNA:PDL50_TP7_A+ +64 PDL50_TP7_B- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_B-.w5 384 320 0.3 sum_sqrt 65 RNA:PDL50_TP7_B- +65 PDL50_TP7_B+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_B+.w5 384 320 0.3 sum_sqrt 64 RNA:PDL50_TP7_B+ +66 PDL50_TP7_C- /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_C-.w5 384 320 0.3 sum_sqrt 67 RNA:PDL50_TP7_C- +67 PDL50_TP7_C+ /home/yuanh/analysis/Borzoi_transfer/public_dataset/hayflick/RNA/w5/PDL50_TP7_C+.w5 384 320 0.3 sum_sqrt 66 RNA:PDL50_TP7_C+ diff --git a/tests/test_transfer/test_ia3.ipynb b/tests/test_transfer/test_ia3.ipynb new file mode 100644 index 0000000..b51fdd0 --- /dev/null +++ b/tests/test_transfer/test_ia3.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "73892ca7-0ef3-42fa-8f58-db1476625022", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-17 16:32:27.975305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-06-17 16:32:40.663143: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + } + ], + "source": [ + "import re\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "from baskerville import seqnn\n", + "from baskerville import layers\n", + "from baskerville import transfer_helper" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "480898e4-1a79-4133-b88f-ae6e2ea4835e", + "metadata": {}, + "outputs": [], + "source": [ + "# test if trainable param match expectation\n", + "def test_add_ia3(model_final):\n", + "\n", + " # expected trainable\n", + " params_added = 0\n", + " for l in model_final.layers:\n", + " if re.match('multihead_attention', l.name): # kv layers\n", + " params_added += transfer_helper.param_count(l._k_layer._ia3_layer)\n", + " params_added += transfer_helper.param_count(l._v_layer._ia3_layer)\n", + " elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers\n", + " params_added += transfer_helper.param_count(l._ia3_layer)\n", + " \n", + " params_head = transfer_helper.param_count(model_final.layers[-2])\n", + " print('expect params (ia3): %d'%params_added)\n", + " print('expect params (head): %d' % params_head)\n", + " print('expect params (total): %d' % (params_head + params_added))\n", + "\n", + " # observed trainable\n", + " c1 = transfer_helper.param_count(model_final, 'trainable')\n", + " print('trainable count: %d' %c1)\n", + " \n", + " assert c1==(params_head+params_added)\n", + " print(\"assert passed. trainable params match expectation.\") \n", + "\n", + "# test at initialization, output is the same\n", + "def test_add_ia3_2(model_final):\n", + " random_input = np.random.rand(1, model_final.input_shape[-2], model_final.input_shape[-1])\n", + " output_original = seqnn_model.model(random_input).numpy()\n", + " output_ia3 = model_final(random_input).numpy()\n", + " \n", + " assert np.allclose(output_original, output_ia3)\n", + " print(\"assert passed. at initialization, ia3 output same as pre-train.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6bab8fb3-981a-43ef-992f-8cb54f991410", + "metadata": {}, + "outputs": [], + "source": [ + "test_data_dir = '/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer'\n", + "params_file = '%s/params.json' %test_data_dir\n", + "targets_file = '%s/targets.txt' %test_data_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "95ec1814-82c3-473e-894c-b9a3d608bb9a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-17 16:32:52.777625: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22807 MB memory: -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:1a:00.0, compute capability: 7.5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "params added/unfrozen by ia3: 20480\n" + ] + } + ], + "source": [ + "###################\n", + "# pre-train model #\n", + "###################\n", + "targets_df = pd.read_csv(targets_file, index_col=0, sep=\"\\t\")\n", + "with open(params_file) as params_open:\n", + " params = json.load(params_open)\n", + "params_model = params[\"model\"]\n", + "params_train = params[\"train\"]\n", + "params_model['verbose'] = False\n", + "\n", + "# set strand pairs\n", + "if \"strand_pair\" in targets_df.columns:\n", + " params_model[\"strand_pair\"] = [np.array(targets_df.strand_pair)]\n", + "\n", + "seqnn_model = seqnn.SeqNN(params_model)\n", + "strand_pair = np.array(targets_df.strand_pair)\n", + "\n", + "#############\n", + "# ia3 model #\n", + "#############\n", + "model_final = transfer_helper.add_ia3(seqnn_model.model, strand_pair)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ed3f5c61-8830-4d39-925b-200b01ad1fa5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "expect params (ia3): 20480\n", + "expect params (head): 52292\n", + "expect params (total): 72772\n", + "trainable count: 72772\n", + "assert passed. trainable params match expectation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-17 16:33:00.103821: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "assert passed. at initialization, ia3 same as pre-train.\n" + ] + } + ], + "source": [ + "test_add_ia3(model_final)\n", + "test_add_ia3_2(model_final)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e500e589-ef52-4775-9ade-caca53c42035", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From c5b6724c0a200adb1c9e59b60ba6974861a874f8 Mon Sep 17 00:00:00 2001 From: hy395 Date: Wed, 3 Jul 2024 00:32:52 -0700 Subject: [PATCH 11/26] add se_adapter and locon --- src/baskerville/HY_helper.py | 75 --- src/baskerville/blocks.py | 43 +- .../{ => helpers}/transfer_helper.py | 551 ++++++++++-------- src/baskerville/layers.py | 100 +++- src/baskerville/scripts/hound_transfer.py | 228 +++----- src/baskerville/seqnn.py | 24 +- src/baskerville/trainer.py | 48 +- tests/test_transfer/test_ia3.ipynb | 2 +- 8 files changed, 538 insertions(+), 533 deletions(-) delete mode 100644 src/baskerville/HY_helper.py rename src/baskerville/{ => helpers}/transfer_helper.py (61%) diff --git a/src/baskerville/HY_helper.py b/src/baskerville/HY_helper.py deleted file mode 100644 index d4de926..0000000 --- a/src/baskerville/HY_helper.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy as np -from basenji import dna_io -import pysam -import pyBigWig - - -def make_seq_1hot(genome_open, chrm, start, end, seq_len): - if start < 0: - seq_dna = 'N'*(-start) + genome_open.fetch(chrm, 0, end) - else: - seq_dna = genome_open.fetch(chrm, start, end) - - #Extend to full length - if len(seq_dna) < seq_len: - seq_dna += 'N'*(seq_len-len(seq_dna)) - - seq_1hot = dna_io.dna_1hot(seq_dna) - return seq_1hot - -# Helper function to get (padded) one-hot -def process_sequence(fasta_file, chrom, start, end, seq_len=524288) : - - fasta_open = pysam.Fastafile(fasta_file) - seq_len_actual = end - start - - #Pad sequence to input window size - start -= (seq_len - seq_len_actual) // 2 - end += (seq_len - seq_len_actual) // 2 - - #Get one-hot - sequence_one_hot = make_seq_1hot(fasta_open, chrom, start, end, seq_len) - - return sequence_one_hot.astype('float32') - -def compute_cov(seqnn_model, chr, start, end): - seq_len = seqnn_model.model.layers[0].input.shape[1] - seq1hot = process_sequence('/home/yuanh/programs/genomes/hg38/hg38.fa', chr, start, end, seq_len=seq_len) - out = seqnn_model.model(seq1hot[None, ]) - return out.numpy() - -def write_bw(bw_file, chr, start, end, values, span=32): - bw_out = pyBigWig.open(bw_file, 'w') - header = [] - header.append((chr, end+1)) - bw_out.addHeader(header) - bw_out.addEntries(chr, start, values=values, span=span, step=span) - bw_out.close() - -def transform(seq_cov, clip=384, clip_soft=320, scale=0.3): - seq_cov = scale * seq_cov # scale - seq_cov = -1 + np.sqrt(1+seq_cov) # variant stabilize - clip_mask = (seq_cov > clip_soft) # soft clip - seq_cov[clip_mask] = clip_soft-1 + np.sqrt(seq_cov[clip_mask] - clip_soft+1) - seq_cov = np.clip(seq_cov, -clip, clip) # hard clip - return seq_cov - -def untransform(cov, scale=0.3, clip_soft=320, pool_width=32): - - # undo clip_soft - cov_unclipped = (cov - clip_soft + 1)**2 + clip_soft - 1 - unclip_mask = (cov > clip_soft) - cov[unclip_mask] = cov_unclipped[unclip_mask] - - # undo sqrt - cov = (cov +1)**2 - 1 - - # undo scale - cov = cov / scale - - # undo sum - cov = cov / pool_width - - return cov - - diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py index 8e74e31..ffbdb75 100644 --- a/src/baskerville/blocks.py +++ b/src/baskerville/blocks.py @@ -149,8 +149,6 @@ def conv_dna( conv_type="standard", kernel_initializer="he_normal", padding="same", - transfer_se=False, - se_ratio=16, ): """Construct a single convolution block, assumed to be operating on DNA. @@ -197,19 +195,7 @@ def conv_dna( kernel_initializer=kernel_initializer, kernel_regularizer=tf.keras.regularizers.l2(l2_scale), )(current) - - # squeeze-excite for transfer - if transfer_se: - se_out = squeeze_excite(current, - activation=None, - additive=False, - bottleneck_ratio=se_ratio, - use_bias=False, - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), - scale_fun='tanh' - ) - current = current + se_out - + # squeeze-excite if se: current = squeeze_excite(current) @@ -281,8 +267,6 @@ def conv_nac( kernel_initializer="he_normal", padding="same", se=False, - transfer_se=False, - se_ratio=16, ): """Construct a single convolution block. @@ -342,18 +326,6 @@ def conv_nac( kernel_regularizer=tf.keras.regularizers.l2(l2_scale), )(current) - # squeeze-excite for transfer - if transfer_se: - se_out = squeeze_excite(current, - activation=None, - additive=False, - bottleneck_ratio=se_ratio, - use_bias=False, - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), - scale_fun='tanh' - ) - current = current + se_out - # squeeze-excite if se: current = squeeze_excite(current) @@ -484,8 +456,6 @@ def unet_conv( bn_momentum=0.99, kernel_size=1, kernel_initializer="he_normal", - transfer_se=False, - se_ratio=16, upsample_conv=False, ): """Construct a feature pyramid network block. @@ -561,17 +531,6 @@ def unet_conv( kernel_initializer=kernel_initializer, )(current) - if transfer_se: - se_out = squeeze_excite(current, - activation=None, - additive=False, - bottleneck_ratio=se_ratio, - use_bias=False, - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), - scale_fun='tanh' - ) - current = current + se_out - # dropout if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) diff --git a/src/baskerville/transfer_helper.py b/src/baskerville/helpers/transfer_helper.py similarity index 61% rename from src/baskerville/transfer_helper.py rename to src/baskerville/helpers/transfer_helper.py index 72acefc..aa178b6 100644 --- a/src/baskerville/transfer_helper.py +++ b/src/baskerville/helpers/transfer_helper.py @@ -46,33 +46,92 @@ def keras2dict(model): layer_parent_dict[layer_name].append(layer.name) return layer_parent_dict +# lora requires change model.h5 weight order. +# locon and ia3 don't modify model in place. +def var_reorder(weight_h5): + # assumes weight_h5 model saved with seqnn_model.save() + # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. + # model.load_weights() load weights sequencially, assuming h5 weights are in the right order. + # When inserting lora, multihead_attention layer weights order changed. + # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs + # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. + var_init_order = ['r_w_bias:0:0', + 'r_r_bias:0:0', + 'q_layer/kernel:0', + 'k_layer/kernel:0', + 'v_layer/kernel:0', + 'embedding_layer/kernel:0', + 'embedding_layer/bias:0', + 'r_k_layer/kernel:0'] + + f = h5py.File(weight_h5, 'r+') + layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + for l_name in layers: + new_name_order = [l_name+'/'+i for i in var_init_order] + f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + f.close() + + +# houlsby requires architecture change. +# thus we need to modify json. +def modify_json(input_json, output_json, adapter, latent=8, se_rank=None, conv_select=None): + + with open(input_json) as params_open: + params = json.load(params_open) + + # houlsby + if adapter=='adapterHoulsby': + params["model"]['adapter']= 'houlsby' + params["model"]['adapter_latent']= latent + + # houlsby_se + elif adapter=='houlsby_se': + params["model"]['adapter']= 'houlsby_se' + params["model"]['adapter_latent']= latent + params["model"]['se_rank']= se_rank + params["model"]['conv_select']= conv_select + + else: + raise ValueError("adapter must be adapterHoulsby or houlsby_se") + + ### output + with open(output_json, 'w') as params_open: + json.dump(params, params_open, indent=4) + ###################### # add houlsby layers # ###################### -def add_houlsby(input_model, strand_pair, latent_size=16): +def add_houlsby(input_model, strand_pair, latent_size=8): # take seqnn_model as input # output a new seqnn_model object # only the adapter, and layer_norm are trainable - - model = tf.keras.Model(inputs=input_model.input, - outputs=input_model.layers[-2].output) # remove the switch_reverse layer - - # save current graph - layer_parent_dict_old = keras2dict(model) - - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({model.layers[0].name: model.input}) - - # remove switch_reverse + + ################## + # houlsby layers # + ################## + houlsby_layers = [] + for i in range(len(input_model.layers)-1): + layer = input_model.layers[i] + next_layer = input_model.layers[i+1] + if re.match('dropout', layer.name) and re.match('add', next_layer.name): + houlsby_layers += [next_layer.name] + + ################### + # construct model # + ################### + layer_parent_dict_old = keras2dict(input_model) + # remove switch_reverse_layer to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] for i in to_fix: del layer_parent_dict_old[i] - + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) # Iterate over all layers after the input model_outputs = [] reverse_bool = None - - for layer in model.layers[1:]: + + for layer in input_model.layers[1:-1]: # parent layers parent_layers = layer_parent_dict_old[layer.name] @@ -84,14 +143,11 @@ def add_houlsby(input_model, strand_pair, latent_size=16): if re.match('stochastic_reverse_complement', layer.name): x, reverse_bool = layer(layer_input) - # insert adapter: - elif re.match('add', layer.name): - if any([re.match('dropout', i) for i in parent_layers]): - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) - x = layer([layer_input[0], x]) - else: - x = layer(layer_input) + # insert houlsby layer: + elif layer.name in houlsby_layers: + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) + x = layer([layer_input[0], x]) else: x = layer(layer_input) @@ -99,12 +155,10 @@ def add_houlsby(input_model, strand_pair, latent_size=16): # save the output tensor of every layer layer_output_dict_new.update({layer.name: x}) - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=model.inputs, outputs=final) + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - ################# # set trainable # - ################# for l in model_adapter.layers[:-2]: # trunk if re.match('layer_normalization|adapter_houlsby', l.name): l.trainable = True @@ -122,10 +176,10 @@ def add_houlsby(input_model, strand_pair, latent_size=16): return model_adapter -################### -# add lora layers # -################### -def add_lora(input_model, rank=8, alpha=16, mode='default'): +############### +# lora layers # +############### +def add_lora(input_model, rank=8, alpha=16, mode='default', report_param=True): # take seqnn.model as input # replace _q_layer, _v_layer in multihead_attention # optionally replace _k_layer, _embedding_layer @@ -175,25 +229,81 @@ def add_lora(input_model, rank=8, alpha=16, mode='default'): params_added += param_count(l._k_layer.up_layer) params_added += param_count(l._embedding_layer.down_layer) params_added += param_count(l._embedding_layer.up_layer) + + if report_param: + print('params added/unfrozen by lora: %d'%params_added) + +############### +# lora layers # +############### +def add_lora_conv(input_model, conv_select=None): + + # add lora layers + add_lora(input_model, rank=8, alpha=16, mode='default', report_param=False) + + # list all conv layers + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + if conv_select is None: + conv_select = len(conv_layers) + if conv_select > len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + + # set conv layers trainable + trainable_conv = conv_layers[-conv_select:] + for layer in input_model.layers: + if layer.name in trainable_conv: + layer.trainable=True - print('params added/unfrozen by lora: %d'%params_added) + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in input_model.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + elif l.name in trainable_conv: + params_added += param_count(l) + + print('params added/unfrozen by lora_conv: %d'%params_added) -################## -# add ia3 layers # -################## +# merge lora weights # +def merge_lora_layer(lora_layer): + down_weights = lora_layer.down_layer.kernel + up_weights = lora_layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + lora_layer.original_layer.kernel.assign_add(increment_weights) + return lora_layer.original_layer + +def merge_lora(input_model): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + if isinstance(layer._q_layer, layers.Lora): + layer._q_layer = merge_lora_layer(layer._q_layer) + if isinstance(layer._v_layer, layers.Lora): + layer._v_layer = merge_lora_layer(layer._v_layer) + if isinstance(layer._k_layer, layers.Lora): + layer._k_layer = merge_lora_layer(layer._k_layer) + if isinstance(layer._embedding_layer, layers.Lora): + layer._embedding_layer = merge_lora_layer(layer._embedding_layer) + input_model(input_model.input) + + +############## +# IA3 layers # +############## def add_ia3(input_model, strand_pair): - #################### # add to kv layers # - #################### for layer in input_model.layers: if re.match('multihead_attention', layer.name): layer._k_layer = layers.IA3(layer._k_layer, trainable=True) layer._v_layer = layers.IA3(layer._v_layer, trainable=True) - ################### # add to ff layer # - ################### # save old graph to dictionary layer_parent_dict_old = keras2dict(input_model) @@ -231,9 +341,7 @@ def add_ia3(input_model, strand_pair): final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - ################# # set trainable # - ################# for layer in model_adapter._flatten_layers(): lst_of_sublayers = list(layer._flatten_layers()) if len(lst_of_sublayers) == 1: @@ -264,88 +372,6 @@ def add_ia3(input_model, strand_pair): return model_adapter - -############### -# modify json # -############### -# houlsby and squeeze-excite -def modify_json(input_json, output_json, adapter='adapterHoulsby', latent=None, conv=None, se_ratio=None): - - with open(input_json) as params_open: - params = json.load(params_open) - - # houlsby # - if adapter=='adapterHoulsby': - params["model"]["trunk"][2]['adapter']= 'houlsby' - params["model"]["trunk"][2]['latent']= latent - - # squeeze-excite # - if conv=='se_all' or conv=='se_all_bn': - for i in [0, 1, 3, 4]: - params['model']['trunk'][i]['transfer_se']=True - params['model']['trunk'][i]['se_ratio']=se_ratio - - elif conv=='se' or conv=='se_bn': - for i in [0, 1]: - params['model']['trunk'][i]['transfer_se']=True - params['model']['trunk'][i]['se_ratio']=se_ratio - - else: - pass - - ### output - with open(output_json, 'w') as params_open: - json.dump(params, params_open, indent=4) - - -###################### -# merge lora weights # -###################### -def merge_lora_layer(lora_layer): - down_weights = lora_layer.down_layer.kernel - up_weights = lora_layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale - lora_layer.original_layer.kernel.assign_add(increment_weights) - return lora_layer.original_layer - -def merge_lora(input_model, mode='default'): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - # default loRA - layer._q_layer = merge_lora_layer(layer._q_layer) - layer._v_layer = merge_lora_layer(layer._v_layer) - if mode=='full': - layer._k_layer = merge_lora_layer(layer._k_layer) - layer._embedding_layer = merge_lora_layer(layer._embedding_layer) - input_model(input_model.input) - -# correct weights.h5 weight order -def var_reorder(weight_h5): - # assumes weight_h5 model saved with seqnn_model.save() - # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. - # model.load_weights() load weights sequencially, assuming h5 weights are in the right order. - # When inserting lora/ia3, multihead_attention layer weights order changed. - # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs - # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. - var_init_order = ['r_w_bias:0:0', - 'r_r_bias:0:0', - 'q_layer/kernel:0', - 'k_layer/kernel:0', - 'v_layer/kernel:0', - 'embedding_layer/kernel:0', - 'embedding_layer/bias:0', - 'r_k_layer/kernel:0'] - - f = h5py.File(weight_h5, 'r+') - layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] - for l_name in layers: - new_name_order = [l_name+'/'+i for i in var_init_order] - f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) - f.close() - -##################### -# merge ia3 weights # -##################### def merge_ia3(original_model, ia3_model): # original model contains pre-trained weights # ia3 model is the fine-tuned ia3 model @@ -366,134 +392,187 @@ def merge_ia3(original_model, ia3_model): else: layer.set_weights(ia3_model.layers[i].get_weights()) -''' -###################### -# add squeeze excite # -###################### -def add_se(input_model, strand_pair, bottleneck_ratio=8, insert_mode='pre_att', unfreeze_bn=False): - # add squeeze-excitation blocks after conv - # input_model should be properly frozen - # pre_att: add se_block to pre-attention conv1d - # all: add se_block to pre-attention conv1d and post-attention separable_conv1d - - if insert_mode not in ['pre_att','all']: - raise ValueError("insert_mode must be pre_att or all") +############# +# add locon # +############# +def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): - model = tf.keras.Model(inputs=input_model.input, - outputs=input_model.layers[-2].output) # remove the switch_reverse layer + # first add lora to attention + add_lora(input_model, report_param=False) - # save current graph - layer_parent_dict_old = keras2dict(model) + # decide: + # 1. whether conv1 is trainable + # 2. which conv layers to add loRA - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({model.layers[0].name: model.input}) + # all conv layers + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + + if conv_select is None: + conv_select = len(conv_layers) + + if conv_select > len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + + locon_layers = [] + conv1_tune = False + if conv_select == len(conv_layers): + locon_layers = conv_layers[1:] + conv1_tune = True + else: + locon_layers = conv_layers[-conv_select:] + + layer_parent_dict_old = keras2dict(input_model) - # remove switch_reverse + # remove switch_reverse_layer to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] for i in to_fix: del layer_parent_dict_old[i] + + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) # Iterate over all layers after the input model_outputs = [] reverse_bool = None + for layer in input_model.layers[1:-1]: - for layer in model.layers[1:]: - - # parent layers + # get layer inputs parent_layers = layer_parent_dict_old[layer.name] - - # layer inputs layer_input = [layer_output_dict_new[parent] for parent in parent_layers] if len(layer_input) == 1: layer_input = layer_input[0] - if layer.name.startswith("stochastic_reverse_complement"): + # construct + if re.match('stochastic_reverse_complement', layer.name): x, reverse_bool = layer(layer_input) - - # insert squeeze-excite layer: - elif layer.name.startswith("conv1d"): - se_layer = layers.SqueezeExcite( - activation=None, # no activation before squeezing - additive=False, # use sigmoid multiplicative scaling - bottleneck_ratio=bottleneck_ratio, # bottleneck ratio - use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization - scale_fun='tanh' - ) - x = layer(layer_input) - x = x + se_layer(x) - - elif layer.name.startswith("separable_conv1d"): - if insert_mode=='all': - se_layer = layers.SqueezeExcite( - activation=None, # no activation before squeezing - additive=False, # use sigmoid multiplicative scaling - bottleneck_ratio=bottleneck_ratio, # bottleneck ratio - use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization - scale_fun='tanh' - ) - x = layer(layer_input) - x = x + se_layer(x) - else: - x = layer(layer_input) - + elif layer.name in locon_layers: + x = layers.Locon(layer, trainable=True, rank=rank, alpha=alpha)(layer_input) else: x = layer(layer_input) - # save the output tensor of every layer + # save layers to dictionary layer_output_dict_new.update({layer.name: x}) - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) - model_final = tf.keras.Model(inputs=model.inputs, outputs=final) - - # unfreeze layers - for l in model_final.layers: # set trunk - if l.name.startswith("squeeze_excite"): l.trainable = True + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - if unfreeze_bn: - for l in model_final.layers: - if l.name.startswith("batch_normalization"): l.trainable=True + if conv1_tune: + model_adapter.get_layer(name=conv_layers[0]).trainable = True # expected number of trainable params added/unfrozen: params_added = 0 - for l in model_final.layers: - if l.name.startswith("squeeze_excite"): - params_added += param_count(l) - elif l.name.startswith("batch_normalization"): - if unfreeze_bn: params_added += param_count(l, type='trainable') - print('params added/unfrozen by se_block: %d'%params_added) - - return model_final + if conv1_tune: + params_added += param_count(model_adapter.get_layer(name=conv_layers[0])) + for l in model_adapter.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + if l.name in locon_layers: + params_added += param_count(l.down_layer) + params_added += param_count(l.up_layer) + + print('params added/unfrozen by lora: %d'%params_added) + return model_adapter -def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio=8, insert_mode='pre_att', unfreeze_bn=False): +#### functions to merge locon +def lora_increment(layer): + down_weights = layer.down_layer.kernel + up_weights = layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * layer.scale + return increment_weights + +def locon_increment(layer): + down_weights = layer.down_layer.kernel + up_weights = layer.up_layer.kernel[0] + increment_weights = tf.einsum("abc,cd->abd", down_weights, up_weights) * layer.scale + return increment_weights + +def merge_locon(original_model, locon_model): + # original model contains pre-trained weights + for i, layer in enumerate(original_model.layers): + + # lora layers + if re.match('multihead_attention', layer.name): + q = locon_model.layers[i]._q_layer + k = locon_model.layers[i]._k_layer + v = locon_model.layers[i]._v_layer + e = locon_model.layers[i]._embedding_layer + if isinstance(q, layers.Lora): + increment_weights = lora_increment(q) + layer._q_layer.kernel.assign_add(increment_weights) + if isinstance(v, layers.Lora): + increment_weights = lora_increment(v) + layer._v_layer.kernel.assign_add(increment_weights) + if isinstance(k, layers.Lora): + increment_weights = lora_increment(k) + layer._k_layer.kernel.assign_add(increment_weights) + if isinstance(e, layers.Lora): + increment_weights = lora_increment(e) + layer._embedding_layer.kernel.assign_add(increment_weights) + + # locon layers + elif isinstance(locon_model.layers[i], layers.Locon): + increment_weights = locon_increment(locon_model.layers[i]) + layer.kernel.assign_add(increment_weights) + + else: + layer.set_weights(locon_model.layers[i].get_weights()) + + +############## +# houlsby_se # +############## +def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, se_rank=16): # add squeeze-excitation blocks after conv # input_model should be properly frozen # pre_att: add se_block to pre-attention conv1d # all: add se_block to pre-attention conv1d and post-attention separable_conv1d - - if insert_mode not in ['pre_att','all']: - raise ValueError("insert_mode must be pre_att or all") - model = tf.keras.Model(inputs=input_model.input, - outputs=input_model.layers[-2].output) # remove the switch_reverse layer - - # save current graph - layer_parent_dict_old = keras2dict(model) - - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({model.layers[0].name: model.input}) - - # remove switch_reverse + ################## + # houlsby layers # + ################## + houlsby_layers = [] + for i in range(len(input_model.layers)-1): + layer = input_model.layers[i] + next_layer = input_model.layers[i+1] + if re.match('dropout', layer.name) and re.match('add', next_layer.name): + houlsby_layers += [next_layer.name] + + ############# + # SE layers # + ############# + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + if conv_select is None: + se_layers = conv_layers[1:] + if conv_select >= len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + se_layers = conv_layers[-conv_select:] + + ################### + # construct model # + ################### + layer_parent_dict_old = keras2dict(input_model) + # remove switch_reverse_layer to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] for i in to_fix: del layer_parent_dict_old[i] - + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) # Iterate over all layers after the input model_outputs = [] reverse_bool = None - for layer in model.layers[1:]: + for layer in input_model.layers[1:-1]: # parent layers parent_layers = layer_parent_dict_old[layer.name] @@ -505,42 +584,24 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio= if layer.name.startswith("stochastic_reverse_complement"): x, reverse_bool = layer(layer_input) - # insert houlsby: - elif re.match('add', layer.name): - if any([re.match('dropout', i) for i in parent_layers]): - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) - x = layer([layer_input[0], x]) - else: - x = layer(layer_input) + # insert houlsby layer: + elif layer.name in houlsby_layers: + print('adapter added before:%s'%layer.name) + x = layers.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) + x = layer([layer_input[0], x]) # insert squeeze-excite layer: - elif layer.name.startswith("conv1d"): + elif layer.name in se_layers: se_layer = layers.SqueezeExcite( activation=None, # no activation before squeezing additive=False, # use sigmoid multiplicative scaling - bottleneck_ratio=bottleneck_ratio, # bottleneck ratio + rank=se_rank, # bottleneck ratio use_bias=False, # ignore bias kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization scale_fun='tanh' ) x = layer(layer_input) x = x + se_layer(x) - - elif layer.name.startswith("separable_conv1d"): - if insert_mode=='all': - se_layer = layers.SqueezeExcite( - activation=None, # no activation before squeezing - additive=False, # use sigmoid multiplicative scaling - bottleneck_ratio=bottleneck_ratio, # bottleneck ratio - use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization - scale_fun='tanh' - ) - x = layer(layer_input) - x = x + se_layer(x) - else: - x = layer(layer_input) else: x = layer(layer_input) @@ -548,8 +609,8 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio= # save the output tensor of every layer layer_output_dict_new.update({layer.name: x}) - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[model.layers[-1].name], reverse_bool]) - model_final = tf.keras.Model(inputs=model.inputs, outputs=final) + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_final = tf.keras.Model(inputs=input_model.inputs, outputs=final) # set trainable for l in model_final.layers[:-2]: # trunk @@ -561,22 +622,14 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, bottleneck_ratio= for l in model_final.layers: # set trunk if l.name.startswith("squeeze_excite"): l.trainable = True - if unfreeze_bn: - for l in model_final.layers: - if l.name.startswith("batch_normalization"): l.trainable=True - # expected number of trainable params added/unfrozen: params_added = 0 for l in model_final.layers: - if l.name.startswith("squeeze_excite"): - params_added += param_count(l) - elif l.name.startswith("batch_normalization"): - if unfreeze_bn: params_added += param_count(l, type='trainable') - elif l.name.startswith("adapter_houlsby"): + if re.match('squeeze_excite|adapter_houlsby', l.name): params_added += param_count(l) elif l.name.startswith("layer_normalization"): params_added += param_count(l, type='trainable') - print('params added/unfrozen by se_block: %d'%params_added) + print('params added/unfrozen by houlsby_se: %d'%params_added) return model_final -''' + diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index 2bfb5cc..c8acef3 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -149,7 +149,7 @@ def __init__(self, use_bias=False, kernel_initializer=tf.keras.initializers.HeUniform(), #kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), - trainable=trainable, + trainable=True, name="lora_a" ) @@ -157,7 +157,7 @@ def __init__(self, units=self.output_dim, use_bias=False, kernel_initializer=tf.keras.initializers.Zeros(), - trainable=trainable, + trainable=True, name="lora_b" ) @@ -176,6 +176,83 @@ def get_config(self): ) return config +class Locon(tf.keras.layers.Layer): + # LoRA for conv-layer, adapted from: + # https://arxiv.org/pdf/2309.14859#page=23.84 + # https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/modules/locon.py + # use default alpha and rank for locon + + def __init__(self, + original_layer, + rank=4, + alpha=1, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original conv layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.input_dim = original_layer.input_shape[-1] + self.output_dim = original_layer_config["filters"] + + if rank > self.output_dim: + raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + + self.rank = rank + self.alpha = alpha + self.scale = alpha / rank + self.original_layer = original_layer + self.original_layer.trainable = False + + input_dim = original_layer.input_shape[-1] + output_dim = original_layer_config["filters"] + kernel_size = original_layer_config['kernel_size'][0] + stride = original_layer_config['strides'][0] + dilation_rate = original_layer_config["dilation_rate"][0] + + # Note: the original paper mentions that normal distribution was + # used for initialization. However, the official LoRA implementation + # uses "Kaiming/He Initialization". + + self.down_layer = tf.keras.layers.Conv1D( + filters=rank, + kernel_size=kernel_size, + strides=stride, + padding="same", + use_bias=False, + dilation_rate=dilation_rate, + kernel_initializer=tf.keras.initializers.HeUniform(), + name='locon_down' + ) + + self.up_layer = tf.keras.layers.Conv1D( + filters=output_dim, + kernel_size=1, + strides=stride, + padding="same", + use_bias=False, + kernel_initializer=tf.keras.initializers.Zeros(), + name='locon_up' + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + lora_output = self.up_layer(self.down_layer(inputs)) * self.scale + return original_output + lora_output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "rank": self.rank, + "alpha": self.alpha + } + ) + return config + class AdapterHoulsby(tf.keras.layers.Layer): # https://arxiv.org/abs/1902.00751 # adapted from: https://github.com/jain-harshil/Adapter-BERT @@ -227,7 +304,6 @@ def get_config(self): # Basic ############################################################ - class Scale(tf.keras.layers.Layer): """Scale the input by a learned value. @@ -678,7 +754,8 @@ def call(self, inputs, training=False): q *= self._key_size**-0.5 # [B, H, T', T] - content_logits = tf.matmul(q + self._r_w_bias, k, transpose_b=True) + #content_logits = tf.matmul(q + self._r_w_bias, k, transpose_b=True) + content_logits = tf.matmul(q + tf.cast(self._r_w_bias, dtype=inputs.dtype), k, transpose_b=True) if self._num_position_features == 0: logits = content_logits @@ -714,10 +791,12 @@ def call(self, inputs, training=False): # Add shifted relative logits to content logits. if self._content_position_bias: # [B, H, T', 2T-1] - relative_logits = tf.matmul(q + self._r_r_bias, r_k, transpose_b=True) + #relative_logits = tf.matmul(q + self._r_r_bias, r_k, transpose_b=True) + relative_logits = tf.matmul(q + tf.cast(self._r_r_bias, dtype=inputs.dtype), r_k, transpose_b=True) else: # [1, H, 1, 2T-1] - relative_logits = tf.matmul(self._r_r_bias, r_k, transpose_b=True) + #relative_logits = tf.matmul(self._r_r_bias, r_k, transpose_b=True) + relative_logits = tf.matmul(tf.cast(self._r_r_bias, dtype=inputs.dtype), r_k, transpose_b=True) # [1, H, T', 2T-1] relative_logits = tf.broadcast_to( relative_logits, @@ -804,7 +883,7 @@ def __init__( self, activation='relu', additive=False, - bottleneck_ratio=8, + rank=8, norm_type=None, bn_momentum=0.9, use_bias=True, @@ -817,7 +896,7 @@ def __init__( self.additive = additive self.norm_type = norm_type self.bn_momentum = bn_momentum - self.bottleneck_ratio = bottleneck_ratio + self.rank = rank self.kernel_initializer=kernel_initializer self.bias_initializer=bias_initializer self.use_bias=use_bias @@ -851,7 +930,7 @@ def build(self, input_shape): exit(1) self.dense1 = tf.keras.layers.Dense( - units=self.num_channels // self.bottleneck_ratio, + units=self.rank, activation="relu", use_bias=self.use_bias, kernel_initializer=self.kernel_initializer, @@ -900,8 +979,7 @@ def get_config(self): "use_bias":self.use_bias, "norm_type": self.norm_type, "bn_momentum": self.bn_momentum, - "bottleneck_ratio": self.bottleneck_ratio, - 'bottleneck_size': self.num_channels // self.bottleneck_ratio, + "rank": self.rank } ) return config diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 85864f3..06f184c 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -28,7 +28,7 @@ from baskerville import seqnn from baskerville import trainer from baskerville import layers -from baskerville import transfer_helper +from baskerville.helpers import transfer_helper """ hound_transfer.py @@ -79,27 +79,38 @@ def main(): "--att_adapter", default=None, type=str, - help="attention layer module [adapterHoulsby, lora, lora_full, ia3]", + help="attention layer module [adapterHoulsby, lora, lora_full, ia3, locon]", ) parser.add_argument( "--att_latent", type=int, - default=16, + default=8, help="attention adapter latent size.", - ) - parser.add_argument( - "--conv_adapter", - default=None, - type=str, - help="conv layer module [conv, bn, conv_bn, squez_excit]", ) - parser.add_argument( - "--se_ratio", + "--lora_alpha", type=int, default=16, - help="se bottleneck ratio.", + help="lora alpha.", ) + parser.add_argument( + "--conv_select", + default=None, + type=int, + help="# of conv layers to insert locon/se.", + ) + parser.add_argument( + "--conv_rank", + type=int, + default=4, + help="locon/se rank.", + ) + parser.add_argument( + "--locon_alpha", + type=int, + default=1, + help="locon_alpha.", + ) parser.add_argument( "--tfr_train", default=None, @@ -171,8 +182,7 @@ def main(): params_model["strand_pair"] = strand_pairs if args.mixed_precision: - policy = mixed_precision.Policy('mixed_float16') - mixed_precision.set_global_policy(policy) + mixed_precision.set_global_policy('mixed_float16') if params_train.get("num_gpu", 1) == 1: ######################################## @@ -206,127 +216,58 @@ def main(): # attention adapter if args.att_adapter is not None: if args.att_adapter=='adapterHoulsby': - if args.conv_adapter not in ['se', 'se_bn', 'se_all','se_all_bn']: - # when att_adapter=='Houlsby' and conv_adapter=='se', do nothing. - # see conv_adapter section. - seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, - strand_pairs[0], - latent_size=args.att_latent) + seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, + strand_pairs[0], + latent_size=args.att_latent) elif args.att_adapter=='lora': transfer_helper.add_lora(seqnn_model.model, rank=args.att_latent, + alpha=args.lora_alpha, mode='default') elif args.att_adapter=='lora_full': transfer_helper.add_lora(seqnn_model.model, rank=args.att_latent, + alpha=args.lora_alpha, mode='full') elif args.att_adapter=='ia3': - seqnn_model.model = transfer_helper.add_ia3(seqnn_model.model, strand_pairs[0]) - - ''' - # conv adapter - # assume seqnn_model is appropriately frozen - if args.conv_adapter is not None: - if args.conv_adapter=='conv': - params_added = 0 - for l in seqnn_model.model.layers: - if l.name.startswith(("conv1d","separable_conv1d")): - l.trainable=True - params_added += transfer_helper.param_count(l, type='trainable') - print('params added/unfrozen by conv: %d'%params_added) - - elif args.conv_adapter=='conv_bn': - params_added = 0 - for l in seqnn_model.model.layers: - if l.name.startswith(("conv1d","separable_conv1d","batch_normalization")): - l.trainable=True - params_added += transfer_helper.param_count(l, type='trainable') - print('params added/unfrozen by conv_bn: %d'%params_added) - - elif args.conv_adapter=='bn': - params_added = 0 - for l in seqnn_model.model.layers: - if l.name.startswith("batch_normalization"): - l.trainable=True - params_added += transfer_helper.param_count(l, type='trainable') - print('params added/unfrozen by bn: %d'%params_added) - - ################## - # squeeze-excite # - ################## - elif args.conv_adapter in ['se','se_bn','se_all','se_all_bn']: - if args.att_adapter=='adapterHoulsby': - if args.conv_adapter=='se': - seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=False) - elif args.conv_adapter=='se_bn': - seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=True) - elif args.conv_adapter=='se_all': - seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='all', - unfreeze_bn=False) - elif args.conv_adapter=='se_all_bn': - seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='all', - unfreeze_bn=True) - else: - if args.conv_adapter=='se': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=False) - elif args.conv_adapter=='se_bn': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=True) - elif args.conv_adapter=='se_all': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, - strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='all', - unfreeze_bn=False) - elif args.conv_adapter=='se_all_bn': - seqnn_model.model = transfer_helper.add_se(seqnn_model.model, + seqnn_model.model = transfer_helper.add_ia3(seqnn_model.model, + strand_pairs[0]) + + elif args.att_adapter=='locon': # lora on conv+att + seqnn_model.model = transfer_helper.add_locon(seqnn_model.model, + strand_pairs[0], + conv_select=args.conv_select, + rank=args.conv_rank, + alpha=args.locon_alpha) + + elif args.att_adapter=='lora_conv': # lora on att, unfreeze_conv + transfer_helper.add_lora_conv(seqnn_model.model, conv_select=args.conv_select) + + elif args.att_adapter=='houlsby_se': # adapter on conv+att + seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, strand_pair=strand_pairs[0], - houlsby_latent=args.att_latent, - bottleneck_ratio=args.se_ratio, - insert_mode='pre_att', - unfreeze_bn=True) - ''' + conv_select=args.conv_select, + se_rank=args.conv_rank) ################# # final summary # ################# seqnn_model.model.summary() - - # initialize trainer - seqnn_trainer = trainer.Trainer( - params_train, train_data, eval_data, args.out_dir - ) - + + if args.mixed_precision: + # add additional activation to cast float16 output to float32 + seqnn_model.append_activation() + # run with loss scaling + seqnn_trainer = trainer.Trainer( + params_train, train_data, eval_data, args.out_dir, loss_scale=True + ) + else: + seqnn_trainer = trainer.Trainer( + params_train, train_data, eval_data, args.out_dir + ) + # compile model seqnn_trainer.compile(seqnn_model) @@ -344,31 +285,28 @@ def main(): ############################# if args.transfer_mode=='sparse': - # overwrite json file when needed - # for: adapterHoulsby and squeeze-excite - transfer_helper.modify_json(input_json=args.params_file, - output_json='%s/params.json'%args.out_dir, - adapter=args.att_adapter, - latent=args.att_latent, - conv=args.conv_adapter, - se_ratio=args.se_ratio) - - # merge weights when needed - # for: lora and ia3 - # save weight to: model_best.mergeW.h5 - if args.att_adapter=='lora': - seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) - transfer_helper.merge_lora(seqnn_model.model, mode='default') - seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) + # for: adapterHoulsby and houlsby_se, overwrite json file + if args.att_adapter=='adapterHoulsby': + transfer_helper.modify_json(input_json=args.params_file, + output_json='%s/params.json'%args.out_dir, + adapter=args.att_adapter, + latent=args.att_latent) + + if args.att_adapter=='houlsby_se': + transfer_helper.modify_json(input_json=args.params_file, + output_json='%s/params.json'%args.out_dir, + adapter=args.att_adapter, + conv_select=args.conv_select, + se_rank=args.conv_rank + ) - if args.att_adapter=='lora_full': + # for lora, ia3, locon, save weight to: model_best.mergeW.h5 + if args.att_adapter in ['lora', 'lora_full', 'lora_conv']: seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) - transfer_helper.merge_lora(seqnn_model.model, mode='full') + transfer_helper.merge_lora(seqnn_model.model) seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) - - # merge ia3 weights to original, save weight to: model_best_mergeweight.h5 + if args.att_adapter=='ia3': # ia3 model ia3_model = seqnn_model.model @@ -381,6 +319,18 @@ def main(): transfer_helper.merge_ia3(original_model, ia3_model) original_model.save('%s/model_best.mergeW.h5'%args.out_dir) + if args.att_adapter=='locon': + # locon model + locon_model = seqnn_model.model + locon_model.load_weights('%s/model_best.h5'%args.out_dir) + # original model + seqnn_model2 = seqnn.SeqNN(params_model) + seqnn_model2.restore(args.restore, trunk=args.trunk) + original_model = seqnn_model2.model + # merge weights into original model + transfer_helper.merge_locon(original_model, locon_model) + original_model.save('%s/model_best.mergeW.h5'%args.out_dir) + else: ######################################## # multi GPU diff --git a/src/baskerville/seqnn.py b/src/baskerville/seqnn.py index 1ffca86..82db788 100644 --- a/src/baskerville/seqnn.py +++ b/src/baskerville/seqnn.py @@ -25,7 +25,7 @@ from baskerville import dataset from baskerville import layers from baskerville import metrics - +from baskerville.helpers import transfer_helper class SeqNN: """Sequence neural network model. @@ -198,6 +198,13 @@ def build_model(self, save_reprs: bool = True): for ho in self.head_output: self.models.append(tf.keras.Model(inputs=sequence, outputs=ho)) self.model = self.models[0] + + # add adapter + if hasattr(self, 'adapter'): + for hi, head in enumerate(self.heads): + self.models[hi] = self.insert_adapter(self.models[hi]) + self.model = self.models[0] + if self.verbose: print(self.model.summary()) @@ -1093,3 +1100,18 @@ def track_sequence(self, sequence): print("model_strides", self.model_strides) print("target_lengths", self.target_lengths) print("target_crops", self.target_crops) + + # method for inserting adapter for transfer learning + def insert_adapter(self, model): + if self.adapter=='houlsby': + output_model = transfer_helper.add_houlsby(model, + self.strand_pair[0], + latent_size=self.adapter_latent) + elif self.adapter=='houlsby_se': + output_model = transfer_helper.add_houlsby_se(model, + self.strand_pair[0], + houlsby_latent=self.adapter_latent, + conv_select=self.conv_select, + se_rank=self.se_rank) + return output_model + diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index d7c048e..3136642 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -531,22 +531,40 @@ def fit_tape(self, seqnn_model): if self.strategy is None: - @tf.function - def train_step(x, y): - with tf.GradientTape() as tape: - pred = model(x, training=True) - loss = self.loss_fn(y, pred) + sum(model.losses) - train_loss(loss) - train_r(y, pred) - train_r2(y, pred) - gradients = tape.gradient(loss, model.trainable_variables) - if self.agc_clip is not None: - gradients = adaptive_clip_grad( - model.trainable_variables, gradients, self.agc_clip + if self.loss_scale: + + @tf.function + def train_step(x, y): + with tf.GradientTape() as tape: + pred = model(x, training=True) + loss = self.loss_fn(y, pred) + sum(model.losses) + scaled_loss = self.optimizer.get_scaled_loss(loss) + train_loss(loss) + train_r(y, pred) + train_r2(y, pred) + scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables) + gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) + self.optimizer.apply_gradients( + zip(gradients, model.trainable_variables) + ) + else: + + @tf.function + def train_step(x, y): + with tf.GradientTape() as tape: + pred = model(x, training=True) + loss = self.loss_fn(y, pred) + sum(model.losses) + train_loss(loss) + train_r(y, pred) + train_r2(y, pred) + gradients = tape.gradient(loss, model.trainable_variables) + if self.agc_clip is not None: + gradients = adaptive_clip_grad( + model.trainable_variables, gradients, self.agc_clip + ) + self.optimizer.apply_gradients( + zip(gradients, model.trainable_variables) ) - self.optimizer.apply_gradients( - zip(gradients, model.trainable_variables) - ) @tf.function def eval_step(x, y): diff --git a/tests/test_transfer/test_ia3.ipynb b/tests/test_transfer/test_ia3.ipynb index b51fdd0..7ac40ba 100644 --- a/tests/test_transfer/test_ia3.ipynb +++ b/tests/test_transfer/test_ia3.ipynb @@ -24,7 +24,7 @@ "import tensorflow as tf\n", "from baskerville import seqnn\n", "from baskerville import layers\n", - "from baskerville import transfer_helper" + "from baskerville.helpers import transfer_helper" ] }, { From cd38eee5600180e7a56ad20c3548f2db0f100f9f Mon Sep 17 00:00:00 2001 From: hy395 Date: Wed, 3 Jul 2024 01:30:32 -0700 Subject: [PATCH 12/26] dont specify tf version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 0f1e198..46ad3b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = scipy>=1.9.1 statsmodels>=0.13.5 tabulate>=0.8.10 - tensorflow>=2.12.0 + tensorflow tqdm>=4.65.0 [options.extras_require] From e39e65f63d6820e4efd3c67f11ec2656077c2862 Mon Sep 17 00:00:00 2001 From: hy395 Date: Wed, 3 Jul 2024 01:36:35 -0700 Subject: [PATCH 13/26] change back tf version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 46ad3b4..0f1e198 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = scipy>=1.9.1 statsmodels>=0.13.5 tabulate>=0.8.10 - tensorflow + tensorflow>=2.12.0 tqdm>=4.65.0 [options.extras_require] From 061ad81c65a9b44e7f61dbfb939367bcabd958d5 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 3 Sep 2024 17:05:18 -0700 Subject: [PATCH 14/26] add log_dir argument --- src/baskerville/scripts/hound_transfer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 06f184c..59edca4 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -59,6 +59,12 @@ def main(): default="train_out", help="Output directory [Default: %(default)s]", ) + parser.add_argument( + "-l", + "--log_dir", + default="log_out", + help="Tensorboard log directory [Default: %(default)s]", + ) parser.add_argument( "--restore", default=None, @@ -261,11 +267,11 @@ def main(): seqnn_model.append_activation() # run with loss scaling seqnn_trainer = trainer.Trainer( - params_train, train_data, eval_data, args.out_dir, loss_scale=True + params_train, train_data, eval_data, args.out_dir, args.log_dir, loss_scale=True ) else: seqnn_trainer = trainer.Trainer( - params_train, train_data, eval_data, args.out_dir + params_train, train_data, eval_data, args.out_dir, args.log_dir ) # compile model From 09cad180eff68f81ef5f0239e32d7c3bc4cff695 Mon Sep 17 00:00:00 2001 From: hy395 Date: Thu, 3 Oct 2024 17:04:35 -0700 Subject: [PATCH 15/26] untrack borzoi_test_gene.py --- src/baskerville/adapters.py | 301 +++++++++ src/baskerville/helpers/transfer.py | 636 +++++++++++++++++++ src/baskerville/pygene.py | 324 ---------- src/baskerville/scripts/borzoi_test_genes.py | 569 ----------------- 4 files changed, 937 insertions(+), 893 deletions(-) create mode 100644 src/baskerville/adapters.py create mode 100644 src/baskerville/helpers/transfer.py delete mode 100755 src/baskerville/pygene.py delete mode 100755 src/baskerville/scripts/borzoi_test_genes.py diff --git a/src/baskerville/adapters.py b/src/baskerville/adapters.py new file mode 100644 index 0000000..f05a307 --- /dev/null +++ b/src/baskerville/adapters.py @@ -0,0 +1,301 @@ +# Copyright 2023 Calico LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +import pdb +import sys +from typing import Optional, List + +import numpy as np +import tensorflow as tf + +gpu_devices = tf.config.experimental.list_physical_devices("GPU") +for device in gpu_devices: + tf.config.experimental.set_memory_growth(device, True) + +##################### +# transfer learning # +##################### +class IA3(tf.keras.layers.Layer): + # https://arxiv.org/pdf/2205.05638.pdf + # ia3 module for attention layer, scale output. + + def __init__(self, + original_layer, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.output_dim = original_layer_config["units"] + + self.original_layer = original_layer + self.original_layer.trainable = False + + # IA3 weights. Make it a dense layer to control trainable + self._ia3_layer = tf.keras.layers.Dense( + units=self.output_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Ones(), + trainable=True, + name="ia3" + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + return original_output * scaler + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "size": self.output_dim, + } + ) + return config + +class IA3_ff(tf.keras.layers.Layer): + # https://arxiv.org/pdf/2205.05638.pdf + # ia3 module for down-projection ff layer, scale input. + + def __init__(self, + original_layer, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.input_dim = original_layer.input_shape[-1] + + self.original_layer = original_layer + self.original_layer.trainable = False + + # IA3 weights. Make it a dense layer to control trainable + self._ia3_layer = tf.keras.layers.Dense( + units=self.input_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Ones(), + trainable=True, + name="ia3_ff" + ) + + def call(self, inputs): + scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + return self.original_layer(inputs * scaler) + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "size": self.input_dim + } + ) + return config + +class Lora(tf.keras.layers.Layer): + # adapted from: + # https://arxiv.org/abs/2106.09685 + # https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/ + # https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py + + def __init__(self, + original_layer, + rank=8, + alpha=16, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original dense layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.output_dim = original_layer_config["units"] + + if rank > self.output_dim: + raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + + self.rank = rank + self.alpha = alpha + self.scale = alpha / rank + self.original_layer = original_layer + self.original_layer.trainable = False + + # Note: the original paper mentions that normal distribution was + # used for initialization. However, the official LoRA implementation + # uses "Kaiming/He Initialization". + self.down_layer = tf.keras.layers.Dense( + units=rank, + use_bias=False, + kernel_initializer=tf.keras.initializers.HeUniform(), + #kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), + trainable=True, + name="lora_a" + ) + + self.up_layer = tf.keras.layers.Dense( + units=self.output_dim, + use_bias=False, + kernel_initializer=tf.keras.initializers.Zeros(), + trainable=True, + name="lora_b" + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + lora_output = self.up_layer(self.down_layer(inputs)) * self.scale + return original_output + lora_output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "rank": self.rank, + "alpha": self.alpha + } + ) + return config + +class Locon(tf.keras.layers.Layer): + # LoRA for conv-layer, adapted from: + # https://arxiv.org/pdf/2309.14859#page=23.84 + # https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/modules/locon.py + # use default alpha and rank for locon + + def __init__(self, + original_layer, + rank=4, + alpha=1, + trainable=False, + **kwargs): + + # keep the name of this layer the same as the original conv layer. + original_layer_config = original_layer.get_config() + name = original_layer_config["name"] + kwargs.pop("name", None) + super().__init__(name=name, trainable=trainable, **kwargs) + + self.input_dim = original_layer.input_shape[-1] + self.output_dim = original_layer_config["filters"] + + if rank > self.output_dim: + raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + + self.rank = rank + self.alpha = alpha + self.scale = alpha / rank + self.original_layer = original_layer + self.original_layer.trainable = False + + input_dim = original_layer.input_shape[-1] + output_dim = original_layer_config["filters"] + kernel_size = original_layer_config['kernel_size'][0] + stride = original_layer_config['strides'][0] + dilation_rate = original_layer_config["dilation_rate"][0] + + # Note: the original paper mentions that normal distribution was + # used for initialization. However, the official LoRA implementation + # uses "Kaiming/He Initialization". + + self.down_layer = tf.keras.layers.Conv1D( + filters=rank, + kernel_size=kernel_size, + strides=stride, + padding="same", + use_bias=False, + dilation_rate=dilation_rate, + kernel_initializer=tf.keras.initializers.HeUniform(), + name='locon_down' + ) + + self.up_layer = tf.keras.layers.Conv1D( + filters=output_dim, + kernel_size=1, + strides=stride, + padding="same", + use_bias=False, + kernel_initializer=tf.keras.initializers.Zeros(), + name='locon_up' + ) + + def call(self, inputs): + original_output = self.original_layer(inputs) + lora_output = self.up_layer(self.down_layer(inputs)) * self.scale + return original_output + lora_output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "rank": self.rank, + "alpha": self.alpha + } + ) + return config + +class AdapterHoulsby(tf.keras.layers.Layer): + # https://arxiv.org/abs/1902.00751 + # adapted from: https://github.com/jain-harshil/Adapter-BERT + + def __init__( + self, + latent_size, + activation=tf.keras.layers.ReLU(), + **kwargs): + super(AdapterHoulsby, self).__init__(**kwargs) + self.latent_size = latent_size + self.activation = activation + + def build(self, input_shape): + self.down_project = tf.keras.layers.Dense( + units=self.latent_size, + activation="linear", + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + bias_initializer="zeros", + name='adapter_down' + ) + + self.up_project = tf.keras.layers.Dense( + units=input_shape[-1], + activation="linear", + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), + bias_initializer="zeros", + name='adapter_up' + ) + + def call(self, inputs): + projected_down = self.down_project(inputs) + activated = self.activation(projected_down) + projected_up = self.up_project(activated) + output = projected_up + inputs + return output + + def get_config(self): + config = super().get_config().copy() + config.update( + { + "latent_size": self.latent_size, + "activation": self.activation + } + ) + return config diff --git a/src/baskerville/helpers/transfer.py b/src/baskerville/helpers/transfer.py new file mode 100644 index 0000000..25b80f5 --- /dev/null +++ b/src/baskerville/helpers/transfer.py @@ -0,0 +1,636 @@ +import argparse +import json +import os +import shutil +import re +import h5py + +import numpy as np +import pandas as pd +import tensorflow as tf +from tensorflow.keras import mixed_precision + +from baskerville import dataset +from baskerville import seqnn +from baskerville import trainer +from baskerville import layers +from baskerville import adapters + +def param_count(layer, type='all'): + if type not in ['all','trainable','non_trainable']: + raise ValueError("TYPE must be one of all, trainable, non_trainable") + output = 0 + if type=='all': + output = int(sum(tf.keras.backend.count_params(w) for w in layer.weights)) + elif type=='trainable': + output = int(sum(tf.keras.backend.count_params(w) for w in layer.trainable_weights)) + else: + output = int(sum(tf.keras.backend.count_params(w) for w in layer.non_trainable_weights)) + return output + +def param_summary(model): + trainable = param_count(model, type='trainable') + non_trainable = param_count(model, type='non_trainable') + print('total params:%d' %(trainable + non_trainable)) + print('trainable params:%d' %trainable) + print('non-trainable params:%d' %non_trainable) + +def keras2dict(model): + layer_parent_dict = {} # the parent layers of each layer in the old graph + for layer in model.layers: + for node in layer._outbound_nodes: + layer_name = node.outbound_layer.name + if layer_name not in layer_parent_dict: + layer_parent_dict.update({layer_name: [layer.name]}) + else: + if layer.name not in layer_parent_dict[layer_name]: + layer_parent_dict[layer_name].append(layer.name) + return layer_parent_dict + +# lora requires change model.h5 weight order. +# locon and ia3 don't modify model in place. +def var_reorder(weight_h5): + # assumes weight_h5 model saved with seqnn_model.save() + # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. + # model.load_weights() load weights sequencially, assuming h5 weights are in the right order. + # When inserting lora, multihead_attention layer weights order changed. + # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs + # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. + var_init_order = ['r_w_bias:0:0', + 'r_r_bias:0:0', + 'q_layer/kernel:0', + 'k_layer/kernel:0', + 'v_layer/kernel:0', + 'embedding_layer/kernel:0', + 'embedding_layer/bias:0', + 'r_k_layer/kernel:0'] + + f = h5py.File(weight_h5, 'r+') + layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + for l_name in layers: + new_name_order = [l_name+'/'+i for i in var_init_order] + f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + f.close() + + +# houlsby requires architecture change. +# thus we need to modify json. +def modify_json(input_json, output_json, adapter, latent=8, se_rank=None, conv_select=None): + + with open(input_json) as params_open: + params = json.load(params_open) + + # houlsby + if adapter=='adapterHoulsby': + params["model"]['adapter']= 'houlsby' + params["model"]['adapter_latent']= latent + + # houlsby_se + elif adapter=='houlsby_se': + params["model"]['adapter']= 'houlsby_se' + params["model"]['adapter_latent']= latent + params["model"]['se_rank']= se_rank + params["model"]['conv_select']= conv_select + + else: + raise ValueError("adapter must be adapterHoulsby or houlsby_se") + + ### output + with open(output_json, 'w') as params_open: + json.dump(params, params_open, indent=4) + +###################### +# add houlsby layers # +###################### +def add_houlsby(input_model, strand_pair, latent_size=8): + # take seqnn_model as input + # output a new seqnn_model object + # only the adapter, and layer_norm are trainable + + ################## + # houlsby layers # + ################## + houlsby_layers = [] + for i in range(len(input_model.layers)-1): + layer = input_model.layers[i] + next_layer = input_model.layers[i+1] + if re.match('dropout', layer.name) and re.match('add', next_layer.name): + houlsby_layers += [next_layer.name] + + ################### + # construct model # + ################### + layer_parent_dict_old = keras2dict(input_model) + # remove switch_reverse_layer + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in input_model.layers[1:-1]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + + # insert houlsby layer: + elif layer.name in houlsby_layers: + print('adapter added before:%s'%layer.name) + x = adapters.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) + x = layer([layer_input[0], x]) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) + + # set trainable # + for l in model_adapter.layers[:-2]: # trunk + if re.match('layer_normalization|adapter_houlsby', l.name): + l.trainable = True + else: + l.trainable = False + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_adapter.layers: + if l.name.startswith("adapter_houlsby"): + params_added += param_count(l) + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type='trainable') + print('params added/unfrozen by adapter_houlsby: %d'%params_added) + + return model_adapter + +############### +# lora layers # +############### +def add_lora(input_model, rank=8, alpha=16, mode='default', report_param=True): + # take seqnn.model as input + # replace _q_layer, _v_layer in multihead_attention + # optionally replace _k_layer, _embedding_layer + if mode not in ['default','full']: + raise ValueError("mode must be default or full") + + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + # default loRA + layer._q_layer = adapters.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) + layer._v_layer = adapters.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) + # full loRA + if mode=='full': + layer._k_layer = adapters.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) + layer._embedding_layer = adapters.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) + + input_model(input_model.input) # initialize new variables + + # freeze all params but lora + for layer in input_model._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name in ["lora_a", "lora_b"]: + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + input_model.layers[-2].trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in input_model.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + if mode=='full': + params_added += param_count(l._k_layer.down_layer) + params_added += param_count(l._k_layer.up_layer) + params_added += param_count(l._embedding_layer.down_layer) + params_added += param_count(l._embedding_layer.up_layer) + + if report_param: + print('params added/unfrozen by lora: %d'%params_added) + +############### +# lora layers # +############### +def add_lora_conv(input_model, conv_select=None): + + # add lora layers + add_lora(input_model, rank=8, alpha=16, mode='default', report_param=False) + + # list all conv layers + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + if conv_select is None: + conv_select = len(conv_layers) + if conv_select > len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + + # set conv layers trainable + trainable_conv = conv_layers[-conv_select:] + for layer in input_model.layers: + if layer.name in trainable_conv: + layer.trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in input_model.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + elif l.name in trainable_conv: + params_added += param_count(l) + + print('params added/unfrozen by lora_conv: %d'%params_added) + +# merge lora weights # +def merge_lora_layer(lora_layer): + down_weights = lora_layer.down_layer.kernel + up_weights = lora_layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + lora_layer.original_layer.kernel.assign_add(increment_weights) + return lora_layer.original_layer + +def merge_lora(input_model): + for layer in input_model.layers: + if 'multihead_attention' in layer.name: + if isinstance(layer._q_layer, adapters.Lora): + layer._q_layer = merge_lora_layer(layer._q_layer) + if isinstance(layer._v_layer, adapters.Lora): + layer._v_layer = merge_lora_layer(layer._v_layer) + if isinstance(layer._k_layer, adapters.Lora): + layer._k_layer = merge_lora_layer(layer._k_layer) + if isinstance(layer._embedding_layer, adapters.Lora): + layer._embedding_layer = merge_lora_layer(layer._embedding_layer) + input_model(input_model.input) + + +############## +# IA3 layers # +############## +def add_ia3(input_model, strand_pair): + + # add to kv layers # + for layer in input_model.layers: + if re.match('multihead_attention', layer.name): + layer._k_layer = adapters.IA3(layer._k_layer, trainable=True) + layer._v_layer = adapters.IA3(layer._v_layer, trainable=True) + + # add to ff layer # + # save old graph to dictionary + layer_parent_dict_old = keras2dict(input_model) + + # remove switch_reverse_layer + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + for layer in input_model.layers[1:-1]: + + # get layer inputs + parent_layers = layer_parent_dict_old[layer.name] + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + # construct + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + # transformer ff down-project layer (1536 -> 768): + elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + x = adapters.IA3_ff(layer, trainable=True)(layer_input) + else: + x = layer(layer_input) + + # save layers to dictionary + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) + + # set trainable # + for layer in model_adapter._flatten_layers(): + lst_of_sublayers = list(layer._flatten_layers()) + if len(lst_of_sublayers) == 1: + if layer.name in ['ia3', 'ia3_ff']: + layer.trainable = True + else: + layer.trainable = False + + ### bias terms need to be frozen separately + for layer in model_adapter.layers: + if re.match('multihead_attention', layer.name): + layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) + layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + + # set final head to be trainable + model_adapter.layers[-2].trainable=True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_adapter.layers: + if re.match('multihead_attention', l.name): # kv layers + params_added += param_count(l._k_layer._ia3_layer) + params_added += param_count(l._v_layer._ia3_layer) + elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers + params_added += param_count(l._ia3_layer) + + print('params added/unfrozen by ia3: %d'%params_added) + + return model_adapter + +def merge_ia3(original_model, ia3_model): + # original model contains pre-trained weights + # ia3 model is the fine-tuned ia3 model + for i, layer in enumerate(original_model.layers): + # attention layers + if re.match('multihead_attention', layer.name): + # scale k + k_scaler = ia3_model.layers[i]._k_layer._ia3_layer.kernel[0] + layer._k_layer.kernel.assign(layer._k_layer.kernel * k_scaler) + # scale v + v_scaler = ia3_model.layers[i]._v_layer._ia3_layer.kernel[0] + layer._v_layer.kernel.assign(layer._v_layer.kernel * v_scaler) + # ff layers + elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + ff_scaler = tf.expand_dims(ia3_model.layers[i]._ia3_layer.kernel[0], 1) + layer.kernel.assign(layer.kernel * ff_scaler) + # other layers + else: + layer.set_weights(ia3_model.layers[i].get_weights()) + +############# +# add locon # +############# +def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): + + # first add lora to attention + add_lora(input_model, report_param=False) + + # decide: + # 1. whether conv1 is trainable + # 2. which conv layers to add loRA + + # all conv layers + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + + if conv_select is None: + conv_select = len(conv_layers) + + if conv_select > len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + + locon_layers = [] + conv1_tune = False + if conv_select == len(conv_layers): + locon_layers = conv_layers[1:] + conv1_tune = True + else: + locon_layers = conv_layers[-conv_select:] + + layer_parent_dict_old = keras2dict(input_model) + + # remove switch_reverse_layer + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + for layer in input_model.layers[1:-1]: + + # get layer inputs + parent_layers = layer_parent_dict_old[layer.name] + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + # construct + if re.match('stochastic_reverse_complement', layer.name): + x, reverse_bool = layer(layer_input) + elif layer.name in locon_layers: + x = adapters.Locon(layer, trainable=True, rank=rank, alpha=alpha)(layer_input) + else: + x = layer(layer_input) + + # save layers to dictionary + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) + + if conv1_tune: + model_adapter.get_layer(name=conv_layers[0]).trainable = True + + # expected number of trainable params added/unfrozen: + params_added = 0 + if conv1_tune: + params_added += param_count(model_adapter.get_layer(name=conv_layers[0])) + for l in model_adapter.layers: + if re.match('multihead_attention', l.name): + params_added += param_count(l._q_layer.down_layer) + params_added += param_count(l._q_layer.up_layer) + params_added += param_count(l._v_layer.down_layer) + params_added += param_count(l._v_layer.up_layer) + if l.name in locon_layers: + params_added += param_count(l.down_layer) + params_added += param_count(l.up_layer) + + print('params added/unfrozen by lora: %d'%params_added) + + return model_adapter + +#### functions to merge locon +def lora_increment(layer): + down_weights = layer.down_layer.kernel + up_weights = layer.up_layer.kernel + increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * layer.scale + return increment_weights + +def locon_increment(layer): + down_weights = layer.down_layer.kernel + up_weights = layer.up_layer.kernel[0] + increment_weights = tf.einsum("abc,cd->abd", down_weights, up_weights) * layer.scale + return increment_weights + +def merge_locon(original_model, locon_model): + # original model contains pre-trained weights + for i, layer in enumerate(original_model.layers): + + # lora layers + if re.match('multihead_attention', layer.name): + q = locon_model.layers[i]._q_layer + k = locon_model.layers[i]._k_layer + v = locon_model.layers[i]._v_layer + e = locon_model.layers[i]._embedding_layer + if isinstance(q, adapters.Lora): + increment_weights = lora_increment(q) + layer._q_layer.kernel.assign_add(increment_weights) + if isinstance(v, adapters.Lora): + increment_weights = lora_increment(v) + layer._v_layer.kernel.assign_add(increment_weights) + if isinstance(k, adapters.Lora): + increment_weights = lora_increment(k) + layer._k_layer.kernel.assign_add(increment_weights) + if isinstance(e, adapters.Lora): + increment_weights = lora_increment(e) + layer._embedding_layer.kernel.assign_add(increment_weights) + + # locon layers + elif isinstance(locon_model.layers[i], adapters.Locon): + increment_weights = locon_increment(locon_model.layers[i]) + layer.kernel.assign_add(increment_weights) + + else: + layer.set_weights(locon_model.layers[i].get_weights()) + + +############## +# houlsby_se # +############## +def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, se_rank=16): + # add squeeze-excitation blocks after conv + # input_model should be properly frozen + # pre_att: add se_block to pre-attention conv1d + # all: add se_block to pre-attention conv1d and post-attention separable_conv1d + + ################## + # houlsby layers # + ################## + houlsby_layers = [] + for i in range(len(input_model.layers)-1): + layer = input_model.layers[i] + next_layer = input_model.layers[i+1] + if re.match('dropout', layer.name) and re.match('add', next_layer.name): + houlsby_layers += [next_layer.name] + + ############# + # SE layers # + ############# + conv_layers = [] + for layer in input_model.layers: + if re.match('conv1d', layer.name): + conv_layers += [layer.name] + if conv_select is None: + se_layers = conv_layers[1:] + if conv_select >= len(conv_layers): + raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + se_layers = conv_layers[-conv_select:] + + ################### + # construct model # + ################### + layer_parent_dict_old = keras2dict(input_model) + # remove switch_reverse_layer + to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + for i in to_fix: + del layer_parent_dict_old[i] + # create new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + # Iterate over all layers after the input + model_outputs = [] + reverse_bool = None + + for layer in input_model.layers[1:-1]: + + # parent layers + parent_layers = layer_parent_dict_old[layer.name] + + # layer inputs + layer_input = [layer_output_dict_new[parent] for parent in parent_layers] + if len(layer_input) == 1: layer_input = layer_input[0] + + if layer.name.startswith("stochastic_reverse_complement"): + x, reverse_bool = layer(layer_input) + + # insert houlsby layer: + elif layer.name in houlsby_layers: + print('adapter added before:%s'%layer.name) + x = adapters.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) + x = layer([layer_input[0], x]) + + # insert squeeze-excite layer: + elif layer.name in se_layers: + se_layer = layers.SqueezeExcite( + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + rank=se_rank, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization + scale_fun='tanh' + ) + x = layer(layer_input) + x = x + se_layer(x) + + else: + x = layer(layer_input) + + # save the output tensor of every layer + layer_output_dict_new.update({layer.name: x}) + + final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + model_final = tf.keras.Model(inputs=input_model.inputs, outputs=final) + + # set trainable + for l in model_final.layers[:-2]: # trunk + if re.match('layer_normalization|adapter_houlsby', l.name): + l.trainable = True + else: + l.trainable = False + + for l in model_final.layers: # set trunk + if l.name.startswith("squeeze_excite"): l.trainable = True + + # expected number of trainable params added/unfrozen: + params_added = 0 + for l in model_final.layers: + if re.match('squeeze_excite|adapter_houlsby', l.name): + params_added += param_count(l) + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type='trainable') + print('params added/unfrozen by houlsby_se: %d'%params_added) + + return model_final + diff --git a/src/baskerville/pygene.py b/src/baskerville/pygene.py deleted file mode 100755 index 86cae4f..0000000 --- a/src/baskerville/pygene.py +++ /dev/null @@ -1,324 +0,0 @@ -#!/usr/bin/env python -from optparse import OptionParser - -import gzip -import pdb - -''' -pygene - -Classes and methods to manage genes in GTF format. -''' - -################################################################################ -# Classes -################################################################################ -class GenomicInterval: - def __init__(self, start, end, chrom=None, strand=None): - self.start = start - self.end = end - self.chrom = chrom - self.strand = strand - - def __eq__(self, other): - return self.start == other.start - - def __lt__(self, other): - return self.start < other.start - - def __cmp__(self, x): - if self.start < x.start: - return -1 - elif self.start > x.start: - return 1 - else: - return 0 - - def __str__(self): - if self.chrom is None: - label = '[%d-%d]' % (self.start, self.end) - else: - label = '%s:%d-%d' % (self.chrom, self.start, self.end) - return label - - -class Transcript: - def __init__(self, chrom, strand, kv): - self.chrom = chrom - self.strand = strand - self.kv = kv - self.exons = [] - self.cds = [] - self.utrs3 = [] - self.utrs5 = [] - self.sorted = False - self.utrs_defined = False - - def add_cds(self, start, end): - self.cds.append(GenomicInterval(start,end)) - - def add_exon(self, start, end): - self.exons.append(GenomicInterval(start,end)) - - def define_utrs(self): - self.utrs_defined = True - - if len(self.cds) == 0: - self.utrs3 = self.exons - - else: - assert(self.sorted) - - # reset UTR lists - self.utrs5 = [] - self.utrs3 = [] - - # match up exons and CDS - ci = 0 - for ei in range(len(self.exons)): - # left initial - if self.exons[ei].end < self.cds[ci].start: - utr = GenomicInterval(self.exons[ei].start, self.exons[ei].end) - if self.strand == '+': - self.utrs5.append(utr) - else: - self.utrs3.append(utr) - - # right initial - elif self.cds[ci].end < self.exons[ei].start: - utr = GenomicInterval(self.exons[ei].start, self.exons[ei].end) - if self.strand == '+': - self.utrs3.append(utr) - else: - self.utrs5.append(utr) - - # overlap - else: - # left overlap - if self.exons[ei].start < self.cds[ci].start: - utr = GenomicInterval(self.exons[ei].start, self.cds[ci].start-1) - if self.strand == '+': - self.utrs5.append(utr) - else: - self.utrs3.append(utr) - - # right overlap - if self.cds[ci].end < self.exons[ei].end: - utr = GenomicInterval(self.cds[ci].end+1, self.exons[ei].end) - if self.strand == '+': - self.utrs3.append(utr) - else: - self.utrs5.append(utr) - - # increment up to last - ci = min(ci+1, len(self.cds)-1) - - def fasta_cds(self, fasta_open, stranded=False): - assert(self.sorted) - gene_seq = '' - for exon in self.cds: - exon_seq = fasta_open.fetch(self.chrom, exon.start-1, exon.end) - gene_seq += exon_seq - if stranded and self.strand == '-': - gene_seq = rc(gene_seq) - return gene_seq - - def fasta_exons(self, fasta_open, stranded=False): - assert(self.sorted) - gene_seq = '' - for exon in self.exons: - exon_seq = fasta_open.fetch(self.chrom, exon.start-1, exon.end) - gene_seq += exon_seq - if stranded and self.strand == '-': - gene_seq = rc(gene_seq) - return gene_seq - - def sort_exons(self): - self.sorted = True - if len(self.exons) > 1: - self.exons.sort() - if len(self.cds) > 1: - self.cds.sort() - - def span(self): - exon_starts = [exon.start for exon in self.exons] - exon_ends = [exon.end for exon in self.exons] - return min(exon_starts), max(exon_ends) - - def tss(self): - if self.strand == '-': - return self.exons[-1].end - else: - return self.exons[0].start - - def write_gtf(self, gtf_out, write_cds=False, write_utrs=False): - for ex in self.exons: - cols = [self.chrom, 'pygene', 'exon', str(ex.start), str(ex.end)] - cols += ['.', self.strand, '.', kv_gtf(self.kv)] - print('\t'.join(cols), file=gtf_out) - if write_cds: - for cds in self.cds: - cols = [self.chrom, 'pygene', 'CDS', str(cds.start), str(cds.end)] - cols += ['.', self.strand, '.', kv_gtf(self.kv)] - print('\t'.join(cols), file=gtf_out) - if write_utrs: - assert(self.utrs_defined) - for utr in self.utrs5: - cols = [self.chrom, 'pygene', '5\'UTR', str(utr.start), str(utr.end)] - cols += ['.', self.strand, '.', kv_gtf(self.kv)] - print('\t'.join(cols), file=gtf_out) - for utr in self.utrs3: - cols = [self.chrom, 'pygene', '3\'UTR', str(utr.start), str(utr.end)] - cols += ['.', self.strand, '.', kv_gtf(self.kv)] - print('\t'.join(cols), file=gtf_out) - - def __str__(self): - return '%s %s %s %s' % (self.chrom, self.strand, kv_gtf(self.kv), ','.join([ex.__str__() for ex in self.exons])) - - -class Gene: - def __init__(self): - self.transcripts = {} - self.chrom = None - self.strand = None - self.start = None - self.end = None - - def add_transcript(self, tx_id, tx): - self.transcripts[tx_id] = tx - self.chrom = tx.chrom - self.strand = tx.strand - self.kv = tx.kv - - def span(self): - tx_spans = [tx.span() for tx in self.transcripts.values()] - tx_starts, tx_ends = zip(*tx_spans) - self.start = min(tx_starts) - self.end = max(tx_ends) - return self.start, self.end - - -class GTF: - def __init__(self, gtf_file, trim_dot=False): - self.gtf_file = gtf_file - self.genes = {} - self.transcripts = {} - self.utrs_defined = False - self.trim_dot = trim_dot - - self.read_gtf() - - def define_utrs(self): - self.utrs_defined = True - for tx in self.transcripts.values(): - tx.define_utrs() - - def read_gtf(self): - if self.gtf_file[-3:] == '.gz': - gtf_in = gzip.open(self.gtf_file, 'rt') - else: - gtf_in = open(self.gtf_file) - - # ignore header - line = gtf_in.readline() - while line[0] == '#': - line = gtf_in.readline() - - while line: - a = line.split('\t') - if a[2] in ['exon','CDS']: - chrom = a[0] - interval_type = a[2] - start = int(a[3]) - end = int(a[4]) - strand = a[6] - kv = gtf_kv(a[8]) - - # add/get transcript - tx_id = kv['transcript_id'] - if self.trim_dot: - tx_id = trim_dot(tx_id) - if not tx_id in self.transcripts: - self.transcripts[tx_id] = Transcript(chrom, strand, kv) - tx = self.transcripts[tx_id] - - # add/get gene - gene_id = kv['gene_id'] - if self.trim_dot: - gene_id = trim_dot(gene_id) - if not gene_id in self.genes: - self.genes[gene_id] = Gene() - self.genes[gene_id].add_transcript(tx_id, tx) - - # add exons - if interval_type == 'exon': - tx.add_exon(start, end) - elif interval_type == 'CDS': - tx.add_cds(start, end) - - line = gtf_in.readline() - - gtf_in.close() - - # sort transcript exons - for tx in self.transcripts.values(): - tx.sort_exons() - - def write_gtf(self, out_gtf_file, write_cds=False, write_utrs=False): - if write_utrs and not self.utrs_defined: - self.define_utrs() - - gtf_out = open(out_gtf_file, 'w') - for tx in self.transcripts.values(): - tx.write_gtf(gtf_out, write_cds, write_utrs) - gtf_out.close() - - -################################################################################ -# Methods -################################################################################ -def gtf_kv(s): - """Convert the last gtf section of key/value pairs into a dict.""" - d = {} - - a = s.split(';') - for key_val in a: - if key_val.strip(): - eq_i = key_val.find('=') - if eq_i != -1 and key_val[eq_i-1] != '"': - kvs = key_val.split('=') - else: - kvs = key_val.split() - - key = kvs[0] - if kvs[1][0] == '"' and kvs[-1][-1] == '"': - val = (' '.join(kvs[1:]))[1:-1].strip() - else: - val = (' '.join(kvs[1:])).strip() - - d[key] = val - - return d - -def kv_gtf(d): - """Convert a kv hash to str gtf representation.""" - s = '' - - if 'gene_id' in d.keys(): - s += '%s "%s"; ' % ('gene_id',d['gene_id']) - - if 'transcript_id' in d.keys(): - s += '%s "%s"; ' % ('transcript_id',d['transcript_id']) - - for key in sorted(d.keys()): - if key not in ['gene_id','transcript_id']: - s += '%s "%s"; ' % (key,d[key]) - - return s - -def trim_dot(gene_id): - """Trim the final dot suffix off a gene_id.""" - dot_i = gene_id.rfind('.') - if dot_i != -1: - gene_id = gene_id[:dot_i] - return gene_id \ No newline at end of file diff --git a/src/baskerville/scripts/borzoi_test_genes.py b/src/baskerville/scripts/borzoi_test_genes.py deleted file mode 100755 index 83f1dec..0000000 --- a/src/baskerville/scripts/borzoi_test_genes.py +++ /dev/null @@ -1,569 +0,0 @@ -#!/usr/bin/env python -# Copyright 2021 Calico LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ========================================================================= -from optparse import OptionParser -import gc -import json -import os -import time - -from intervaltree import IntervalTree -import numpy as np -import pandas as pd -import pybedtools -import pyranges as pr -from qnorm import quantile_normalize -from scipy.stats import pearsonr -from sklearn.metrics import explained_variance_score -from tensorflow.keras import mixed_precision - -from baskerville import pygene -from baskerville import dataset -from baskerville import seqnn - -""" -borzoi_test_genes.py - -Measure accuracy at gene-level. -""" - -################################################################################ -# main -################################################################################ -def main(): - usage = "usage: %prog [options] " - parser = OptionParser(usage) - parser.add_option( - "--head", - dest="head_i", - default=0, - type="int", - help="Parameters head [Default: %default]", - ) - parser.add_option( - "-o", - dest="out_dir", - default="testg_out", - help="Output directory for predictions [Default: %default]", - ) - parser.add_option( - "--rc", - dest="rc", - default=False, - action="store_true", - help="Average the fwd and rc predictions [Default: %default]", - ) - parser.add_option( - "--shifts", - dest="shifts", - default="0", - help="Ensemble prediction shifts [Default: %default]", - ) - parser.add_option( - "--span", - dest="span", - default=False, - action="store_true", - help="Aggregate entire gene span [Default: %default]", - ) - parser.add_option( - "--f16", - dest="f16", - default=False, - action="store_true", - help="use mixed precision for inference", - ) - parser.add_option( - "-t", - dest="targets_file", - default=None, - type="str", - help="File specifying target indexes and labels in table format", - ) - parser.add_option( - "--split", - dest="split_label", - default="test", - help="Dataset split label for eg TFR pattern [Default: %default]", - ) - parser.add_option( - "--tfr", - dest="tfr_pattern", - default=None, - help="TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]", - ) - parser.add_option( - "-u", - dest="untransform_old", - default=False, - action="store_true", - help="Untransform old models [Default: %default]", - ) - (options, args) = parser.parse_args() - - if len(args) != 4: - parser.error("Must provide parameters, model, data directory, and genes GTF") - else: - params_file = args[0] - model_file = args[1] - data_dir = args[2] - genes_gtf_file = args[3] - - if not os.path.isdir(options.out_dir): - os.mkdir(options.out_dir) - - # parse shifts to integers - options.shifts = [int(shift) for shift in options.shifts.split(",")] - - ####################################################### - # inputs - - # read targets - if options.targets_file is None: - options.targets_file = "%s/targets.txt" % data_dir - targets_df = pd.read_csv(options.targets_file, index_col=0, sep="\t") - - # prep strand - targets_strand_df = dataset.targets_prep_strand(targets_df) - num_targets = targets_df.shape[0] - num_targets_strand = targets_strand_df.shape[0] - - # read model parameters - with open(params_file) as params_open: - params = json.load(params_open) - params_model = params["model"] - params_train = params["train"] - - # set strand pairs (using new indexing) - orig_new_index = dict(zip(targets_df.index, np.arange(targets_df.shape[0]))) - targets_strand_pair = np.array( - [orig_new_index[ti] for ti in targets_df.strand_pair] - ) - params_model["strand_pair"] = [targets_strand_pair] - - # construct eval data - eval_data = dataset.SeqDataset( - data_dir, - split_label=options.split_label, - batch_size=params_train["batch_size"], - mode="eval", - tfr_pattern=options.tfr_pattern, - ) - - # initialize model - ################### - # mixed precision # - ################### - if options.f16: - mixed_precision.set_global_policy('mixed_float16') # first set global policy - seqnn_model = seqnn.SeqNN(params_model) # then create model - seqnn_model.restore(model_file, options.head_i) - seqnn_model.append_activation() # add additional activation to cast float16 output to float32 - else: - # initialize model - seqnn_model = seqnn.SeqNN(params_model) - seqnn_model.restore(model_file, options.head_i) - - seqnn_model.build_slice(targets_df.index) - seqnn_model.build_ensemble(options.rc, options.shifts) - - ####################################################### - # sequence intervals - - # read data parameters - with open("%s/statistics.json" % data_dir) as data_open: - data_stats = json.load(data_open) - crop_bp = data_stats["crop_bp"] - pool_width = data_stats["pool_width"] - - # read sequence positions - seqs_df = pd.read_csv( - "%s/sequences.bed" % data_dir, - sep="\t", - names=["Chromosome", "Start", "End", "Name"], - ) - seqs_df = seqs_df[seqs_df.Name == options.split_label] - seqs_pr = pr.PyRanges(seqs_df) - - ####################################################### - # make gene BED - - t0 = time.time() - print("Making gene BED...", end="") - genes_bed_file = "%s/genes.bed" % options.out_dir - if options.span: - make_genes_span(genes_bed_file, genes_gtf_file, options.out_dir) - else: - make_genes_exon(genes_bed_file, genes_gtf_file, options.out_dir) - - genes_pr = pr.read_bed(genes_bed_file) - print("DONE in %ds" % (time.time() - t0)) - - # count gene normalization lengths - gene_lengths = {} - gene_strand = {} - for line in open(genes_bed_file): - a = line.rstrip().split("\t") - gene_id = a[3] - gene_seg_len = int(a[2]) - int(a[1]) - gene_lengths[gene_id] = gene_lengths.get(gene_id, 0) + gene_seg_len - gene_strand[gene_id] = a[5] - - ####################################################### - # intersect genes w/ preds, targets - - # intersect seqs, genes - t0 = time.time() - print("Intersecting sequences w/ genes...", end="") - seqs_genes_pr = seqs_pr.join(genes_pr) - print("DONE in %ds" % (time.time() - t0), flush=True) - - # hash preds/targets by gene_id - gene_preds_dict = {} - gene_targets_dict = {} - - si = 0 - for x, y in eval_data.dataset: - # predict only if gene overlaps - yh = None - y = y.numpy()[..., targets_df.index] - - t0 = time.time() - print("Sequence %d..." % si, end="") - for bsi in range(x.shape[0]): - seq = seqs_df.iloc[si + bsi] - - cseqs_genes_df = seqs_genes_pr[seq.Chromosome].df - if cseqs_genes_df.shape[0] == 0: - # empty. no genes on this chromosome - seq_genes_df = cseqs_genes_df - else: - seq_genes_df = cseqs_genes_df[cseqs_genes_df.Start == seq.Start] - - for _, seq_gene in seq_genes_df.iterrows(): - gene_id = seq_gene.Name_b - gene_start = seq_gene.Start_b - gene_end = seq_gene.End_b - seq_start = seq_gene.Start - - # clip boundaries - gene_seq_start = max(0, gene_start - seq_start) - gene_seq_end = max(0, gene_end - seq_start) - - # requires >50% overlap - bin_start = int(np.round(gene_seq_start / pool_width)) - bin_end = int(np.round(gene_seq_end / pool_width)) - - # predict - if yh is None: - yh = seqnn_model(x) - - # slice gene region - yhb = yh[bsi, bin_start:bin_end].astype("float16") - yb = y[bsi, bin_start:bin_end].astype("float16") - - if len(yb) > 0: - gene_preds_dict.setdefault(gene_id, []).append(yhb) - gene_targets_dict.setdefault(gene_id, []).append(yb) - - # advance sequence table index - si += x.shape[0] - print("DONE in %ds" % (time.time() - t0), flush=True) - if si % 128 == 0: - gc.collect() - - # aggregate gene bin values into arrays - gene_targets = [] - gene_preds = [] - gene_ids = sorted(gene_targets_dict.keys()) - gene_within = [] - gene_wvar = [] - - for gene_id in gene_ids: - gene_preds_gi = np.concatenate(gene_preds_dict[gene_id], axis=0).astype( - "float32" - ) - gene_targets_gi = np.concatenate(gene_targets_dict[gene_id], axis=0).astype( - "float32" - ) - - # slice strand - if gene_strand[gene_id] == "+": - gene_strand_mask = (targets_df.strand != "-").to_numpy() - else: - gene_strand_mask = (targets_df.strand != "+").to_numpy() - gene_preds_gi = gene_preds_gi[:, gene_strand_mask] - gene_targets_gi = gene_targets_gi[:, gene_strand_mask] - - if gene_targets_gi.shape[0] == 0: - print(gene_id, gene_targets_gi.shape, gene_preds_gi.shape) - - # untransform - if options.untransform_old: - gene_preds_gi = dataset.untransform_preds1(gene_preds_gi, targets_strand_df) - gene_targets_gi = dataset.untransform_preds1(gene_targets_gi, targets_strand_df) - else: - gene_preds_gi = dataset.untransform_preds(gene_preds_gi, targets_strand_df) - gene_targets_gi = dataset.untransform_preds(gene_targets_gi, targets_strand_df) - - # compute within gene correlation before dropping length axis - gene_corr_gi = np.zeros(num_targets_strand) - for ti in range(num_targets_strand): - if ( - gene_preds_gi[:, ti].var() > 1e-6 - and gene_targets_gi[:, ti].var() > 1e-6 - ): - preds_log = np.log2(gene_preds_gi[:, ti] + 1) - targets_log = np.log2(gene_targets_gi[:, ti] + 1) - gene_corr_gi[ti] = pearsonr(preds_log, targets_log)[0] - # gene_corr_gi[ti] = pearsonr(gene_preds_gi[:,ti], gene_targets_gi[:,ti])[0] - else: - gene_corr_gi[ti] = np.nan - gene_within.append(gene_corr_gi) - gene_wvar.append(gene_targets_gi.var(axis=0)) - - # TEMP: save gene preds/targets - # os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True) - # np.save('%s/gene_within/%s_preds.npy' % (options.out_dir, gene_id), gene_preds_gi.astype('float16')) - # np.save('%s/gene_within/%s_targets.npy' % (options.out_dir, gene_id), gene_targets_gi.astype('float16')) - - # mean coverage - gene_preds_gi = gene_preds_gi.mean(axis=0) - gene_targets_gi = gene_targets_gi.mean(axis=0) - - # scale by gene length - gene_preds_gi *= gene_lengths[gene_id] - gene_targets_gi *= gene_lengths[gene_id] - - gene_preds.append(gene_preds_gi) - gene_targets.append(gene_targets_gi) - - gene_targets = np.array(gene_targets) - gene_preds = np.array(gene_preds) - gene_within = np.array(gene_within) - gene_wvar = np.array(gene_wvar) - - # log2 transform - gene_targets = np.log2(gene_targets + 1) - gene_preds = np.log2(gene_preds + 1) - - # save values - genes_targets_df = pd.DataFrame( - gene_targets, index=gene_ids, columns=targets_strand_df.identifier - ) - genes_targets_df.to_csv("%s/gene_targets.tsv" % options.out_dir, sep="\t") - genes_preds_df = pd.DataFrame( - gene_preds, index=gene_ids, columns=targets_strand_df.identifier - ) - genes_preds_df.to_csv("%s/gene_preds.tsv" % options.out_dir, sep="\t") - genes_within_df = pd.DataFrame( - gene_within, index=gene_ids, columns=targets_strand_df.identifier - ) - genes_within_df.to_csv("%s/gene_within.tsv" % options.out_dir, sep="\t") - genes_var_df = pd.DataFrame( - gene_wvar, index=gene_ids, columns=targets_strand_df.identifier - ) - genes_var_df.to_csv("%s/gene_var.tsv" % options.out_dir, sep="\t") - - # quantile and mean normalize - gene_targets_norm = quantile_normalize(gene_targets, ncpus=2) - gene_targets_norm = gene_targets_norm - gene_targets_norm.mean( - axis=-1, keepdims=True - ) - gene_preds_norm = quantile_normalize(gene_preds, ncpus=2) - gene_preds_norm = gene_preds_norm - gene_preds_norm.mean(axis=-1, keepdims=True) - - ####################################################### - # accuracy stats - - wvar_t = np.percentile(gene_wvar, 80, axis=0) - - acc_pearsonr = [] - acc_r2 = [] - acc_npearsonr = [] - acc_nr2 = [] - acc_wpearsonr = [] - for ti in range(num_targets_strand): - r_ti = pearsonr(gene_targets[:, ti], gene_preds[:, ti])[0] - acc_pearsonr.append(r_ti) - r2_ti = explained_variance_score(gene_targets[:, ti], gene_preds[:, ti]) - acc_r2.append(r2_ti) - nr_ti = pearsonr(gene_targets_norm[:, ti], gene_preds_norm[:, ti])[0] - acc_npearsonr.append(nr_ti) - nr2_ti = explained_variance_score( - gene_targets_norm[:, ti], gene_preds_norm[:, ti] - ) - acc_nr2.append(nr2_ti) - var_mask = gene_wvar[:, ti] > wvar_t[ti] - wr_ti = gene_within[var_mask].mean() - acc_wpearsonr.append(wr_ti) - - acc_df = pd.DataFrame( - { - "identifier": targets_strand_df.identifier, - "pearsonr": acc_pearsonr, - "r2": acc_r2, - "pearsonr_norm": acc_npearsonr, - "r2_norm": acc_nr2, - "pearsonr_gene": acc_wpearsonr, - "description": targets_strand_df.description, - } - ) - acc_df.to_csv("%s/acc.txt" % options.out_dir, sep="\t") - - print("%d genes" % gene_targets.shape[0]) - print("Overall PearsonR: %.4f" % np.mean(acc_df.pearsonr)) - print("Overall R2: %.4f" % np.mean(acc_df.r2)) - print("Normalized PearsonR: %.4f" % np.mean(acc_df.pearsonr_norm)) - print("Normalized R2: %.4f" % np.mean(acc_df.r2_norm)) - print("Within-gene PearsonR: %.4f" % np.mean(acc_df.pearsonr_gene)) - - -def genes_aggregate(genes_bed_file, values_bedgraph): - """Aggregate values across genes. - - Args: - genes_bed_file (str): BED file of genes. - values_bedgraph (str): BedGraph file of values. - - Returns: - gene_values (dict): Dictionary of gene values. - """ - values_bt = pybedtools.BedTool(values_bedgraph) - genes_bt = pybedtools.BedTool(genes_bed_file) - - gene_values = {} - - for overlap in genes_bt.intersect(values_bt, wo=True): - gene_id = overlap[3] - value = overlap[7] - gene_values[gene_id] = gene_values.get(gene_id, 0) + value - - return gene_values - - -def make_genes_exon(genes_bed_file: str, genes_gtf_file: str, out_dir: str): - """Make a BED file with each genes' exons, excluding exons overlapping - across genes. - - Args: - genes_bed_file (str): Output BED file of genes. - genes_gtf_file (str): Input GTF file of genes. - out_dir (str): Output directory for temporary files. - """ - # read genes - genes_gtf = pygene.GTF(genes_gtf_file) - - # write gene exons - agenes_bed_file = "%s/genes_all.bed" % out_dir - agenes_bed_out = open(agenes_bed_file, "w") - for gene_id, gene in genes_gtf.genes.items(): - # collect exons - gene_intervals = IntervalTree() - for tx_id, tx in gene.transcripts.items(): - for exon in tx.exons: - gene_intervals[exon.start - 1 : exon.end] = True - - # union - gene_intervals.merge_overlaps() - - # write - for interval in sorted(gene_intervals): - cols = [ - gene.chrom, - str(interval.begin), - str(interval.end), - gene_id, - ".", - gene.strand, - ] - print("\t".join(cols), file=agenes_bed_out) - agenes_bed_out.close() - - # find overlapping exons - genes1_bt = pybedtools.BedTool(agenes_bed_file) - genes2_bt = pybedtools.BedTool(agenes_bed_file) - overlapping_exons = set() - for overlap in genes1_bt.intersect(genes2_bt, s=True, wo=True): - gene1_id = overlap[3] - gene1_start = int(overlap[1]) - gene1_end = int(overlap[2]) - overlapping_exons.add((gene1_id, gene1_start, gene1_end)) - - gene2_id = overlap[9] - gene2_start = int(overlap[7]) - gene2_end = int(overlap[8]) - overlapping_exons.add((gene2_id, gene2_start, gene2_end)) - - # filter for nonoverlapping exons - genes_bed_out = open(genes_bed_file, "w") - for line in open(agenes_bed_file): - a = line.split() - start = int(a[1]) - end = int(a[2]) - gene_id = a[-1] - if (gene_id, start, end) not in overlapping_exons: - print(line, end="", file=genes_bed_out) - genes_bed_out.close() - - -def make_genes_span( - genes_bed_file: str, genes_gtf_file: str, out_dir: str, stranded: bool = True -): - """Make a BED file with the span of each gene. - - Args: - genes_bed_file (str): Output BED file of genes. - genes_gtf_file (str): Input GTF file of genes. - out_dir (str): Output directory for temporary files. - stranded (bool): Perform stranded intersection. - """ - # read genes - genes_gtf = pygene.GTF(genes_gtf_file) - - # write all gene spans - agenes_bed_file = "%s/genes_all.bed" % out_dir - agenes_bed_out = open(agenes_bed_file, "w") - for gene_id, gene in genes_gtf.genes.items(): - start, end = gene.span() - cols = [gene.chrom, str(start - 1), str(end), gene_id, ".", gene.strand] - print("\t".join(cols), file=agenes_bed_out) - agenes_bed_out.close() - - # find overlapping genes - genes1_bt = pybedtools.BedTool(agenes_bed_file) - genes2_bt = pybedtools.BedTool(agenes_bed_file) - overlapping_genes = set() - for overlap in genes1_bt.intersect(genes2_bt, s=stranded, wo=True): - gene1_id = overlap[3] - gene2_id = overlap[7] - if gene1_id != gene2_id: - overlapping_genes.add(gene1_id) - overlapping_genes.add(gene2_id) - - # filter for nonoverlapping genes - genes_bed_out = open(genes_bed_file, "w") - for line in open(agenes_bed_file): - gene_id = line.split()[-1] - if gene_id not in overlapping_genes: - print(line, end="", file=genes_bed_out) - genes_bed_out.close() - - -################################################################################ -# __main__ -################################################################################ -if __name__ == "__main__": - main() From d500a2669a0eb6b547b895126cfa613917e66d79 Mon Sep 17 00:00:00 2001 From: hy395 Date: Fri, 4 Oct 2024 09:09:44 -0700 Subject: [PATCH 16/26] move transfer param to json --- .gitignore | 5 + src/baskerville/adapters.py | 140 ++-- src/baskerville/blocks.py | 37 +- src/baskerville/helpers/transfer.py | 450 +++++++------ src/baskerville/helpers/transfer_helper.py | 635 ------------------ src/baskerville/layers.py | 334 +-------- src/baskerville/metrics.py | 8 +- src/baskerville/scripts/hound_eval.py | 4 +- src/baskerville/scripts/hound_eval_spec.py | 18 +- src/baskerville/scripts/hound_train.py | 2 +- src/baskerville/scripts/hound_transfer.py | 256 ++++--- src/baskerville/seqnn.py | 36 +- src/baskerville/trainer.py | 68 +- tests/data/transfer/model_houlsby.json | 79 +++ tests/data/transfer/model_se4.json | 81 +++ .../transfer/transfer_json/params_full.json | 80 +++ .../transfer_json/params_houlsby.json | 82 +++ .../transfer/transfer_json/params_ia3.json | 81 +++ .../transfer/transfer_json/params_linear.json | 80 +++ .../transfer/transfer_json/params_locon4.json | 83 +++ .../transfer/transfer_json/params_lora.json | 82 +++ .../transfer/transfer_json/params_se4.json | 84 +++ tests/test_transfer/test/params.json | 80 +++ tests/test_transfer/test_ia3.ipynb | 195 ------ tests/test_transfer/test_load_houlsby.py | 25 + tests/test_transfer/test_load_se2.py | 25 + tests/test_transfer/test_seqlen.py | 0 tests/test_transfer/test_transfer.sh | 12 + 28 files changed, 1437 insertions(+), 1625 deletions(-) delete mode 100644 src/baskerville/helpers/transfer_helper.py create mode 100644 tests/data/transfer/model_houlsby.json create mode 100644 tests/data/transfer/model_se4.json create mode 100644 tests/data/transfer/transfer_json/params_full.json create mode 100644 tests/data/transfer/transfer_json/params_houlsby.json create mode 100644 tests/data/transfer/transfer_json/params_ia3.json create mode 100644 tests/data/transfer/transfer_json/params_linear.json create mode 100644 tests/data/transfer/transfer_json/params_locon4.json create mode 100644 tests/data/transfer/transfer_json/params_lora.json create mode 100644 tests/data/transfer/transfer_json/params_se4.json create mode 100644 tests/test_transfer/test/params.json delete mode 100644 tests/test_transfer/test_ia3.ipynb create mode 100644 tests/test_transfer/test_load_houlsby.py create mode 100644 tests/test_transfer/test_load_se2.py create mode 100644 tests/test_transfer/test_seqlen.py create mode 100644 tests/test_transfer/test_transfer.sh diff --git a/.gitignore b/.gitignore index 069c8b2..3d0903b 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,8 @@ dmypy.json # Pyre type checker .pyre/ + +# additional untracked files +src/baskerville/scripts/borzoi_test_genes.py +src/baskerville/pygene.py +src/baskerville/snps_old.py diff --git a/src/baskerville/adapters.py b/src/baskerville/adapters.py index f05a307..7e86ac6 100644 --- a/src/baskerville/adapters.py +++ b/src/baskerville/adapters.py @@ -23,18 +23,16 @@ for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) + ##################### # transfer learning # ##################### class IA3(tf.keras.layers.Layer): # https://arxiv.org/pdf/2205.05638.pdf # ia3 module for attention layer, scale output. - - def __init__(self, - original_layer, - trainable=False, - **kwargs): - + + def __init__(self, original_layer, trainable=False, **kwargs): + # keep the name of this layer the same as the original dense layer. original_layer_config = original_layer.get_config() name = original_layer_config["name"] @@ -42,7 +40,7 @@ def __init__(self, super().__init__(name=name, trainable=trainable, **kwargs) self.output_dim = original_layer_config["units"] - + self.original_layer = original_layer self.original_layer.trainable = False @@ -52,12 +50,12 @@ def __init__(self, use_bias=False, kernel_initializer=tf.keras.initializers.Ones(), trainable=True, - name="ia3" + name="ia3", ) def call(self, inputs): original_output = self.original_layer(inputs) - scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + scaler = self._ia3_layer(tf.constant([[1]], dtype="float64"))[0] return original_output * scaler def get_config(self): @@ -69,15 +67,13 @@ def get_config(self): ) return config + class IA3_ff(tf.keras.layers.Layer): # https://arxiv.org/pdf/2205.05638.pdf # ia3 module for down-projection ff layer, scale input. - - def __init__(self, - original_layer, - trainable=False, - **kwargs): - + + def __init__(self, original_layer, trainable=False, **kwargs): + # keep the name of this layer the same as the original dense layer. original_layer_config = original_layer.get_config() name = original_layer_config["name"] @@ -85,7 +81,7 @@ def __init__(self, super().__init__(name=name, trainable=trainable, **kwargs) self.input_dim = original_layer.input_shape[-1] - + self.original_layer = original_layer self.original_layer.trainable = False @@ -95,35 +91,27 @@ def __init__(self, use_bias=False, kernel_initializer=tf.keras.initializers.Ones(), trainable=True, - name="ia3_ff" + name="ia3_ff", ) def call(self, inputs): - scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] + scaler = self._ia3_layer(tf.constant([[1]], dtype="float64"))[0] return self.original_layer(inputs * scaler) def get_config(self): config = super().get_config().copy() - config.update( - { - "size": self.input_dim - } - ) + config.update({"size": self.input_dim}) return config - + + class Lora(tf.keras.layers.Layer): - # adapted from: + # adapted from: # https://arxiv.org/abs/2106.09685 # https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/ # https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py - - def __init__(self, - original_layer, - rank=8, - alpha=16, - trainable=False, - **kwargs): - + + def __init__(self, original_layer, rank=8, alpha=16, trainable=False, **kwargs): + # keep the name of this layer the same as the original dense layer. original_layer_config = original_layer.get_config() name = original_layer_config["name"] @@ -131,9 +119,11 @@ def __init__(self, super().__init__(name=name, trainable=trainable, **kwargs) self.output_dim = original_layer_config["units"] - + if rank > self.output_dim: - raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + raise ValueError( + f"LoRA rank {rank} must be less or equal than {self.output_dim}" + ) self.rank = rank self.alpha = alpha @@ -148,9 +138,9 @@ def __init__(self, units=rank, use_bias=False, kernel_initializer=tf.keras.initializers.HeUniform(), - #kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), + # kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), trainable=True, - name="lora_a" + name="lora_a", ) self.up_layer = tf.keras.layers.Dense( @@ -158,7 +148,7 @@ def __init__(self, use_bias=False, kernel_initializer=tf.keras.initializers.Zeros(), trainable=True, - name="lora_b" + name="lora_b", ) def call(self, inputs): @@ -168,27 +158,18 @@ def call(self, inputs): def get_config(self): config = super().get_config().copy() - config.update( - { - "rank": self.rank, - "alpha": self.alpha - } - ) + config.update({"rank": self.rank, "alpha": self.alpha}) return config + class Locon(tf.keras.layers.Layer): - # LoRA for conv-layer, adapted from: + # LoRA for conv-layer, adapted from: # https://arxiv.org/pdf/2309.14859#page=23.84 # https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/modules/locon.py # use default alpha and rank for locon - - def __init__(self, - original_layer, - rank=4, - alpha=1, - trainable=False, - **kwargs): - + + def __init__(self, original_layer, rank=4, alpha=1, trainable=False, **kwargs): + # keep the name of this layer the same as the original conv layer. original_layer_config = original_layer.get_config() name = original_layer_config["name"] @@ -197,9 +178,11 @@ def __init__(self, self.input_dim = original_layer.input_shape[-1] self.output_dim = original_layer_config["filters"] - + if rank > self.output_dim: - raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") + raise ValueError( + f"LoRA rank {rank} must be less or equal than {self.output_dim}" + ) self.rank = rank self.alpha = alpha @@ -209,14 +192,14 @@ def __init__(self, input_dim = original_layer.input_shape[-1] output_dim = original_layer_config["filters"] - kernel_size = original_layer_config['kernel_size'][0] - stride = original_layer_config['strides'][0] + kernel_size = original_layer_config["kernel_size"][0] + stride = original_layer_config["strides"][0] dilation_rate = original_layer_config["dilation_rate"][0] # Note: the original paper mentions that normal distribution was # used for initialization. However, the official LoRA implementation # uses "Kaiming/He Initialization". - + self.down_layer = tf.keras.layers.Conv1D( filters=rank, kernel_size=kernel_size, @@ -225,9 +208,9 @@ def __init__(self, use_bias=False, dilation_rate=dilation_rate, kernel_initializer=tf.keras.initializers.HeUniform(), - name='locon_down' + name="locon_down", ) - + self.up_layer = tf.keras.layers.Conv1D( filters=output_dim, kernel_size=1, @@ -235,7 +218,7 @@ def __init__(self, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.Zeros(), - name='locon_up' + name="locon_up", ) def call(self, inputs): @@ -245,42 +228,34 @@ def call(self, inputs): def get_config(self): config = super().get_config().copy() - config.update( - { - "rank": self.rank, - "alpha": self.alpha - } - ) + config.update({"rank": self.rank, "alpha": self.alpha}) return config + class AdapterHoulsby(tf.keras.layers.Layer): # https://arxiv.org/abs/1902.00751 # adapted from: https://github.com/jain-harshil/Adapter-BERT - - def __init__( - self, - latent_size, - activation=tf.keras.layers.ReLU(), - **kwargs): + + def __init__(self, latent_size, activation=tf.keras.layers.ReLU(), **kwargs): super(AdapterHoulsby, self).__init__(**kwargs) self.latent_size = latent_size self.activation = activation def build(self, input_shape): self.down_project = tf.keras.layers.Dense( - units=self.latent_size, - activation="linear", + units=self.latent_size, + activation="linear", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), bias_initializer="zeros", - name='adapter_down' + name="adapter_down", ) - + self.up_project = tf.keras.layers.Dense( - units=input_shape[-1], + units=input_shape[-1], activation="linear", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), bias_initializer="zeros", - name='adapter_up' + name="adapter_up", ) def call(self, inputs): @@ -292,10 +267,5 @@ def call(self, inputs): def get_config(self): config = super().get_config().copy() - config.update( - { - "latent_size": self.latent_size, - "activation": self.activation - } - ) + config.update({"latent_size": self.latent_size, "activation": self.activation}) return config diff --git a/src/baskerville/blocks.py b/src/baskerville/blocks.py index ffbdb75..ce939ef 100644 --- a/src/baskerville/blocks.py +++ b/src/baskerville/blocks.py @@ -195,7 +195,7 @@ def conv_dna( kernel_initializer=kernel_initializer, kernel_regularizer=tf.keras.regularizers.l2(l2_scale), )(current) - + # squeeze-excite if se: current = squeeze_excite(current) @@ -1109,8 +1109,6 @@ def transformer( qkv_width=1, mha_initializer="he_normal", kernel_initializer="he_normal", - adapter=None, - latent=16, seqlen_train=None, **kwargs, ): @@ -1143,17 +1141,13 @@ def transformer( initializer=mha_initializer, l2_scale=mha_l2_scale, qkv_width=qkv_width, - seqlen_train=seqlen_train + seqlen_train=seqlen_train, )(current) # dropout if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) - # add houlsby-adapter - if adapter=='houlsby': - current = layers.AdapterHoulsby(latent_size=latent)(current) - # residual current = tf.keras.layers.Add()([inputs, current]) @@ -1161,7 +1155,7 @@ def transformer( final = current else: final = transformer_dense( - current, out_size, dense_expansion, l2_scale, dropout, kernel_initializer, adapter, latent + current, out_size, dense_expansion, l2_scale, dropout, kernel_initializer ) return final @@ -1273,8 +1267,7 @@ def transformer_split( def transformer_dense( - inputs, out_size, dense_expansion, l2_scale, dropout, kernel_initializer, - adapter=None, latent=16 + inputs, out_size, dense_expansion, l2_scale, dropout, kernel_initializer ): """Transformer block dense portion.""" # layer norm @@ -1306,9 +1299,6 @@ def transformer_dense( if dropout > 0: current = tf.keras.layers.Dropout(dropout)(current) - if adapter=='houlsby': - current = layers.AdapterHoulsby(latent_size=latent)(current) - # residual final = tf.keras.layers.Add()([inputs, current]) @@ -1451,20 +1441,21 @@ def squeeze_excite( additive=False, norm_type=None, bn_momentum=0.9, - kernel_initializer='glorot_uniform', + kernel_initializer="glorot_uniform", use_bias=True, - scale_fun='sigmoid', + scale_fun="sigmoid", **kwargs, ): return layers.SqueezeExcite( - activation=activation, - additive=additive, - bottleneck_ratio=bottleneck_ratio, - norm_type=norm_type, - bn_momentum=bn_momentum, - kernel_initializer=kernel_initializer, + activation=activation, + additive=additive, + bottleneck_ratio=bottleneck_ratio, + norm_type=norm_type, + bn_momentum=bn_momentum, + kernel_initializer=kernel_initializer, scale_fun=scale_fun, - use_bias=use_bias)(inputs) + use_bias=use_bias, + )(inputs) def wheeze_excite(inputs, pool_size, **kwargs): diff --git a/src/baskerville/helpers/transfer.py b/src/baskerville/helpers/transfer.py index 25b80f5..a52554a 100644 --- a/src/baskerville/helpers/transfer.py +++ b/src/baskerville/helpers/transfer.py @@ -16,27 +16,34 @@ from baskerville import layers from baskerville import adapters -def param_count(layer, type='all'): - if type not in ['all','trainable','non_trainable']: + +def param_count(layer, type="all"): + if type not in ["all", "trainable", "non_trainable"]: raise ValueError("TYPE must be one of all, trainable, non_trainable") output = 0 - if type=='all': + if type == "all": output = int(sum(tf.keras.backend.count_params(w) for w in layer.weights)) - elif type=='trainable': - output = int(sum(tf.keras.backend.count_params(w) for w in layer.trainable_weights)) + elif type == "trainable": + output = int( + sum(tf.keras.backend.count_params(w) for w in layer.trainable_weights) + ) else: - output = int(sum(tf.keras.backend.count_params(w) for w in layer.non_trainable_weights)) + output = int( + sum(tf.keras.backend.count_params(w) for w in layer.non_trainable_weights) + ) return output + def param_summary(model): - trainable = param_count(model, type='trainable') - non_trainable = param_count(model, type='non_trainable') - print('total params:%d' %(trainable + non_trainable)) - print('trainable params:%d' %trainable) - print('non-trainable params:%d' %non_trainable) + trainable = param_count(model, type="trainable") + non_trainable = param_count(model, type="non_trainable") + print("total params:%d" % (trainable + non_trainable)) + print("trainable params:%d" % trainable) + print("non-trainable params:%d" % non_trainable) + def keras2dict(model): - layer_parent_dict = {} # the parent layers of each layer in the old graph + layer_parent_dict = {} # the parent layers of each layer in the old graph for layer in model.layers: for node in layer._outbound_nodes: layer_name = node.outbound_layer.name @@ -47,6 +54,7 @@ def keras2dict(model): layer_parent_dict[layer_name].append(layer.name) return layer_parent_dict + # lora requires change model.h5 weight order. # locon and ia3 don't modify model in place. def var_reorder(weight_h5): @@ -56,49 +64,56 @@ def var_reorder(weight_h5): # When inserting lora, multihead_attention layer weights order changed. # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. - var_init_order = ['r_w_bias:0:0', - 'r_r_bias:0:0', - 'q_layer/kernel:0', - 'k_layer/kernel:0', - 'v_layer/kernel:0', - 'embedding_layer/kernel:0', - 'embedding_layer/bias:0', - 'r_k_layer/kernel:0'] - - f = h5py.File(weight_h5, 'r+') - layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] + var_init_order = [ + "r_w_bias:0:0", + "r_r_bias:0:0", + "q_layer/kernel:0", + "k_layer/kernel:0", + "v_layer/kernel:0", + "embedding_layer/kernel:0", + "embedding_layer/bias:0", + "r_k_layer/kernel:0", + ] + + f = h5py.File(weight_h5, "r+") + layers = [i for i in list(f["model_weights"].keys()) if "multihead_attention" in i] for l_name in layers: - new_name_order = [l_name+'/'+i for i in var_init_order] - f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) + new_name_order = [l_name + "/" + i for i in var_init_order] + f["model_weights"][l_name].attrs.modify( + name="weight_names", value=new_name_order + ) f.close() # houlsby requires architecture change. # thus we need to modify json. -def modify_json(input_json, output_json, adapter, latent=8, se_rank=None, conv_select=None): +def modify_json( + input_json, output_json, adapter, latent=8, se_rank=None, conv_select=None +): with open(input_json) as params_open: params = json.load(params_open) # houlsby - if adapter=='adapterHoulsby': - params["model"]['adapter']= 'houlsby' - params["model"]['adapter_latent']= latent + if adapter == "adapterHoulsby": + params["model"]["adapter"] = "houlsby" + params["model"]["adapter_latent"] = latent # houlsby_se - elif adapter=='houlsby_se': - params["model"]['adapter']= 'houlsby_se' - params["model"]['adapter_latent']= latent - params["model"]['se_rank']= se_rank - params["model"]['conv_select']= conv_select + elif adapter == "houlsby_se": + params["model"]["adapter"] = "houlsby_se" + params["model"]["adapter_latent"] = latent + params["model"]["se_rank"] = se_rank + params["model"]["conv_select"] = conv_select else: raise ValueError("adapter must be adapterHoulsby or houlsby_se") - + ### output - with open(output_json, 'w') as params_open: + with open(output_json, "w") as params_open: json.dump(params, params_open, indent=4) - + + ###################### # add houlsby layers # ###################### @@ -111,57 +126,60 @@ def add_houlsby(input_model, strand_pair, latent_size=8): # houlsby layers # ################## houlsby_layers = [] - for i in range(len(input_model.layers)-1): + for i in range(len(input_model.layers) - 1): layer = input_model.layers[i] - next_layer = input_model.layers[i+1] - if re.match('dropout', layer.name) and re.match('add', next_layer.name): + next_layer = input_model.layers[i + 1] + if re.match("dropout", layer.name) and re.match("add", next_layer.name): houlsby_layers += [next_layer.name] ################### # construct model # - ################### + ################### layer_parent_dict_old = keras2dict(input_model) # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + to_fix = [i for i in layer_parent_dict_old if re.match("switch_reverse", i)] for i in to_fix: del layer_parent_dict_old[i] # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) # Iterate over all layers after the input model_outputs = [] reverse_bool = None for layer in input_model.layers[1:-1]: - + # parent layers parent_layers = layer_parent_dict_old[layer.name] - + # layer inputs layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - + if len(layer_input) == 1: + layer_input = layer_input[0] + + if re.match("stochastic_reverse_complement", layer.name): + x, reverse_bool = layer(layer_input) + # insert houlsby layer: elif layer.name in houlsby_layers: - print('adapter added before:%s'%layer.name) + print("adapter added before:%s" % layer.name) x = adapters.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) x = layer([layer_input[0], x]) - + else: x = layer(layer_input) - + # save the output tensor of every layer layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + + final = layers.SwitchReverse(strand_pair)( + [layer_output_dict_new[input_model.layers[-2].name], reverse_bool] + ) model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) # set trainable # - for l in model_adapter.layers[:-2]: # trunk - if re.match('layer_normalization|adapter_houlsby', l.name): + for l in model_adapter.layers[:-2]: # trunk + if re.match("layer_normalization|adapter_houlsby", l.name): l.trainable = True else: l.trainable = False @@ -169,70 +187,84 @@ def add_houlsby(input_model, strand_pair, latent_size=8): # expected number of trainable params added/unfrozen: params_added = 0 for l in model_adapter.layers: - if l.name.startswith("adapter_houlsby"): + if l.name.startswith("adapter_houlsby"): params_added += param_count(l) - elif l.name.startswith("layer_normalization"): - params_added += param_count(l, type='trainable') - print('params added/unfrozen by adapter_houlsby: %d'%params_added) + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type="trainable") + print("params added/unfrozen by adapter_houlsby: %d" % params_added) return model_adapter + ############### # lora layers # ############### -def add_lora(input_model, rank=8, alpha=16, mode='default', report_param=True): +def add_lora(input_model, rank=8, alpha=16, mode="default", report_param=True): # take seqnn.model as input # replace _q_layer, _v_layer in multihead_attention # optionally replace _k_layer, _embedding_layer - if mode not in ['default','full']: + if mode not in ["default", "full"]: raise ValueError("mode must be default or full") - + for layer in input_model.layers: - if re.match('multihead_attention', layer.name): + if re.match("multihead_attention", layer.name): # default loRA - layer._q_layer = adapters.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) - layer._v_layer = adapters.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) + layer._q_layer = adapters.Lora( + layer._q_layer, rank=rank, alpha=alpha, trainable=True + ) + layer._v_layer = adapters.Lora( + layer._v_layer, rank=rank, alpha=alpha, trainable=True + ) # full loRA - if mode=='full': - layer._k_layer = adapters.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) - layer._embedding_layer = adapters.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) - - input_model(input_model.input) # initialize new variables + if mode == "full": + layer._k_layer = adapters.Lora( + layer._k_layer, rank=rank, alpha=alpha, trainable=True + ) + layer._embedding_layer = adapters.Lora( + layer._embedding_layer, rank=rank, alpha=alpha, trainable=True + ) + + input_model(input_model.input) # initialize new variables # freeze all params but lora for layer in input_model._flatten_layers(): lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: + if len(lst_of_sublayers) == 1: if layer.name in ["lora_a", "lora_b"]: layer.trainable = True else: layer.trainable = False - ### bias terms need to be frozen separately + ### bias terms need to be frozen separately for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + if re.match("multihead_attention", layer.name): + layer._r_w_bias = tf.Variable( + layer._r_w_bias, trainable=False, name=layer._r_w_bias.name + ) + layer._r_r_bias = tf.Variable( + layer._r_r_bias, trainable=False, name=layer._r_r_bias.name + ) # set final head to be trainable - input_model.layers[-2].trainable=True + input_model.layers[-2].trainable = True # expected number of trainable params added/unfrozen: params_added = 0 for l in input_model.layers: - if re.match('multihead_attention', l.name): + if re.match("multihead_attention", l.name): params_added += param_count(l._q_layer.down_layer) params_added += param_count(l._q_layer.up_layer) params_added += param_count(l._v_layer.down_layer) params_added += param_count(l._v_layer.up_layer) - if mode=='full': + if mode == "full": params_added += param_count(l._k_layer.down_layer) params_added += param_count(l._k_layer.up_layer) params_added += param_count(l._embedding_layer.down_layer) params_added += param_count(l._embedding_layer.up_layer) if report_param: - print('params added/unfrozen by lora: %d'%params_added) + print("params added/unfrozen by lora: %d" % params_added) + ############### # lora layers # @@ -240,28 +272,30 @@ def add_lora(input_model, rank=8, alpha=16, mode='default', report_param=True): def add_lora_conv(input_model, conv_select=None): # add lora layers - add_lora(input_model, rank=8, alpha=16, mode='default', report_param=False) + add_lora(input_model, rank=8, alpha=16, mode="default", report_param=False) # list all conv layers conv_layers = [] for layer in input_model.layers: - if re.match('conv1d', layer.name): + if re.match("conv1d", layer.name): conv_layers += [layer.name] - if conv_select is None: + if conv_select is None: conv_select = len(conv_layers) if conv_select > len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + raise ValueError( + "conv_select must be less than number of conv layers %d." % len(conv_layers) + ) # set conv layers trainable - trainable_conv = conv_layers[-conv_select:] + trainable_conv = conv_layers[-conv_select:] for layer in input_model.layers: if layer.name in trainable_conv: - layer.trainable=True - + layer.trainable = True + # expected number of trainable params added/unfrozen: params_added = 0 for l in input_model.layers: - if re.match('multihead_attention', l.name): + if re.match("multihead_attention", l.name): params_added += param_count(l._q_layer.down_layer) params_added += param_count(l._q_layer.up_layer) params_added += param_count(l._v_layer.down_layer) @@ -269,26 +303,30 @@ def add_lora_conv(input_model, conv_select=None): elif l.name in trainable_conv: params_added += param_count(l) - print('params added/unfrozen by lora_conv: %d'%params_added) + print("params added/unfrozen by lora_conv: %d" % params_added) + # merge lora weights # def merge_lora_layer(lora_layer): down_weights = lora_layer.down_layer.kernel up_weights = lora_layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + increment_weights = ( + tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale + ) lora_layer.original_layer.kernel.assign_add(increment_weights) return lora_layer.original_layer + def merge_lora(input_model): for layer in input_model.layers: - if 'multihead_attention' in layer.name: + if "multihead_attention" in layer.name: if isinstance(layer._q_layer, adapters.Lora): layer._q_layer = merge_lora_layer(layer._q_layer) - if isinstance(layer._v_layer, adapters.Lora): + if isinstance(layer._v_layer, adapters.Lora): layer._v_layer = merge_lora_layer(layer._v_layer) - if isinstance(layer._k_layer, adapters.Lora): + if isinstance(layer._k_layer, adapters.Lora): layer._k_layer = merge_lora_layer(layer._k_layer) - if isinstance(layer._embedding_layer, adapters.Lora): + if isinstance(layer._embedding_layer, adapters.Lora): layer._embedding_layer = merge_lora_layer(layer._embedding_layer) input_model(input_model.input) @@ -297,88 +335,96 @@ def merge_lora(input_model): # IA3 layers # ############## def add_ia3(input_model, strand_pair): - + # add to kv layers # for layer in input_model.layers: - if re.match('multihead_attention', layer.name): + if re.match("multihead_attention", layer.name): layer._k_layer = adapters.IA3(layer._k_layer, trainable=True) layer._v_layer = adapters.IA3(layer._v_layer, trainable=True) - + # add to ff layer # # save old graph to dictionary layer_parent_dict_old = keras2dict(input_model) - + # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + to_fix = [i for i in layer_parent_dict_old if re.match("switch_reverse", i)] for i in to_fix: del layer_parent_dict_old[i] # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - + # Iterate over all layers after the input model_outputs = [] reverse_bool = None for layer in input_model.layers[1:-1]: - + # get layer inputs parent_layers = layer_parent_dict_old[layer.name] layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] + if len(layer_input) == 1: + layer_input = layer_input[0] # construct - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) + if re.match("stochastic_reverse_complement", layer.name): + x, reverse_bool = layer(layer_input) # transformer ff down-project layer (1536 -> 768): - elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + elif re.match("dense", layer.name) and layer.input_shape[-1] == 1536: x = adapters.IA3_ff(layer, trainable=True)(layer_input) else: x = layer(layer_input) - + # save layers to dictionary layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + + final = layers.SwitchReverse(strand_pair)( + [layer_output_dict_new[input_model.layers[-2].name], reverse_bool] + ) model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) # set trainable # for layer in model_adapter._flatten_layers(): lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: - if layer.name in ['ia3', 'ia3_ff']: + if len(lst_of_sublayers) == 1: + if layer.name in ["ia3", "ia3_ff"]: layer.trainable = True else: layer.trainable = False - - ### bias terms need to be frozen separately + + ### bias terms need to be frozen separately for layer in model_adapter.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) + if re.match("multihead_attention", layer.name): + layer._r_w_bias = tf.Variable( + layer._r_w_bias, trainable=False, name=layer._r_w_bias.name + ) + layer._r_r_bias = tf.Variable( + layer._r_r_bias, trainable=False, name=layer._r_r_bias.name + ) # set final head to be trainable - model_adapter.layers[-2].trainable=True + model_adapter.layers[-2].trainable = True # expected number of trainable params added/unfrozen: params_added = 0 for l in model_adapter.layers: - if re.match('multihead_attention', l.name): # kv layers + if re.match("multihead_attention", l.name): # kv layers params_added += param_count(l._k_layer._ia3_layer) params_added += param_count(l._v_layer._ia3_layer) - elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers + elif re.match("dense", l.name) and l.input_shape[-1] == 1536: # ff layers params_added += param_count(l._ia3_layer) - - print('params added/unfrozen by ia3: %d'%params_added) - + + print("params added/unfrozen by ia3: %d" % params_added) + return model_adapter + def merge_ia3(original_model, ia3_model): # original model contains pre-trained weights # ia3 model is the fine-tuned ia3 model for i, layer in enumerate(original_model.layers): # attention layers - if re.match('multihead_attention', layer.name): + if re.match("multihead_attention", layer.name): # scale k k_scaler = ia3_model.layers[i]._k_layer._ia3_layer.kernel[0] layer._k_layer.kernel.assign(layer._k_layer.kernel * k_scaler) @@ -386,13 +432,14 @@ def merge_ia3(original_model, ia3_model): v_scaler = ia3_model.layers[i]._v_layer._ia3_layer.kernel[0] layer._v_layer.kernel.assign(layer._v_layer.kernel * v_scaler) # ff layers - elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: + elif re.match("dense", layer.name) and layer.input_shape[-1] == 1536: ff_scaler = tf.expand_dims(ia3_model.layers[i]._ia3_layer.kernel[0], 1) layer.kernel.assign(layer.kernel * ff_scaler) # other layers else: layer.set_weights(ia3_model.layers[i].get_weights()) + ############# # add locon # ############# @@ -400,22 +447,24 @@ def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): # first add lora to attention add_lora(input_model, report_param=False) - + # decide: # 1. whether conv1 is trainable # 2. which conv layers to add loRA - + # all conv layers conv_layers = [] for layer in input_model.layers: - if re.match('conv1d', layer.name): + if re.match("conv1d", layer.name): conv_layers += [layer.name] - if conv_select is None: + if conv_select is None: conv_select = len(conv_layers) - + if conv_select > len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + raise ValueError( + "conv_select must be less than number of conv layers %d." % len(conv_layers) + ) locon_layers = [] conv1_tune = False @@ -424,40 +473,45 @@ def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): conv1_tune = True else: locon_layers = conv_layers[-conv_select:] - + layer_parent_dict_old = keras2dict(input_model) - + # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + to_fix = [i for i in layer_parent_dict_old if re.match("switch_reverse", i)] for i in to_fix: del layer_parent_dict_old[i] # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new = {} # the output tensor of each layer in the new graph layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - + # Iterate over all layers after the input model_outputs = [] reverse_bool = None for layer in input_model.layers[1:-1]: - + # get layer inputs parent_layers = layer_parent_dict_old[layer.name] layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] + if len(layer_input) == 1: + layer_input = layer_input[0] # construct - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) + if re.match("stochastic_reverse_complement", layer.name): + x, reverse_bool = layer(layer_input) elif layer.name in locon_layers: - x = adapters.Locon(layer, trainable=True, rank=rank, alpha=alpha)(layer_input) + x = adapters.Locon(layer, trainable=True, rank=rank, alpha=alpha)( + layer_input + ) else: x = layer(layer_input) - + # save layers to dictionary layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + + final = layers.SwitchReverse(strand_pair)( + [layer_output_dict_new[input_model.layers[-2].name], reverse_bool] + ) model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) if conv1_tune: @@ -468,7 +522,7 @@ def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): if conv1_tune: params_added += param_count(model_adapter.get_layer(name=conv_layers[0])) for l in model_adapter.layers: - if re.match('multihead_attention', l.name): + if re.match("multihead_attention", l.name): params_added += param_count(l._q_layer.down_layer) params_added += param_count(l._q_layer.up_layer) params_added += param_count(l._v_layer.down_layer) @@ -477,10 +531,11 @@ def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): params_added += param_count(l.down_layer) params_added += param_count(l.up_layer) - print('params added/unfrozen by lora: %d'%params_added) + print("params added/unfrozen by lora: %d" % params_added) return model_adapter + #### functions to merge locon def lora_increment(layer): down_weights = layer.down_layer.kernel @@ -488,22 +543,24 @@ def lora_increment(layer): increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * layer.scale return increment_weights + def locon_increment(layer): down_weights = layer.down_layer.kernel up_weights = layer.up_layer.kernel[0] increment_weights = tf.einsum("abc,cd->abd", down_weights, up_weights) * layer.scale return increment_weights + def merge_locon(original_model, locon_model): # original model contains pre-trained weights for i, layer in enumerate(original_model.layers): - + # lora layers - if re.match('multihead_attention', layer.name): + if re.match("multihead_attention", layer.name): q = locon_model.layers[i]._q_layer k = locon_model.layers[i]._k_layer v = locon_model.layers[i]._v_layer - e = locon_model.layers[i]._embedding_layer + e = locon_model.layers[i]._embedding_layer if isinstance(q, adapters.Lora): increment_weights = lora_increment(q) layer._q_layer.kernel.assign_add(increment_weights) @@ -516,20 +573,22 @@ def merge_locon(original_model, locon_model): if isinstance(e, adapters.Lora): increment_weights = lora_increment(e) layer._embedding_layer.kernel.assign_add(increment_weights) - + # locon layers elif isinstance(locon_model.layers[i], adapters.Locon): - increment_weights = locon_increment(locon_model.layers[i]) - layer.kernel.assign_add(increment_weights) - + increment_weights = locon_increment(locon_model.layers[i]) + layer.kernel.assign_add(increment_weights) + else: layer.set_weights(locon_model.layers[i].get_weights()) - + ############## # houlsby_se # ############## -def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, se_rank=16): +def add_houlsby_se( + input_model, strand_pair, houlsby_latent=8, conv_select=None, se_rank=16 +): # add squeeze-excitation blocks after conv # input_model should be properly frozen # pre_att: add se_block to pre-attention conv1d @@ -539,10 +598,10 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, # houlsby layers # ################## houlsby_layers = [] - for i in range(len(input_model.layers)-1): + for i in range(len(input_model.layers) - 1): layer = input_model.layers[i] - next_layer = input_model.layers[i+1] - if re.match('dropout', layer.name) and re.match('add', next_layer.name): + next_layer = input_model.layers[i + 1] + if re.match("dropout", layer.name) and re.match("add", next_layer.name): houlsby_layers += [next_layer.name] ############# @@ -550,12 +609,14 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, ############# conv_layers = [] for layer in input_model.layers: - if re.match('conv1d', layer.name): + if re.match("conv1d", layer.name): conv_layers += [layer.name] - if conv_select is None: + if conv_select is None: se_layers = conv_layers[1:] if conv_select >= len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) + raise ValueError( + "conv_select must be less than number of conv layers %d." % len(conv_layers) + ) se_layers = conv_layers[-conv_select:] ################### @@ -563,74 +624,79 @@ def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, ################### layer_parent_dict_old = keras2dict(input_model) # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] + to_fix = [i for i in layer_parent_dict_old if re.match("switch_reverse", i)] for i in to_fix: del layer_parent_dict_old[i] # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) + layer_output_dict_new = {} # the output tensor of each layer in the new graph + layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) # Iterate over all layers after the input model_outputs = [] reverse_bool = None - + for layer in input_model.layers[1:-1]: - + # parent layers parent_layers = layer_parent_dict_old[layer.name] - + # layer inputs layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] + if len(layer_input) == 1: + layer_input = layer_input[0] if layer.name.startswith("stochastic_reverse_complement"): - x, reverse_bool = layer(layer_input) - + x, reverse_bool = layer(layer_input) + # insert houlsby layer: elif layer.name in houlsby_layers: - print('adapter added before:%s'%layer.name) + print("adapter added before:%s" % layer.name) x = adapters.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) x = layer([layer_input[0], x]) # insert squeeze-excite layer: elif layer.name in se_layers: se_layer = layers.SqueezeExcite( - activation=None, # no activation before squeezing - additive=False, # use sigmoid multiplicative scaling - rank=se_rank, # bottleneck ratio - use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization - scale_fun='tanh' + activation=None, # no activation before squeezing + additive=False, # use sigmoid multiplicative scaling + rank=se_rank, # bottleneck ratio + use_bias=False, # ignore bias + kernel_initializer=tf.keras.initializers.TruncatedNormal( + stddev=1e-3 + ), # near-zero weight initialization + scale_fun="tanh", ) x = layer(layer_input) x = x + se_layer(x) - + else: x = layer(layer_input) - + # save the output tensor of every layer layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) + + final = layers.SwitchReverse(strand_pair)( + [layer_output_dict_new[input_model.layers[-2].name], reverse_bool] + ) model_final = tf.keras.Model(inputs=input_model.inputs, outputs=final) # set trainable - for l in model_final.layers[:-2]: # trunk - if re.match('layer_normalization|adapter_houlsby', l.name): + for l in model_final.layers[:-2]: # trunk + if re.match("layer_normalization|adapter_houlsby", l.name): l.trainable = True else: l.trainable = False - for l in model_final.layers: # set trunk - if l.name.startswith("squeeze_excite"): l.trainable = True + for l in model_final.layers: # set trunk + if l.name.startswith("squeeze_excite"): + l.trainable = True # expected number of trainable params added/unfrozen: params_added = 0 for l in model_final.layers: - if re.match('squeeze_excite|adapter_houlsby', l.name): + if re.match("squeeze_excite|adapter_houlsby", l.name): params_added += param_count(l) - elif l.name.startswith("layer_normalization"): - params_added += param_count(l, type='trainable') - print('params added/unfrozen by houlsby_se: %d'%params_added) - - return model_final + elif l.name.startswith("layer_normalization"): + params_added += param_count(l, type="trainable") + print("params added/unfrozen by houlsby_se: %d" % params_added) + return model_final diff --git a/src/baskerville/helpers/transfer_helper.py b/src/baskerville/helpers/transfer_helper.py deleted file mode 100644 index aa178b6..0000000 --- a/src/baskerville/helpers/transfer_helper.py +++ /dev/null @@ -1,635 +0,0 @@ -import argparse -import json -import os -import shutil -import re -import h5py - -import numpy as np -import pandas as pd -import tensorflow as tf -from tensorflow.keras import mixed_precision - -from baskerville import dataset -from baskerville import seqnn -from baskerville import trainer -from baskerville import layers - -def param_count(layer, type='all'): - if type not in ['all','trainable','non_trainable']: - raise ValueError("TYPE must be one of all, trainable, non_trainable") - output = 0 - if type=='all': - output = int(sum(tf.keras.backend.count_params(w) for w in layer.weights)) - elif type=='trainable': - output = int(sum(tf.keras.backend.count_params(w) for w in layer.trainable_weights)) - else: - output = int(sum(tf.keras.backend.count_params(w) for w in layer.non_trainable_weights)) - return output - -def param_summary(model): - trainable = param_count(model, type='trainable') - non_trainable = param_count(model, type='non_trainable') - print('total params:%d' %(trainable + non_trainable)) - print('trainable params:%d' %trainable) - print('non-trainable params:%d' %non_trainable) - -def keras2dict(model): - layer_parent_dict = {} # the parent layers of each layer in the old graph - for layer in model.layers: - for node in layer._outbound_nodes: - layer_name = node.outbound_layer.name - if layer_name not in layer_parent_dict: - layer_parent_dict.update({layer_name: [layer.name]}) - else: - if layer.name not in layer_parent_dict[layer_name]: - layer_parent_dict[layer_name].append(layer.name) - return layer_parent_dict - -# lora requires change model.h5 weight order. -# locon and ia3 don't modify model in place. -def var_reorder(weight_h5): - # assumes weight_h5 model saved with seqnn_model.save() - # [i.name for i in model.layers[30].weights] to check for multihead_attention layer weights order. - # model.load_weights() load weights sequencially, assuming h5 weights are in the right order. - # When inserting lora, multihead_attention layer weights order changed. - # multihead_attention layer weights order is saved inside f['model_weights']['multihead_attention'].attrs - # After saving the weight_merged model, we need to go into the weights.h5, and change the attrs in multihead attention. - var_init_order = ['r_w_bias:0:0', - 'r_r_bias:0:0', - 'q_layer/kernel:0', - 'k_layer/kernel:0', - 'v_layer/kernel:0', - 'embedding_layer/kernel:0', - 'embedding_layer/bias:0', - 'r_k_layer/kernel:0'] - - f = h5py.File(weight_h5, 'r+') - layers = [i for i in list(f['model_weights'].keys()) if 'multihead_attention' in i] - for l_name in layers: - new_name_order = [l_name+'/'+i for i in var_init_order] - f['model_weights'][l_name].attrs.modify(name='weight_names', value=new_name_order) - f.close() - - -# houlsby requires architecture change. -# thus we need to modify json. -def modify_json(input_json, output_json, adapter, latent=8, se_rank=None, conv_select=None): - - with open(input_json) as params_open: - params = json.load(params_open) - - # houlsby - if adapter=='adapterHoulsby': - params["model"]['adapter']= 'houlsby' - params["model"]['adapter_latent']= latent - - # houlsby_se - elif adapter=='houlsby_se': - params["model"]['adapter']= 'houlsby_se' - params["model"]['adapter_latent']= latent - params["model"]['se_rank']= se_rank - params["model"]['conv_select']= conv_select - - else: - raise ValueError("adapter must be adapterHoulsby or houlsby_se") - - ### output - with open(output_json, 'w') as params_open: - json.dump(params, params_open, indent=4) - -###################### -# add houlsby layers # -###################### -def add_houlsby(input_model, strand_pair, latent_size=8): - # take seqnn_model as input - # output a new seqnn_model object - # only the adapter, and layer_norm are trainable - - ################## - # houlsby layers # - ################## - houlsby_layers = [] - for i in range(len(input_model.layers)-1): - layer = input_model.layers[i] - next_layer = input_model.layers[i+1] - if re.match('dropout', layer.name) and re.match('add', next_layer.name): - houlsby_layers += [next_layer.name] - - ################### - # construct model # - ################### - layer_parent_dict_old = keras2dict(input_model) - # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - - for layer in input_model.layers[1:-1]: - - # parent layers - parent_layers = layer_parent_dict_old[layer.name] - - # layer inputs - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - - # insert houlsby layer: - elif layer.name in houlsby_layers: - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=latent_size)(layer_input[1]) - x = layer([layer_input[0], x]) - - else: - x = layer(layer_input) - - # save the output tensor of every layer - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - - # set trainable # - for l in model_adapter.layers[:-2]: # trunk - if re.match('layer_normalization|adapter_houlsby', l.name): - l.trainable = True - else: - l.trainable = False - - # expected number of trainable params added/unfrozen: - params_added = 0 - for l in model_adapter.layers: - if l.name.startswith("adapter_houlsby"): - params_added += param_count(l) - elif l.name.startswith("layer_normalization"): - params_added += param_count(l, type='trainable') - print('params added/unfrozen by adapter_houlsby: %d'%params_added) - - return model_adapter - -############### -# lora layers # -############### -def add_lora(input_model, rank=8, alpha=16, mode='default', report_param=True): - # take seqnn.model as input - # replace _q_layer, _v_layer in multihead_attention - # optionally replace _k_layer, _embedding_layer - if mode not in ['default','full']: - raise ValueError("mode must be default or full") - - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - # default loRA - layer._q_layer = layers.Lora(layer._q_layer, rank=rank, alpha=alpha, trainable=True) - layer._v_layer = layers.Lora(layer._v_layer, rank=rank, alpha=alpha, trainable=True) - # full loRA - if mode=='full': - layer._k_layer = layers.Lora(layer._k_layer, rank=rank, alpha=alpha, trainable=True) - layer._embedding_layer = layers.Lora(layer._embedding_layer, rank=rank, alpha=alpha, trainable=True) - - input_model(input_model.input) # initialize new variables - - # freeze all params but lora - for layer in input_model._flatten_layers(): - lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: - if layer.name in ["lora_a", "lora_b"]: - layer.trainable = True - else: - layer.trainable = False - - ### bias terms need to be frozen separately - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) - - # set final head to be trainable - input_model.layers[-2].trainable=True - - # expected number of trainable params added/unfrozen: - params_added = 0 - for l in input_model.layers: - if re.match('multihead_attention', l.name): - params_added += param_count(l._q_layer.down_layer) - params_added += param_count(l._q_layer.up_layer) - params_added += param_count(l._v_layer.down_layer) - params_added += param_count(l._v_layer.up_layer) - if mode=='full': - params_added += param_count(l._k_layer.down_layer) - params_added += param_count(l._k_layer.up_layer) - params_added += param_count(l._embedding_layer.down_layer) - params_added += param_count(l._embedding_layer.up_layer) - - if report_param: - print('params added/unfrozen by lora: %d'%params_added) - -############### -# lora layers # -############### -def add_lora_conv(input_model, conv_select=None): - - # add lora layers - add_lora(input_model, rank=8, alpha=16, mode='default', report_param=False) - - # list all conv layers - conv_layers = [] - for layer in input_model.layers: - if re.match('conv1d', layer.name): - conv_layers += [layer.name] - if conv_select is None: - conv_select = len(conv_layers) - if conv_select > len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) - - # set conv layers trainable - trainable_conv = conv_layers[-conv_select:] - for layer in input_model.layers: - if layer.name in trainable_conv: - layer.trainable=True - - # expected number of trainable params added/unfrozen: - params_added = 0 - for l in input_model.layers: - if re.match('multihead_attention', l.name): - params_added += param_count(l._q_layer.down_layer) - params_added += param_count(l._q_layer.up_layer) - params_added += param_count(l._v_layer.down_layer) - params_added += param_count(l._v_layer.up_layer) - elif l.name in trainable_conv: - params_added += param_count(l) - - print('params added/unfrozen by lora_conv: %d'%params_added) - -# merge lora weights # -def merge_lora_layer(lora_layer): - down_weights = lora_layer.down_layer.kernel - up_weights = lora_layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * lora_layer.scale - lora_layer.original_layer.kernel.assign_add(increment_weights) - return lora_layer.original_layer - -def merge_lora(input_model): - for layer in input_model.layers: - if 'multihead_attention' in layer.name: - if isinstance(layer._q_layer, layers.Lora): - layer._q_layer = merge_lora_layer(layer._q_layer) - if isinstance(layer._v_layer, layers.Lora): - layer._v_layer = merge_lora_layer(layer._v_layer) - if isinstance(layer._k_layer, layers.Lora): - layer._k_layer = merge_lora_layer(layer._k_layer) - if isinstance(layer._embedding_layer, layers.Lora): - layer._embedding_layer = merge_lora_layer(layer._embedding_layer) - input_model(input_model.input) - - -############## -# IA3 layers # -############## -def add_ia3(input_model, strand_pair): - - # add to kv layers # - for layer in input_model.layers: - if re.match('multihead_attention', layer.name): - layer._k_layer = layers.IA3(layer._k_layer, trainable=True) - layer._v_layer = layers.IA3(layer._v_layer, trainable=True) - - # add to ff layer # - # save old graph to dictionary - layer_parent_dict_old = keras2dict(input_model) - - # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - - # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - for layer in input_model.layers[1:-1]: - - # get layer inputs - parent_layers = layer_parent_dict_old[layer.name] - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - # construct - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - # transformer ff down-project layer (1536 -> 768): - elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: - x = layers.IA3_ff(layer, trainable=True)(layer_input) - else: - x = layer(layer_input) - - # save layers to dictionary - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - - # set trainable # - for layer in model_adapter._flatten_layers(): - lst_of_sublayers = list(layer._flatten_layers()) - if len(lst_of_sublayers) == 1: - if layer.name in ['ia3', 'ia3_ff']: - layer.trainable = True - else: - layer.trainable = False - - ### bias terms need to be frozen separately - for layer in model_adapter.layers: - if re.match('multihead_attention', layer.name): - layer._r_w_bias = tf.Variable(layer._r_w_bias, trainable=False, name=layer._r_w_bias.name) - layer._r_r_bias = tf.Variable(layer._r_r_bias, trainable=False, name=layer._r_r_bias.name) - - # set final head to be trainable - model_adapter.layers[-2].trainable=True - - # expected number of trainable params added/unfrozen: - params_added = 0 - for l in model_adapter.layers: - if re.match('multihead_attention', l.name): # kv layers - params_added += param_count(l._k_layer._ia3_layer) - params_added += param_count(l._v_layer._ia3_layer) - elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers - params_added += param_count(l._ia3_layer) - - print('params added/unfrozen by ia3: %d'%params_added) - - return model_adapter - -def merge_ia3(original_model, ia3_model): - # original model contains pre-trained weights - # ia3 model is the fine-tuned ia3 model - for i, layer in enumerate(original_model.layers): - # attention layers - if re.match('multihead_attention', layer.name): - # scale k - k_scaler = ia3_model.layers[i]._k_layer._ia3_layer.kernel[0] - layer._k_layer.kernel.assign(layer._k_layer.kernel * k_scaler) - # scale v - v_scaler = ia3_model.layers[i]._v_layer._ia3_layer.kernel[0] - layer._v_layer.kernel.assign(layer._v_layer.kernel * v_scaler) - # ff layers - elif re.match('dense', layer.name) and layer.input_shape[-1]==1536: - ff_scaler = tf.expand_dims(ia3_model.layers[i]._ia3_layer.kernel[0], 1) - layer.kernel.assign(layer.kernel * ff_scaler) - # other layers - else: - layer.set_weights(ia3_model.layers[i].get_weights()) - -############# -# add locon # -############# -def add_locon(input_model, strand_pair, conv_select=None, rank=4, alpha=1): - - # first add lora to attention - add_lora(input_model, report_param=False) - - # decide: - # 1. whether conv1 is trainable - # 2. which conv layers to add loRA - - # all conv layers - conv_layers = [] - for layer in input_model.layers: - if re.match('conv1d', layer.name): - conv_layers += [layer.name] - - if conv_select is None: - conv_select = len(conv_layers) - - if conv_select > len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) - - locon_layers = [] - conv1_tune = False - if conv_select == len(conv_layers): - locon_layers = conv_layers[1:] - conv1_tune = True - else: - locon_layers = conv_layers[-conv_select:] - - layer_parent_dict_old = keras2dict(input_model) - - # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - - # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - for layer in input_model.layers[1:-1]: - - # get layer inputs - parent_layers = layer_parent_dict_old[layer.name] - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - # construct - if re.match('stochastic_reverse_complement', layer.name): - x, reverse_bool = layer(layer_input) - elif layer.name in locon_layers: - x = layers.Locon(layer, trainable=True, rank=rank, alpha=alpha)(layer_input) - else: - x = layer(layer_input) - - # save layers to dictionary - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) - model_adapter = tf.keras.Model(inputs=input_model.inputs, outputs=final) - - if conv1_tune: - model_adapter.get_layer(name=conv_layers[0]).trainable = True - - # expected number of trainable params added/unfrozen: - params_added = 0 - if conv1_tune: - params_added += param_count(model_adapter.get_layer(name=conv_layers[0])) - for l in model_adapter.layers: - if re.match('multihead_attention', l.name): - params_added += param_count(l._q_layer.down_layer) - params_added += param_count(l._q_layer.up_layer) - params_added += param_count(l._v_layer.down_layer) - params_added += param_count(l._v_layer.up_layer) - if l.name in locon_layers: - params_added += param_count(l.down_layer) - params_added += param_count(l.up_layer) - - print('params added/unfrozen by lora: %d'%params_added) - - return model_adapter - -#### functions to merge locon -def lora_increment(layer): - down_weights = layer.down_layer.kernel - up_weights = layer.up_layer.kernel - increment_weights = tf.einsum("ab,bc->ac", down_weights, up_weights) * layer.scale - return increment_weights - -def locon_increment(layer): - down_weights = layer.down_layer.kernel - up_weights = layer.up_layer.kernel[0] - increment_weights = tf.einsum("abc,cd->abd", down_weights, up_weights) * layer.scale - return increment_weights - -def merge_locon(original_model, locon_model): - # original model contains pre-trained weights - for i, layer in enumerate(original_model.layers): - - # lora layers - if re.match('multihead_attention', layer.name): - q = locon_model.layers[i]._q_layer - k = locon_model.layers[i]._k_layer - v = locon_model.layers[i]._v_layer - e = locon_model.layers[i]._embedding_layer - if isinstance(q, layers.Lora): - increment_weights = lora_increment(q) - layer._q_layer.kernel.assign_add(increment_weights) - if isinstance(v, layers.Lora): - increment_weights = lora_increment(v) - layer._v_layer.kernel.assign_add(increment_weights) - if isinstance(k, layers.Lora): - increment_weights = lora_increment(k) - layer._k_layer.kernel.assign_add(increment_weights) - if isinstance(e, layers.Lora): - increment_weights = lora_increment(e) - layer._embedding_layer.kernel.assign_add(increment_weights) - - # locon layers - elif isinstance(locon_model.layers[i], layers.Locon): - increment_weights = locon_increment(locon_model.layers[i]) - layer.kernel.assign_add(increment_weights) - - else: - layer.set_weights(locon_model.layers[i].get_weights()) - - -############## -# houlsby_se # -############## -def add_houlsby_se(input_model, strand_pair, houlsby_latent=8, conv_select=None, se_rank=16): - # add squeeze-excitation blocks after conv - # input_model should be properly frozen - # pre_att: add se_block to pre-attention conv1d - # all: add se_block to pre-attention conv1d and post-attention separable_conv1d - - ################## - # houlsby layers # - ################## - houlsby_layers = [] - for i in range(len(input_model.layers)-1): - layer = input_model.layers[i] - next_layer = input_model.layers[i+1] - if re.match('dropout', layer.name) and re.match('add', next_layer.name): - houlsby_layers += [next_layer.name] - - ############# - # SE layers # - ############# - conv_layers = [] - for layer in input_model.layers: - if re.match('conv1d', layer.name): - conv_layers += [layer.name] - if conv_select is None: - se_layers = conv_layers[1:] - if conv_select >= len(conv_layers): - raise ValueError("conv_select must be less than number of conv layers %d."%len(conv_layers)) - se_layers = conv_layers[-conv_select:] - - ################### - # construct model # - ################### - layer_parent_dict_old = keras2dict(input_model) - # remove switch_reverse_layer - to_fix = [i for i in layer_parent_dict_old if re.match('switch_reverse', i)] - for i in to_fix: - del layer_parent_dict_old[i] - # create new graph - layer_output_dict_new = {} # the output tensor of each layer in the new graph - layer_output_dict_new.update({input_model.layers[0].name: input_model.input}) - # Iterate over all layers after the input - model_outputs = [] - reverse_bool = None - - for layer in input_model.layers[1:-1]: - - # parent layers - parent_layers = layer_parent_dict_old[layer.name] - - # layer inputs - layer_input = [layer_output_dict_new[parent] for parent in parent_layers] - if len(layer_input) == 1: layer_input = layer_input[0] - - if layer.name.startswith("stochastic_reverse_complement"): - x, reverse_bool = layer(layer_input) - - # insert houlsby layer: - elif layer.name in houlsby_layers: - print('adapter added before:%s'%layer.name) - x = layers.AdapterHoulsby(latent_size=houlsby_latent)(layer_input[1]) - x = layer([layer_input[0], x]) - - # insert squeeze-excite layer: - elif layer.name in se_layers: - se_layer = layers.SqueezeExcite( - activation=None, # no activation before squeezing - additive=False, # use sigmoid multiplicative scaling - rank=se_rank, # bottleneck ratio - use_bias=False, # ignore bias - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), # near-zero weight initialization - scale_fun='tanh' - ) - x = layer(layer_input) - x = x + se_layer(x) - - else: - x = layer(layer_input) - - # save the output tensor of every layer - layer_output_dict_new.update({layer.name: x}) - - final = layers.SwitchReverse(strand_pair)([layer_output_dict_new[input_model.layers[-2].name], reverse_bool]) - model_final = tf.keras.Model(inputs=input_model.inputs, outputs=final) - - # set trainable - for l in model_final.layers[:-2]: # trunk - if re.match('layer_normalization|adapter_houlsby', l.name): - l.trainable = True - else: - l.trainable = False - - for l in model_final.layers: # set trunk - if l.name.startswith("squeeze_excite"): l.trainable = True - - # expected number of trainable params added/unfrozen: - params_added = 0 - for l in model_final.layers: - if re.match('squeeze_excite|adapter_houlsby', l.name): - params_added += param_count(l) - elif l.name.startswith("layer_normalization"): - params_added += param_count(l, type='trainable') - print('params added/unfrozen by houlsby_se: %d'%params_added) - - return model_final - diff --git a/src/baskerville/layers.py b/src/baskerville/layers.py index c8acef3..ec94dd0 100644 --- a/src/baskerville/layers.py +++ b/src/baskerville/layers.py @@ -23,287 +23,11 @@ for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) -##################### -# transfer learning # -##################### -class IA3(tf.keras.layers.Layer): - # https://arxiv.org/pdf/2205.05638.pdf - # ia3 module for attention layer, scale output. - - def __init__(self, - original_layer, - trainable=False, - **kwargs): - - # keep the name of this layer the same as the original dense layer. - original_layer_config = original_layer.get_config() - name = original_layer_config["name"] - kwargs.pop("name", None) - super().__init__(name=name, trainable=trainable, **kwargs) - - self.output_dim = original_layer_config["units"] - - self.original_layer = original_layer - self.original_layer.trainable = False - - # IA3 weights. Make it a dense layer to control trainable - self._ia3_layer = tf.keras.layers.Dense( - units=self.output_dim, - use_bias=False, - kernel_initializer=tf.keras.initializers.Ones(), - trainable=True, - name="ia3" - ) - - def call(self, inputs): - original_output = self.original_layer(inputs) - scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] - return original_output * scaler - - def get_config(self): - config = super().get_config().copy() - config.update( - { - "size": self.output_dim, - } - ) - return config - -class IA3_ff(tf.keras.layers.Layer): - # https://arxiv.org/pdf/2205.05638.pdf - # ia3 module for down-projection ff layer, scale input. - - def __init__(self, - original_layer, - trainable=False, - **kwargs): - - # keep the name of this layer the same as the original dense layer. - original_layer_config = original_layer.get_config() - name = original_layer_config["name"] - kwargs.pop("name", None) - super().__init__(name=name, trainable=trainable, **kwargs) - - self.input_dim = original_layer.input_shape[-1] - - self.original_layer = original_layer - self.original_layer.trainable = False - - # IA3 weights. Make it a dense layer to control trainable - self._ia3_layer = tf.keras.layers.Dense( - units=self.input_dim, - use_bias=False, - kernel_initializer=tf.keras.initializers.Ones(), - trainable=True, - name="ia3_ff" - ) - - def call(self, inputs): - scaler = self._ia3_layer(tf.constant([[1]], dtype='float64'))[0] - return self.original_layer(inputs * scaler) - - def get_config(self): - config = super().get_config().copy() - config.update( - { - "size": self.input_dim - } - ) - return config - -class Lora(tf.keras.layers.Layer): - # adapted from: - # https://arxiv.org/abs/2106.09685 - # https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/ - # https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py - - def __init__(self, - original_layer, - rank=8, - alpha=16, - trainable=False, - **kwargs): - - # keep the name of this layer the same as the original dense layer. - original_layer_config = original_layer.get_config() - name = original_layer_config["name"] - kwargs.pop("name", None) - super().__init__(name=name, trainable=trainable, **kwargs) - - self.output_dim = original_layer_config["units"] - - if rank > self.output_dim: - raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") - - self.rank = rank - self.alpha = alpha - self.scale = alpha / rank - self.original_layer = original_layer - self.original_layer.trainable = False - - # Note: the original paper mentions that normal distribution was - # used for initialization. However, the official LoRA implementation - # uses "Kaiming/He Initialization". - self.down_layer = tf.keras.layers.Dense( - units=rank, - use_bias=False, - kernel_initializer=tf.keras.initializers.HeUniform(), - #kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank), - trainable=True, - name="lora_a" - ) - - self.up_layer = tf.keras.layers.Dense( - units=self.output_dim, - use_bias=False, - kernel_initializer=tf.keras.initializers.Zeros(), - trainable=True, - name="lora_b" - ) - - def call(self, inputs): - original_output = self.original_layer(inputs) - lora_output = self.up_layer(self.down_layer(inputs)) * self.scale - return original_output + lora_output - - def get_config(self): - config = super().get_config().copy() - config.update( - { - "rank": self.rank, - "alpha": self.alpha - } - ) - return config - -class Locon(tf.keras.layers.Layer): - # LoRA for conv-layer, adapted from: - # https://arxiv.org/pdf/2309.14859#page=23.84 - # https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/modules/locon.py - # use default alpha and rank for locon - - def __init__(self, - original_layer, - rank=4, - alpha=1, - trainable=False, - **kwargs): - - # keep the name of this layer the same as the original conv layer. - original_layer_config = original_layer.get_config() - name = original_layer_config["name"] - kwargs.pop("name", None) - super().__init__(name=name, trainable=trainable, **kwargs) - - self.input_dim = original_layer.input_shape[-1] - self.output_dim = original_layer_config["filters"] - - if rank > self.output_dim: - raise ValueError(f"LoRA rank {rank} must be less or equal than {self.output_dim}") - - self.rank = rank - self.alpha = alpha - self.scale = alpha / rank - self.original_layer = original_layer - self.original_layer.trainable = False - - input_dim = original_layer.input_shape[-1] - output_dim = original_layer_config["filters"] - kernel_size = original_layer_config['kernel_size'][0] - stride = original_layer_config['strides'][0] - dilation_rate = original_layer_config["dilation_rate"][0] - - # Note: the original paper mentions that normal distribution was - # used for initialization. However, the official LoRA implementation - # uses "Kaiming/He Initialization". - - self.down_layer = tf.keras.layers.Conv1D( - filters=rank, - kernel_size=kernel_size, - strides=stride, - padding="same", - use_bias=False, - dilation_rate=dilation_rate, - kernel_initializer=tf.keras.initializers.HeUniform(), - name='locon_down' - ) - - self.up_layer = tf.keras.layers.Conv1D( - filters=output_dim, - kernel_size=1, - strides=stride, - padding="same", - use_bias=False, - kernel_initializer=tf.keras.initializers.Zeros(), - name='locon_up' - ) - - def call(self, inputs): - original_output = self.original_layer(inputs) - lora_output = self.up_layer(self.down_layer(inputs)) * self.scale - return original_output + lora_output - - def get_config(self): - config = super().get_config().copy() - config.update( - { - "rank": self.rank, - "alpha": self.alpha - } - ) - return config - -class AdapterHoulsby(tf.keras.layers.Layer): - # https://arxiv.org/abs/1902.00751 - # adapted from: https://github.com/jain-harshil/Adapter-BERT - - def __init__( - self, - latent_size, - activation=tf.keras.layers.ReLU(), - **kwargs): - super(AdapterHoulsby, self).__init__(**kwargs) - self.latent_size = latent_size - self.activation = activation - - def build(self, input_shape): - self.down_project = tf.keras.layers.Dense( - units=self.latent_size, - activation="linear", - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), - bias_initializer="zeros", - name='adapter_down' - ) - - self.up_project = tf.keras.layers.Dense( - units=input_shape[-1], - activation="linear", - kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3), - bias_initializer="zeros", - name='adapter_up' - ) - - def call(self, inputs): - projected_down = self.down_project(inputs) - activated = self.activation(projected_down) - projected_up = self.up_project(activated) - output = projected_up + inputs - return output - - def get_config(self): - config = super().get_config().copy() - config.update( - { - "latent_size": self.latent_size, - "activation": self.activation - } - ) - return config - ############################################################ # Basic ############################################################ + class Scale(tf.keras.layers.Layer): """Scale the input by a learned value. @@ -566,7 +290,7 @@ def __init__( initializer="he_normal", l2_scale=0, qkv_width=1, - seqlen_train=None + seqlen_train=None, ): """Creates a MultiheadAttention module. Original version written by Ziga Avsec. @@ -754,8 +478,10 @@ def call(self, inputs, training=False): q *= self._key_size**-0.5 # [B, H, T', T] - #content_logits = tf.matmul(q + self._r_w_bias, k, transpose_b=True) - content_logits = tf.matmul(q + tf.cast(self._r_w_bias, dtype=inputs.dtype), k, transpose_b=True) + # content_logits = tf.matmul(q + self._r_w_bias, k, transpose_b=True) + content_logits = tf.matmul( + q + tf.cast(self._r_w_bias, dtype=inputs.dtype), k, transpose_b=True + ) if self._num_position_features == 0: logits = content_logits @@ -791,12 +517,18 @@ def call(self, inputs, training=False): # Add shifted relative logits to content logits. if self._content_position_bias: # [B, H, T', 2T-1] - #relative_logits = tf.matmul(q + self._r_r_bias, r_k, transpose_b=True) - relative_logits = tf.matmul(q + tf.cast(self._r_r_bias, dtype=inputs.dtype), r_k, transpose_b=True) + # relative_logits = tf.matmul(q + self._r_r_bias, r_k, transpose_b=True) + relative_logits = tf.matmul( + q + tf.cast(self._r_r_bias, dtype=inputs.dtype), + r_k, + transpose_b=True, + ) else: # [1, H, 1, 2T-1] - #relative_logits = tf.matmul(self._r_r_bias, r_k, transpose_b=True) - relative_logits = tf.matmul(tf.cast(self._r_r_bias, dtype=inputs.dtype), r_k, transpose_b=True) + # relative_logits = tf.matmul(self._r_r_bias, r_k, transpose_b=True) + relative_logits = tf.matmul( + tf.cast(self._r_r_bias, dtype=inputs.dtype), r_k, transpose_b=True + ) # [1, H, T', 2T-1] relative_logits = tf.broadcast_to( relative_logits, @@ -881,15 +613,15 @@ def get_config(self): class SqueezeExcite(tf.keras.layers.Layer): def __init__( self, - activation='relu', + activation="relu", additive=False, rank=8, norm_type=None, bn_momentum=0.9, use_bias=True, - kernel_initializer='glorot_uniform', - bias_initializer='zeros', - scale_fun='sigmoid', + kernel_initializer="glorot_uniform", + bias_initializer="zeros", + scale_fun="sigmoid", ): super(SqueezeExcite, self).__init__() self.activation = activation @@ -897,10 +629,10 @@ def __init__( self.norm_type = norm_type self.bn_momentum = bn_momentum self.rank = rank - self.kernel_initializer=kernel_initializer - self.bias_initializer=bias_initializer - self.use_bias=use_bias - self.scale_fun=scale_fun + self.kernel_initializer = kernel_initializer + self.bias_initializer = bias_initializer + self.use_bias = use_bias + self.scale_fun = scale_fun def build(self, input_shape): self.num_channels = input_shape[-1] @@ -918,9 +650,9 @@ def build(self, input_shape): ) exit(1) - if self.scale_fun=='sigmoid': + if self.scale_fun == "sigmoid": self.scale_f = tf.keras.activations.sigmoid - elif self.scale_fun=='tanh': # set to tanh for transfer + elif self.scale_fun == "tanh": # set to tanh for transfer self.scale_f = tf.keras.activations.tanh else: print( @@ -930,19 +662,20 @@ def build(self, input_shape): exit(1) self.dense1 = tf.keras.layers.Dense( - units=self.rank, + units=self.rank, activation="relu", use_bias=self.use_bias, kernel_initializer=self.kernel_initializer, bias_initializer=self.bias_initializer, ) - + self.dense2 = tf.keras.layers.Dense( - units=self.num_channels, + units=self.num_channels, use_bias=self.use_bias, kernel_initializer=self.kernel_initializer, bias_initializer=self.bias_initializer, - activation=None) + activation=None, + ) def call(self, x): # activate @@ -976,14 +709,15 @@ def get_config(self): { "activation": self.activation, "additive": self.additive, - "use_bias":self.use_bias, + "use_bias": self.use_bias, "norm_type": self.norm_type, "bn_momentum": self.bn_momentum, - "rank": self.rank + "rank": self.rank, } ) return config + class GlobalContext(tf.keras.layers.Layer): def __init__(self): super(GlobalContext, self).__init__() diff --git a/src/baskerville/metrics.py b/src/baskerville/metrics.py index b540086..72cef09 100644 --- a/src/baskerville/metrics.py +++ b/src/baskerville/metrics.py @@ -128,7 +128,7 @@ def poisson_multinomial( rescale (bool): Rescale loss after re-weighting. """ seq_len = y_true.shape[1] - + if weight_range < 1: raise ValueError("Poisson Multinomial weight_range must be >=1") elif weight_range == 1: @@ -147,8 +147,8 @@ def poisson_multinomial( y_pred = tf.math.multiply(y_pred, position_weights) # sum across lengths - s_true = tf.math.reduce_sum(y_true, axis=-2) # B x T - s_pred = tf.math.reduce_sum(y_pred, axis=-2) # B x T + s_true = tf.math.reduce_sum(y_true, axis=-2) # B x T + s_pred = tf.math.reduce_sum(y_pred, axis=-2) # B x T # total count poisson loss, mean across targets poisson_term = poisson(s_true, s_pred) # B x T @@ -159,7 +159,7 @@ def poisson_multinomial( y_pred += epsilon # normalize to sum to one - p_pred = y_pred / tf.expand_dims(s_pred, axis=-2) # B x L x T + p_pred = y_pred / tf.expand_dims(s_pred, axis=-2) # B x L x T # multinomial loss pl_pred = tf.math.log(p_pred) # B x L x T diff --git a/src/baskerville/scripts/hound_eval.py b/src/baskerville/scripts/hound_eval.py index a82199c..2115e17 100755 --- a/src/baskerville/scripts/hound_eval.py +++ b/src/baskerville/scripts/hound_eval.py @@ -150,8 +150,8 @@ def main(): # mixed precision # ################### if args.f16: - mixed_precision.set_global_policy('mixed_float16') # first set global policy - seqnn_model = seqnn.SeqNN(params_model) # then create model + mixed_precision.set_global_policy("mixed_float16") # first set global policy + seqnn_model = seqnn.SeqNN(params_model) # then create model seqnn_model.restore(args.model_file, args.head_i) seqnn_model.append_activation() # add additional activation to cast float16 output to float32 else: diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 2a4608f..4212a42 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -43,7 +43,7 @@ def main(): parser.add_option( "-c", dest="class_min", - default=5, + default=80, type="int", help="Minimum target class size to consider [Default: %default]", ) @@ -77,7 +77,7 @@ def main(): ) parser.add_option( "--f16", - dest="f16", + dest="f16", default=False, action="store_true", help="use mixed precision for inference", @@ -170,9 +170,11 @@ def main(): targets_df["class"] = target_classes target_classes = sorted(set(target_classes)) else: - targets_df["class"] = targets_df['description'].str.replace(':.*','',regex=True) - target_classes = options.target_classes.split(',') - + targets_df["class"] = targets_df["description"].str.replace( + ":.*", "", regex=True + ) + target_classes = options.target_classes.split(",") + print(target_classes) ####################################################### @@ -202,10 +204,10 @@ def main(): # mixed precision # ################### if options.f16: - mixed_precision.set_global_policy('mixed_float16') # set global policy - seqnn_model = seqnn.SeqNN(params_model) # create model + mixed_precision.set_global_policy("mixed_float16") # set global policy + seqnn_model = seqnn.SeqNN(params_model) # create model seqnn_model.restore(model_file, options.head_i) - seqnn_model.append_activation() # add additional activation to cast float16 output to float32 + seqnn_model.append_activation() # add additional activation to cast float16 output to float32 else: # initialize model seqnn_model = seqnn.SeqNN(params_model) diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index cec1dcf..40bf142 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -211,4 +211,4 @@ def main(): # __main__ ################################################################################ if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 59edca4..c4d3a85 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -18,6 +18,7 @@ import os import shutil import re +import sys import numpy as np import pandas as pd @@ -28,7 +29,7 @@ from baskerville import seqnn from baskerville import trainer from baskerville import layers -from baskerville.helpers import transfer_helper +from baskerville.helpers import transfer """ hound_transfer.py @@ -37,6 +38,7 @@ Additional argument to allow for transfer learning from existing Hound model. """ + def main(): parser = argparse.ArgumentParser(description="Train a model.") parser.add_argument( @@ -64,7 +66,7 @@ def main(): "--log_dir", default="log_out", help="Tensorboard log directory [Default: %(default)s]", - ) + ) parser.add_argument( "--restore", default=None, @@ -76,47 +78,6 @@ def main(): default=False, help="Restore only model trunk [Default: %(default)s]", ) - parser.add_argument( - "--transfer_mode", - default="full", - help="transfer method. [full, linear, adapter]", - ) - parser.add_argument( - "--att_adapter", - default=None, - type=str, - help="attention layer module [adapterHoulsby, lora, lora_full, ia3, locon]", - ) - parser.add_argument( - "--att_latent", - type=int, - default=8, - help="attention adapter latent size.", - ) - parser.add_argument( - "--lora_alpha", - type=int, - default=16, - help="lora alpha.", - ) - parser.add_argument( - "--conv_select", - default=None, - type=int, - help="# of conv layers to insert locon/se.", - ) - parser.add_argument( - "--conv_rank", - type=int, - default=4, - help="locon/se rank.", - ) - parser.add_argument( - "--locon_alpha", - type=int, - default=1, - help="locon_alpha.", - ) parser.add_argument( "--tfr_train", default=None, @@ -142,15 +103,28 @@ def main(): if args.params_file != "%s/params.json" % args.out_dir: shutil.copy(args.params_file, "%s/params.json" % args.out_dir) - if args.transfer_mode not in ['full','linear','sparse']: - raise ValueError("transfer mode must be one of full, linear, sparse") - - # read model parameters + # model parameters with open(args.params_file) as params_open: params = json.load(params_open) + params_model = params["model"] + + # train parameters params_train = params["train"] + # transfer parameters + params_transfer = params["transfer"] + transfer_mode = params_transfer.get("mode", "full") + transfer_adapter = params_transfer.get("adapter", None) + transfer_latent = params_transfer.get("latent", 8) + transfer_conv_select = params_transfer.get("conv_select", 4) + transfer_conv_rank = params_transfer.get("conv_latent", 4) + transfer_lora_alpha = params_transfer.get("lora_alpha", 16) + transfer_locon_alpha = params_transfer.get("locon_alpha", 1) + + if transfer_mode not in ["full", "linear", "adapter"]: + raise ValueError("transfer mode must be one of full, linear, adapter") + # read datasets train_data = [] eval_data = [] @@ -184,79 +158,93 @@ def main(): tfr_pattern=args.tfr_eval, ) ) - + params_model["strand_pair"] = strand_pairs if args.mixed_precision: - mixed_precision.set_global_policy('mixed_float16') - + mixed_precision.set_global_policy("mixed_float16") + if params_train.get("num_gpu", 1) == 1: ######################################## # one GPU # initialize model - params_model['verbose']=False + params_model["verbose"] = False seqnn_model = seqnn.SeqNN(params_model) - + # restore if args.restore: seqnn_model.restore(args.restore, trunk=args.trunk) # head params - print('params in new head: %d' %transfer_helper.param_count(seqnn_model.model.layers[-2])) + print( + "params in new head: %d" + % transfer.param_count(seqnn_model.model.layers[-2]) + ) #################### # transfer options # #################### - if args.transfer_mode=='full': - seqnn_model.model.trainable=True - - elif args.transfer_mode=='linear': - seqnn_model.model_trunk.trainable=False + if transfer_mode == "full": + seqnn_model.model.trainable = True + + elif transfer_mode == "linear": + seqnn_model.model_trunk.trainable = False ############ # adapters # ############ - elif args.transfer_mode=='sparse': + elif transfer_mode == "adapter": # attention adapter - if args.att_adapter is not None: - if args.att_adapter=='adapterHoulsby': - seqnn_model.model = transfer_helper.add_houlsby(seqnn_model.model, - strand_pairs[0], - latent_size=args.att_latent) - elif args.att_adapter=='lora': - transfer_helper.add_lora(seqnn_model.model, - rank=args.att_latent, - alpha=args.lora_alpha, - mode='default') - - elif args.att_adapter=='lora_full': - transfer_helper.add_lora(seqnn_model.model, - rank=args.att_latent, - alpha=args.lora_alpha, - mode='full') - - elif args.att_adapter=='ia3': - seqnn_model.model = transfer_helper.add_ia3(seqnn_model.model, - strand_pairs[0]) - - elif args.att_adapter=='locon': # lora on conv+att - seqnn_model.model = transfer_helper.add_locon(seqnn_model.model, - strand_pairs[0], - conv_select=args.conv_select, - rank=args.conv_rank, - alpha=args.locon_alpha) - - elif args.att_adapter=='lora_conv': # lora on att, unfreeze_conv - transfer_helper.add_lora_conv(seqnn_model.model, conv_select=args.conv_select) - - elif args.att_adapter=='houlsby_se': # adapter on conv+att - seqnn_model.model = transfer_helper.add_houlsby_se(seqnn_model.model, - strand_pair=strand_pairs[0], - conv_select=args.conv_select, - se_rank=args.conv_rank) - + if transfer_adapter is not None: + if transfer_adapter == "houlsby": + seqnn_model.model = transfer.add_houlsby( + seqnn_model.model, strand_pairs[0], latent_size=transfer_latent + ) + elif transfer_adapter == "lora": + transfer.add_lora( + seqnn_model.model, + rank=transfer_latent, + alpha=transfer_lora_alpha, + mode="default", + ) + + elif transfer_adapter == "lora_full": + transfer.add_lora( + seqnn_model.model, + rank=transfer_latent, + alpha=transfer_lora_alpha, + mode="full", + ) + + elif transfer_adapter == "ia3": + seqnn_model.model = transfer.add_ia3( + seqnn_model.model, strand_pairs[0] + ) + + elif transfer_adapter == "locon": # lora on conv+att + seqnn_model.model = transfer.add_locon( + seqnn_model.model, + strand_pairs[0], + conv_select=transfer_conv_select, + rank=transfer_conv_rank, + alpha=transfer_locon_alpha, + ) + + elif transfer_adapter == "lora_conv": # lora on att, unfreeze_conv + transfer.add_lora_conv( + seqnn_model.model, conv_select=transfer_conv_select + ) + + elif transfer_adapter == "houlsby_se": # adapter on conv+att + seqnn_model.model = transfer.add_houlsby_se( + seqnn_model.model, + strand_pair=strand_pairs[0], + conv_select=transfer_conv_select, + se_rank=transfer_conv_rank, + ) + ################# # final summary # ################# @@ -267,13 +255,18 @@ def main(): seqnn_model.append_activation() # run with loss scaling seqnn_trainer = trainer.Trainer( - params_train, train_data, eval_data, args.out_dir, args.log_dir, loss_scale=True + params_train, + train_data, + eval_data, + args.out_dir, + args.log_dir, + loss_scale=True, ) else: seqnn_trainer = trainer.Trainer( params_train, train_data, eval_data, args.out_dir, args.log_dir ) - + # compile model seqnn_trainer.compile(seqnn_model) @@ -287,55 +280,58 @@ def main(): seqnn_trainer.fit2(seqnn_model) ############################# - # post-training adjustments # + # post-training adjustments # ############################# - if args.transfer_mode=='sparse': - - # for: adapterHoulsby and houlsby_se, overwrite json file - if args.att_adapter=='adapterHoulsby': - transfer_helper.modify_json(input_json=args.params_file, - output_json='%s/params.json'%args.out_dir, - adapter=args.att_adapter, - latent=args.att_latent) - - if args.att_adapter=='houlsby_se': - transfer_helper.modify_json(input_json=args.params_file, - output_json='%s/params.json'%args.out_dir, - adapter=args.att_adapter, - conv_select=args.conv_select, - se_rank=args.conv_rank - ) - + if transfer_mode == "adapter": + + # for: houlsby and houlsby_se, overwrite json file + if transfer_adapter == "houlsby": + transfer.modify_json( + input_json=args.params_file, + output_json="%s/params.json" % args.out_dir, + adapter=transfer_adapter, + latent=transfer_latent, + ) + + if transfer_adapter == "houlsby_se": + transfer.modify_json( + input_json=args.params_file, + output_json="%s/params.json" % args.out_dir, + adapter=transfer_adapter, + conv_select=transfer_conv_select, + se_rank=transfer_conv_rank, + ) + # for lora, ia3, locon, save weight to: model_best.mergeW.h5 - if args.att_adapter in ['lora', 'lora_full', 'lora_conv']: - seqnn_model.model.load_weights('%s/model_best.h5'%args.out_dir) - transfer_helper.merge_lora(seqnn_model.model) - seqnn_model.save('%s/model_best.mergeW.h5'%args.out_dir) - transfer_helper.var_reorder('%s/model_best.mergeW.h5'%args.out_dir) - - if args.att_adapter=='ia3': + if transfer_adapter in ["lora", "lora_full", "lora_conv"]: + seqnn_model.model.load_weights("%s/model_best.h5" % args.out_dir) + transfer.merge_lora(seqnn_model.model) + seqnn_model.save("%s/model_best.mergeW.h5" % args.out_dir) + transfer.var_reorder("%s/model_best.mergeW.h5" % args.out_dir) + + if transfer_adapter == "ia3": # ia3 model ia3_model = seqnn_model.model - ia3_model.load_weights('%s/model_best.h5'%args.out_dir) + ia3_model.load_weights("%s/model_best.h5" % args.out_dir) # original model seqnn_model2 = seqnn.SeqNN(params_model) seqnn_model2.restore(args.restore, trunk=args.trunk) original_model = seqnn_model2.model # merge weights into original model - transfer_helper.merge_ia3(original_model, ia3_model) - original_model.save('%s/model_best.mergeW.h5'%args.out_dir) + transfer.merge_ia3(original_model, ia3_model) + original_model.save("%s/model_best.mergeW.h5" % args.out_dir) - if args.att_adapter=='locon': + if transfer_adapter == "locon": # locon model locon_model = seqnn_model.model - locon_model.load_weights('%s/model_best.h5'%args.out_dir) + locon_model.load_weights("%s/model_best.h5" % args.out_dir) # original model seqnn_model2 = seqnn.SeqNN(params_model) seqnn_model2.restore(args.restore, trunk=args.trunk) original_model = seqnn_model2.model # merge weights into original model - transfer_helper.merge_locon(original_model, locon_model) - original_model.save('%s/model_best.mergeW.h5'%args.out_dir) + transfer.merge_locon(original_model, locon_model) + original_model.save("%s/model_best.mergeW.h5" % args.out_dir) else: ######################################## diff --git a/src/baskerville/seqnn.py b/src/baskerville/seqnn.py index 87aa223..35cbea2 100644 --- a/src/baskerville/seqnn.py +++ b/src/baskerville/seqnn.py @@ -25,7 +25,8 @@ from baskerville import dataset from baskerville import layers from baskerville import metrics -from baskerville.helpers import transfer_helper +from baskerville.helpers import transfer + class SeqNN: """Sequence neural network model. @@ -198,9 +199,9 @@ def build_model(self, save_reprs: bool = True): for ho in self.head_output: self.models.append(tf.keras.Model(inputs=sequence, outputs=ho)) self.model = self.models[0] - + # add adapter - if hasattr(self, 'adapter'): + if hasattr(self, "adapter"): for hi, head in enumerate(self.heads): self.models[hi] = self.insert_adapter(self.models[hi]) self.model = self.models[0] @@ -229,9 +230,11 @@ def build_embed(self, conv_layer_i: int, batch_norm: bool = True): def append_activation(self): """add additional activation to convert float16 output to float32, required for mixed precision""" model_0 = self.model - new_outputs = tf.keras.layers.Activation('linear', dtype='float32')(model_0.layers[-1].output) + new_outputs = tf.keras.layers.Activation("linear", dtype="float32")( + model_0.layers[-1].output + ) self.model = tf.keras.Model(inputs=model_0.layers[0].input, outputs=new_outputs) - + def build_ensemble(self, ensemble_rc: bool = False, ensemble_shifts=[0]): """Build ensemble of models computing on augmented input sequences.""" shift_bool = len(ensemble_shifts) > 1 or ensemble_shifts[0] != 0 @@ -1108,15 +1111,16 @@ def track_sequence(self, sequence): # method for inserting adapter for transfer learning def insert_adapter(self, model): - if self.adapter=='houlsby': - output_model = transfer_helper.add_houlsby(model, - self.strand_pair[0], - latent_size=self.adapter_latent) - elif self.adapter=='houlsby_se': - output_model = transfer_helper.add_houlsby_se(model, - self.strand_pair[0], - houlsby_latent=self.adapter_latent, - conv_select=self.conv_select, - se_rank=self.se_rank) + if self.adapter == "houlsby": + output_model = transfer.add_houlsby( + model, self.strand_pair[0], latent_size=self.adapter_latent + ) + elif self.adapter == "houlsby_se": + output_model = transfer.add_houlsby_se( + model, + self.strand_pair[0], + houlsby_latent=self.adapter_latent, + conv_select=self.conv_select, + se_rank=self.se_rank, + ) return output_model - diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 69aa96f..1bd26e4 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ========================================================================= -# modified fit2 to: -# show progress bar during training -# save gpu memory information - import time import pdb @@ -441,9 +437,9 @@ def eval_step1_distr(xd, yd): # training loop gpu_memory_callback = GPUMemoryUsageCallback() - file_path='%s/gpu_mem.txt' % self.out_dir - with open(file_path, 'w') as file: - file.write('epoch\tbatch\tgpu_mem(GB)\n') + file_path = "%s/gpu_mem.txt" % self.out_dir + with open(file_path, "w") as file: + file.write("epoch\tbatch\tgpu_mem(GB)\n") first_step = True # set up summary writer @@ -461,10 +457,12 @@ def eval_step1_distr(xd, yd): # get iterators train_data_iters = [iter(td.dataset) for td in self.train_data] - + # train t0 = time.time() - prog_bar = tf.keras.utils.Progbar(len(self.dataset_indexes)) # Create Keras Progbar + prog_bar = tf.keras.utils.Progbar( + len(self.dataset_indexes) + ) # Create Keras Progbar for didx, di in enumerate(self.dataset_indexes): x, y = safe_next(train_data_iters[di]) if self.strategy is None: @@ -481,12 +479,12 @@ def eval_step1_distr(xd, yd): print("Successful first step!", flush=True) first_step = False prog_bar.add(1) - - if (ei == epoch_start) and (didx < 1000) and (didx%100 == 1): - mem=gpu_memory_callback.on_batch_end() - file = open(file_path, 'a') - file.write("%d\t%d\t%.2f\n"%(ei, didx, mem)) - + + if (ei == epoch_start) and (didx < 1000) and (didx % 100 == 1): + mem = gpu_memory_callback.on_batch_end() + file = open(file_path, "a") + file.write("%d\t%d\t%.2f\n" % (ei, didx, mem)) + print("Epoch %d - %ds" % (ei, (time.time() - t0))) for di in range(self.num_datasets): print(" Data %d" % di, end="") @@ -567,7 +565,6 @@ def eval_step1_distr(xd, yd): valid_r[di].reset_states() valid_r2[di].reset_states() - def fit_tape(self, seqnn_model): """Train the model using a custom tf.GradientTape loop.""" if not self.compiled: @@ -586,23 +583,26 @@ def fit_tape(self, seqnn_model): if self.strategy is None: if self.loss_scale: - + @tf.function def train_step(x, y): with tf.GradientTape() as tape: pred = model(x, training=True) loss = self.loss_fn(y, pred) + sum(model.losses) - scaled_loss = self.optimizer.get_scaled_loss(loss) + scaled_loss = self.optimizer.get_scaled_loss(loss) train_loss(loss) train_r(y, pred) train_r2(y, pred) - scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables) + scaled_gradients = tape.gradient( + scaled_loss, model.trainable_variables + ) gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) self.optimizer.apply_gradients( zip(gradients, model.trainable_variables) - ) + ) + else: - + @tf.function def train_step(x, y): with tf.GradientTape() as tape: @@ -695,9 +695,9 @@ def eval_step_distr(xd, yd): # training loop gpu_memory_callback = GPUMemoryUsageCallback() - file_path='%s/gpu_mem.txt' % self.out_dir - with open(file_path, 'w') as file: - file.write('epoch\tbatch\tgpu_mem(GB)\n') + file_path = "%s/gpu_mem.txt" % self.out_dir + with open(file_path, "w") as file: + file.write("epoch\tbatch\tgpu_mem(GB)\n") for ei in range(epoch_start, self.train_epochs_max): if ei >= self.train_epochs_min and unimproved > self.patience: @@ -716,10 +716,10 @@ def eval_step_distr(xd, yd): print("Successful first step!", flush=True) # print gpu memory usage - if (ei == epoch_start) and (si < 1000) and (si%100 == 1): - mem=gpu_memory_callback.on_batch_end() - with open(file_path, 'a') as file: - file.write("%d\t%d\t%.2f\n"%(ei, si, mem)) + if (ei == epoch_start) and (si < 1000) and (si % 100 == 1): + mem = gpu_memory_callback.on_batch_end() + with open(file_path, "a") as file: + file.write("%d\t%d\t%.2f\n" % (ei, si, mem)) # evaluate for x, y in self.eval_data[0].dataset: @@ -875,7 +875,7 @@ def make_optimizer(self, loss_scale=False): global_clipnorm=global_clipnorm, amsgrad=False, ) # reduces performance in my experience - + elif optimizer_type in ["sgd", "momentum"]: self.optimizer = tf.keras.optimizers.SGD( learning_rate=lr_schedule, @@ -1114,19 +1114,19 @@ def CheckGradientNA(gradients): for grad in gradients: if grad is not None: if tf.reduce_any(tf.math.is_nan(grad)): - raise ValueError("NaN gradient detected.") + raise ValueError("NaN gradient detected.") + # Define a custom callback class to track GPU memory usage class GPUMemoryUsageCallback(tf.keras.callbacks.Callback): def on_train_begin(self, logs=None): # Enable memory growth to avoid GPU memory allocation issues - physical_devices = tf.config.experimental.list_physical_devices('GPU') + physical_devices = tf.config.experimental.list_physical_devices("GPU") if physical_devices: for device in physical_devices: tf.config.experimental.set_memory_growth(device, True) def on_batch_end(self, logs=None): - gpu_memory = tf.config.experimental.get_memory_info('GPU:0') - current_memory = gpu_memory['peak'] / 1e9 # Convert to GB + gpu_memory = tf.config.experimental.get_memory_info("GPU:0") + current_memory = gpu_memory["peak"] / 1e9 # Convert to GB return current_memory - diff --git a/tests/data/transfer/model_houlsby.json b/tests/data/transfer/model_houlsby.json new file mode 100644 index 0000000..6b0507c --- /dev/null +++ b/tests/data/transfer/model_houlsby.json @@ -0,0 +1,79 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1e-06, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1e-08, + "l2_scale": 1e-08, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + }, + "adapter": "houlsby", + "adapter_latent": 8 + } +} \ No newline at end of file diff --git a/tests/data/transfer/model_se4.json b/tests/data/transfer/model_se4.json new file mode 100644 index 0000000..a9436ae --- /dev/null +++ b/tests/data/transfer/model_se4.json @@ -0,0 +1,81 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1e-06, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1e-08, + "l2_scale": 1e-08, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + }, + "adapter": "houlsby_se", + "adapter_latent": 8, + "se_rank": 16, + "conv_select": 4 + } +} \ No newline at end of file diff --git a/tests/data/transfer/transfer_json/params_full.json b/tests/data/transfer/transfer_json/params_full.json new file mode 100644 index 0000000..8ba67e0 --- /dev/null +++ b/tests/data/transfer/transfer_json/params_full.json @@ -0,0 +1,80 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "full" + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_houlsby.json b/tests/data/transfer/transfer_json/params_houlsby.json new file mode 100644 index 0000000..b2f0885 --- /dev/null +++ b/tests/data/transfer/transfer_json/params_houlsby.json @@ -0,0 +1,82 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "adapter", + "adapter": "houlsby", + "adapter_latent": 8 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_ia3.json b/tests/data/transfer/transfer_json/params_ia3.json new file mode 100644 index 0000000..24b38f6 --- /dev/null +++ b/tests/data/transfer/transfer_json/params_ia3.json @@ -0,0 +1,81 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "adapter", + "adapter": "ia3" + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_linear.json b/tests/data/transfer/transfer_json/params_linear.json new file mode 100644 index 0000000..c4128de --- /dev/null +++ b/tests/data/transfer/transfer_json/params_linear.json @@ -0,0 +1,80 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "linear" + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_locon4.json b/tests/data/transfer/transfer_json/params_locon4.json new file mode 100644 index 0000000..fcaa113 --- /dev/null +++ b/tests/data/transfer/transfer_json/params_locon4.json @@ -0,0 +1,83 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "adapter", + "adapter": "locon", + "adapter_latent": 8, + "conv_select": 4 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_lora.json b/tests/data/transfer/transfer_json/params_lora.json new file mode 100644 index 0000000..57e5e70 --- /dev/null +++ b/tests/data/transfer/transfer_json/params_lora.json @@ -0,0 +1,82 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "adapter", + "adapter": "lora", + "adapter_latent": 8 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/data/transfer/transfer_json/params_se4.json b/tests/data/transfer/transfer_json/params_se4.json new file mode 100644 index 0000000..f59d61d --- /dev/null +++ b/tests/data/transfer/transfer_json/params_se4.json @@ -0,0 +1,84 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "adapter", + "adapter": "houlsby_se", + "adapter_latent": 8, + "conv_select": 4, + "conv_latent": 16 + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/test_transfer/test/params.json b/tests/test_transfer/test/params.json new file mode 100644 index 0000000..c4128de --- /dev/null +++ b/tests/test_transfer/test/params.json @@ -0,0 +1,80 @@ +{ + "train": { + "batch_size": 2, + "shuffle_buffer": 224, + "optimizer": "adam", + "learning_rate": 0.0001, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 10, + "train_epochs_max": 50 + }, + "transfer": { + "mode": "linear" + }, + "model": { + "seq_length": 393216, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 1.0e-6, + "trunk": [ + { + "name": "conv_dna", + "filters": 320, + "kernel_size": 11, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 384, + "filters_end": 768, + "divisible_by": 16, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 4, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1.0e-8, + "l2_scale": 1.0e-8, + "kernel_initializer": "he_normal", + "repeat": 8 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 3072 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} diff --git a/tests/test_transfer/test_ia3.ipynb b/tests/test_transfer/test_ia3.ipynb deleted file mode 100644 index 7ac40ba..0000000 --- a/tests/test_transfer/test_ia3.ipynb +++ /dev/null @@ -1,195 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "73892ca7-0ef3-42fa-8f58-db1476625022", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-17 16:32:27.975305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-06-17 16:32:40.663143: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" - ] - } - ], - "source": [ - "import re\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "from baskerville import seqnn\n", - "from baskerville import layers\n", - "from baskerville.helpers import transfer_helper" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "480898e4-1a79-4133-b88f-ae6e2ea4835e", - "metadata": {}, - "outputs": [], - "source": [ - "# test if trainable param match expectation\n", - "def test_add_ia3(model_final):\n", - "\n", - " # expected trainable\n", - " params_added = 0\n", - " for l in model_final.layers:\n", - " if re.match('multihead_attention', l.name): # kv layers\n", - " params_added += transfer_helper.param_count(l._k_layer._ia3_layer)\n", - " params_added += transfer_helper.param_count(l._v_layer._ia3_layer)\n", - " elif re.match('dense', l.name) and l.input_shape[-1]==1536: # ff layers\n", - " params_added += transfer_helper.param_count(l._ia3_layer)\n", - " \n", - " params_head = transfer_helper.param_count(model_final.layers[-2])\n", - " print('expect params (ia3): %d'%params_added)\n", - " print('expect params (head): %d' % params_head)\n", - " print('expect params (total): %d' % (params_head + params_added))\n", - "\n", - " # observed trainable\n", - " c1 = transfer_helper.param_count(model_final, 'trainable')\n", - " print('trainable count: %d' %c1)\n", - " \n", - " assert c1==(params_head+params_added)\n", - " print(\"assert passed. trainable params match expectation.\") \n", - "\n", - "# test at initialization, output is the same\n", - "def test_add_ia3_2(model_final):\n", - " random_input = np.random.rand(1, model_final.input_shape[-2], model_final.input_shape[-1])\n", - " output_original = seqnn_model.model(random_input).numpy()\n", - " output_ia3 = model_final(random_input).numpy()\n", - " \n", - " assert np.allclose(output_original, output_ia3)\n", - " print(\"assert passed. at initialization, ia3 output same as pre-train.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "6bab8fb3-981a-43ef-992f-8cb54f991410", - "metadata": {}, - "outputs": [], - "source": [ - "test_data_dir = '/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer'\n", - "params_file = '%s/params.json' %test_data_dir\n", - "targets_file = '%s/targets.txt' %test_data_dir" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "95ec1814-82c3-473e-894c-b9a3d608bb9a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-17 16:32:52.777625: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22807 MB memory: -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:1a:00.0, compute capability: 7.5\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "params added/unfrozen by ia3: 20480\n" - ] - } - ], - "source": [ - "###################\n", - "# pre-train model #\n", - "###################\n", - "targets_df = pd.read_csv(targets_file, index_col=0, sep=\"\\t\")\n", - "with open(params_file) as params_open:\n", - " params = json.load(params_open)\n", - "params_model = params[\"model\"]\n", - "params_train = params[\"train\"]\n", - "params_model['verbose'] = False\n", - "\n", - "# set strand pairs\n", - "if \"strand_pair\" in targets_df.columns:\n", - " params_model[\"strand_pair\"] = [np.array(targets_df.strand_pair)]\n", - "\n", - "seqnn_model = seqnn.SeqNN(params_model)\n", - "strand_pair = np.array(targets_df.strand_pair)\n", - "\n", - "#############\n", - "# ia3 model #\n", - "#############\n", - "model_final = transfer_helper.add_ia3(seqnn_model.model, strand_pair)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "ed3f5c61-8830-4d39-925b-200b01ad1fa5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "expect params (ia3): 20480\n", - "expect params (head): 52292\n", - "expect params (total): 72772\n", - "trainable count: 72772\n", - "assert passed. trainable params match expectation.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-17 16:33:00.103821: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "assert passed. at initialization, ia3 same as pre-train.\n" - ] - } - ], - "source": [ - "test_add_ia3(model_final)\n", - "test_add_ia3_2(model_final)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e500e589-ef52-4775-9ade-caca53c42035", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/test_transfer/test_load_houlsby.py b/tests/test_transfer/test_load_houlsby.py new file mode 100644 index 0000000..4823b6c --- /dev/null +++ b/tests/test_transfer/test_load_houlsby.py @@ -0,0 +1,25 @@ +import json +import numpy as np +import pandas as pd +from baskerville import seqnn + +model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/train/model_best.h5" +targets_file = "/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer/targets.txt" +params_file = "model_houlsby.json" + +# model params +with open(params_file) as params_open: + params = json.load(params_open) +params_model = params["model"] +params_model["verbose"] = False + +# set strand pairs +targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") +if "strand_pair" in targets_df.columns: + params_model["strand_pair"] = [np.array(targets_df.strand_pair)] +strand_pair = np.array(targets_df.strand_pair) + +seqnn_model = seqnn.SeqNN(params_model) +seqnn_model.restore(model_file) + +print("load model success!") diff --git a/tests/test_transfer/test_load_se2.py b/tests/test_transfer/test_load_se2.py new file mode 100644 index 0000000..dbd4938 --- /dev/null +++ b/tests/test_transfer/test_load_se2.py @@ -0,0 +1,25 @@ +import json +import numpy as np +import pandas as pd +from baskerville import seqnn + +model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby_se/se2/train/f0c0/train/model_best.h5" +targets_file = "/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer/targets.txt" +params_file = "model_se2.json" + +# model params +with open(params_file) as params_open: + params = json.load(params_open) +params_model = params["model"] +params_model["verbose"] = False + +# set strand pairs +targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") +if "strand_pair" in targets_df.columns: + params_model["strand_pair"] = [np.array(targets_df.strand_pair)] +strand_pair = np.array(targets_df.strand_pair) + +seqnn_model = seqnn.SeqNN(params_model) +seqnn_model.restore(model_file) + +print("load model success!") diff --git a/tests/test_transfer/test_seqlen.py b/tests/test_transfer/test_seqlen.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_transfer/test_transfer.sh b/tests/test_transfer/test_transfer.sh new file mode 100644 index 0000000..6c54f33 --- /dev/null +++ b/tests/test_transfer/test_transfer.sh @@ -0,0 +1,12 @@ +pretrain_model='/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/westminster_no_gtex_trunk/trained_trunks/f0c0.h5' +data='/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/data0' + +# test each script +# modify hound_transfer.py to exit after compile +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_full.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_linear.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_houlsby.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_lora.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_ia3.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_locon4.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_se4.json ${data} From 101346cd065631128395a178548f1d840bb512ab Mon Sep 17 00:00:00 2001 From: hy395 Date: Mon, 14 Oct 2024 16:52:34 -0700 Subject: [PATCH 17/26] add transfer tutorial --- docs/transfer/make_tfr.sh | 24 +++ .../test => docs/transfer}/params.json | 44 +++-- docs/transfer/targets.txt | 5 + docs/transfer/transfer.md | 181 ++++++++++++++++++ src/baskerville/scripts/hound_eval_spec.py | 2 +- src/baskerville/scripts/utils/bw_w5.py | 146 ++++++++++++++ .../transfer/{ => json}/model_houlsby.json | 0 tests/data/transfer/{ => json}/model_se4.json | 0 tests/data/transfer/{ => json}/params.json | 0 .../transfer/json/params_262k_adj_seqlen.json | 85 ++++++++ .../{transfer_json => json}/params_full.json | 0 .../params_houlsby.json | 0 .../{transfer_json => json}/params_ia3.json | 0 .../params_linear.json | 0 .../params_locon4.json | 0 .../{transfer_json => json}/params_lora.json | 0 .../{transfer_json => json}/params_se4.json | 0 tests/test_transfer/test_load_houlsby.py | 6 +- ...t_load_se2.py => test_load_houlsby_se4.py} | 6 +- tests/test_transfer/test_seqlen.py | 0 tests/test_transfer/test_transfer.sh | 14 +- 21 files changed, 481 insertions(+), 32 deletions(-) create mode 100644 docs/transfer/make_tfr.sh rename {tests/test_transfer/test => docs/transfer}/params.json (64%) create mode 100644 docs/transfer/targets.txt create mode 100644 docs/transfer/transfer.md create mode 100755 src/baskerville/scripts/utils/bw_w5.py rename tests/data/transfer/{ => json}/model_houlsby.json (100%) rename tests/data/transfer/{ => json}/model_se4.json (100%) rename tests/data/transfer/{ => json}/params.json (100%) create mode 100644 tests/data/transfer/json/params_262k_adj_seqlen.json rename tests/data/transfer/{transfer_json => json}/params_full.json (100%) rename tests/data/transfer/{transfer_json => json}/params_houlsby.json (100%) rename tests/data/transfer/{transfer_json => json}/params_ia3.json (100%) rename tests/data/transfer/{transfer_json => json}/params_linear.json (100%) rename tests/data/transfer/{transfer_json => json}/params_locon4.json (100%) rename tests/data/transfer/{transfer_json => json}/params_lora.json (100%) rename tests/data/transfer/{transfer_json => json}/params_se4.json (100%) rename tests/test_transfer/{test_load_se2.py => test_load_houlsby_se4.py} (80%) delete mode 100644 tests/test_transfer/test_seqlen.py diff --git a/docs/transfer/make_tfr.sh b/docs/transfer/make_tfr.sh new file mode 100644 index 0000000..139bf8f --- /dev/null +++ b/docs/transfer/make_tfr.sh @@ -0,0 +1,24 @@ +#! /bin/bash + +conda activate baskerville + +# files +data_path='/home/yuanh/analysis/Borzoi_transfer/tutorial/data' +OUT=${data_path}/tfr +HG38=${data_path}/hg38 +CONTIGDATA==${data_path}/trainsplit +FASTA_HUMAN=$HG38/hg38.ml.fa +UMAP_HUMAN=$HG38/umap_k36_t10_l32.bed +BLACK_HUMAN=$HG38/blacklist_hg38_all.bed + +# params +LENGTH=524288 +CROP=163840 +WIDTH=32 +FOLDS=8 +DOPTS="-c $CROP -d 2 -f $FOLDS -l $LENGTH -p 32 -r 256 --umap_clip 0.5 -w $WIDTH" + +# copy sequence contigs, mappability and train/val/test split. +mkdir $OUT +cp ${CONTIGDATA}/* $OUT +hound_data.py --restart $DOPTS -b $BLACK_HUMAN -o $OUT $FASTA_HUMAN -u $OUT/umap_human.bed targets.txt \ No newline at end of file diff --git a/tests/test_transfer/test/params.json b/docs/transfer/params.json similarity index 64% rename from tests/test_transfer/test/params.json rename to docs/transfer/params.json index c4128de..1d4113b 100644 --- a/tests/test_transfer/test/params.json +++ b/docs/transfer/params.json @@ -1,13 +1,13 @@ { "train": { - "batch_size": 2, - "shuffle_buffer": 224, + "batch_size": 1, + "shuffle_buffer": 256, "optimizer": "adam", - "learning_rate": 0.0001, + "learning_rate": 0.00006, "loss": "poisson_mn", "total_weight": 0.2, "warmup_steps": 20000, - "global_clipnorm": 0.1, + "global_clipnorm": 0.15, "adam_beta1": 0.9, "adam_beta2": 0.999, "patience": 5, @@ -15,31 +15,34 @@ "train_epochs_max": 50 }, "transfer": { - "mode": "linear" + "mode": "adapter", + "adapter": "locon", + "adapter_latent": 8, + "conv_select": 4 }, "model": { - "seq_length": 393216, + "seq_length": 524288, "augment_rc": true, "augment_shift": 3, "activation": "gelu", - "norm_type": "batch", + "norm_type": "batch-sync", "bn_momentum": 0.9, "kernel_initializer": "lecun_normal", - "l2_scale": 1.0e-6, + "l2_scale": 2.0e-8, "trunk": [ { "name": "conv_dna", - "filters": 320, - "kernel_size": 11, + "filters": 512, + "kernel_size": 15, "norm_type": null, "activation": "linear", "pool_size": 2 }, { "name": "res_tower", - "filters_init": 384, - "filters_end": 768, - "divisible_by": 16, + "filters_init": 608, + "filters_end": 1536, + "divisible_by": 32, "kernel_size": 5, "num_convs": 1, "pool_size": 2, @@ -48,7 +51,7 @@ { "name": "transformer_tower", "key_size": 64, - "heads": 4, + "heads": 8, "num_position_features": 32, "dropout": 0.2, "mha_l2_scale": 1.0e-8, @@ -59,21 +62,26 @@ { "name": "unet_conv", "kernel_size": 3, - "upsample_conv": true + "upsample_conv": true }, { "name": "unet_conv", "kernel_size": 3, - "upsample_conv": true + "upsample_conv": true }, { "name": "Cropping1D", - "cropping": 3072 + "cropping": 5120 + }, + { + "name": "conv_nac", + "filters": 1920, + "dropout": 0.1 } ], "head_human": { "name": "final", - "units": 68, + "units": 4, "activation": "softplus" } } diff --git a/docs/transfer/targets.txt b/docs/transfer/targets.txt new file mode 100644 index 0000000..fb251f7 --- /dev/null +++ b/docs/transfer/targets.txt @@ -0,0 +1,5 @@ +identifier file clip clip_soft scale sum_stat strand_pair description +0 PDL_TP1_A+ /home/yuanh/analysis/Borzoi_transfer/tutorial/data/w5/PDL20_TP1_A.filter+.w5 768 384 0.3 sum_sqrt 1 RNA:PDL_TP1_A+ +1 PDL_TP1_A- /home/yuanh/analysis/Borzoi_transfer/tutorial/data/w5/PDL20_TP1_A.filter-.w5 768 384 0.3 sum_sqrt 0 RNA:PDL_TP1_A- +2 PDL_TP7_C+ /home/yuanh/analysis/Borzoi_transfer/tutorial/data/w5/PDL50_TP7_C.filter+.w5 768 384 0.3 sum_sqrt 3 RNA:PDL_TP7_C+ +3 PDL_TP7_C- /home/yuanh/analysis/Borzoi_transfer/tutorial/data/w5/PDL50_TP7_C.filter-.w5 768 384 0.3 sum_sqrt 2 RNA:PDL_TP7_C- \ No newline at end of file diff --git a/docs/transfer/transfer.md b/docs/transfer/transfer.md new file mode 100644 index 0000000..be068e8 --- /dev/null +++ b/docs/transfer/transfer.md @@ -0,0 +1,181 @@ +## Transfer Learning Tutorial + +### Required Software +- baskerville +- bamCoverage from [deepTools](https://github.com/deeptools/deepTools/tree/master) is required to make BigWig files. + +### Download Tutorial Data + + +Set data_path to your preferred directory: + +```bash +data_path='/home/yuanh/analysis/Borzoi_transfer/tutorial/data' +bam_folder=${data_path}/bam +bw_folder=${data_path}/bw +w5_folder=${data_path}/w5 + +mkdir -p ${data_path} +``` + +Download Borzoi pre-trained model weights: + +```bash +gsutil cp -r gs://scbasset_tutorial_data/baskerville_transfer/pretrain_trunks/ ${data_path} +``` + +Download hg38 reference information, and train-validation-test-split information: +```bash +gsutil cp -r gs://scbasset_tutorial_data/baskerville_transfer/hg38/ ${data_path} +gsutil cp -r gs://scbasset_tutorial_data/baskerville_transfer/trainsplit/ ${data_path} +gunzip ${data_path}/hg38/hg38.ml.fa.gz +``` + +Follow `Step 1` to generate BigWig from BAM files. Or, for the purpose of this tutorial, download CPM normalized stranded BigWig files for wild-type (PDL20_TP1_A) and senescent (PDL50_TP7_C) WI38 cell RNA-seq directly, and skip `Step 1`. + +```bash +gsutil cp -r gs://scbasset_tutorial_data/baskerville_transfer/bw/ ${data_path} +``` + +### Step 1 (Optional): Convert BAM to BigWig Files + +When you start from bam files, you can first create stranded/unstranded BigWig files depending on the RNA-seq protocol is stranded or not: + +```bash +for file in ${bam_folder}/*.bam +do + bam=`basename ${file}` + bamCoverage --filterRNAstrand forward --binSize 1 --normalizeUsing CPM --skipNAs -p 16 -b ${bam_folder}/${bam} -o ${bw_folder}/${bam/.bam/}+.bw + bamCoverage --filterRNAstrand reverse --binSize 1 --normalizeUsing CPM --skipNAs -p 16 -b ${bam_folder}/${bam} -o ${bw_folder}/${bam/.bam/}-.bw + echo ${bam} +done +``` +`Note`: when working with 10x scRNA data, the strands are flipped. Now `--filterRNAstrand forward` refers to the reverse strand, and `--filterRNAstrand reverse` refers to the forward strand. + +Or created unstranded BigWig files: +```bash +for file in ${bam_folder}/*.bam +do + bam=`basename ${file}` + bamCoverage --binSize 1 --normalizeUsing CPM --skipNAs -p 16 -b ${bam_folder}/${bam} -o ${bw_folder}/${bam/.bam/}+.bw + echo ${bam} +done +``` + +### Step 2. Convert BigWig Files to Compressed hdf5 Format (w5) Files + +Convert BigWig files to compressed hdf5 format (.w5). + +```bash +mkdir ${w5_folder} +for file in ${bw_folder}/*.bw +do + bw=$(basename "${file}") + scripts/utils/bw_w5.py ${bw_folder}/${bw} ${w5_folder}/${bw/.bw/.w5} + echo ${bw} +done +``` + +`Note:` if your BAM/BigWig file chromosomes names are 1, 2, 3, etc (instead of chr1, chr2, chr3, etc), make sure to run the bw_w5.py script with the --chr_prepend option. This will prepend 'chr' to the chromosome names before converting the files to .w5 format. + +### Step 3. Make Target File + +We have provided the target file for this tutorial example. + +Create *targets.txt*: +- (unnamed) => integer index of each track (must start from 0 when training a new model). +- 'identifier' => unique identifier of each experiment (and strand). +- 'file' => local file path to .w5 file. +- 'clip' => hard clipping threshold to be applied to each bin, after soft-clipping (default: 768). +- 'clip_soft' => soft clipping (squashing) threshold (default: 384). +- 'scale' => scale value applied to each bp-level position before clipping (see more detaile below). +- 'sum_stat' => type of bin-level pooling operation (default: 'sum_sqrt', sum and square-root). +- 'strand_pair' => integer index of the other stranded track of an experiment (same index as current row if unstranded). +- 'description' => text description of experiment. + +**Note on 'scale':** A scaling factor is applied when creating the TFRecord data. Borzoi models use Poisson and multinomial losses. Input BigWig/W5 tracks are scaled so that one fragment is counted as one event, with each bp position of the fragment contributing 1/(frag length). As a result, the total coverage across the genome should sum to the read depth of the sample. + +- If you start with BAM files, you can make BigWig files with option `--normalizeUsing None` in `bamCoverage`. Find out the fragment length by `samtools stats x.bam|grep "average length"` And then set the scaling factor to 1/(frag length). +- For standard BigWig tracks that are TPM normalized, it sums up to (frag length) * 1e6. When fragment length and library size are unknown for your RNA-seq data (e.g. when you only have RPM normalized BigWig data), we typically assume fragment length of 100, and library size of 33 million reads. Thus, for RPM normalized BigWig files, we set a scaling factor of 33/100 = 0.3. + + +### Step 4. Create TFRecords + +```bash +./make_tfr.sh +``` + +### Step 5. Parameter Json File + +Similar to Borzoi training, arguments for training learning is also indicated in the params.json file. Add a additional `transfer` section in the parameter json file to allow transfer learning. For transfer learning rate, we suggest lowering the lr to 1e-5 for fine-tuning, and keeping the original lr for other methods. For batch size, we suggest a batch size of 1 to reduce GPU memory for linear probing or adapter-based methods. Here's the `transfer` arguments for different transfer methods. You can also find the params.json file for Locon4 in the `data/params.json`. + +**Full fine-tuning**: +``` + "transfer": { + "mode": "full" + }, +``` + +**Linear probing**: +``` + "transfer": { + "mode": "linear" + }, +``` + +**LoRA**: +``` + "transfer": { + "mode": "adapter", + "adapter": "lora", + "adapter_latent": 8 + }, +``` + +**Locon4**: +``` + "transfer": { + "mode": "adapter", + "adapter": "locon", + "adapter_latent": 8, + "conv_select": 4 + }, +``` + +**Houlsby**: +``` + "transfer": { + "mode": "adapter", + "adapter": "houlsby", + "adapter_latent": 8 + }, +``` + +**Houlsby_se4**: +``` + "transfer": { + "mode": "adapter", + "adapter": "houlsby_se", + "adapter_latent": 8, + "conv_select": 4, + "conv_latent": 16 + }, +``` + +### Step 6. Train model + +Use westminster_train_folds.py with `--transfer` option to perform transfer learning on the dataset. + +```bash +westminster_train_folds.py -e 'tf2.12' \ + -q nvidia_geforce_rtx_4090 \ + --name "locon" \ + --rc --shifts "0,1" -o train -f 4 --step 8 --eval_train_off \ + --restore ${data_path}/pretrain_trunks \ + --trunk \ + --transfer \ + --train_f3 \ + --weight_file model_best.mergeW.h5 \ + params.json \ + ${data_path}/tfr +``` diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 4212a42..1956387 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -43,7 +43,7 @@ def main(): parser.add_option( "-c", dest="class_min", - default=80, + default=10, type="int", help="Minimum target class size to consider [Default: %default]", ) diff --git a/src/baskerville/scripts/utils/bw_w5.py b/src/baskerville/scripts/utils/bw_w5.py new file mode 100755 index 0000000..ee247aa --- /dev/null +++ b/src/baskerville/scripts/utils/bw_w5.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +from optparse import OptionParser +import sys + +import h5py +import numpy as np +import pyBigWig +import scipy.interpolate + +''' +bw_w5.py + +Convert a BigWig to wigHDF5. +''' + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] ' + parser = OptionParser(usage) + parser.add_option('-c', '--chr_strip', dest='chr_strip', + default=False, action='store_true') + parser.add_option('-p', '--chr_prepend', dest='chr_prepend', + default=False, action='store_true') + parser.add_option('-i', dest='interp_nan', + default=False, action='store_true', + help='Interpolate NaNs [Default: %default]') + parser.add_option('-m', dest='min_norm', + default=False, action='store_true', + help='Normalize the minimum nonzero value to 1 [Default: %default]') + parser.add_option('-s', dest='scale', + default=1.0, type='float', + help='Scale all values (e.g. to undo normalization) [Default: %default]') + parser.add_option('-v', dest='verbose', + default=False, action='store_true') + parser.add_option('-z', dest='clip_zero', + default=False, action='store_true', + help='Clip negative values at zero [Default: %default]') + (options,args) = parser.parse_args() + + if len(args) != 2: + parser.error('Must provide input BigWig and output HDF5.') + else: + bw_files = args[0] + hdf5_file = args[1] + + # open files + bw_files = bw_files.split(',') + bw_ins = [pyBigWig.open(bw_file) for bw_file in bw_files] + h5_out = h5py.File(hdf5_file, 'w') + + # process chromosomes in length order + chrom_lengths = bw_ins[0].chroms() + chroms = sorted(chrom_lengths.keys()) + length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms] + length_chroms = sorted(length_chroms)[::-1] + min_factor = None + + # for each chromosome + for clength, chrom in length_chroms: + if options.verbose: + print(chrom) + + # read values + x = bw_ins[0].values(chrom, 0, clength, numpy=True) + for bw_in in bw_ins[1:]: + x += bw_in.values(chrom, 0, clength, numpy=True) + + # scale + if options.scale != 1: + x = x*options.scale + + # normalize min to 1 + # (a simple strategy to undo normalization) + if options.min_norm: + if min_factor is None: + min_factor = x[x>0].min() + print('Min normalization factor: %f' % min_factor, file=sys.stderr) + x /= min_factor + + # interpolate NaN + if options.interp_nan: + x = interp_nan(x) + else: + x = np.nan_to_num(x) + + # clip negative values + if options.clip_zero: + x = np.clip(x, 0, np.inf) + + # clip float16 min/max + x = np.clip(x, np.finfo(np.float16).min, np.finfo(np.float16).max) + x = x.astype('float16') + + # strip "chr" + if options.chr_strip: + chrom = chrom.replace('chr','') + + # prepend "chr" + if options.chr_prepend: + chrom = 'chr' + chrom + + # write gzipped into HDF5 + h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True) + + # close files + h5_out.close() + for bw_in in bw_ins: + bw_in.close() + + +def interp_nan(x, kind='linear'): + '''Linearly interpolate to fill NaN.''' + + # pad zeroes + xp = np.zeros(len(x)+2) + xp[1:-1] = x + + # find NaN + x_nan = np.isnan(xp) + + if np.sum(x_nan) == 0: + # unnecessary + return x + + else: + # interpolate + inds = np.arange(len(xp)) + interpolator = scipy.interpolate.interp1d( + inds[~x_nan], + xp[~x_nan], + kind=kind, + bounds_error=False) + + loc = np.where(x_nan) + xp[loc] = interpolator(loc) + + # slice off pad + return xp[1:-1] + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/tests/data/transfer/model_houlsby.json b/tests/data/transfer/json/model_houlsby.json similarity index 100% rename from tests/data/transfer/model_houlsby.json rename to tests/data/transfer/json/model_houlsby.json diff --git a/tests/data/transfer/model_se4.json b/tests/data/transfer/json/model_se4.json similarity index 100% rename from tests/data/transfer/model_se4.json rename to tests/data/transfer/json/model_se4.json diff --git a/tests/data/transfer/params.json b/tests/data/transfer/json/params.json similarity index 100% rename from tests/data/transfer/params.json rename to tests/data/transfer/json/params.json diff --git a/tests/data/transfer/json/params_262k_adj_seqlen.json b/tests/data/transfer/json/params_262k_adj_seqlen.json new file mode 100644 index 0000000..4406a71 --- /dev/null +++ b/tests/data/transfer/json/params_262k_adj_seqlen.json @@ -0,0 +1,85 @@ +{ + "train": { + "batch_size": 1, + "shuffle_buffer": 256, + "optimizer": "adam", + "learning_rate": 6e-05, + "loss": "poisson_mn", + "total_weight": 0.2, + "warmup_steps": 20000, + "global_clipnorm": 0.15, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "patience": 5, + "train_epochs_min": 5, + "train_epochs_max": 50 + }, + "model": { + "seq_length": 262144, + "augment_rc": true, + "augment_shift": 3, + "activation": "gelu", + "norm_type": "batch-sync", + "bn_momentum": 0.9, + "kernel_initializer": "lecun_normal", + "l2_scale": 2e-08, + "trunk": [ + { + "name": "conv_dna", + "filters": 512, + "kernel_size": 15, + "norm_type": null, + "activation": "linear", + "pool_size": 2 + }, + { + "name": "res_tower", + "filters_init": 608, + "filters_end": 1536, + "divisible_by": 32, + "kernel_size": 5, + "num_convs": 1, + "pool_size": 2, + "repeat": 6 + }, + { + "name": "transformer_tower", + "key_size": 64, + "heads": 8, + "num_position_features": 32, + "dropout": 0.2, + "mha_l2_scale": 1e-08, + "l2_scale": 1e-08, + "kernel_initializer": "he_normal", + "repeat": 8, + "adapter": "houlsby", + "latent": 8, + "seqlen_train": 4096 + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "unet_conv", + "kernel_size": 3, + "upsample_conv": true + }, + { + "name": "Cropping1D", + "cropping": 2560 + }, + { + "name": "conv_nac", + "filters": 1920, + "dropout": 0.1 + } + ], + "head_human": { + "name": "final", + "units": 68, + "activation": "softplus" + } + } +} \ No newline at end of file diff --git a/tests/data/transfer/transfer_json/params_full.json b/tests/data/transfer/json/params_full.json similarity index 100% rename from tests/data/transfer/transfer_json/params_full.json rename to tests/data/transfer/json/params_full.json diff --git a/tests/data/transfer/transfer_json/params_houlsby.json b/tests/data/transfer/json/params_houlsby.json similarity index 100% rename from tests/data/transfer/transfer_json/params_houlsby.json rename to tests/data/transfer/json/params_houlsby.json diff --git a/tests/data/transfer/transfer_json/params_ia3.json b/tests/data/transfer/json/params_ia3.json similarity index 100% rename from tests/data/transfer/transfer_json/params_ia3.json rename to tests/data/transfer/json/params_ia3.json diff --git a/tests/data/transfer/transfer_json/params_linear.json b/tests/data/transfer/json/params_linear.json similarity index 100% rename from tests/data/transfer/transfer_json/params_linear.json rename to tests/data/transfer/json/params_linear.json diff --git a/tests/data/transfer/transfer_json/params_locon4.json b/tests/data/transfer/json/params_locon4.json similarity index 100% rename from tests/data/transfer/transfer_json/params_locon4.json rename to tests/data/transfer/json/params_locon4.json diff --git a/tests/data/transfer/transfer_json/params_lora.json b/tests/data/transfer/json/params_lora.json similarity index 100% rename from tests/data/transfer/transfer_json/params_lora.json rename to tests/data/transfer/json/params_lora.json diff --git a/tests/data/transfer/transfer_json/params_se4.json b/tests/data/transfer/json/params_se4.json similarity index 100% rename from tests/data/transfer/transfer_json/params_se4.json rename to tests/data/transfer/json/params_se4.json diff --git a/tests/test_transfer/test_load_houlsby.py b/tests/test_transfer/test_load_houlsby.py index 4823b6c..3f25da0 100644 --- a/tests/test_transfer/test_load_houlsby.py +++ b/tests/test_transfer/test_load_houlsby.py @@ -4,8 +4,8 @@ from baskerville import seqnn model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/train/model_best.h5" -targets_file = "/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer/targets.txt" -params_file = "model_houlsby.json" +targets_file = "tests/data/transfer/targets.txt" +params_file = "tests/data/transfer/json/model_houlsby.json" # model params with open(params_file) as params_open: @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") +print("load model success!") \ No newline at end of file diff --git a/tests/test_transfer/test_load_se2.py b/tests/test_transfer/test_load_houlsby_se4.py similarity index 80% rename from tests/test_transfer/test_load_se2.py rename to tests/test_transfer/test_load_houlsby_se4.py index dbd4938..97aadd1 100644 --- a/tests/test_transfer/test_load_se2.py +++ b/tests/test_transfer/test_load_houlsby_se4.py @@ -4,8 +4,8 @@ from baskerville import seqnn model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby_se/se2/train/f0c0/train/model_best.h5" -targets_file = "/home/yuanh/programs/source/python_packages/baskerville/tests/data/transfer/targets.txt" -params_file = "model_se2.json" +targets_file = "tests/data/transfer/targets.txt" +params_file = "tests/data/transfer/json/model_se4.json" # model params with open(params_file) as params_open: @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") +print("load model success!") \ No newline at end of file diff --git a/tests/test_transfer/test_seqlen.py b/tests/test_transfer/test_seqlen.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_transfer/test_transfer.sh b/tests/test_transfer/test_transfer.sh index 6c54f33..e2e98ec 100644 --- a/tests/test_transfer/test_transfer.sh +++ b/tests/test_transfer/test_transfer.sh @@ -3,10 +3,10 @@ data='/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_ # test each script # modify hound_transfer.py to exit after compile -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_full.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_linear.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_houlsby.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_lora.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_ia3.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_locon4.json ${data} -hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/transfer_json/params_se4.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_full.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_linear.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_houlsby.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_lora.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_ia3.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_locon4.json ${data} +hound_transfer.py -o test --restore ${pretrain_model} --trunk ../data/transfer/json/params_se4.json ${data} From 1963ce31e438876ea9b72d267e0fbdf5586ca7d2 Mon Sep 17 00:00:00 2001 From: hy395 Date: Mon, 14 Oct 2024 16:53:47 -0700 Subject: [PATCH 18/26] add transfer tutorial --- src/baskerville/scripts/hound_eval_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 1956387..4212a42 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -43,7 +43,7 @@ def main(): parser.add_option( "-c", dest="class_min", - default=10, + default=80, type="int", help="Minimum target class size to consider [Default: %default]", ) From 9559c7627841ee9498e6f47d0491fbcdded947cc Mon Sep 17 00:00:00 2001 From: hy395 Date: Fri, 18 Oct 2024 11:25:30 -0700 Subject: [PATCH 19/26] move transfer.py out of helper --- src/baskerville/scripts/hound_eval_spec.py | 2 +- src/baskerville/scripts/hound_transfer.py | 2 +- src/baskerville/seqnn.py | 2 +- src/baskerville/{helpers => }/transfer.py | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename src/baskerville/{helpers => }/transfer.py (100%) diff --git a/src/baskerville/scripts/hound_eval_spec.py b/src/baskerville/scripts/hound_eval_spec.py index 4212a42..1956387 100755 --- a/src/baskerville/scripts/hound_eval_spec.py +++ b/src/baskerville/scripts/hound_eval_spec.py @@ -43,7 +43,7 @@ def main(): parser.add_option( "-c", dest="class_min", - default=80, + default=10, type="int", help="Minimum target class size to consider [Default: %default]", ) diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index c4d3a85..88ca008 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -29,7 +29,7 @@ from baskerville import seqnn from baskerville import trainer from baskerville import layers -from baskerville.helpers import transfer +from baskerville import transfer """ hound_transfer.py diff --git a/src/baskerville/seqnn.py b/src/baskerville/seqnn.py index 35cbea2..8887f50 100644 --- a/src/baskerville/seqnn.py +++ b/src/baskerville/seqnn.py @@ -25,7 +25,7 @@ from baskerville import dataset from baskerville import layers from baskerville import metrics -from baskerville.helpers import transfer +from baskerville import transfer class SeqNN: diff --git a/src/baskerville/helpers/transfer.py b/src/baskerville/transfer.py similarity index 100% rename from src/baskerville/helpers/transfer.py rename to src/baskerville/transfer.py From 902ad299e5c5aef9973b4b9fc8f7fb7fb8e24307 Mon Sep 17 00:00:00 2001 From: David Kelley Date: Sun, 27 Oct 2024 11:51:06 -0700 Subject: [PATCH 20/26] black format --- src/baskerville/scripts/utils/bw_w5.py | 103 +++++++++++-------- tests/test_transfer/test_load_houlsby.py | 2 +- tests/test_transfer/test_load_houlsby_se4.py | 2 +- 3 files changed, 63 insertions(+), 44 deletions(-) diff --git a/src/baskerville/scripts/utils/bw_w5.py b/src/baskerville/scripts/utils/bw_w5.py index ee247aa..6d52a77 100755 --- a/src/baskerville/scripts/utils/bw_w5.py +++ b/src/baskerville/scripts/utils/bw_w5.py @@ -7,53 +7,71 @@ import pyBigWig import scipy.interpolate -''' +""" bw_w5.py Convert a BigWig to wigHDF5. -''' +""" + ################################################################################ # main ################################################################################ def main(): - usage = 'usage: %prog [options] ' + usage = "usage: %prog [options] " parser = OptionParser(usage) - parser.add_option('-c', '--chr_strip', dest='chr_strip', - default=False, action='store_true') - parser.add_option('-p', '--chr_prepend', dest='chr_prepend', - default=False, action='store_true') - parser.add_option('-i', dest='interp_nan', - default=False, action='store_true', - help='Interpolate NaNs [Default: %default]') - parser.add_option('-m', dest='min_norm', - default=False, action='store_true', - help='Normalize the minimum nonzero value to 1 [Default: %default]') - parser.add_option('-s', dest='scale', - default=1.0, type='float', - help='Scale all values (e.g. to undo normalization) [Default: %default]') - parser.add_option('-v', dest='verbose', - default=False, action='store_true') - parser.add_option('-z', dest='clip_zero', - default=False, action='store_true', - help='Clip negative values at zero [Default: %default]') - (options,args) = parser.parse_args() + parser.add_option( + "-c", "--chr_strip", dest="chr_strip", default=False, action="store_true" + ) + parser.add_option( + "-p", "--chr_prepend", dest="chr_prepend", default=False, action="store_true" + ) + parser.add_option( + "-i", + dest="interp_nan", + default=False, + action="store_true", + help="Interpolate NaNs [Default: %default]", + ) + parser.add_option( + "-m", + dest="min_norm", + default=False, + action="store_true", + help="Normalize the minimum nonzero value to 1 [Default: %default]", + ) + parser.add_option( + "-s", + dest="scale", + default=1.0, + type="float", + help="Scale all values (e.g. to undo normalization) [Default: %default]", + ) + parser.add_option("-v", dest="verbose", default=False, action="store_true") + parser.add_option( + "-z", + dest="clip_zero", + default=False, + action="store_true", + help="Clip negative values at zero [Default: %default]", + ) + (options, args) = parser.parse_args() if len(args) != 2: - parser.error('Must provide input BigWig and output HDF5.') + parser.error("Must provide input BigWig and output HDF5.") else: bw_files = args[0] hdf5_file = args[1] # open files - bw_files = bw_files.split(',') + bw_files = bw_files.split(",") bw_ins = [pyBigWig.open(bw_file) for bw_file in bw_files] - h5_out = h5py.File(hdf5_file, 'w') + h5_out = h5py.File(hdf5_file, "w") # process chromosomes in length order chrom_lengths = bw_ins[0].chroms() chroms = sorted(chrom_lengths.keys()) - length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms] + length_chroms = [(chrom_lengths[chrm], chrm) for chrm in chroms] length_chroms = sorted(length_chroms)[::-1] min_factor = None @@ -69,14 +87,14 @@ def main(): # scale if options.scale != 1: - x = x*options.scale + x = x * options.scale # normalize min to 1 # (a simple strategy to undo normalization) if options.min_norm: if min_factor is None: - min_factor = x[x>0].min() - print('Min normalization factor: %f' % min_factor, file=sys.stderr) + min_factor = x[x > 0].min() + print("Min normalization factor: %f" % min_factor, file=sys.stderr) x /= min_factor # interpolate NaN @@ -91,18 +109,20 @@ def main(): # clip float16 min/max x = np.clip(x, np.finfo(np.float16).min, np.finfo(np.float16).max) - x = x.astype('float16') + x = x.astype("float16") # strip "chr" if options.chr_strip: - chrom = chrom.replace('chr','') + chrom = chrom.replace("chr", "") # prepend "chr" if options.chr_prepend: - chrom = 'chr' + chrom - + chrom = "chr" + chrom + # write gzipped into HDF5 - h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True) + h5_out.create_dataset( + chrom, data=x, dtype="float16", compression="gzip", shuffle=True + ) # close files h5_out.close() @@ -110,11 +130,11 @@ def main(): bw_in.close() -def interp_nan(x, kind='linear'): - '''Linearly interpolate to fill NaN.''' +def interp_nan(x, kind="linear"): + """Linearly interpolate to fill NaN.""" # pad zeroes - xp = np.zeros(len(x)+2) + xp = np.zeros(len(x) + 2) xp[1:-1] = x # find NaN @@ -128,10 +148,8 @@ def interp_nan(x, kind='linear'): # interpolate inds = np.arange(len(xp)) interpolator = scipy.interpolate.interp1d( - inds[~x_nan], - xp[~x_nan], - kind=kind, - bounds_error=False) + inds[~x_nan], xp[~x_nan], kind=kind, bounds_error=False + ) loc = np.where(x_nan) xp[loc] = interpolator(loc) @@ -139,8 +157,9 @@ def interp_nan(x, kind='linear'): # slice off pad return xp[1:-1] + ################################################################################ # __main__ ################################################################################ -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_transfer/test_load_houlsby.py b/tests/test_transfer/test_load_houlsby.py index 3f25da0..fa55252 100644 --- a/tests/test_transfer/test_load_houlsby.py +++ b/tests/test_transfer/test_load_houlsby.py @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") \ No newline at end of file +print("load model success!") diff --git a/tests/test_transfer/test_load_houlsby_se4.py b/tests/test_transfer/test_load_houlsby_se4.py index 97aadd1..5ceb318 100644 --- a/tests/test_transfer/test_load_houlsby_se4.py +++ b/tests/test_transfer/test_load_houlsby_se4.py @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") \ No newline at end of file +print("load model success!") From ebbb7894540ff2e75e1839cd18b7e66c74ba3d49 Mon Sep 17 00:00:00 2001 From: David Kelley Date: Sun, 27 Oct 2024 12:54:17 -0700 Subject: [PATCH 21/26] setting aside nfs-dependent tests --- .gitignore | 3 ++ tests/conftest.py | 19 +++++++ .../test_transfer.sh | 0 tests/integration/test_transfer_load.py | 49 +++++++++++++++++++ tests/test_transfer/test_load_houlsby.py | 25 ---------- tests/test_transfer/test_load_houlsby_se4.py | 25 ---------- tests/{ => unit}/test_dna.py | 0 tests/{ => unit}/test_eval.py | 0 tests/{ => unit}/test_ism.py | 0 tests/{ => unit}/test_snp.py | 0 tests/{ => unit}/test_train.py | 0 11 files changed, 71 insertions(+), 50 deletions(-) create mode 100644 tests/conftest.py rename tests/{test_transfer => integration}/test_transfer.sh (100%) create mode 100644 tests/integration/test_transfer_load.py delete mode 100644 tests/test_transfer/test_load_houlsby.py delete mode 100644 tests/test_transfer/test_load_houlsby_se4.py rename tests/{ => unit}/test_dna.py (100%) rename tests/{ => unit}/test_eval.py (100%) rename tests/{ => unit}/test_ism.py (100%) rename tests/{ => unit}/test_snp.py (100%) rename tests/{ => unit}/test_train.py (100%) diff --git a/.gitignore b/.gitignore index 3d0903b..6fd128e 100644 --- a/.gitignore +++ b/.gitignore @@ -135,3 +135,6 @@ dmypy.json src/baskerville/scripts/borzoi_test_genes.py src/baskerville/pygene.py src/baskerville/snps_old.py + +# backup +**/*.py~ \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..2cccbf2 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +import pytest +from pathlib import Path + +def pytest_collection_modifyitems(config, items): + if not config.getoption("--integration"): + skip_integration = pytest.mark.skip(reason="Integration test - use --integration to run") + for item in items: + test_path = Path(item.fspath) + if "integration" in test_path.parts: + item.add_marker(skip_integration) + +def pytest_addoption(parser): + print("Adding custom option") # Debug print + parser.addoption( + "--integration", + action="store_true", + default=False, + help="run integration tests" + ) \ No newline at end of file diff --git a/tests/test_transfer/test_transfer.sh b/tests/integration/test_transfer.sh similarity index 100% rename from tests/test_transfer/test_transfer.sh rename to tests/integration/test_transfer.sh diff --git a/tests/integration/test_transfer_load.py b/tests/integration/test_transfer_load.py new file mode 100644 index 0000000..6a2f0a6 --- /dev/null +++ b/tests/integration/test_transfer_load.py @@ -0,0 +1,49 @@ +import json +import numpy as np +import pandas as pd +from baskerville import seqnn + +def test_housby(): + model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/train/model_best.h5" + targets_file = "tests/data/transfer/targets.txt" + params_file = "tests/data/transfer/json/model_houlsby.json" + + # model params + with open(params_file) as params_open: + params = json.load(params_open) + params_model = params["model"] + params_model["verbose"] = False + + # set strand pairs + targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") + if "strand_pair" in targets_df.columns: + params_model["strand_pair"] = [np.array(targets_df.strand_pair)] + strand_pair = np.array(targets_df.strand_pair) + + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file) + + print("load model success!") + + +def test_se4(): + model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby_se/se2/train/f0c0/train/model_best.h5" + targets_file = "tests/data/transfer/targets.txt" + params_file = "tests/data/transfer/json/model_se4.json" + + # model params + with open(params_file) as params_open: + params = json.load(params_open) + params_model = params["model"] + params_model["verbose"] = False + + # set strand pairs + targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") + if "strand_pair" in targets_df.columns: + params_model["strand_pair"] = [np.array(targets_df.strand_pair)] + strand_pair = np.array(targets_df.strand_pair) + + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file) + + print("load model success!") diff --git a/tests/test_transfer/test_load_houlsby.py b/tests/test_transfer/test_load_houlsby.py deleted file mode 100644 index fa55252..0000000 --- a/tests/test_transfer/test_load_houlsby.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import numpy as np -import pandas as pd -from baskerville import seqnn - -model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/train/model_best.h5" -targets_file = "tests/data/transfer/targets.txt" -params_file = "tests/data/transfer/json/model_houlsby.json" - -# model params -with open(params_file) as params_open: - params = json.load(params_open) -params_model = params["model"] -params_model["verbose"] = False - -# set strand pairs -targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") -if "strand_pair" in targets_df.columns: - params_model["strand_pair"] = [np.array(targets_df.strand_pair)] -strand_pair = np.array(targets_df.strand_pair) - -seqnn_model = seqnn.SeqNN(params_model) -seqnn_model.restore(model_file) - -print("load model success!") diff --git a/tests/test_transfer/test_load_houlsby_se4.py b/tests/test_transfer/test_load_houlsby_se4.py deleted file mode 100644 index 5ceb318..0000000 --- a/tests/test_transfer/test_load_houlsby_se4.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import numpy as np -import pandas as pd -from baskerville import seqnn - -model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby_se/se2/train/f0c0/train/model_best.h5" -targets_file = "tests/data/transfer/targets.txt" -params_file = "tests/data/transfer/json/model_se4.json" - -# model params -with open(params_file) as params_open: - params = json.load(params_open) -params_model = params["model"] -params_model["verbose"] = False - -# set strand pairs -targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") -if "strand_pair" in targets_df.columns: - params_model["strand_pair"] = [np.array(targets_df.strand_pair)] -strand_pair = np.array(targets_df.strand_pair) - -seqnn_model = seqnn.SeqNN(params_model) -seqnn_model.restore(model_file) - -print("load model success!") diff --git a/tests/test_dna.py b/tests/unit/test_dna.py similarity index 100% rename from tests/test_dna.py rename to tests/unit/test_dna.py diff --git a/tests/test_eval.py b/tests/unit/test_eval.py similarity index 100% rename from tests/test_eval.py rename to tests/unit/test_eval.py diff --git a/tests/test_ism.py b/tests/unit/test_ism.py similarity index 100% rename from tests/test_ism.py rename to tests/unit/test_ism.py diff --git a/tests/test_snp.py b/tests/unit/test_snp.py similarity index 100% rename from tests/test_snp.py rename to tests/unit/test_snp.py diff --git a/tests/test_train.py b/tests/unit/test_train.py similarity index 100% rename from tests/test_train.py rename to tests/unit/test_train.py From 6672d05ea0efda027d243ff51c0f2a40e7ea99c9 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 5 Nov 2024 13:50:57 -0800 Subject: [PATCH 22/26] make gpumemorycallback cpu compatible --- src/baskerville/scripts/utils/bw_w5.py | 103 +++++++++++-------- src/baskerville/trainer.py | 19 ++-- tests/test_transfer/test_load_houlsby.py | 2 +- tests/test_transfer/test_load_houlsby_se4.py | 2 +- 4 files changed, 74 insertions(+), 52 deletions(-) diff --git a/src/baskerville/scripts/utils/bw_w5.py b/src/baskerville/scripts/utils/bw_w5.py index ee247aa..6d52a77 100755 --- a/src/baskerville/scripts/utils/bw_w5.py +++ b/src/baskerville/scripts/utils/bw_w5.py @@ -7,53 +7,71 @@ import pyBigWig import scipy.interpolate -''' +""" bw_w5.py Convert a BigWig to wigHDF5. -''' +""" + ################################################################################ # main ################################################################################ def main(): - usage = 'usage: %prog [options] ' + usage = "usage: %prog [options] " parser = OptionParser(usage) - parser.add_option('-c', '--chr_strip', dest='chr_strip', - default=False, action='store_true') - parser.add_option('-p', '--chr_prepend', dest='chr_prepend', - default=False, action='store_true') - parser.add_option('-i', dest='interp_nan', - default=False, action='store_true', - help='Interpolate NaNs [Default: %default]') - parser.add_option('-m', dest='min_norm', - default=False, action='store_true', - help='Normalize the minimum nonzero value to 1 [Default: %default]') - parser.add_option('-s', dest='scale', - default=1.0, type='float', - help='Scale all values (e.g. to undo normalization) [Default: %default]') - parser.add_option('-v', dest='verbose', - default=False, action='store_true') - parser.add_option('-z', dest='clip_zero', - default=False, action='store_true', - help='Clip negative values at zero [Default: %default]') - (options,args) = parser.parse_args() + parser.add_option( + "-c", "--chr_strip", dest="chr_strip", default=False, action="store_true" + ) + parser.add_option( + "-p", "--chr_prepend", dest="chr_prepend", default=False, action="store_true" + ) + parser.add_option( + "-i", + dest="interp_nan", + default=False, + action="store_true", + help="Interpolate NaNs [Default: %default]", + ) + parser.add_option( + "-m", + dest="min_norm", + default=False, + action="store_true", + help="Normalize the minimum nonzero value to 1 [Default: %default]", + ) + parser.add_option( + "-s", + dest="scale", + default=1.0, + type="float", + help="Scale all values (e.g. to undo normalization) [Default: %default]", + ) + parser.add_option("-v", dest="verbose", default=False, action="store_true") + parser.add_option( + "-z", + dest="clip_zero", + default=False, + action="store_true", + help="Clip negative values at zero [Default: %default]", + ) + (options, args) = parser.parse_args() if len(args) != 2: - parser.error('Must provide input BigWig and output HDF5.') + parser.error("Must provide input BigWig and output HDF5.") else: bw_files = args[0] hdf5_file = args[1] # open files - bw_files = bw_files.split(',') + bw_files = bw_files.split(",") bw_ins = [pyBigWig.open(bw_file) for bw_file in bw_files] - h5_out = h5py.File(hdf5_file, 'w') + h5_out = h5py.File(hdf5_file, "w") # process chromosomes in length order chrom_lengths = bw_ins[0].chroms() chroms = sorted(chrom_lengths.keys()) - length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms] + length_chroms = [(chrom_lengths[chrm], chrm) for chrm in chroms] length_chroms = sorted(length_chroms)[::-1] min_factor = None @@ -69,14 +87,14 @@ def main(): # scale if options.scale != 1: - x = x*options.scale + x = x * options.scale # normalize min to 1 # (a simple strategy to undo normalization) if options.min_norm: if min_factor is None: - min_factor = x[x>0].min() - print('Min normalization factor: %f' % min_factor, file=sys.stderr) + min_factor = x[x > 0].min() + print("Min normalization factor: %f" % min_factor, file=sys.stderr) x /= min_factor # interpolate NaN @@ -91,18 +109,20 @@ def main(): # clip float16 min/max x = np.clip(x, np.finfo(np.float16).min, np.finfo(np.float16).max) - x = x.astype('float16') + x = x.astype("float16") # strip "chr" if options.chr_strip: - chrom = chrom.replace('chr','') + chrom = chrom.replace("chr", "") # prepend "chr" if options.chr_prepend: - chrom = 'chr' + chrom - + chrom = "chr" + chrom + # write gzipped into HDF5 - h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True) + h5_out.create_dataset( + chrom, data=x, dtype="float16", compression="gzip", shuffle=True + ) # close files h5_out.close() @@ -110,11 +130,11 @@ def main(): bw_in.close() -def interp_nan(x, kind='linear'): - '''Linearly interpolate to fill NaN.''' +def interp_nan(x, kind="linear"): + """Linearly interpolate to fill NaN.""" # pad zeroes - xp = np.zeros(len(x)+2) + xp = np.zeros(len(x) + 2) xp[1:-1] = x # find NaN @@ -128,10 +148,8 @@ def interp_nan(x, kind='linear'): # interpolate inds = np.arange(len(xp)) interpolator = scipy.interpolate.interp1d( - inds[~x_nan], - xp[~x_nan], - kind=kind, - bounds_error=False) + inds[~x_nan], xp[~x_nan], kind=kind, bounds_error=False + ) loc = np.where(x_nan) xp[loc] = interpolator(loc) @@ -139,8 +157,9 @@ def interp_nan(x, kind='linear'): # slice off pad return xp[1:-1] + ################################################################################ # __main__ ################################################################################ -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 1bd26e4..50c6d32 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -1117,16 +1117,19 @@ def CheckGradientNA(gradients): raise ValueError("NaN gradient detected.") -# Define a custom callback class to track GPU memory usage class GPUMemoryUsageCallback(tf.keras.callbacks.Callback): + def __init__(self): + super().__init__() + self.gpu_available = tf.config.experimental.list_physical_devices("GPU") + def on_train_begin(self, logs=None): - # Enable memory growth to avoid GPU memory allocation issues - physical_devices = tf.config.experimental.list_physical_devices("GPU") - if physical_devices: - for device in physical_devices: + if self.gpu_available: + for device in self.gpu_available: tf.config.experimental.set_memory_growth(device, True) def on_batch_end(self, logs=None): - gpu_memory = tf.config.experimental.get_memory_info("GPU:0") - current_memory = gpu_memory["peak"] / 1e9 # Convert to GB - return current_memory + if self.gpu_available: + gpu_memory = tf.config.experimental.get_memory_info("GPU:0") + current_memory = gpu_memory["peak"] / 1e9 # Convert to GB + return current_memory + return 0 # No GPU, return 0 diff --git a/tests/test_transfer/test_load_houlsby.py b/tests/test_transfer/test_load_houlsby.py index 3f25da0..fa55252 100644 --- a/tests/test_transfer/test_load_houlsby.py +++ b/tests/test_transfer/test_load_houlsby.py @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") \ No newline at end of file +print("load model success!") diff --git a/tests/test_transfer/test_load_houlsby_se4.py b/tests/test_transfer/test_load_houlsby_se4.py index 97aadd1..5ceb318 100644 --- a/tests/test_transfer/test_load_houlsby_se4.py +++ b/tests/test_transfer/test_load_houlsby_se4.py @@ -22,4 +22,4 @@ seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) -print("load model success!") \ No newline at end of file +print("load model success!") From 527ba32d4f0f5e90bc5e020c9983d7378cece46f Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 5 Nov 2024 13:56:27 -0800 Subject: [PATCH 23/26] Untrack tests/test_transfer --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6fd128e..5844da9 100644 --- a/.gitignore +++ b/.gitignore @@ -135,6 +135,8 @@ dmypy.json src/baskerville/scripts/borzoi_test_genes.py src/baskerville/pygene.py src/baskerville/snps_old.py +tests/test_transfer/ + # backup -**/*.py~ \ No newline at end of file +**/*.py~ From 44cf223a11bbe8109abae48188d7dc92d700c753 Mon Sep 17 00:00:00 2001 From: hy395 Date: Tue, 5 Nov 2024 13:58:26 -0800 Subject: [PATCH 24/26] black --- tests/conftest.py | 12 ++++++++---- tests/integration/test_transfer_load.py | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2cccbf2..fefa87e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,23 @@ import pytest from pathlib import Path -def pytest_collection_modifyitems(config, items): + +def pytest_collection_modifyitems(config, items): if not config.getoption("--integration"): - skip_integration = pytest.mark.skip(reason="Integration test - use --integration to run") + skip_integration = pytest.mark.skip( + reason="Integration test - use --integration to run" + ) for item in items: test_path = Path(item.fspath) if "integration" in test_path.parts: item.add_marker(skip_integration) + def pytest_addoption(parser): print("Adding custom option") # Debug print parser.addoption( "--integration", action="store_true", default=False, - help="run integration tests" - ) \ No newline at end of file + help="run integration tests", + ) diff --git a/tests/integration/test_transfer_load.py b/tests/integration/test_transfer_load.py index 6a2f0a6..89900d5 100644 --- a/tests/integration/test_transfer_load.py +++ b/tests/integration/test_transfer_load.py @@ -3,6 +3,7 @@ import pandas as pd from baskerville import seqnn + def test_housby(): model_file = "/home/yuanh/analysis/Borzoi_transfer/exp_10_10_23/hayflick/houlsby/latent_8/train/f0c0/train/model_best.h5" targets_file = "tests/data/transfer/targets.txt" From 913280758c0340179eb83fab322cb3396e183f28 Mon Sep 17 00:00:00 2001 From: hy395 Date: Thu, 19 Dec 2024 17:36:28 -0800 Subject: [PATCH 25/26] fix bug on json param --- docs/transfer/transfer.md | 1 - src/baskerville/scripts/hound_transfer.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/transfer/transfer.md b/docs/transfer/transfer.md index be068e8..32ff94b 100644 --- a/docs/transfer/transfer.md +++ b/docs/transfer/transfer.md @@ -137,7 +137,6 @@ Similar to Borzoi training, arguments for training learning is also indicated in "transfer": { "mode": "adapter", "adapter": "locon", - "adapter_latent": 8, "conv_select": 4 }, ``` diff --git a/src/baskerville/scripts/hound_transfer.py b/src/baskerville/scripts/hound_transfer.py index 88ca008..886bb02 100755 --- a/src/baskerville/scripts/hound_transfer.py +++ b/src/baskerville/scripts/hound_transfer.py @@ -116,7 +116,7 @@ def main(): params_transfer = params["transfer"] transfer_mode = params_transfer.get("mode", "full") transfer_adapter = params_transfer.get("adapter", None) - transfer_latent = params_transfer.get("latent", 8) + transfer_latent = params_transfer.get("adapter_latent", 8) transfer_conv_select = params_transfer.get("conv_select", 4) transfer_conv_rank = params_transfer.get("conv_latent", 4) transfer_lora_alpha = params_transfer.get("lora_alpha", 16) From a636082145e4555cb8833616f27c7315499d5a06 Mon Sep 17 00:00:00 2001 From: hy395 Date: Fri, 20 Dec 2024 15:32:28 -0800 Subject: [PATCH 26/26] update tutorial --- docs/transfer/params.json | 1 - docs/transfer/transfer.md | 12 ++++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/transfer/params.json b/docs/transfer/params.json index 1d4113b..89e8df7 100644 --- a/docs/transfer/params.json +++ b/docs/transfer/params.json @@ -17,7 +17,6 @@ "transfer": { "mode": "adapter", "adapter": "locon", - "adapter_latent": 8, "conv_select": 4 }, "model": { diff --git a/docs/transfer/transfer.md b/docs/transfer/transfer.md index 32ff94b..ba67b82 100644 --- a/docs/transfer/transfer.md +++ b/docs/transfer/transfer.md @@ -163,17 +163,25 @@ Similar to Borzoi training, arguments for training learning is also indicated in ### Step 6. Train model +Run hound_transfer.py on one fold: + +```bash +hound_transfer.py -o train/f0c0/train \ + --restore ${data_path}/pretrain_trunks/f0c0.h5 \ + --trunk params.json \ + train/f0c0/data0 +``` + Use westminster_train_folds.py with `--transfer` option to perform transfer learning on the dataset. ```bash westminster_train_folds.py -e 'tf2.12' \ -q nvidia_geforce_rtx_4090 \ --name "locon" \ - --rc --shifts "0,1" -o train -f 4 --step 8 --eval_train_off \ + -o train -f 4 \ --restore ${data_path}/pretrain_trunks \ --trunk \ --transfer \ - --train_f3 \ --weight_file model_best.mergeW.h5 \ params.json \ ${data_path}/tfr