From 95a98308cddf2f134b017ab36329c4e52d5889c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C4=99drzej=20Kubica?= Date: Thu, 31 Aug 2023 10:59:26 +0000 Subject: [PATCH 1/2] added function to filter transcripts in clusters --- src/isocomp/Compare/compare_isoforms_in_cluster.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/isocomp/Compare/compare_isoforms_in_cluster.py b/src/isocomp/Compare/compare_isoforms_in_cluster.py index 75cb793..d1c9432 100644 --- a/src/isocomp/Compare/compare_isoforms_in_cluster.py +++ b/src/isocomp/Compare/compare_isoforms_in_cluster.py @@ -108,6 +108,9 @@ def compare_isoforms_in_cluster( # TODO consider doing this as a view of the isoform_library cluster_gtf = isoform_library.get_cluster(cluster) + # drop duplicated transcripts + cluster_gtf = cluster_gtf.drop_duplicate_positions() + # note that the score attribute stores the number of isoforms in # the window cluster_window = isoform_library.get_cluster_coord(cluster) @@ -145,7 +148,7 @@ def compare_isoforms_in_cluster( isoform1_id = cross_isoforms['V1'][i] isoform2_id = cross_isoforms['V2'][i] - # create window ojects which describe the location of the isoforms + # create window objects which describe the location of the isoforms # according to the gtf isoform1_window = isoform_library\ .get_isoform_coord(unique_id=isoform1_id) From 13202d672da50a7eef4e379b0b4dd9d6fc84728b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C4=99drzej=20Kubica?= Date: Thu, 31 Aug 2023 21:29:44 +0000 Subject: [PATCH 2/2] added function to find unique isoforms --- .../Compare/compare_isoforms_in_cluster.py | 91 ++++++++++--------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/src/isocomp/Compare/compare_isoforms_in_cluster.py b/src/isocomp/Compare/compare_isoforms_in_cluster.py index d1c9432..c68c42d 100644 --- a/src/isocomp/Compare/compare_isoforms_in_cluster.py +++ b/src/isocomp/Compare/compare_isoforms_in_cluster.py @@ -108,9 +108,6 @@ def compare_isoforms_in_cluster( # TODO consider doing this as a view of the isoform_library cluster_gtf = isoform_library.get_cluster(cluster) - # drop duplicated transcripts - cluster_gtf = cluster_gtf.drop_duplicate_positions() - # note that the score attribute stores the number of isoforms in # the window cluster_window = isoform_library.get_cluster_coord(cluster) @@ -128,43 +125,53 @@ def compare_isoforms_in_cluster( # TODO parameterize the cases in which isoforms are compared --eg, # same strand, overlap threshold, different subjects else: - # this produces a cartesian product of sorts... looks something - # like this: - # vector_crosser(['tx_1','tx_2','tx_3'],['tx_1','tx_2','tx_3']) - # {'V1': ['tx_2', 'tx_2', 'tx_1'], 'V2': ['tx_1', 'tx_3', 'tx_3']} - # the V1 and V2 lists will be the same length, so if you iterate over - # the length of either list and compare the elements at the same index, - - cross_isoforms = vector_crosser( - cluster_gtf.unique_id, - cluster_gtf.unique_id) - - # iterate over the comparisons produced by vector_crosser() and - # conduct the sequence alignments - for i in range(len(cross_isoforms['V1'])): - - # get the unique_id corresponding to two comparisons in the - # cross_isoforms dict - isoform1_id = cross_isoforms['V1'][i] - isoform2_id = cross_isoforms['V2'][i] - - # create window objects which describe the location of the isoforms - # according to the gtf - isoform1_window = isoform_library\ - .get_isoform_coord(unique_id=isoform1_id) - isoform2_window = isoform_library\ - .get_isoform_coord(unique_id=isoform2_id) - - # compare the isoform sequences - aln = align_isoforms( - isoform_library.get_isoform_seq(unique_id=isoform1_id), - isoform_library.get_isoform_seq(unique_id=isoform2_id)) - - # append the compare_dict as an element to the list out - out.append(__output_dict(cluster, - cluster_window.chr, - isoform1_window, - isoform2_window, - aln)) - + # group transcripts by coordinates; return unique + cluster_gtf_grouped = cluster_gtf.df.groupby(by=['Start', 'End', 'Strand'], as_index=True) + + for group, cluster_gtf_unique in cluster_gtf_grouped: + if len(cluster_gtf_unique) > 1: + # this produces a cartesian product of sorts... looks something + # like this: + # vector_crosser(['tx_1','tx_2','tx_3'],['tx_1','tx_2','tx_3']) + # {'V1': ['tx_2', 'tx_2', 'tx_1'], 'V2': ['tx_1', 'tx_3', 'tx_3']} + # the V1 and V2 lists will be the same length, so if you iterate over + # the length of either list and compare the elements at the same index, + + cross_isoforms = vector_crosser( + cluster_gtf_unique.unique_id, + cluster_gtf_unique.unique_id) + + # iterate over the comparisons produced by vector_crosser() and + # conduct the sequence alignments + for i in range(len(cross_isoforms['V1'])): + + # get the unique_id corresponding to two comparisons in the + # cross_isoforms dict + isoform1_id = cross_isoforms['V1'][i] + isoform2_id = cross_isoforms['V2'][i] + + # create window objects which describe the location of the isoforms + # according to the gtf + isoform1_window = isoform_library\ + .get_isoform_coord(unique_id=isoform1_id) + isoform2_window = isoform_library\ + .get_isoform_coord(unique_id=isoform2_id) + + # compare the isoform sequences + aln = align_isoforms( + isoform_library.get_isoform_seq(unique_id=isoform1_id), + isoform_library.get_isoform_seq(unique_id=isoform2_id)) + + # append the compare_dict as an element to the list out + out.append(__output_dict(cluster, + cluster_window.chr, + isoform1_window, + isoform2_window, + aln)) + else: + tx_id = cluster_gtf_unique['unique_id'].iloc[0] + isoform1_window = isoform_library.get_isoform_coord(unique_id=tx_id) + out.append(__output_dict(cluster, + cluster_window.chr, + isoform1_window)) return out