Merge pull request #37 from jjjk123/develop

added function to filter transcripts in clusters
collaborativebioinformatics · Aug 31, 2023 · 9d8e2c8 · 9d8e2c8
2 parents c193533 + 13202d6
commit 9d8e2c8
Showing 1 changed file with 49 additions and 39 deletions.
diff --git a/src/isocomp/Compare/compare_isoforms_in_cluster.py b/src/isocomp/Compare/compare_isoforms_in_cluster.py
@@ -125,43 +125,53 @@ def compare_isoforms_in_cluster(
     # TODO parameterize the cases in which isoforms are compared --eg, 
     # same strand, overlap threshold, different subjects
     else:
-        # this produces a cartesian product of sorts... looks something
-        # like this:
-        # vector_crosser(['tx_1','tx_2','tx_3'],['tx_1','tx_2','tx_3'])
-        # {'V1': ['tx_2', 'tx_2', 'tx_1'], 'V2': ['tx_1', 'tx_3', 'tx_3']}
-        # the V1 and V2 lists will be the same length, so if you iterate over
-        # the length of either list and compare the elements at the same index,
-
-        cross_isoforms = vector_crosser(
-            cluster_gtf.unique_id,
-            cluster_gtf.unique_id)
-
-        # iterate over the comparisons produced by vector_crosser() and
-        # conduct the sequence alignments
-        for i in range(len(cross_isoforms['V1'])):
-
-            # get the unique_id corresponding to two comparisons in the
-            # cross_isoforms dict
-            isoform1_id = cross_isoforms['V1'][i]
-            isoform2_id = cross_isoforms['V2'][i]
-
-            # create window ojects which describe the location of the isoforms
-            # according to the gtf
-            isoform1_window = isoform_library\
-                .get_isoform_coord(unique_id=isoform1_id)
-            isoform2_window = isoform_library\
-                .get_isoform_coord(unique_id=isoform2_id)
-
-            # compare the isoform sequences
-            aln = align_isoforms(
-                isoform_library.get_isoform_seq(unique_id=isoform1_id),
-                isoform_library.get_isoform_seq(unique_id=isoform2_id))
-
-            # append the compare_dict as an element to the list out
-            out.append(__output_dict(cluster,
-                                     cluster_window.chr,
-                                     isoform1_window,
-                                     isoform2_window,
-                                     aln))
-
+        # group transcripts by coordinates; return unique
+        cluster_gtf_grouped = cluster_gtf.df.groupby(by=['Start', 'End', 'Strand'], as_index=True)
+
+        for group, cluster_gtf_unique in cluster_gtf_grouped:
+            if len(cluster_gtf_unique) > 1:        
+                # this produces a cartesian product of sorts... looks something
+                # like this:
+                # vector_crosser(['tx_1','tx_2','tx_3'],['tx_1','tx_2','tx_3'])
+                # {'V1': ['tx_2', 'tx_2', 'tx_1'], 'V2': ['tx_1', 'tx_3', 'tx_3']}
+                # the V1 and V2 lists will be the same length, so if you iterate over
+                # the length of either list and compare the elements at the same index,
+
+                cross_isoforms = vector_crosser(
+                    cluster_gtf_unique.unique_id,
+                    cluster_gtf_unique.unique_id)
+
+                # iterate over the comparisons produced by vector_crosser() and
+                # conduct the sequence alignments
+                for i in range(len(cross_isoforms['V1'])):
+
+                    # get the unique_id corresponding to two comparisons in the
+                    # cross_isoforms dict
+                    isoform1_id = cross_isoforms['V1'][i]
+                    isoform2_id = cross_isoforms['V2'][i]
+
+                    # create window objects which describe the location of the isoforms
+                    # according to the gtf
+                    isoform1_window = isoform_library\
+                        .get_isoform_coord(unique_id=isoform1_id)
+                    isoform2_window = isoform_library\
+                        .get_isoform_coord(unique_id=isoform2_id)
+
+                    # compare the isoform sequences
+                    aln = align_isoforms(
+                        isoform_library.get_isoform_seq(unique_id=isoform1_id),
+                        isoform_library.get_isoform_seq(unique_id=isoform2_id))
+
+                    # append the compare_dict as an element to the list out
+                    out.append(__output_dict(cluster,
+                                            cluster_window.chr,
+                                            isoform1_window,
+                                            isoform2_window,
+                                            aln))
+            else:
+                tx_id = cluster_gtf_unique['unique_id'].iloc[0]
+                isoform1_window = isoform_library.get_isoform_coord(unique_id=tx_id)
+                out.append(__output_dict(cluster,
+                                            cluster_window.chr,
+                                            isoform1_window))
     return out