Added ROSE workflow

daisybio · Jan 6, 2024 · ef90358 · ef90358
1 parent 8c6c89e
commit ef90358
Show file tree

Hide file tree

Showing 14 changed files with 898 additions and 3 deletions.
diff --git a/bin/bed_to_gff.py b/bin/bed_to_gff.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+import argparse
+
+def convert_bed_to_gff(bed_filename, gff_filename):
+    gff_lines = []
+
+    with open(bed_filename, 'r') as bed:
+        for line in bed:
+            parts = line.strip().split()
+            if len(parts) < 3:
+                # skip invalid lines
+                continue
+
+            seqid, start, end = parts[0], int(parts[1]), parts[2]
+            # adjust for 0-based to 1-based coordinate
+            start += 1
+
+            # Set default values for the other GFF columns (referring to the names in https://en.wikipedia.org/wiki/General_feature_format)
+            source = 'bed2gff'
+            type = 'region'
+            score = '.'
+            strand = '.'
+            phase = '.'
+            attributes = '.'
+
+            gff_line = f"{seqid}\t{source}\t{type}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{attributes}"
+            gff_lines.append(gff_line)
+
+    with open(gff_filename, 'w') as gff:
+        gff.write('\n'.join(gff_lines))
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert BED file to GFF format.')
+    parser.add_argument('--bed', '-i', type=str, help='Input BED file')
+    parser.add_argument('--gff', '-o', type=str, help='Output GFF file')
+
+    args = parser.parse_args()
+
+    convert_bed_to_gff(args.bed, args.gff)
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/reformat_gff.py b/bin/reformat_gff.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import argparse
+
+
+parser = argparse.ArgumentParser(prog="GFF to ROSE-GFF",
+                                    description="Takes a standard GFF as input and reformats it into a ROSE input GFF")
+
+parser.add_argument("-i", "--input", required=True)
+parser.add_argument("-o", "--output", required=True)
+
+args = parser.parse_args()
+path_input = args.input
+path_output = args.output
+
+
+gff = pd.read_csv(path_input,
+                  sep = "\t",
+                  names = ["seqname", "source", "feature1", "start", "end", "score", "strand", "dot", "feature2"],
+                  index_col=False).drop(columns=["score", "feature1", "dot", "feature2"])
+
+if not all(gff['seqname'].str.startswith('chr')):
+    gff['seqname'] = ["chr" + str(chrom) for chrom in gff['seqname'].tolist()]
+gff['source'] = ['enhancer_'+ str(num) for num in range(gff.shape[0])]
+gff['id2'] = gff['source']
+gff['empty1'] = ''
+gff['empty2'] = ''
+gff['empty3'] = ''
+gff = gff[['seqname', 'source', 'empty1', 'start', 'end', 'empty2', 'strand', 'empty3', 'id2']]
+gff
+
+
+gff.to_csv(path_output, sep = "\t", header = False, index=False)