From f58fa4be3670ee488ad7a50fb24d07bc2cdc568f Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 15 May 2024 11:02:58 +0100
Subject: [PATCH 1/9] Rewritten the split function -- still need to add the
 header cleaning function. But at current, this works

---
 src/main.rs           |   9 +++-
 src/split_by_count.rs | 119 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 106 insertions(+), 22 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index aedc212..9b8ebf4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -30,6 +30,7 @@ mod exclude_seq;
 use crate::exclude_seq::exclude_seq_mod::filter_fasta;
 
 fn main() -> Result<(), Error> {
+    let split_options = ["pep", "cds", "cdna", "rna", "other"];
     let match_result = command!()
     .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project")
     .subcommand(
@@ -73,7 +74,13 @@ fn main() -> Result<(), Error> {
                     .aliases(["out"])
                     .required(false)
                     .default_value("./")
-                    .help("The output directory that files will be placed in")
+                    .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa")
+            )
+            .arg(
+                Arg::new("data_type")
+                    .short('d')
+                    .value_parser(clap::builder::PossibleValuesParser::new(&split_options))
+                    .help("The data type of the input data")
             )
             .arg(
                 Arg::new("count")
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index 495f5af..742cd95 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -1,12 +1,65 @@
 pub mod split_by_count_mod {
     use clap::ArgMatches;
+    use compare::{natural, Compare};
+    use noodles::fasta::record::Definition;
+    use noodles::fasta::{self, Record};
+    use std::cmp::Ordering::{self, Equal};
+    use std::fs::OpenOptions;
     use std::{
-        fs::File,
-        io::{BufRead, BufReader},
+        fs::{create_dir_all, File},
+        io::{stdout, BufRead, BufReader, Write},
+        path::Path,
     };
 
+    fn sanitise_headers(head: &Definition) -> String {
+        return head.to_string();
+    }
+
+    fn fix_head(records: Record, sanitise: bool) -> Record {
+        let clean_headers = true;
+        if clean_headers {
+            let header = sanitise_headers(records.definition());
+
+            let definition = fasta::record::Definition::new(header, None);
+            let seq = records.sequence().to_owned();
+            return fasta::Record::new(definition, seq);
+        } else {
+            return records.to_owned();
+        };
+    }
+
+    fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
+        println!("{}", outdir);
+
+        let _data_file = File::create(&outdir);
+        let mut file = OpenOptions::new()
+            .write(true)
+            .append(true)
+            .open(outdir)
+            .expect("creation failed");
+
+        let mut writer = fasta::Writer::new(file);
+        for i in fasta_record {
+            writer.write_record(&i).unwrap();
+        }
+    }
+
     pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
         let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
+        let path_obj = Path::new(fasta_file);
+        let grab_name = path_obj.file_name().unwrap();
+        let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
+        let actual_name = actual_list[0];
+
+        let data_type = arguments.unwrap().get_one::<String>("data_type").unwrap();
+
+        let outpath = arguments
+            .unwrap()
+            .get_one::<String>("output-directory")
+            .unwrap();
+
+        let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
+        create_dir_all(new_outpath.clone()).unwrap();
         let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
         println!("Fasta file for processing: {:?}", fasta_file);
         println!("{:?}", &fasta_count);
@@ -15,27 +68,51 @@ pub mod split_by_count_mod {
             fasta_count
         );
 
-        let chunk_val = *fasta_count;
-        let mut counter = 0;
-        let mut global_counter = 0;
-
-        let input = File::open(fasta_file).expect("CANT OPEN FASTA");
-        let buffered = BufReader::new(input);
-
-        for line in buffered.lines() {
-            if counter != chunk_val {
-                if line.expect("NO LINES IN FASTA").starts_with('>') {
-                    println!("header");
-                } else {
-                    println!("Sequence");
-                    counter += 1;
-                    global_counter += 1;
-                }
-            } else {
+        let mut counter: u16 = 0;
+        let mut file_counter = 1;
+        let clean_headers = true;
+
+        let file_name: Vec<&str> = actual_name.split(".").collect();
+
+        let mut reader = File::open(fasta_file)
+            .map(BufReader::new)
+            .map(fasta::Reader::new)
+            .unwrap();
+
+        let mut record_list: Vec<Record> = Vec::new();
+        for result in reader.records() {
+            let record = result.unwrap();
+            counter += 1;
+
+            let final_rec = fix_head(record, clean_headers);
+            record_list.push(final_rec);
+
+            let cmp = natural();
+            let compared = cmp.compare(&counter, fasta_count);
+            if compared == Ordering::Equal {
+                let full_outpath = format!(
+                    "{}{}_f{}_c{}-a{}.fa",
+                    new_outpath,
+                    file_name[0],
+                    file_counter,
+                    &fasta_count,
+                    &record_list.len()
+                );
+
+                write_fasta(&full_outpath, &record_list);
+                file_counter += 1;
                 counter = 0;
-                println!("CHUNK");
+                record_list = Vec::new();
             }
         }
-        println!("Total number of pairs: {:?}", global_counter);
+        let full_outpath = format!(
+            "{}{}_f{}_c{}-a{}.fa",
+            new_outpath,
+            file_name[0],
+            file_counter,
+            &fasta_count,
+            &record_list.len()
+        );
+        write_fasta(&full_outpath, &record_list);
     }
 }

From 09c627368383793610f591e848c5a4ad8dabdcb0 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 15 May 2024 11:03:18 +0100
Subject: [PATCH 2/9] Updates

---
 Cargo.lock | 7 +++++++
 Cargo.toml | 1 +
 2 files changed, 8 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 6249461..f5b6709 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -236,6 +236,12 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "compare"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3"
+
 [[package]]
 name = "const_format"
 version = "0.2.32"
@@ -409,6 +415,7 @@ version = "0.1.2"
 dependencies = [
  "clap",
  "colored",
+ "compare",
  "csv",
  "io",
  "noodles",
diff --git a/Cargo.toml b/Cargo.toml
index ad77b8f..ed8c798 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ edition = "2021"
 [dependencies]
 clap = { version = "4.4.4", features = ["cargo"] }
 colored = "2.0.4"
+compare = "0.1.0"
 csv = "1.3.0"
 io = "0.0.2"
 noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }

From 9a3dde8f1d02821d4de180bc0c9347826131c92c Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 15 May 2024 11:14:06 +0100
Subject: [PATCH 3/9] Updated to match new function

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ae3e739..79c7001 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,9 @@ Currently, this program has the following arguments:
     This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
     The count will be the upper limit, as there will be a left over number of records.
 
-    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}`
+    This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa`
+
+    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
 
 -   split_by_size (NOT YET WRITTEN)
 
@@ -59,5 +61,10 @@ Currently, this program has the following arguments:
 
     -   GC percentage per scaffold + counts
     -   GC percentage whole genome
+    -   N50 and N90
+    -   L50
+    -   GAP count and length (summary with average length)
+
+    `profile -f input.fasta -o outdir`
 
 If there are other options that would be useful to any other teams, leave a message or issue.

From 8ac5d6dc966fa5a7bddf2828643a37c08b0c8e00 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 15 May 2024 12:23:33 +0100
Subject: [PATCH 4/9] Updates to start working on the header sanitisation

---
 src/generics.rs       |  5 +++++
 src/main.rs           |  7 +++++++
 src/split_by_count.rs | 11 ++++-------
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index 2811f50..efdc8af 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -1,4 +1,5 @@
 use noodles::fasta;
+use noodles::fasta::record::Definition;
 use std::error::Error;
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
@@ -41,3 +42,7 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
     // Take a HashMap and return a Key only Vec
     map.into_iter().map(|(k, _v)| k)
 }
+
+pub fn sanitise_header(old_header: &Definition) -> std::string::String {
+    return "THIS".to_string();
+}
diff --git a/src/main.rs b/src/main.rs
index 9b8ebf4..dc6fc18 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -82,6 +82,13 @@ fn main() -> Result<(), Error> {
                     .value_parser(clap::builder::PossibleValuesParser::new(&split_options))
                     .help("The data type of the input data")
             )
+            .arg(
+                Arg::new("sanitise")
+                    .short('s')
+                    .value_parser(clap::value_parser!(bool))
+                    .default_value("false")
+                    .help("Do we need to sanitise the headers of the input fasta")
+            )
             .arg(
                 Arg::new("count")
                     .short('c')
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index 742cd95..bded683 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -1,4 +1,5 @@
 pub mod split_by_count_mod {
+    use crate::generics::sanitise_header;
     use clap::ArgMatches;
     use compare::{natural, Compare};
     use noodles::fasta::record::Definition;
@@ -11,14 +12,10 @@ pub mod split_by_count_mod {
         path::Path,
     };
 
-    fn sanitise_headers(head: &Definition) -> String {
-        return head.to_string();
-    }
-
     fn fix_head(records: Record, sanitise: bool) -> Record {
         let clean_headers = true;
         if clean_headers {
-            let header = sanitise_headers(records.definition());
+            let header = sanitise_header(records.definition());
 
             let definition = fasta::record::Definition::new(header, None);
             let seq = records.sequence().to_owned();
@@ -45,6 +42,7 @@ pub mod split_by_count_mod {
     }
 
     pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
+        let sanitise: &bool = arguments.unwrap().get_one::<bool>("sanitise").unwrap();
         let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
         let path_obj = Path::new(fasta_file);
         let grab_name = path_obj.file_name().unwrap();
@@ -70,7 +68,6 @@ pub mod split_by_count_mod {
 
         let mut counter: u16 = 0;
         let mut file_counter = 1;
-        let clean_headers = true;
 
         let file_name: Vec<&str> = actual_name.split(".").collect();
 
@@ -84,7 +81,7 @@ pub mod split_by_count_mod {
             let record = result.unwrap();
             counter += 1;
 
-            let final_rec = fix_head(record, clean_headers);
+            let final_rec = fix_head(record, *sanitise);
             record_list.push(final_rec);
 
             let cmp = natural();

From fbc1f822fa22e79f9d13c09be434966a0ec4b9a4 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Fri, 17 May 2024 14:27:15 +0100
Subject: [PATCH 5/9] Minor refactoring of commands to remove unnecessary lines

---
 src/generics.rs       | 46 +++++++++++++++++++++++++--
 src/main.rs           | 73 ++++++++++++++-----------------------------
 src/split_by_count.rs | 19 +++++------
 3 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index efdc8af..fbe788d 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -1,5 +1,6 @@
 use noodles::fasta;
 use noodles::fasta::record::Definition;
+use regex::{Captures, Regex};
 use std::error::Error;
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
@@ -43,6 +44,47 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
     map.into_iter().map(|(k, _v)| k)
 }
 
-pub fn sanitise_header(old_header: &Definition) -> std::string::String {
-    return "THIS".to_string();
+fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
+    let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
+
+    let first_run = re.captures(&header).ok_or("None")?;
+
+    if first_run[0] == "None".to_owned() {
+        let re = Regex::new(r"symbol:(\S+)").unwrap();
+        let second_run = re.captures(&header).ok_or("None")?;
+        if second_run[0] == "None".to_owned() {
+            let re = Regex::new(r"(\(\S+\)) gene").unwrap();
+            let third_run = re.captures(&header).ok_or("None")?;
+            if third_run[0] == "None".to_owned() {
+                Ok("NOCAPTUREDRESULT".to_string())
+            } else {
+                Ok(third_run[0].to_string())
+            }
+        } else {
+            Ok(second_run[0].to_string())
+        }
+    } else {
+        Ok(first_run[0].to_string())
+    }
+}
+
+fn get_ens_code(header: String) {
+    // Dont know if we will even need this one as our curators want minimal
+    // information for the jbrowse instance
+    let re = Regex::new(r"GeneID:([1-9])\w+").unwrap();
+
+    let matches = re.captures(&header).unwrap();
+}
+
+pub fn sanitise_header(old_header: &Definition) -> String {
+    let x = get_gene_symbol(old_header.to_string());
+
+    // Yeah i dont know either...
+    match x {
+        Ok(c) => c,
+        Err(e) => {
+            format!("Regex isnt good enough to capture header id: {}", e)
+        }
+    }
+    //let ens_code = get_ens_code(old_header.to_string());
 }
diff --git a/src/main.rs b/src/main.rs
index dc6fc18..229f55f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -46,14 +46,11 @@ fn main() -> Result<(), Error> {
                     .short('v')
                     .value_parser(clap::value_parser!(bool))
                     .default_value("false")
-                    .required(false)
                     .help("Print explainers as to why validation fails, if it does fail")
             )
             .arg(
                 Arg::new("output")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("Output the log to file")
             )
@@ -64,15 +61,12 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa")
             )
@@ -86,14 +80,11 @@ fn main() -> Result<(), Error> {
                 Arg::new("sanitise")
                     .short('s')
                     .value_parser(clap::value_parser!(bool))
-                    .default_value("false")
                     .help("Do we need to sanitise the headers of the input fasta")
             )
             .arg(
                 Arg::new("count")
                     .short('c')
-                    .long("file-count")
-                    .aliases(["count"])
                     .value_parser(clap::value_parser!(u16))
                     .help("How many sequences per file")
             )
@@ -104,14 +95,12 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("mem-size")
                     .short('s')
-                    .long("mem-size")
                     .required(true)
                     .value_parser(clap::value_parser!(u16))
                     .help("Size in MB that a fasta file is to be chunked into")
@@ -119,35 +108,45 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("The output directory that files will be placed in")
             )
     )
+    .subcommand(
+        Command::new("geneset_csvs")
+            .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal")
+            .arg(
+                Arg::new("geneset_dir")
+                    .short('d')
+                    .required(true)
+                    .help("The path to the top level directory of your geneset directory.")
+            )
+            .arg(
+                Arg::new("specifiy_clade")
+                    .short('c')
+                    .required(true)
+                    .default_value("ALL")
+                    .help("Specify the clade folder to refresh")
+            )
+    )
     .subcommand(
         Command::new("mapheaders")
             .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers")
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
             )
             .arg(
                 Arg::new("replace-with")
                     .short('r')
-                    .aliases(["replacement"])
-                    .required(false)
                     .default_value("FMMH")
                     .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'")
             )
@@ -158,22 +157,18 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./new")
                     .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
             )
             .arg(
                 Arg::new("map-file")
                     .short('m')
-                    .aliases(["mapped-header-file"])
                     .required(true)
                     .help("The original mapped header field, a TSV of old-header, new-header")
             )
@@ -184,15 +179,13 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-file")
                 .short('f')
-                .aliases(["fsata"])
                 .required(true)
                 .help("The input fasta file for profiling")
         )
         .arg(
             Arg::new("output-dir")
                 .short('o')
-                .aliases(["outdir"])
-                .default_value("FASTMAN-out")
+                .default_value("FasMan-out")
                 .help("The input fasta file for profiling")
         )
     )
@@ -202,21 +195,18 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta")
                 .short('f')
-                .aliases(["fasta"])
                 .required(true)
                 .help("The input fasta file for re-organising")
         )
         .arg(
             Arg::new("tpf")
                 .short('t')
-                .aliases(["tpf file"])
                 .required(true)
                 .help("The TPF file used to re-organise the input fasta")
         )
         .arg(
             Arg::new("sort")
                 .short('s')
-                .required(false)
                 .value_parser(clap::value_parser!(bool))
                 .default_value("false")
                 .help("Size sort the output or leave as order in AGP")
@@ -224,14 +214,11 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("output")
                 .short('o')
-                .aliases(["out"])
-                .required(false)
                 .default_value("new.fasta")
                 .help("The output name of the new fasta file")
         )
         .arg(
             Arg::new("n_length")
-                .aliases(["n_len"])
                 .value_parser(clap::value_parser!(usize))
                 .default_value("200")
                 .help("Length that the N (gap) string should be.")
@@ -243,7 +230,6 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-file")
                 .short('f')
-                .aliases(["fsata"])
                 .required(true)
                 .help("The input fasta file for profiling")
         )
@@ -251,8 +237,6 @@ fn main() -> Result<(), Error> {
             Arg::new("random")
                 .short('r')
                 .value_parser(clap::value_parser!(bool))
-                .default_value("false")
-                .aliases(["random"])
                 .help("Random subset of input file. Default skims the first X given percent")
         )
         .arg(
@@ -260,7 +244,6 @@ fn main() -> Result<(), Error> {
                 .short('p')
                 .value_parser(clap::value_parser!(u16))
                 .default_value("50")
-                .aliases(["proportion"])
                 .help("Percentage of the original file entries that should be retained")
         )
     )
@@ -276,14 +259,12 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("output")
                     .short('o')
-                    .required(false)
                     .default_value("FiilteredFasta.fa")
                     .help("The outfile naming")
             )
             .arg(
                 Arg::new("filter_list")
                     .short('l')
-                    .required(false)
                     .help("A string comma-separated list of sequence names to exclude from the final fasta")
             )
     )
@@ -293,30 +274,24 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-1")
                 .short('p')
-                .aliases(["primary-fasta"])
                 .required(true)
                 .help("The input fasta file for re-organising")
         )
         .arg(
             Arg::new("fasta-2")
                 .short('s')
-                .aliases(["secondary-fasta"])
                 .required(true)
                 .help("The second input fasta file")
         )
         .arg(
             Arg::new("naming")
                 .short('s')
-                .aliases(["naming"])
-                .required(false)
                 .default_value("PRI/HAP")
                 .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output")
         )
         .arg(
             Arg::new("output")
                 .short('o')
-                .aliases(["output"])
-                .required(false)
                 .default_value("merged")
                 .help("Output file prefix")
         )
@@ -324,17 +299,15 @@ fn main() -> Result<(), Error> {
     .get_matches();
 
     println! {
-        "{}\n{}\n{}",
+        "{}\n{}\n{}\n{}\n-- {}\n{}\n-- {}",
         "WELCOME TO Fasta Manipulator".bold(),
         "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(),
-        "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple()
-    };
-
-    println!(
-        "RUNNING : {:?} : SUBCOMMAND\nRUNNING ON: {:?}",
+        "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(),
+        "RUNNING SUBCOMMAND:",
         match_result.subcommand_name().unwrap(),
+        "RUNNING ON:",
         env::consts::OS
-    );
+    };
 
     match match_result.subcommand_name() {
         Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")),
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index bded683..37359fe 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -2,21 +2,18 @@ pub mod split_by_count_mod {
     use crate::generics::sanitise_header;
     use clap::ArgMatches;
     use compare::{natural, Compare};
-    use noodles::fasta::record::Definition;
     use noodles::fasta::{self, Record};
-    use std::cmp::Ordering::{self, Equal};
+    use std::cmp::Ordering;
     use std::fs::OpenOptions;
     use std::{
         fs::{create_dir_all, File},
-        io::{stdout, BufRead, BufReader, Write},
+        io::BufReader,
         path::Path,
     };
 
     fn fix_head(records: Record, sanitise: bool) -> Record {
-        let clean_headers = true;
-        if clean_headers {
+        if sanitise {
             let header = sanitise_header(records.definition());
-
             let definition = fasta::record::Definition::new(header, None);
             let seq = records.sequence().to_owned();
             return fasta::Record::new(definition, seq);
@@ -29,7 +26,7 @@ pub mod split_by_count_mod {
         println!("{}", outdir);
 
         let _data_file = File::create(&outdir);
-        let mut file = OpenOptions::new()
+        let file = OpenOptions::new()
             .write(true)
             .append(true)
             .open(outdir)
@@ -59,15 +56,13 @@ pub mod split_by_count_mod {
         let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
         create_dir_all(new_outpath.clone()).unwrap();
         let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
-        println!("Fasta file for processing: {:?}", fasta_file);
-        println!("{:?}", &fasta_count);
         println!(
-            "Number of sequence-header pairs per file: {:?}",
-            fasta_count
+            "Fasta file for processing: {:?}\nNumber of records per file: {:?}",
+            fasta_file, fasta_count
         );
 
         let mut counter: u16 = 0;
-        let mut file_counter = 1;
+        let mut file_counter: u16 = 1;
 
         let file_name: Vec<&str> = actual_name.split(".").collect();
 

From c9e2968e96d65bf76872bdc8e98edbe58ad2171d Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 21 May 2024 17:20:07 +0100
Subject: [PATCH 6/9] Turns out header sanitation only needs to be minimal on
 the newer data coming out of the annotators (which is great). This will
 probably have to be updated to re-include the regex I have commented out for
 now

---
 src/generics.rs       | 51 +++++++++++++++++++------------------------
 src/split_by_count.rs |  1 +
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index fbe788d..5660951 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -1,6 +1,5 @@
 use noodles::fasta;
 use noodles::fasta::record::Definition;
-use regex::{Captures, Regex};
 use std::error::Error;
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
@@ -45,35 +44,30 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
 }
 
 fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
-    let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
+    let header_list: Vec<&str> = header.split(" ").collect();
+    let record_header = header_list[0];
+    Ok(record_header[1..].to_owned())
+    // let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
 
-    let first_run = re.captures(&header).ok_or("None")?;
+    // let first_run = re.captures(&header).ok_or("None")?;
 
-    if first_run[0] == "None".to_owned() {
-        let re = Regex::new(r"symbol:(\S+)").unwrap();
-        let second_run = re.captures(&header).ok_or("None")?;
-        if second_run[0] == "None".to_owned() {
-            let re = Regex::new(r"(\(\S+\)) gene").unwrap();
-            let third_run = re.captures(&header).ok_or("None")?;
-            if third_run[0] == "None".to_owned() {
-                Ok("NOCAPTUREDRESULT".to_string())
-            } else {
-                Ok(third_run[0].to_string())
-            }
-        } else {
-            Ok(second_run[0].to_string())
-        }
-    } else {
-        Ok(first_run[0].to_string())
-    }
-}
-
-fn get_ens_code(header: String) {
-    // Dont know if we will even need this one as our curators want minimal
-    // information for the jbrowse instance
-    let re = Regex::new(r"GeneID:([1-9])\w+").unwrap();
-
-    let matches = re.captures(&header).unwrap();
+    // if first_run[0] == "None".to_owned() {
+    //     let re = Regex::new(r"symbol:(\S+)").unwrap();
+    //     let second_run = re.captures(&header).ok_or("None")?;
+    //     if second_run[0] == "None".to_owned() {
+    //         let re = Regex::new(r"(\(\S+\)) gene").unwrap();
+    //         let third_run = re.captures(&header).ok_or("None")?;
+    //         if third_run[0] == "None".to_owned() {
+    //             Ok("NOCAPTUREDRESULT".to_string())
+    //         } else {
+    //             Ok(third_run[0].to_string())
+    //         }
+    //     } else {
+    //         Ok(second_run[0].to_string())
+    //     }
+    // } else {
+    //     Ok(first_run[0].to_string())
+    // }
 }
 
 pub fn sanitise_header(old_header: &Definition) -> String {
@@ -86,5 +80,4 @@ pub fn sanitise_header(old_header: &Definition) -> String {
             format!("Regex isnt good enough to capture header id: {}", e)
         }
     }
-    //let ens_code = get_ens_code(old_header.to_string());
 }
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index 37359fe..bdb3fd2 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -97,6 +97,7 @@ pub mod split_by_count_mod {
                 record_list = Vec::new();
             }
         }
+
         let full_outpath = format!(
             "{}{}_f{}_c{}-a{}.fa",
             new_outpath,

From 55231b93d2f5a6a5911b83d447a55435f9b8ee4a Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Tue, 21 May 2024 17:21:22 +0100
Subject: [PATCH 7/9] Project updates

---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f5b6709..7630133 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -411,7 +411,7 @@ dependencies = [
 
 [[package]]
 name = "fasta_manipulation"
-version = "0.1.2"
+version = "0.1.3"
 dependencies = [
  "clap",
  "colored",
diff --git a/Cargo.toml b/Cargo.toml
index ed8c798..4c55605 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fasta_manipulation"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

From 89deb80592c7b8850242b132cb46d9239595a8fa Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 22 May 2024 12:46:22 +0100
Subject: [PATCH 8/9] linter fixes

---
 src/generics.rs       | 2 +-
 src/main.rs           | 6 ++----
 src/split_by_count.rs | 7 +++----
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index 5660951..f10b880 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -44,7 +44,7 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
 }
 
 fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
-    let header_list: Vec<&str> = header.split(" ").collect();
+    let header_list: Vec<&str> = header.split(' ').collect();
     let record_header = header_list[0];
     Ok(record_header[1..].to_owned())
     // let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
diff --git a/src/main.rs b/src/main.rs
index 229f55f..9e947f9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -73,7 +73,7 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("data_type")
                     .short('d')
-                    .value_parser(clap::builder::PossibleValuesParser::new(&split_options))
+                    .value_parser(clap::builder::PossibleValuesParser::new(split_options))
                     .help("The data type of the input data")
             )
             .arg(
@@ -299,13 +299,11 @@ fn main() -> Result<(), Error> {
     .get_matches();
 
     println! {
-        "{}\n{}\n{}\n{}\n-- {}\n{}\n-- {}",
+        "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}",
         "WELCOME TO Fasta Manipulator".bold(),
         "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(),
         "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(),
-        "RUNNING SUBCOMMAND:",
         match_result.subcommand_name().unwrap(),
-        "RUNNING ON:",
         env::consts::OS
     };
 
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index bdb3fd2..ee95c30 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -25,16 +25,15 @@ pub mod split_by_count_mod {
     fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
         println!("{}", outdir);
 
-        let _data_file = File::create(&outdir);
+        let _data_file = File::create(outdir);
         let file = OpenOptions::new()
-            .write(true)
             .append(true)
             .open(outdir)
             .expect("creation failed");
 
         let mut writer = fasta::Writer::new(file);
         for i in fasta_record {
-            writer.write_record(&i).unwrap();
+            writer.write_record(i).unwrap();
         }
     }
 
@@ -64,7 +63,7 @@ pub mod split_by_count_mod {
         let mut counter: u16 = 0;
         let mut file_counter: u16 = 1;
 
-        let file_name: Vec<&str> = actual_name.split(".").collect();
+        let file_name: Vec<&str> = actual_name.split('.').collect();
 
         let mut reader = File::open(fasta_file)
             .map(BufReader::new)

From f0f6103c96ac5101a9245496c85e5a70b1b77ee4 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 22 May 2024 12:48:43 +0100
Subject: [PATCH 9/9] linter fixes

---
 src/split_by_count.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index ee95c30..1396f00 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -11,6 +11,7 @@ pub mod split_by_count_mod {
         path::Path,
     };
 
+    #[allow(clippy::needless_return)]
     fn fix_head(records: Record, sanitise: bool) -> Record {
         if sanitise {
             let header = sanitise_header(records.definition());