From e07a6895940b8cdd76aa5becd1d0b5528e29e752 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 23 May 2024 21:04:45 +0100
Subject: [PATCH 1/7] Documenting code and removed an unnecessary check

---
 src/generics.rs  |  1 +
 src/tpf_fasta.rs | 57 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 47 insertions(+), 11 deletions(-)
diff --git a/src/generics.rs b/src/generics.rs
index f10b880..f12e80e 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -19,6 +19,7 @@ pub fn validate_fasta(
 ) -> result::Result<HashMap<std::string::String, usize>, Box<dyn Error>> {
     // Simply validate the fasta is valid by reading though and ensure there are
     // valid record formats through out the file
+    // Return a Dict of header and length
     let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
         fasta::reader::Builder.build_from_path(path);
     let mut fasta_map = HashMap::new();
diff --git a/src/tpf_fasta.rs b/src/tpf_fasta.rs
index fc5ec7e..2111ded 100644
--- a/src/tpf_fasta.rs
+++ b/src/tpf_fasta.rs
@@ -20,6 +20,7 @@ pub mod tpf_fasta_mod {
     }
 
     impl std::fmt::Display for Tpf {
+        // This is how we want to print a Tpf object
         fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
             write!(
                 fmt,
@@ -42,9 +43,13 @@ pub mod tpf_fasta_mod {
     }
 
     fn parse_tpf(path: &String) -> Vec<Tpf> {
+        // Instantiate a List of Tpf objects
         let mut all_tpf: Vec<Tpf> = Vec::new();
         for line in read_to_string(path).unwrap().lines() {
+            // If line starts with '?' parse line, lines
+            // without this are gaps
             if line.starts_with('?') {
+                // Parse data into TpF object
                 let line_replaced = line.replace('\t', " ");
                 let line_list: Vec<&str> = line_replaced.split_whitespace().collect();
                 let scaff_data: Vec<&str> = line_list[1].split(':').collect();
@@ -82,6 +87,10 @@ pub mod tpf_fasta_mod {
         parsed: std::option::Option<noodles::fasta::record::Sequence>,
         orientation: String,
     ) -> String {
+        // The TPF will contain data in both PLUS (normal) and
+        // MINUS (inverted), if MINUS then we need to invert again
+        // and get thr complement sequence
+        // We then return the sequence of the record.
         if orientation == "MINUS" {
             let start = Position::try_from(1).unwrap();
             let parse_orientation = parsed.unwrap();
@@ -103,18 +112,22 @@ pub mod tpf_fasta_mod {
         sequence: std::option::Option<noodles::fasta::record::Sequence>,
         tpf: Vec<&Tpf>,
     ) -> Vec<NewFasta> {
-        let mut subset_tpf: Vec<NewFasta> = Vec::new();
         //
         // Take the input sequence and scaffold name
         // Parse the input sequence based on the data contained in
-        // the TPF. Which is already a subset based on scaff name
+        // the TPF. Which is already a subset based on scaff name.
+        //
+        // for instance this Vec may only contain SCAFFOLD_1 TPF records
+        // if the sequence is from a SCAFFOLD_1 component
+        // as we move through the list, we are cutting the sequence at the
+        // recorded positions and outputting the new sequence.
         //
+        let mut subset_tpf: Vec<NewFasta> = Vec::new();
 
         let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence ()
         for &i in &tpf {
             let start = Position::try_from(i.start_coord).unwrap();
             let end = Position::try_from(i.end_coord).unwrap();
-            //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap());
             let parsed = new_seq.slice(start..=end);
             let the_sequence = check_orientation(parsed, i.orientation.to_owned());
             let data = NewFasta {
@@ -127,6 +140,7 @@ pub mod tpf_fasta_mod {
     }
 
     fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
+        // Get a Vec of the uniques names in the TPF Vec
         let mut uniques: Vec<String> = Vec::new();
 
         for i in tpf_list {
@@ -145,7 +159,9 @@ pub mod tpf_fasta_mod {
     ) {
         //
         // TPF is in the input TPF order, this will continue to be the case until
-        // the script is modified and the Tpf struct gets modified in place for some reason
+        // such time that the script starts modifying the TPF in place which
+        // we don't want to happen. Once this happens the order will no
+        // longer be guaranteed.
         //
         let _data_file = File::create(output);
         let mut file = OpenOptions::new()
@@ -161,15 +177,18 @@ pub mod tpf_fasta_mod {
 
         let uniques = get_uniques(&tpf_data);
 
-        // This is inefficient as we are scanning through the fasta_data, uniques number of times
+        // This is inefficient as we are scanning through the fasta_data, uniques
+        // ( equal to number of scaffolds) number of times
         // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total.
-        let mut no_more: Vec<String> = Vec::new();
         for x in uniques {
             println!("NOW WRITING DATA FOR: {:?}", &x);
             // X = "SUPER_1"
             let stringy = format!(">{x}\n");
             file.write_all(stringy.as_bytes())
                 .expect("Unable to write to file");
+
+            // file2 will collect what went where
+            // no sequence data
             file2
                 .write_all(stringy.as_bytes())
                 .expect("Unable to write to file");
@@ -179,7 +198,6 @@ pub mod tpf_fasta_mod {
                 sequence: Vec::new(),
             };
 
-            no_more.push(x.to_owned());
             x.clone_into(&mut data.name);
             for tpf in &tpf_data {
                 if tpf.new_scaffold == x {
@@ -195,6 +213,11 @@ pub mod tpf_fasta_mod {
                 }
             }
 
+            // Should be it's own function really
+            // This actually writes the new fasta file
+            // Joining the data together with user (default = 200)
+            // N's (gap)
+
             let line_len: usize = 60;
             let fixed = data.sequence;
             let n_string = "N".repeat(n_length);
@@ -210,7 +233,6 @@ pub mod tpf_fasta_mod {
                 let formatted = i.to_owned() + "\n";
                 file.write_all(formatted.as_bytes()).unwrap();
             }
-            println!("NO LONG SCANNING FOR: {:?}", &no_more)
         }
     }
 
@@ -219,7 +241,7 @@ pub mod tpf_fasta_mod {
     pub fn curate_fasta(arguments: std::option::Option<&ArgMatches>) {
         //
         // Generate a curated fasta file based on the input TPF file
-        // which was generated by Pretext and the agp_to_tpf script.
+        // which was generated by Pretext and the agp_to_tpf scripts.
         // This new fasta file contains a new scaffold naming as well
         // as pieced together sequences generated by the splitting of
         // data in Pretext.
@@ -229,11 +251,14 @@ pub mod tpf_fasta_mod {
         let n_length: &usize = arguments.unwrap().get_one::<usize>("n_length").unwrap();
         let output: &String = arguments.unwrap().get_one::<String>("output").unwrap();
         println!("LET'S GET CURATING THAT FASTA!");
+
+        // Stacker is supposed to increase the stack size
+        // once memory runs out
         stacker::maybe_grow(32 * 1024, 1024 * 5120, || {
             match validate_fasta(fasta_file) {
+                // validate returns Vec of headers - basically indexes it
                 Ok(fasta_d) => {
                     let tpf_data = parse_tpf(&tpf_file);
-                    //let _validated = varify_validity(&tpf_data, &fasta_d);
 
                     //
                     // Start indexed reader of the input fasta
@@ -244,10 +269,12 @@ pub mod tpf_fasta_mod {
                     let fasta_repo = match reader {
                         Ok(data) => {
                             let adapter = IndexedReader::new(data);
+
+                            // Now read the fasta and return is as a queryable object
                             let repository = fasta::Repository::new(adapter);
                             repository
                         }
-                        Err(_) => todo!(),
+                        Err(_) => todo!(), // Probably just panic!
                     };
 
                     //
@@ -257,9 +284,16 @@ pub mod tpf_fasta_mod {
                     //
                     let mut new_fasta_data: Vec<NewFasta> = Vec::new();
                     for i in fasta_d {
+                        // for header in fasta_d
+                        // subset the tpf on header and length
+                        // cross referencing with fasta_d
                         let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1));
+
+                        // Query the fasta for scaffold = header
                         let sequence = fasta_repo.get(&i.0).transpose();
 
+                        // if exists then get the seqeuence, return a tpf object
+                        // containing the trimmed sequence
                         match sequence {
                             Ok(data) => {
                                 let subset_results = parse_seq(data, subset_tpf);
@@ -268,6 +302,7 @@ pub mod tpf_fasta_mod {
                             Err(e) => panic!("{:?}", e),
                         };
                     }
+                    // Write it all out to fasta
                     save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned())
                 }
                 Err(e) => panic!("Something is wrong with the file! | {}", e),

From 0242f4d047f682344d8ae449f604b0954d0c5123 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 23 May 2024 21:07:29 +0100
Subject: [PATCH 2/7] Documenting the small exclude seq

---
 src/exclude_seq.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/exclude_seq.rs b/src/exclude_seq.rs
index ab82c4e..fcd7973 100644
--- a/src/exclude_seq.rs
+++ b/src/exclude_seq.rs
@@ -9,8 +9,11 @@ pub mod exclude_seq_mod {
         fasta: &'a str,
         out_file: &str,
     ) -> std::result::Result<&'a str, Box<dyn Error>> {
+        // Open and read fasta
         let reader: Result<fasta::Reader<Box<dyn BufRead>>, std::io::Error> =
             fasta::reader::Builder.build_from_path(fasta);
+
+        // Create new file
         let file = fs::OpenOptions::new()
             .create(true)
             .append(true)
@@ -19,6 +22,8 @@ pub mod exclude_seq_mod {
 
         match reader {
             Ok(fasta) => {
+                // on Ok reading append record to new fasta if
+                // not in user given list of headers
                 let mut binding = fasta;
                 for result in binding.records() {
                     let record = result?;

From 7bf90c477e2c47c83907a1f9ae00a0a6ffebf89c Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 23 May 2024 21:09:37 +0100
Subject: [PATCH 3/7] Documenting the generics

---
 src/generics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index f12e80e..6f2e634 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -45,6 +45,7 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
 }
 
 fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
+    // Take a string and return first segment of it
     let header_list: Vec<&str> = header.split(' ').collect();
     let record_header = header_list[0];
     Ok(record_header[1..].to_owned())
@@ -72,13 +73,16 @@ fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>>
 }
 
 pub fn sanitise_header(old_header: &Definition) -> String {
+    // Clean the header
+    // This is overly complex for historical reasons
+    // It is still here incase those reasons come back to haunt me
+    // ...again
     let x = get_gene_symbol(old_header.to_string());
 
-    // Yeah i dont know either...
     match x {
         Ok(c) => c,
         Err(e) => {
-            format!("Regex isnt good enough to capture header id: {}", e)
+            format!("Split didn't work: {}", e)
         }
     }
 }

From 6fd352312077369a2971695cf93e069f4992c0e2 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 23 May 2024 21:27:28 +0100
Subject: [PATCH 4/7] Documenting the map headers

---
 src/map_headers.rs | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/map_headers.rs b/src/map_headers.rs
index 2b066b0..95370a1 100644
--- a/src/map_headers.rs
+++ b/src/map_headers.rs
@@ -1,5 +1,4 @@
 pub mod mapping_headers {
-
     use clap::ArgMatches;
     use colored::Colorize;
     use std::error::Error;
@@ -51,6 +50,7 @@ pub mod mapping_headers {
             std::vec::IntoIter<std::string::String>,
         >,
     ) {
+        // Save the header mapping to file
         let f: File = File::create(output).expect("Unable to create file");
         let mut f: BufWriter<File> = BufWriter::new(f);
         for map_pair in mapped {
@@ -69,6 +69,9 @@ pub mod mapping_headers {
             std::vec::IntoIter<std::string::String>,
         >,
     ) {
+        // Swap out the old with the new
+        // skip all else.
+        // This could be re-written now that I know more about noodles
         let file_reader: File = File::open(input).expect("CAN'T OPEN FILE");
         let buff_reader: BufReader<File> = BufReader::new(file_reader);
         let mut new_fasta: File = File::create(output).unwrap();
@@ -95,6 +98,8 @@ pub mod mapping_headers {
     pub fn map_fasta_head(
         arguments: std::option::Option<&ArgMatches>,
     ) -> Result<(), Box<dyn Error>> {
+        // Generate a mapped.txt with the old and new headers
+        // Generate a mapped.fasta with the new headers
         let file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
         let replacer: &String = arguments
             .unwrap()
@@ -110,19 +115,22 @@ pub mod mapping_headers {
 
         match validate_fasta(file) {
             Ok(names) => {
+                // Vec of scaffold names from validate_fasta
+                // return only the headers, not the lengths
                 let new_names = Vec::from_iter(only_keys(names));
 
+                // Generate a Zip of the a=old and new names
                 let new_map: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
                     create_mapping(new_names, replacer);
 
+                // Save the mapping to file
                 let map_to_save: Zip<std::vec::IntoIter<String>, std::vec::IntoIter<String>> =
                     new_map.clone();
                 let output_file = format!("{}mapped-heads.tsv", output);
-
                 save_mapping(&output_file, map_to_save);
 
+                // Generate a new fasta with the mapped headers
                 let new_fasta: String = format!("{output}mapped.fasta");
-
                 create_mapped_fasta(file, &new_fasta, new_map);
 
                 println!(

From a1af7b64ab15ace060879411d0940f9d8b749cb0 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 23 May 2024 21:34:28 +0100
Subject: [PATCH 5/7] Documentation

---
 src/split_by_count.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index 1396f00..fa91188 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -13,6 +13,8 @@ pub mod split_by_count_mod {
 
     #[allow(clippy::needless_return)]
     fn fix_head(records: Record, sanitise: bool) -> Record {
+        // Taker a Record and sanitise the header
+        // recombine into a new Record
         if sanitise {
             let header = sanitise_header(records.definition());
             let definition = fasta::record::Definition::new(header, None);
@@ -24,6 +26,7 @@ pub mod split_by_count_mod {
     }
 
     fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
+        // Take fasta Record and append to output file
         println!("{}", outdir);
 
         let _data_file = File::create(outdir);
@@ -61,16 +64,20 @@ pub mod split_by_count_mod {
             fasta_file, fasta_count
         );
 
+        // Header counter
         let mut counter: u16 = 0;
         let mut file_counter: u16 = 1;
 
+        // Remove the file suffix from the file name
         let file_name: Vec<&str> = actual_name.split('.').collect();
 
+        // Open the fasta file
         let mut reader = File::open(fasta_file)
             .map(BufReader::new)
             .map(fasta::Reader::new)
             .unwrap();
 
+        // Create a Record List
         let mut record_list: Vec<Record> = Vec::new();
         for result in reader.records() {
             let record = result.unwrap();

From 748379dd7f39eb42b06bb9e946f4205ab3f77953 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Wed, 29 May 2024 16:51:30 +0100
Subject: [PATCH 6/7] Refactor of splitbycount most of the functional work of
 splitbysize is complete. Santitise headers just needs adding

---
 src/generics.rs       |  24 +++++++++
 src/main.rs           |  16 +++++-
 src/split_by_count.rs |  33 +++---------
 src/split_by_size.rs  | 122 ++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 163 insertions(+), 32 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index 6f2e634..36b9a19 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -1,6 +1,7 @@
 use noodles::fasta;
 use noodles::fasta::record::Definition;
 use std::error::Error;
+use std::fs::{self, File, OpenOptions};
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
 #[derive(Debug, Clone)]
@@ -86,3 +87,26 @@ pub fn sanitise_header(old_header: &Definition) -> String {
         }
     }
 }
+
+pub fn write_fasta(
+    outdir: &String,
+    file_name: String,
+    fasta_record: Vec<noodles::fasta::Record>,
+) -> std::io::Result<()> {
+    // Create file
+    fs::create_dir_all(&outdir)?;
+    let file_path = format!("{}/{}", outdir, file_name);
+    let _data_file = File::create(&file_path);
+
+    // Append to file
+    let file = OpenOptions::new()
+        .append(true)
+        .open(file_path)
+        .expect("creation failed");
+
+    let mut writer = fasta::Writer::new(file);
+    for i in fasta_record {
+        writer.write_record(&i).unwrap();
+    }
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index 9e947f9..7a88c42 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -100,11 +100,23 @@ fn main() -> Result<(), Error> {
             )
             .arg(
                 Arg::new("mem-size")
-                    .short('s')
+                    .short('m')
                     .required(true)
-                    .value_parser(clap::value_parser!(u16))
+                    .value_parser(clap::value_parser!(usize))
                     .help("Size in MB that a fasta file is to be chunked into")
             )
+            .arg(
+                Arg::new("data_type")
+                    .short('d')
+                    .value_parser(clap::builder::PossibleValuesParser::new(split_options))
+                    .help("The data type of the input data")
+            )
+            .arg(
+                Arg::new("sanitise")
+                    .short('s')
+                    .value_parser(clap::value_parser!(bool))
+                    .help("Do we need to sanitise the headers of the input fasta")
+            )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index fa91188..f41321f 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -1,5 +1,5 @@
 pub mod split_by_count_mod {
-    use crate::generics::sanitise_header;
+    use crate::generics::{sanitise_header, write_fasta};
     use clap::ArgMatches;
     use compare::{natural, Compare};
     use noodles::fasta::{self, Record};
@@ -25,22 +25,6 @@ pub mod split_by_count_mod {
         };
     }
 
-    fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
-        // Take fasta Record and append to output file
-        println!("{}", outdir);
-
-        let _data_file = File::create(outdir);
-        let file = OpenOptions::new()
-            .append(true)
-            .open(outdir)
-            .expect("creation failed");
-
-        let mut writer = fasta::Writer::new(file);
-        for i in fasta_record {
-            writer.write_record(i).unwrap();
-        }
-    }
-
     pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
         let sanitise: &bool = arguments.unwrap().get_one::<bool>("sanitise").unwrap();
         let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
@@ -57,7 +41,6 @@ pub mod split_by_count_mod {
             .unwrap();
 
         let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
-        create_dir_all(new_outpath.clone()).unwrap();
         let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
         println!(
             "Fasta file for processing: {:?}\nNumber of records per file: {:?}",
@@ -89,30 +72,28 @@ pub mod split_by_count_mod {
             let cmp = natural();
             let compared = cmp.compare(&counter, fasta_count);
             if compared == Ordering::Equal {
-                let full_outpath = format!(
-                    "{}{}_f{}_c{}-a{}.fa",
-                    new_outpath,
+                let file_name = format!(
+                    "{}_f{}_c{}-a{}.fa",
                     file_name[0],
                     file_counter,
                     &fasta_count,
                     &record_list.len()
                 );
 
-                write_fasta(&full_outpath, &record_list);
+                write_fasta(&new_outpath, file_name, record_list);
                 file_counter += 1;
                 counter = 0;
                 record_list = Vec::new();
             }
         }
 
-        let full_outpath = format!(
-            "{}{}_f{}_c{}-a{}.fa",
-            new_outpath,
+        let file_name = format!(
+            "{}_f{}_c{}-a{}.fa",
             file_name[0],
             file_counter,
             &fasta_count,
             &record_list.len()
         );
-        write_fasta(&full_outpath, &record_list);
+        write_fasta(&new_outpath, file_name, record_list);
     }
 }
diff --git a/src/split_by_size.rs b/src/split_by_size.rs
index f1b4a7b..31753b2 100644
--- a/src/split_by_size.rs
+++ b/src/split_by_size.rs
@@ -1,12 +1,126 @@
 pub mod split_by_size_mod {
+    use crate::generics::{only_keys, validate_fasta, write_fasta};
     use clap::ArgMatches;
+    use noodles::fasta;
+    use noodles::fasta::record::Definition;
+    use noodles::fasta::repository::adapters::IndexedReader;
+    use noodles::fasta::Record;
+    use std::collections::HashMap;
+    use std::path::Path;
+
+    pub fn find_chunks<'a>(
+        header_sizes: &'a HashMap<std::string::String, usize>,
+        size: &usize,
+    ) -> HashMap<usize, HashMap<&'a String, &'a usize>> {
+        //let mut new_map = HashMap::new();
+        let mut chunk = 1;
+        let mut new_map: HashMap<usize, HashMap<&String, &usize>> = HashMap::new();
+        let mut subset_map: HashMap<&String, &usize> = HashMap::new();
+        let mut temp_map: HashMap<&String, &usize> = HashMap::new();
+
+        for i in header_sizes {
+            let scaff_name = i.0;
+            let scaff_size = i.1;
+            // If scaffold size is greater than chunk then output
+            // straight away
+            if i.1 > size {
+                // Must be something cleaner for this bit
+                temp_map.insert(scaff_name, scaff_size);
+                new_map.insert(chunk, temp_map);
+
+                // Clear Hashmap
+                temp_map = HashMap::new();
+                chunk += 1;
+            // If Scaffold not > chunk size, add to HashMap
+            // scan through HashMap and check whether greater than Chunk.
+            } else {
+                subset_map.insert(scaff_name, scaff_size);
+                // If this list sums to larger than Chunk then
+                // remove last item and check again.
+                // if removing [-1] makes total size < chunk
+                // out to file and keep that [-1] in list for next round of
+                // chunking
+                if subset_map.len() > 1 {
+                    let summed: usize = subset_map.values().copied().sum();
+                    if summed > size.to_owned() {
+                        subset_map.remove(scaff_name);
+                        let summed: usize = subset_map.values().copied().sum();
+                        if summed < size.to_owned() {
+                            new_map.insert(chunk, subset_map);
+                            chunk += 1;
+                        } else {
+                            println!("ERROR: MORE LOGIC NEEDED TO SPLIT UP")
+                        }
+                        subset_map = HashMap::new();
+                        subset_map.insert(scaff_name, scaff_size);
+                    }
+                }
+            }
+        }
+        new_map.insert(chunk.to_owned(), subset_map.to_owned());
+
+        new_map
+    }
 
     pub fn split_file_by_size(arguments: std::option::Option<&ArgMatches>) {
         let fasta_file: &String = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
+        let chunk_size: &usize = arguments.unwrap().get_one::<usize>("mem-size").unwrap();
+        let data_type: &String = arguments.unwrap().get_one::<String>("data_type").unwrap();
+        let outpath: &String = arguments
+            .unwrap()
+            .get_one::<String>("output-directory")
+            .unwrap();
+
+        let path_obj = Path::new(fasta_file);
+        let grab_name = path_obj.file_name().unwrap();
+        let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
+        let actual_name = actual_list[0];
+
+        let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
+
         println!("Fasta file for processing: {:?}", &fasta_file);
-        println!(
-            "Size to chunk fasta into: {:?}",
-            arguments.unwrap().get_one::<u16>("mem-size").unwrap()
-        );
+        println!("Size to chunk fasta into: {:?}", &chunk_size);
+
+        let validation = validate_fasta(fasta_file);
+
+        // Deserved better error handling here
+        let results = validation.unwrap();
+
+        // Returns only the HashMap< usize, Hashmap<String, usize>>
+        let split_hash = find_chunks(&results, &chunk_size);
+
+        // Duplicated from TPF_FASTA
+        // Should be abstracted into generics
+        let reader = fasta::indexed_reader::Builder::default().build_from_path(fasta_file);
+        let fasta_repo = match reader {
+            Ok(data) => {
+                let adapter = IndexedReader::new(data);
+
+                // Now read the fasta and return is as a queryable object
+                let repository = fasta::Repository::new(adapter);
+                repository
+            }
+            Err(_) => todo!(), // Probably just panic!
+        };
+
+        for i in split_hash {
+            let mut record_list: Vec<Record> = Vec::new();
+            let list: Vec<&String> = only_keys(i.1.to_owned()).collect();
+            for ii in list {
+                let results = fasta_repo.get(ii).transpose();
+                let new_rec = match results {
+                    Ok(data) => {
+                        let definition = Definition::new(ii, None);
+                        Record::new(definition, data.unwrap())
+                    }
+                    Err(e) => panic!("{:?}", e),
+                };
+                record_list.push(new_rec)
+            }
+            let file_name = format!("{}_f{}_{}.fasta", actual_name, i.0, data_type);
+
+            let _ = write_fasta(&new_outpath, file_name, record_list);
+        }
+        //println!("{:?}", split_hash)
     }
 }

From 888947986d3885a93f6f25446f012055a482c371 Mon Sep 17 00:00:00 2001
From: DLBPointon <damonlbp@hotmail.co.uk>
Date: Thu, 30 May 2024 11:17:38 +0100
Subject: [PATCH 7/7] Linting fixes

---
 src/generics.rs       |  2 +-
 src/split_by_count.rs | 11 +++--------
 src/split_by_size.rs  |  9 ++++-----
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/generics.rs b/src/generics.rs
index 36b9a19..ad70e17 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -94,7 +94,7 @@ pub fn write_fasta(
     fasta_record: Vec<noodles::fasta::Record>,
 ) -> std::io::Result<()> {
     // Create file
-    fs::create_dir_all(&outdir)?;
+    fs::create_dir_all(outdir)?;
     let file_path = format!("{}/{}", outdir, file_name);
     let _data_file = File::create(&file_path);
 
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index f41321f..8bb394b 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -4,12 +4,7 @@ pub mod split_by_count_mod {
     use compare::{natural, Compare};
     use noodles::fasta::{self, Record};
     use std::cmp::Ordering;
-    use std::fs::OpenOptions;
-    use std::{
-        fs::{create_dir_all, File},
-        io::BufReader,
-        path::Path,
-    };
+    use std::{fs::File, io::BufReader, path::Path};
 
     #[allow(clippy::needless_return)]
     fn fix_head(records: Record, sanitise: bool) -> Record {
@@ -80,7 +75,7 @@ pub mod split_by_count_mod {
                     &record_list.len()
                 );
 
-                write_fasta(&new_outpath, file_name, record_list);
+                let _ = write_fasta(&new_outpath, file_name, record_list);
                 file_counter += 1;
                 counter = 0;
                 record_list = Vec::new();
@@ -94,6 +89,6 @@ pub mod split_by_count_mod {
             &fasta_count,
             &record_list.len()
         );
-        write_fasta(&new_outpath, file_name, record_list);
+        let _ = write_fasta(&new_outpath, file_name, record_list);
     }
 }
diff --git a/src/split_by_size.rs b/src/split_by_size.rs
index 31753b2..20c35ef 100644
--- a/src/split_by_size.rs
+++ b/src/split_by_size.rs
@@ -42,10 +42,10 @@ pub mod split_by_size_mod {
                 // chunking
                 if subset_map.len() > 1 {
                     let summed: usize = subset_map.values().copied().sum();
-                    if summed > size.to_owned() {
+                    if summed > *size {
                         subset_map.remove(scaff_name);
                         let summed: usize = subset_map.values().copied().sum();
-                        if summed < size.to_owned() {
+                        if summed < *size {
                             new_map.insert(chunk, subset_map);
                             chunk += 1;
                         } else {
@@ -87,7 +87,7 @@ pub mod split_by_size_mod {
         let results = validation.unwrap();
 
         // Returns only the HashMap< usize, Hashmap<String, usize>>
-        let split_hash = find_chunks(&results, &chunk_size);
+        let split_hash = find_chunks(&results, chunk_size);
 
         // Duplicated from TPF_FASTA
         // Should be abstracted into generics
@@ -97,8 +97,7 @@ pub mod split_by_size_mod {
                 let adapter = IndexedReader::new(data);
 
                 // Now read the fasta and return is as a queryable object
-                let repository = fasta::Repository::new(adapter);
-                repository
+                fasta::Repository::new(adapter)
             }
             Err(_) => todo!(), // Probably just panic!
         };