diff --git a/Cargo.lock b/Cargo.lock
index 6249461..7630133 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -236,6 +236,12 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "compare"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3"
+
 [[package]]
 name = "const_format"
 version = "0.2.32"
@@ -405,10 +411,11 @@ dependencies = [
 
 [[package]]
 name = "fasta_manipulation"
-version = "0.1.2"
+version = "0.1.3"
 dependencies = [
  "clap",
  "colored",
+ "compare",
  "csv",
  "io",
  "noodles",
diff --git a/Cargo.toml b/Cargo.toml
index ad77b8f..4c55605 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fasta_manipulation"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -8,6 +8,7 @@ edition = "2021"
 [dependencies]
 clap = { version = "4.4.4", features = ["cargo"] }
 colored = "2.0.4"
+compare = "0.1.0"
 csv = "1.3.0"
 io = "0.0.2"
 noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }
diff --git a/README.md b/README.md
index ae3e739..79c7001 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,9 @@ Currently, this program has the following arguments:
     This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
     The count will be the upper limit, as there will be a left over number of records.
 
-    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}`
+    This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa`
+
+    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
 
 -   split_by_size (NOT YET WRITTEN)
 
@@ -59,5 +61,10 @@ Currently, this program has the following arguments:
 
     -   GC percentage per scaffold + counts
     -   GC percentage whole genome
+    -   N50 and N90
+    -   L50
+    -   GAP count and length (summary with average length)
+
+    `profile -f input.fasta -o outdir`
 
 If there are other options that would be useful to any other teams, leave a message or issue.
diff --git a/src/generics.rs b/src/generics.rs
index 2811f50..f10b880 100644
--- a/src/generics.rs
+++ b/src/generics.rs
@@ -1,4 +1,5 @@
 use noodles::fasta;
+use noodles::fasta::record::Definition;
 use std::error::Error;
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
@@ -41,3 +42,42 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
     // Take a HashMap and return a Key only Vec
     map.into_iter().map(|(k, _v)| k)
 }
+
+fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
+    let header_list: Vec<&str> = header.split(' ').collect();
+    let record_header = header_list[0];
+    Ok(record_header[1..].to_owned())
+    // let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
+
+    // let first_run = re.captures(&header).ok_or("None")?;
+
+    // if first_run[0] == "None".to_owned() {
+    //     let re = Regex::new(r"symbol:(\S+)").unwrap();
+    //     let second_run = re.captures(&header).ok_or("None")?;
+    //     if second_run[0] == "None".to_owned() {
+    //         let re = Regex::new(r"(\(\S+\)) gene").unwrap();
+    //         let third_run = re.captures(&header).ok_or("None")?;
+    //         if third_run[0] == "None".to_owned() {
+    //             Ok("NOCAPTUREDRESULT".to_string())
+    //         } else {
+    //             Ok(third_run[0].to_string())
+    //         }
+    //     } else {
+    //         Ok(second_run[0].to_string())
+    //     }
+    // } else {
+    //     Ok(first_run[0].to_string())
+    // }
+}
+
+pub fn sanitise_header(old_header: &Definition) -> String {
+    let x = get_gene_symbol(old_header.to_string());
+
+    // Yeah i dont know either...
+    match x {
+        Ok(c) => c,
+        Err(e) => {
+            format!("Regex isnt good enough to capture header id: {}", e)
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index aedc212..9e947f9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -30,6 +30,7 @@ mod exclude_seq;
 use crate::exclude_seq::exclude_seq_mod::filter_fasta;
 
 fn main() -> Result<(), Error> {
+    let split_options = ["pep", "cds", "cdna", "rna", "other"];
     let match_result = command!()
     .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project")
     .subcommand(
@@ -45,14 +46,11 @@ fn main() -> Result<(), Error> {
                     .short('v')
                     .value_parser(clap::value_parser!(bool))
                     .default_value("false")
-                    .required(false)
                     .help("Print explainers as to why validation fails, if it does fail")
             )
             .arg(
                 Arg::new("output")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("Output the log to file")
             )
@@ -63,23 +61,30 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
-                    .help("The output directory that files will be placed in")
+                    .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa")
+            )
+            .arg(
+                Arg::new("data_type")
+                    .short('d')
+                    .value_parser(clap::builder::PossibleValuesParser::new(split_options))
+                    .help("The data type of the input data")
+            )
+            .arg(
+                Arg::new("sanitise")
+                    .short('s')
+                    .value_parser(clap::value_parser!(bool))
+                    .help("Do we need to sanitise the headers of the input fasta")
             )
             .arg(
                 Arg::new("count")
                     .short('c')
-                    .long("file-count")
-                    .aliases(["count"])
                     .value_parser(clap::value_parser!(u16))
                     .help("How many sequences per file")
             )
@@ -90,14 +95,12 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("mem-size")
                     .short('s')
-                    .long("mem-size")
                     .required(true)
                     .value_parser(clap::value_parser!(u16))
                     .help("Size in MB that a fasta file is to be chunked into")
@@ -105,35 +108,45 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("The output directory that files will be placed in")
             )
     )
+    .subcommand(
+        Command::new("geneset_csvs")
+            .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal")
+            .arg(
+                Arg::new("geneset_dir")
+                    .short('d')
+                    .required(true)
+                    .help("The path to the top level directory of your geneset directory.")
+            )
+            .arg(
+                Arg::new("specifiy_clade")
+                    .short('c')
+                    .required(true)
+                    .default_value("ALL")
+                    .help("Specify the clade folder to refresh")
+            )
+    )
     .subcommand(
         Command::new("mapheaders")
             .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers")
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./")
                     .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
             )
             .arg(
                 Arg::new("replace-with")
                     .short('r')
-                    .aliases(["replacement"])
-                    .required(false)
                     .default_value("FMMH")
                     .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'")
             )
@@ -144,22 +157,18 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("fasta-file")
                     .short('f')
-                    .aliases(["fasta"])
                     .required(true)
                     .help("A path to a valid fasta file.")
             )
             .arg(
                 Arg::new("output-directory")
                     .short('o')
-                    .aliases(["out"])
-                    .required(false)
                     .default_value("./new")
                     .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta")
             )
             .arg(
                 Arg::new("map-file")
                     .short('m')
-                    .aliases(["mapped-header-file"])
                     .required(true)
                     .help("The original mapped header field, a TSV of old-header, new-header")
             )
@@ -170,15 +179,13 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-file")
                 .short('f')
-                .aliases(["fsata"])
                 .required(true)
                 .help("The input fasta file for profiling")
         )
         .arg(
             Arg::new("output-dir")
                 .short('o')
-                .aliases(["outdir"])
-                .default_value("FASTMAN-out")
+                .default_value("FasMan-out")
                 .help("The input fasta file for profiling")
         )
     )
@@ -188,21 +195,18 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta")
                 .short('f')
-                .aliases(["fasta"])
                 .required(true)
                 .help("The input fasta file for re-organising")
         )
         .arg(
             Arg::new("tpf")
                 .short('t')
-                .aliases(["tpf file"])
                 .required(true)
                 .help("The TPF file used to re-organise the input fasta")
         )
         .arg(
             Arg::new("sort")
                 .short('s')
-                .required(false)
                 .value_parser(clap::value_parser!(bool))
                 .default_value("false")
                 .help("Size sort the output or leave as order in AGP")
@@ -210,14 +214,11 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("output")
                 .short('o')
-                .aliases(["out"])
-                .required(false)
                 .default_value("new.fasta")
                 .help("The output name of the new fasta file")
         )
         .arg(
             Arg::new("n_length")
-                .aliases(["n_len"])
                 .value_parser(clap::value_parser!(usize))
                 .default_value("200")
                 .help("Length that the N (gap) string should be.")
@@ -229,7 +230,6 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-file")
                 .short('f')
-                .aliases(["fsata"])
                 .required(true)
                 .help("The input fasta file for profiling")
         )
@@ -237,8 +237,6 @@ fn main() -> Result<(), Error> {
             Arg::new("random")
                 .short('r')
                 .value_parser(clap::value_parser!(bool))
-                .default_value("false")
-                .aliases(["random"])
                 .help("Random subset of input file. Default skims the first X given percent")
         )
         .arg(
@@ -246,7 +244,6 @@ fn main() -> Result<(), Error> {
                 .short('p')
                 .value_parser(clap::value_parser!(u16))
                 .default_value("50")
-                .aliases(["proportion"])
                 .help("Percentage of the original file entries that should be retained")
         )
     )
@@ -262,14 +259,12 @@ fn main() -> Result<(), Error> {
             .arg(
                 Arg::new("output")
                     .short('o')
-                    .required(false)
                     .default_value("FiilteredFasta.fa")
                     .help("The outfile naming")
             )
             .arg(
                 Arg::new("filter_list")
                     .short('l')
-                    .required(false)
                     .help("A string comma-separated list of sequence names to exclude from the final fasta")
             )
     )
@@ -279,30 +274,24 @@ fn main() -> Result<(), Error> {
         .arg(
             Arg::new("fasta-1")
                 .short('p')
-                .aliases(["primary-fasta"])
                 .required(true)
                 .help("The input fasta file for re-organising")
         )
         .arg(
             Arg::new("fasta-2")
                 .short('s')
-                .aliases(["secondary-fasta"])
                 .required(true)
                 .help("The second input fasta file")
         )
         .arg(
             Arg::new("naming")
                 .short('s')
-                .aliases(["naming"])
-                .required(false)
                 .default_value("PRI/HAP")
                 .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output")
         )
         .arg(
             Arg::new("output")
                 .short('o')
-                .aliases(["output"])
-                .required(false)
                 .default_value("merged")
                 .help("Output file prefix")
         )
@@ -310,17 +299,13 @@ fn main() -> Result<(), Error> {
     .get_matches();
 
     println! {
-        "{}\n{}\n{}",
+        "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}",
         "WELCOME TO Fasta Manipulator".bold(),
         "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(),
-        "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple()
-    };
-
-    println!(
-        "RUNNING : {:?} : SUBCOMMAND\nRUNNING ON: {:?}",
+        "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(),
         match_result.subcommand_name().unwrap(),
         env::consts::OS
-    );
+    };
 
     match match_result.subcommand_name() {
         Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")),
diff --git a/src/split_by_count.rs b/src/split_by_count.rs
index 495f5af..1396f00 100644
--- a/src/split_by_count.rs
+++ b/src/split_by_count.rs
@@ -1,41 +1,111 @@
 pub mod split_by_count_mod {
+    use crate::generics::sanitise_header;
     use clap::ArgMatches;
+    use compare::{natural, Compare};
+    use noodles::fasta::{self, Record};
+    use std::cmp::Ordering;
+    use std::fs::OpenOptions;
     use std::{
-        fs::File,
-        io::{BufRead, BufReader},
+        fs::{create_dir_all, File},
+        io::BufReader,
+        path::Path,
     };
 
+    #[allow(clippy::needless_return)]
+    fn fix_head(records: Record, sanitise: bool) -> Record {
+        if sanitise {
+            let header = sanitise_header(records.definition());
+            let definition = fasta::record::Definition::new(header, None);
+            let seq = records.sequence().to_owned();
+            return fasta::Record::new(definition, seq);
+        } else {
+            return records.to_owned();
+        };
+    }
+
+    fn write_fasta(outdir: &String, fasta_record: &Vec<Record>) {
+        println!("{}", outdir);
+
+        let _data_file = File::create(outdir);
+        let file = OpenOptions::new()
+            .append(true)
+            .open(outdir)
+            .expect("creation failed");
+
+        let mut writer = fasta::Writer::new(file);
+        for i in fasta_record {
+            writer.write_record(i).unwrap();
+        }
+    }
+
     pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) {
+        let sanitise: &bool = arguments.unwrap().get_one::<bool>("sanitise").unwrap();
         let fasta_file = arguments.unwrap().get_one::<String>("fasta-file").unwrap();
+        let path_obj = Path::new(fasta_file);
+        let grab_name = path_obj.file_name().unwrap();
+        let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect();
+        let actual_name = actual_list[0];
+
+        let data_type = arguments.unwrap().get_one::<String>("data_type").unwrap();
+
+        let outpath = arguments
+            .unwrap()
+            .get_one::<String>("output-directory")
+            .unwrap();
+
+        let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type);
+        create_dir_all(new_outpath.clone()).unwrap();
         let fasta_count = arguments.unwrap().get_one::<u16>("count").unwrap();
-        println!("Fasta file for processing: {:?}", fasta_file);
-        println!("{:?}", &fasta_count);
         println!(
-            "Number of sequence-header pairs per file: {:?}",
-            fasta_count
+            "Fasta file for processing: {:?}\nNumber of records per file: {:?}",
+            fasta_file, fasta_count
         );
 
-        let chunk_val = *fasta_count;
-        let mut counter = 0;
-        let mut global_counter = 0;
-
-        let input = File::open(fasta_file).expect("CANT OPEN FASTA");
-        let buffered = BufReader::new(input);
-
-        for line in buffered.lines() {
-            if counter != chunk_val {
-                if line.expect("NO LINES IN FASTA").starts_with('>') {
-                    println!("header");
-                } else {
-                    println!("Sequence");
-                    counter += 1;
-                    global_counter += 1;
-                }
-            } else {
+        let mut counter: u16 = 0;
+        let mut file_counter: u16 = 1;
+
+        let file_name: Vec<&str> = actual_name.split('.').collect();
+
+        let mut reader = File::open(fasta_file)
+            .map(BufReader::new)
+            .map(fasta::Reader::new)
+            .unwrap();
+
+        let mut record_list: Vec<Record> = Vec::new();
+        for result in reader.records() {
+            let record = result.unwrap();
+            counter += 1;
+
+            let final_rec = fix_head(record, *sanitise);
+            record_list.push(final_rec);
+
+            let cmp = natural();
+            let compared = cmp.compare(&counter, fasta_count);
+            if compared == Ordering::Equal {
+                let full_outpath = format!(
+                    "{}{}_f{}_c{}-a{}.fa",
+                    new_outpath,
+                    file_name[0],
+                    file_counter,
+                    &fasta_count,
+                    &record_list.len()
+                );
+
+                write_fasta(&full_outpath, &record_list);
+                file_counter += 1;
                 counter = 0;
-                println!("CHUNK");
+                record_list = Vec::new();
             }
         }
-        println!("Total number of pairs: {:?}", global_counter);
+
+        let full_outpath = format!(
+            "{}{}_f{}_c{}-a{}.fa",
+            new_outpath,
+            file_name[0],
+            file_counter,
+            &fasta_count,
+            &record_list.len()
+        );
+        write_fasta(&full_outpath, &record_list);
     }
 }