diff --git a/Cargo.toml b/Cargo.toml index 9a16802..e833b7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,4 +41,5 @@ log = "0.4" noodles-bam = "0.49.0" noodles-bgzf = "0.25" +noodles-csi = "0.26" noodles-sam = "0.46" diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..89e1a6c --- /dev/null +++ b/src/index.rs @@ -0,0 +1,66 @@ +use std::io; +use std::path::Path; + +use noodles_bam::{self as bam, bai}; +use noodles_csi::{self as csi, index::reference_sequence::bin::Chunk}; +use noodles_sam::{self as sam, alignment::Record}; + +fn is_coordinate_sorted(header: &sam::Header) -> bool { + use sam::header::record::value::map::header::SortOrder; + + if let Some(hdr) = header.header() { + if let Some(sort_order) = hdr.sort_order() { + return sort_order == SortOrder::Coordinate; + } + } + + false +} + +pub fn index_bam>(file: P) -> io::Result<()> { + let mut reader = bam::reader::Builder.build_from_path(file.as_ref())?; + let header = reader.read_header()?; + + if !is_coordinate_sorted(&header) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "the input BAM must be coordinate-sorted to be indexed", + )); + } + + let mut record = Record::default(); + + let mut builder = csi::index::Indexer::default(); + let mut start_position = reader.virtual_position(); + + while reader.read_record(&header, &mut record)? != 0 { + let end_position = reader.virtual_position(); + let chunk = Chunk::new(start_position, end_position); + + let alignment_context = match ( + record.reference_sequence_id(), + record.alignment_start(), + record.alignment_end(), + ) { + (Some(id), Some(start), Some(end)) => { + Some((id, start, end, !record.flags().is_unmapped())) + } + _ => None, + }; + + builder.add_record(alignment_context, chunk)?; + + start_position = end_position; + } + + let index = builder.build(header.reference_sequences().len()); + // let index_file = file.as_ref().with_extension("bai"); + + // write to index file + let stdout = io::stdout().lock(); + let mut writer = bai::Writer::new(stdout); + writer.write_header()?; + writer.write_index(&index)?; + + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index a377b83..c7531d2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ use log::LevelFilter; use std::path::PathBuf; mod extract; +mod index; #[derive(Parser)] #[command(author, version, about, long_about = None)] @@ -31,6 +32,12 @@ enum Commands { #[arg(short = 'b', default_value = "false")] isbam: bool, }, + + /// Index a BAM file + Index { + /// Bam input file + input: PathBuf, + }, } fn main() { @@ -64,6 +71,11 @@ fn main() { extract::extract(readids, input, *isbam).unwrap(); } + Some(Commands::Index { input }) => { + info!("'index' {input:?} "); + index::index_bam(input).unwrap(); + } + // If no subcommand was used, it's a normal top level command None => info!("No subcommand was used"), }