From 1c1d155323f498fd49041e9e57c62749a411f5b8 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Fri, 10 Nov 2023 00:09:53 -0600 Subject: [PATCH] feat(graph): implement graph analysis --- Cargo.toml | 11 +- src/fa2fq.rs | 1 - src/graph.rs | 31 ++ src/graph/analysis.rs | 424 +++++++++++++++++++- src/graph/data.rs | 55 ++- src/graph/load.rs | 105 +++-- src/graph/vis.rs | 1 + src/main.rs | 4 +- tests/data/{cygrpah1.json => cygraph1.json} | 0 9 files changed, 557 insertions(+), 75 deletions(-) create mode 100644 src/graph/vis.rs rename tests/data/{cygrpah1.json => cygraph1.json} (100%) diff --git a/Cargo.toml b/Cargo.toml index 0816249..12335f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ documentation = "https://github.com/cauliyang/rboss" license = "MIT" description = "Rust Bioinformatics Toolbox" keywords = ["Bioinformatics", "Graph", "Toolbox"] -categories = ["Bioinformatics"] +categories = ["command-line-interface"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [profile.dev.package."*"] @@ -53,11 +53,12 @@ noodles-fasta = "0.30" noodles-fastq = "0.9" noodles-sam = "0.46" -petgraph = "0.6" -regex = "1.10" -serde_json = "1.0" +petgraph = { version = "0.6", features = ["serde-1"] } +regex = { version = "1.10" } +serde = { version = "1.0" } +serde_json = { version = "1.0" } -walkdir = "2.4" +walkdir = { version = "2.4" } # standard crate data is left out [dev-dependencies] diff --git a/src/fa2fq.rs b/src/fa2fq.rs index 2db9245..e1a1407 100644 --- a/src/fa2fq.rs +++ b/src/fa2fq.rs @@ -5,7 +5,6 @@ use std::path::Path; pub fn fa2fq>(input: P) -> Result<()> { let mut reader = fasta::reader::Builder.build_from_path(input)?; - let mut writer = fastq::Writer::new(std::io::stdout()); for result in reader.records() { diff --git a/src/graph.rs b/src/graph.rs index 9e5befe..cd60dc7 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -1,14 +1,45 @@ +use anyhow::Result; use clap::Args; use clap::ValueHint; +use std::io::BufWriter; +use std::io::Write; use std::path::PathBuf; mod analysis; mod data; mod load; +use analysis::GraphAnalysis; +use log::warn; + #[derive(Args, Debug)] pub struct GraphArgs { /// Graph input file #[arg(value_hint = ValueHint::FilePath)] input: PathBuf, } + +pub fn analyze_nlgraph(args: &GraphArgs) -> Result<()> { + let mut nlgraph = load::load_cygraph_from_file(&args.input)?; + + if !nlgraph.is_weakly_connected() { + warn!("Graph is weakly connected"); + return Ok(()); + } + + if nlgraph.is_cyclic_directed() { + warn!("Graph is cyclic"); + return Ok(()); + } + + nlgraph.node_degree(); + nlgraph.degree_centrality(); + nlgraph.closeness_centrality(); + nlgraph.local_clustering_coefficient(); + + let stdout = std::io::stdout().lock(); + let mut handle = BufWriter::new(stdout); // optional: wrap that handle in a buffer + writeln!(handle, "{}", nlgraph.to_cyjson())?; + + Ok(()) +} diff --git a/src/graph/analysis.rs b/src/graph/analysis.rs index cb8c50b..857cfae 100644 --- a/src/graph/analysis.rs +++ b/src/graph/analysis.rs @@ -1,5 +1,425 @@ // https://docs.rs/petgraph/latest/petgraph/graph/struct.Graph.html#method.node_weight + +use super::data::NLGraph; + +use log::warn; +use petgraph::algo::dijkstra; +use petgraph::graph::NodeIndex; +use petgraph::visit::IntoNeighborsDirected; +use petgraph::visit::Visitable; +use petgraph::Direction; + pub trait GraphAnalysis { - // add code here - fn node_degree(&self); + fn node_degree(&mut self); + + fn is_weakly_connected(&self) -> bool; + + fn density(&self) -> f64; + fn node_count(&self) -> usize; + fn edge_count(&self) -> usize; + + fn walk(&self); + + // centrality measures + // Degree centrality is a way of measuring the importance of a node within a graph. + // It is based on the number of links incident upon a node (i.e., the number of ties that a node has). For directed graphs, you can compute two types of degree centrality: in-degree centrality and out-degree centrality. + fn degree_centrality(&mut self); + + // Closeness centrality measures how close a node is to all other nodes in the network. + // It is calculated as the reciprocal of the sum of the length of the shortest paths between the node and all other nodes in the graph. + // Thus, the more central a node is, the closer it is to all other nodes. + fn closeness_centrality(&mut self); + + // fn eigenvector_centrality(&mut self, tolerance: f64, max_iter: usize); + + // // local clustering measures + fn local_clustering_coefficient(&mut self); + + fn is_cyclic_directed(&self) -> bool; + + fn vis(&self); + + fn to_cyjson(&self) -> String; +} + +impl GraphAnalysis for NLGraph { + fn node_degree(&mut self) { + for ind in self.node_indices() { + let indegree = self.neighbors_directed(ind, petgraph::Incoming).count(); + let outdegree = self.neighbors_directed(ind, petgraph::Outgoing).count(); + + let node = self.node_weight_mut(ind).unwrap(); + node.indegree = indegree; + node.outdegree = outdegree; + } + } + + fn is_weakly_connected(&self) -> bool { + let mut visited = vec![false; self.node_count()]; + let mut stack = vec![]; + let node_index = self.node_indices().next().unwrap(); + stack.push(node_index.index()); + visited[node_index.index()] = true; + while stack.pop().is_some() { + for neighbor in self.neighbors(node_index) { + if !visited[neighbor.index()] { + visited[neighbor.index()] = true; + stack.push(neighbor.index()); + } + } + } + visited.len() == self.node_count() + } + + fn density(&self) -> f64 { + let n = self.node_count(); + let m = self.edge_count(); + 2.0 * m as f64 / (n * (n - 1)) as f64 + } + + fn node_count(&self) -> usize { + self.node_count() + } + + fn edge_count(&self) -> usize { + self.edge_count() + } + + fn walk(&self) { + let mut paths = Vec::new(); // To hold all paths + let start_nodes = self.externals(Direction::Incoming).collect::>(); // Nodes with in-degree == 0 + + // Recursive DFS function + fn dfs_visit( + graph: &NLGraph, + node: NodeIndex, + current_path: &mut Vec, + paths: &mut Vec>, + ) where + NLGraph: IntoNeighborsDirected + Visitable, + { + // Visit the current node + current_path.push(node); + + // Check if it's an ending node (out-degree == 0) + if graph.neighbors_directed(node, Direction::Outgoing).count() == 0 { + // Save the path + paths.push(current_path.clone()); + } else { + // Continue with unvisited neighbors + for neighbor in graph.neighbors_directed(node, Direction::Outgoing) { + dfs_visit(graph, neighbor, current_path, paths); + } + } + + // Backtrack + current_path.pop(); + } + + // Start the DFS from each starting node + for &start_node in &start_nodes { + let mut current_path = Vec::new(); + dfs_visit(&self, start_node, &mut current_path, &mut paths); + } + + // Output all paths + for path in &paths { + println!("{:?}", path); + } + } + + fn degree_centrality(&mut self) { + // The normalization factor is based on the number of nodes minus 1 + // It's the maximum possible degree of any node + let normalization = (self.node_count() - 1) as f32; + + // Iterate over all nodes to compute in-degree and out-degree centrality + for node_ind in self.node_indices() { + // Compute in-degree and out-degree for the node + let in_degree = self + .neighbors_directed(node_ind, Direction::Incoming) + .count() as f32; + let out_degree = self + .neighbors_directed(node_ind, Direction::Outgoing) + .count() as f32; + + let node_data = self.node_weight_mut(node_ind).unwrap(); + node_data.in_degree_centrality = in_degree / normalization; + node_data.out_degree_centrality = out_degree / normalization; + } + } + + fn closeness_centrality(&mut self) { + if !self.is_weakly_connected() { + warn!("Graph is not weakly connected, skipping closeness centrality calculation"); + return; + } + + let node_count = self.node_count() as f32; + + // Compute shortest paths from each node to all other nodes + for node_ind in self.node_indices() { + let shortest_paths = dijkstra(&*self, node_ind, None, |_| 1); + + // Calculate the sum of the shortest paths to all other nodes + let total_distance: usize = shortest_paths.values().sum(); + let total_distance = total_distance as f32; + + // The closeness centrality for the node is the inverse of the total distance + // If a node is disconnected (total_distance is 0), its centrality is 0 + let centrality = if total_distance > 0.0 { + (node_count - 1.0) / total_distance + } else { + 0.0 + }; + + let node_data = self.node_weight_mut(node_ind).unwrap(); + node_data.clostness_centrality = centrality; + } + } + + fn local_clustering_coefficient(&mut self) { + for node in self.node_indices() { + // Get all the successors and predecessors of the node + let successors: Vec = + self.neighbors_directed(node, Direction::Outgoing).collect(); + let predecessors: Vec = + self.neighbors_directed(node, Direction::Incoming).collect(); + + // Count the number of edges between successors and predecessors + let mut edges_between_neighbors = 0; + for &successor in &successors { + for &predecessor in &predecessors { + if self.contains_edge(predecessor, successor) { + edges_between_neighbors += 1; + } + } + } + + // The number of possible edges between successors and predecessors + let possible_edges = successors.len() * predecessors.len(); + + // Calculate the local clustering coefficient for the node + let coefficient = if possible_edges > 0 { + edges_between_neighbors as f32 / possible_edges as f32 + } else { + 0.0 // If there are no successors or predecessors, the coefficient is 0. + }; + + let node_data = self.node_weight_mut(node).unwrap(); + // Store the coefficient for the node + node_data.local_clustering_coefficient = coefficient; + } + } + + fn is_cyclic_directed(&self) -> bool { + use petgraph::algo::is_cyclic_directed; + is_cyclic_directed(self) + } + + fn to_cyjson(&self) -> String { + let mut nlgraph_json = serde_json::to_value(self).unwrap(); + // { + // "data": [], + // "directed": true, + // "multigraph": true, + // "elements": { + // "nodes": [ + // { + // "data": { + // "chrom": "chr1", + // "ref_start": 154220171, + // "ref_end": 154261697, + // "strand": "+", + // "is_head": true, + // "id": "chr1_154220171_154261697_H+", + // "value": "chr1_154220171_154261697_H+", + // "name": "chr1_154220171_154261697_H+" + // } + // }, + // { + // "data": { + // "chrom": "chr2", + // "ref_start": 80617598, + // "ref_end": 80666408, + // "strand": "-", + // "is_head": false, + // "id": "chr2_80617598_80666408_T-", + // "value": "chr2_80617598_80666408_T-", + // "name": "chr2_80617598_80666408_T-" + // } + // } + // ], + // "edges": [ + // { + // "data": { + // "label": "TRA_(False, MicroHomology(G))_1", + // "weight": 1, + // "read_ids": [ + // "m64135_201204_204719/97059215/ccs" + // ], + // "source": "chr1_154220171_154261697_H+", + // "target": "chr2_80617598_80666408_T-", + // "key": 0 + // } + // } + // ] + // } + // } + + let mut cy = serde_json::json!({ + "desinty": self.density(), + "data": [], + "directed": true, + "multigraph": true, + "elements": { + "nodes": [], + "edges": [] + } + }); + + let d = cy["elements"]["nodes"].as_array_mut().unwrap(); + if let serde_json::Value::Array(nl_nodes) = nlgraph_json["nodes"].take() { + for node_props in nl_nodes { + let node = serde_json::json!({ "data": node_props }); + d.push(node); + } + } + + let e = cy["elements"]["edges"].as_array_mut().unwrap(); + + if let serde_json::Value::Array(nl_edges) = nlgraph_json["edges"].take() { + for edge_props in nl_edges { + // remove first and second element of edge_props + let data = edge_props.as_array(); + + let edge_data = data.iter().last().unwrap().iter().last().unwrap(); + let edge = serde_json::json!({ "data": edge_data}); + e.push(edge); + } + } + + serde_json::to_string_pretty(&cy).unwrap() + } + + fn vis(&self) { + todo!() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + const DATA: &str = r#" + { + "data": [], + "directed": true, + "multigraph": true, + "elements": { + "nodes": [ + { + "data": { + "chrom": "chr1", + "ref_start": 154220171, + "ref_end": 154261697, + "strand": "+", + "is_head": true, + "id": "chr1_154220171_154261697_H+", + "value": "chr1_154220171_154261697_H+", + "name": "chr1_154220171_154261697_H+" + } + }, + { + "data": { + "chrom": "chr2", + "ref_start": 80617598, + "ref_end": 80666408, + "strand": "-", + "is_head": false, + "id": "chr2_80617598_80666408_T-", + "value": "chr2_80617598_80666408_T-", + "name": "chr2_80617598_80666408_T-" + } + } + ], + "edges": [ + { + "data": { + "label": "TRA_(False, MicroHomology(G))_1", + "weight": 1, + "read_ids": ["m64135_201204_204719/97059215/ccs"], + "source": "chr1_154220171_154261697_H+", + "target": "chr2_80617598_80666408_T-", + "key": 0 + } + } + ] + } + }"#; + + fn load_nlgraph() -> NLGraph { + use crate::graph::load::load_cygraph_from_json; + let mut nlgraph = load_cygraph_from_json(serde_json::from_str(DATA).unwrap()).unwrap(); + + nlgraph.node_degree(); + nlgraph + } + + #[test] + fn test_node_degree() { + let _nlgraph = load_nlgraph(); + } + + #[test] + fn test_is_weakly_connected() { + let nlgraph = load_nlgraph(); + assert!(nlgraph.is_weakly_connected()); + } + + #[test] + fn test_walk() { + let nlgraph = load_nlgraph(); + nlgraph.walk(); + } + + #[test] + fn test_desinty() { + let nlgraph = load_nlgraph(); + assert_eq!(nlgraph.density(), 1.0); + } + + #[test] + fn test_closeness_centrality() { + let mut nlgraph = load_nlgraph(); + nlgraph.closeness_centrality(); + } + + #[test] + fn test_degree_centrality() { + let mut nlgraph = load_nlgraph(); + nlgraph.degree_centrality(); + } + + #[test] + fn test_local_clustering_coefficient() { + let mut nlgraph = load_nlgraph(); + nlgraph.local_clustering_coefficient(); + } + + #[test] + fn test_to_json() { + let nlgraph = load_nlgraph(); + + let s = serde_json::to_string_pretty(&nlgraph); + let sg: NLGraph = serde_json::from_str(s.as_ref().unwrap()).unwrap(); + println!("{:?}", sg); + } + + #[test] + fn test_cyjson() { + let nlgraph = load_nlgraph; + let _cyjson = nlgraph().to_cyjson(); + println!("{}", _cyjson); + } } diff --git a/src/graph/data.rs b/src/graph/data.rs index 45e1a50..93977f2 100644 --- a/src/graph/data.rs +++ b/src/graph/data.rs @@ -1,8 +1,14 @@ +use petgraph::Graph; use serde_json::Value; use std::str::FromStr; -#[derive(Debug)] +pub type NLGraph = Graph; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default)] pub enum Strand { + #[default] Positive, Negative, } @@ -16,6 +22,7 @@ impl Strand { } } + #[allow(dead_code)] pub fn is_reverse(&self) -> bool { match self { Strand::Positive => false, @@ -35,7 +42,7 @@ impl FromStr for Strand { } } -#[derive(Debug)] +#[derive(Debug, Default, Serialize, Deserialize)] pub struct NodeData { pub id: String, pub label: String, @@ -44,9 +51,18 @@ pub struct NodeData { pub ref_end: u64, pub strand: Strand, pub is_head: bool, + // node attributes + pub indegree: usize, + pub outdegree: usize, + pub in_degree_centrality: f32, + pub out_degree_centrality: f32, + pub clostness_centrality: f32, + + // local clustering measures + pub local_clustering_coefficient: f32, } -#[derive(Debug)] +#[derive(Debug, Default, Serialize, Deserialize)] pub struct EdgeData { pub label: String, pub weight: u64, @@ -57,9 +73,11 @@ pub struct EdgeData { impl NodeData { pub fn from_json(node_data: &Value) -> Self { - // node: Object {"data": Object {"chrom": String("chr1"), - // "id": String("chr1_154220171_154261697_H+" - // ), "is_head": Bool(true), "name": String("chr1_154220171_154261697_H+"), + // node: Object {"data": Object { + // "chrom": String("chr1"), + // "id": String("chr1_154220171_154261697_H+"), + // "is_head": Bool(true), + // "name": String("chr1_154220171_154261697_H+"), // "ref_end": Number(154261697), // "ref_start": Number(154220171), // "strand": String("+"), @@ -73,6 +91,7 @@ impl NodeData { let ref_end = node.get("ref_end").unwrap().as_u64().unwrap(); let strand = Strand::new(node.get("strand").unwrap().as_str().unwrap()); let is_head = node.get("is_head").unwrap().as_bool().unwrap(); + Self { id, label, @@ -81,18 +100,19 @@ impl NodeData { ref_end, strand, is_head, + ..Default::default() } } } impl EdgeData { pub fn from_json(edge_data: &Value) -> Self { - // Object {"data": Object {"key": Number(0), - // "label": String("TRA_(False, MicroHomology(G))_1"), - // "read_ids": Array [String("m64135_201204_204719/97059215/ccs")], - // "source": String("chr1_154220171_154261697_H+"), - // "target": String("chr2_80617598_80666408_T-"), - // "weight": Number(1)}} + // Object {"data": Object {"key": Number(0), + // "label": String("TRA_(False, MicroHomology(G))_1"), + // "read_ids": Array [String("m64135_201204_204719/97059215/ccs")], + // "source": String("chr1_154220171_154261697_H+"), + // "target": String("chr2_80617598_80666408_T-"), + // "weight": Number(1)}} let edge = edge_data.get("data").unwrap(); let label = edge.get("label").unwrap().as_str().unwrap().to_string(); @@ -118,3 +138,14 @@ impl EdgeData { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_strand() { + let strand = Strand::new("+"); + assert!(!strand.is_reverse()); + } +} diff --git a/src/graph/load.rs b/src/graph/load.rs index 7b9369d..1bfbf70 100644 --- a/src/graph/load.rs +++ b/src/graph/load.rs @@ -1,27 +1,26 @@ use anyhow::Result; use log::info; -use petgraph::Graph; use serde_json::Value; use std::path::Path; use std::collections::HashMap; -use crate::graph::data::{EdgeData, NodeData}; +use crate::graph::data::{EdgeData, NLGraph, NodeData}; -pub fn load_cygraph_from_file>(file: P) -> Result> { +pub fn load_cygraph_from_file>(file: P) -> Result { let reader = std::io::BufReader::new(std::fs::File::open(file.as_ref())?); let data: Value = serde_json::from_reader(reader)?; load_cygraph_from_json(data) } -pub fn load_cygraph_from_json(data: Value) -> Result> { +pub fn load_cygraph_from_json(data: Value) -> Result { let nodes = data.get("elements").unwrap().get("nodes").unwrap(); let edges = data.get("elements").unwrap().get("edges").unwrap(); let node_number = nodes.as_array().unwrap().len(); let edge_number = edges.as_array().unwrap().len(); - let mut graph = Graph::::with_capacity(node_number, edge_number); + let mut graph = NLGraph::with_capacity(node_number, edge_number); let mut id2index = HashMap::new(); @@ -36,7 +35,7 @@ pub fn load_cygraph_from_json(data: Value) -> Result> let edge_data = EdgeData::from_json(edge); let source = id2index.get(&edge_data.source).unwrap(); let target = id2index.get(&edge_data.target).unwrap(); - let index = graph.add_edge(*source, *target, edge_data); + let _index = graph.add_edge(*source, *target, edge_data); } info!("Added {} nodes", graph.node_count()); @@ -48,55 +47,55 @@ pub fn load_cygraph_from_json(data: Value) -> Result> #[cfg(test)] mod tests { use super::*; + const DATA: &str = r#" + { + "data": [], + "directed": true, + "multigraph": true, + "elements": { + "nodes": [ + { + "data": { + "chrom": "chr1", + "ref_start": 154220171, + "ref_end": 154261697, + "strand": "+", + "is_head": true, + "id": "chr1_154220171_154261697_H+", + "value": "chr1_154220171_154261697_H+", + "name": "chr1_154220171_154261697_H+" + } + }, + { + "data": { + "chrom": "chr2", + "ref_start": 80617598, + "ref_end": 80666408, + "strand": "-", + "is_head": false, + "id": "chr2_80617598_80666408_T-", + "value": "chr2_80617598_80666408_T-", + "name": "chr2_80617598_80666408_T-" + } + } + ], + "edges": [ + { + "data": { + "label": "TRA_(False, MicroHomology(G))_1", + "weight": 1, + "read_ids": ["m64135_201204_204719/97059215/ccs"], + "source": "chr1_154220171_154261697_H+", + "target": "chr2_80617598_80666408_T-", + "key": 0 + } + } + ] + } + }"#; #[test] fn test_loadcygraph() { - let data = r#" - { - "data": [], - "directed": true, - "multigraph": true, - "elements": { - "nodes": [ - { - "data": { - "chrom": "chr1", - "ref_start": 154220171, - "ref_end": 154261697, - "strand": "+", - "is_head": true, - "id": "chr1_154220171_154261697_H+", - "value": "chr1_154220171_154261697_H+", - "name": "chr1_154220171_154261697_H+" - } - }, - { - "data": { - "chrom": "chr2", - "ref_start": 80617598, - "ref_end": 80666408, - "strand": "-", - "is_head": false, - "id": "chr2_80617598_80666408_T-", - "value": "chr2_80617598_80666408_T-", - "name": "chr2_80617598_80666408_T-" - } - } - ], - "edges": [ - { - "data": { - "label": "TRA_(False, MicroHomology(G))_1", - "weight": 1, - "read_ids": ["m64135_201204_204719/97059215/ccs"], - "source": "chr1_154220171_154261697_H+", - "target": "chr2_80617598_80666408_T-", - "key": 0 - } - } - ] - } - }"#; - load_cygraph_from_json(serde_json::from_str(data).unwrap()).unwrap(); + load_cygraph_from_json(serde_json::from_str(DATA).unwrap()).unwrap(); } } diff --git a/src/graph/vis.rs b/src/graph/vis.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/graph/vis.rs @@ -0,0 +1 @@ + diff --git a/src/main.rs b/src/main.rs index 9d349e9..8403db5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,7 +69,7 @@ enum Commands { input: PathBuf, }, - /// Create softlinks to files with suffix recursively + /// Create soft links to files with suffix recursively Rsoft { /// The directory to search #[arg(value_hint = ValueHint::FilePath)] @@ -78,7 +78,6 @@ enum Commands { /// The directory to create the softlinks. default is current directory #[arg(short = 't', value_hint = ValueHint::FilePath)] target: Option, - /// The suffix of the files to link. default is all files #[arg(short = 's', value_delimiter = ' ', num_args=1..)] suffix: Option>, @@ -161,6 +160,7 @@ fn main() { Some(Commands::Graph(args)) => { info!("'graph' {args:?} "); + graph::analyze_nlgraph(args).unwrap(); } // If no subcommand was used, it's a normal top level command diff --git a/tests/data/cygrpah1.json b/tests/data/cygraph1.json similarity index 100% rename from tests/data/cygrpah1.json rename to tests/data/cygraph1.json