Skip to content

Commit

Permalink
ENH: node/edge metadata skeleton marbl#243
Browse files Browse the repository at this point in the history
nice; i realize that duplicate node names are going to screw with
this, so i want to go through and double-check that the parsers all
reject graphs with duplicate node names

[ci skip]
  • Loading branch information
fedarko committed May 17, 2023
1 parent f7e0574 commit ae92aaa
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 4 deletions.
26 changes: 25 additions & 1 deletion metagenomescope/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
from ._param_descriptions import (
INPUT,
OUTPUT_DIR,
OUTPUT_CCSTATS,
OUTPUT_DOT,
OUTPUT_CCSTATS,
NODE_METADATA,
EDGE_METADATA,
MAXN,
MAXE,
PATTERNS_FLAG,
Expand Down Expand Up @@ -71,6 +73,24 @@
show_default=True,
help=OUTPUT_CCSTATS,
)
@click.option(
"-n",
"--node-metadata",
type=click.Path(exists=True, dir_okay=False, readable=True),
required=False,
default=None,
show_default=True,
help=NODE_METADATA,
)
@click.option(
"-e",
"--edge-metadata",
type=click.Path(exists=True, dir_okay=False, readable=True),
required=False,
default=None,
show_default=True,
help=EDGE_METADATA,
)
@click.option(
"-maxn",
"--max-node-count",
Expand Down Expand Up @@ -102,6 +122,8 @@ def run_script(
output_viz_dir: str,
output_dot: str,
output_ccstats: str,
node_metadata: str,
edge_metadata: str,
max_node_count: int,
max_edge_count: int,
patterns: bool,
Expand All @@ -122,6 +144,8 @@ def run_script(
output_viz_dir,
output_dot,
output_ccstats,
node_metadata,
edge_metadata,
)


Expand Down
14 changes: 14 additions & 0 deletions metagenomescope/_param_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,20 @@
"raised. THIS OPTION ISN'T FINISHED YET!"
)

NODE_METADATA = (
"TSV file mapping some or all of the graph's node IDs (rows) to arbitrary "
"metadata fields (columns)."
)

EDGE_METADATA = (
"TSV file mapping some or all of the graph's edges (rows) to arbitrary "
"metadata fields (columns). The leftmost two columns in this file should "
"contain the source and sink node ID of the edge being described in a "
"row; if there exist parallel edges in the graph between a given source "
"and sink node, then that row's metadata will be applied to all such "
"edges."
)

IMPACTS = "Impacts all output options (-o, -od, -os)."

MAX_DEETS = f"{IMPACTS} Setting this to 0 removes this limit."
Expand Down
50 changes: 49 additions & 1 deletion metagenomescope/graph/assembly_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from operator import itemgetter
from collections import deque
import numpy
import pandas as pd
import networkx as nx
import pygraphviz


from .. import parsers, config, layout_utils
from .. import parsers, config, layout_utils, misc_utils
from ..msg_utils import operation_msg, conclude_msg
from ..errors import GraphParsingError, GraphError, WeirdError
from . import validators, graph_utils
Expand Down Expand Up @@ -57,6 +58,8 @@ def __init__(
max_node_count=config.MAXN_DEFAULT,
max_edge_count=config.MAXE_DEFAULT,
patterns=True,
node_metadata=None,
edge_metadata=None,
):
"""Parses the input graph file and initializes the AssemblyGraph.
Expand All @@ -77,6 +80,12 @@ def __init__(
patterns: bool
If True, identify & highlight structural patterns; if False, don't.
node_metadata: str or None
Optional path to a TSV file describing node metadata.
edge_metadata: str or None
Optional path to a TSV file describing edge metadata.
"""
self.filename = filename
self.max_node_count = max_node_count
Expand Down Expand Up @@ -131,6 +140,7 @@ def __init__(
# objects' unique IDs.
operation_msg("Initializing node and edge graph objects...")
self._init_graph_objs()
self._integrate_metadata(node_metadata, edge_metadata)
conclude_msg()

# Records the bounding boxes of each component in the graph. Indexed by
Expand Down Expand Up @@ -257,6 +267,44 @@ def _init_graph_objs(self):
# make it easy to associate this edge with its Edge object.
self.graph.edges[e[0], e[1], e[2]]["uid"] = edge_id

def _integrate_metadata(self, node_metadata, edge_metadata):
"""Reads, sanity checks, and integrates node/edge metadata.
Parameters
----------
node_metadata: str or None
If this isn't None, we assume it's a path to a TSV file.
edge_metadata: str or None
If this isn't None, we assume it's a path to a TSV file.
Notes
-----
We load these TSV files as pandas DataFrames using pd.read_csv(). We
could add a lot more details to how we load these DataFrames to, for
example, account for missing values nicely (see read_metadata_file() in
https://github.com/biocore/qurro/blob/master/qurro/_metadata_utils.py),
but I think keeping things simple should be sufficient for now.
"""
if node_metadata is not None:
nm = pd.read_csv(node_metadata, sep="\t", index_col=0)
node_names = [n.name for n in self.nodeid2obj]
misc_utils.verify_subset(
nm.index,
node_names,
custom_message=(
"There exist node IDs in the metadata that are not "
"present in the graph."
),
)
# TODO do this nicely
raise NotImplementedError
if edge_metadata is not None:
# em = pd.read_csv(edge_metadata, sep="\t", index_col=[0, 1])
# Again, check that all source and sink nodes' IDs are in the graph
# TODO
raise NotImplementedError

def _remove_too_large_components(self):
"""Removes too-large components from the graph early on.
Expand Down
10 changes: 10 additions & 0 deletions metagenomescope/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def make_viz(
output_viz_dir: str,
output_dot: str,
output_ccstats: str,
node_metadata: str,
edge_metadata: str,
):
"""Creates a visualization.
Expand Down Expand Up @@ -66,6 +68,12 @@ def make_viz(
output_ccstats: str or None
If passed, we'll write out cc stats to this filepath.
node_metadata: str or None
Optional path to a TSV file describing node metadata.
edge_metadata: str or None
Optional path to a TSV file describing edge metadata.
Returns
-------
None
Expand All @@ -85,6 +93,8 @@ def make_viz(
max_node_count=max_node_count,
max_edge_count=max_edge_count,
patterns=patterns,
node_metadata=node_metadata,
edge_metadata=edge_metadata,
)

outputs = []
Expand Down
20 changes: 18 additions & 2 deletions metagenomescope/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,27 @@ def verify_unique(objs, obj_type="IDs"):
raise WeirdError(f"Duplicate {obj_type}: {objs}")


def verify_subset(s1, s2):
def verify_subset(s1, s2, custom_message=None):
"""Verifies that s1 is a subset of s2.
Note that, if s1 and s2 are identical, then they are still subsets of each
other. That's fine.
Parameters
----------
s1: collection
s2: collection
custom_message: str or None
If this is None, we'll display a simple error message if s1 is not a
subset of s2 (listing out both collections). This can be cumbersome if
s1 and/or s2 are really large, so -- if custom_message is not None --
we'll display that string as our error message instead.
"""
if not set(s1).issubset(set(s2)):
raise WeirdError(f"{s1} is not a subset of {s2}")
if custom_message is None:
msg = f"{s1} is not a subset of {s2}"
else:
msg = custom_message
raise WeirdError(msg)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@
# Sanity check before trying to install -- these should be installed with
# the parent conda environment
setup_requires=["numpy", "pygraphviz"],
# NOTE I don't impose minimum versions here yet, but I probably should
install_requires=[
"click",
"numpy",
"pandas",
"networkx",
"gfapy",
"pyfastg",
Expand Down

0 comments on commit ae92aaa

Please sign in to comment.