Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decoupler pathways inference from differential expression files #339

Merged
merged 5 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 94 additions & 38 deletions tools/tertiary-analysis/decoupler/decoupler_pathway_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@

# add AnnData input file option
parser.add_argument(
"-i", "--input_anndata", help="AnnData input file", required=True
"-i", "--input",
help=(
"AnnData or table input file. Table input is meant for a single "
pcm32 marked this conversation as resolved.
Show resolved Hide resolved
"comparison, not gene x cells"
),
required=True
)

# add network input file option
parser.add_argument(
"-n", "--input_network", help="Network input file", required=True
Expand All @@ -34,6 +38,28 @@
default=None,
)

# Column for stat to use when providing a table
parser.add_argument(
"--stat",
help="Stat to use when providing a table. Default is 'log2FC'.",
default="log2FC",
)
# Optional column for p-value or FDR in the table
parser.add_argument(
"--p_value_column",
required=False,
help="Column name in the table with p-values or FDRs.",
default=None,
)
# Optional column for FDR threshold when given a table
parser.add_argument(
"--p_value_threshold",
required=False,
type=float,
help="Column name in the table with FDRs.",
default=0.05
)

# Column name in net with source nodes
parser.add_argument(
"-s",
Expand Down Expand Up @@ -93,8 +119,15 @@
if args.output is None:
raise ValueError("Please specify either -o or --output")

# read in the AnnData input file
adata = ad.read_h5ad(args.input_anndata)
# detect based on input file extension if the input file is AnnData or matrix
if args.input.endswith(".h5ad"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about input file with .h5 extension

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not needed, since this is internally controlled with the galaxy wrapper, which will transform the filename forcefully into input.h5ad . Also the extension .h5 is generally used for any HDF5 file, not just AnnData, the community tends to favour .h5ad for anndata (although it is just a convention).

input_type = "AnnData"
elif args.input.endswith(".tsv") or args.input.endswith(".csv"):
input_type = "matrix"
else:
raise ValueError(
"Invalid input file. Please provide a valid AnnData or matrix file."
)

# read in the input file network input file
network = pd.read_csv(args.input_network, sep="\t")
Expand All @@ -108,17 +141,32 @@
"Source, target, and weight columns are not present in the network"
)


print(type(args.min_n))

if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns:
# Storing index in a column called 'index_bak'
adata.var['index_bak'] = adata.var.index
adata.var.set_index(args.var_gene_symbols_field, inplace=True)
if input_type == "AnnData":
# read in the AnnData input file
adata = ad.read_h5ad(args.input)

if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns:
# Storing index in a column called 'index_bak'
adata.var['index_bak'] = adata.var.index
adata.var.set_index(args.var_gene_symbols_field, inplace=True)
else:
# read in the matrix input file, genes in rows and columns for stats
adata = pd.read_csv(args.input, sep="\t", index_col=0)
if args.stat not in adata.columns:
raise ValueError(f"Stat column {args.stat} not found in input table header.")
if args.p_value_column and args.p_value_column not in adata.columns:
raise ValueError(
f"P-value column {args.p_value_column} not found in input table header."
)
if args.p_value_column and args.p_value_threshold is not None:
adata = adata[adata[args.p_value_column] <= args.p_value_threshold]

adata = adata[[args.stat]].T

if args.method == "mlm":
dc.run_mlm(
res = dc.run_mlm(
mat=adata,
net=network,
source=args.source,
Expand All @@ -129,24 +177,19 @@
use_raw=args.use_raw,
)

if args.output is not None:
# write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
# output network files
combined_df = pd.concat(
[adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1
)

# Save the combined dataframe to a file
combined_df.to_csv(args.output + ".tsv", sep="\t")

# if args.activities_path is specified, generate the activities AnnData
# and save the AnnData object to the specified path
if args.activities_path is not None:
acts = dc.get_acts(adata, obsm_key="mlm_estimate")
acts.write_h5ad(args.activities_path)

elif args.method == "ulm":
dc.run_ulm(
res = dc.run_ulm(
mat=adata,
net=network,
source=args.source,
target=args.target,
weight=args.weight,
verbose=True,
min_n=args.min_n,
use_raw=args.use_raw,
)
elif args.method == "consensus":
res = dc.run_consensus(
mat=adata,
net=network,
source=args.source,
Expand All @@ -157,18 +200,31 @@
use_raw=args.use_raw,
)

if args.output is not None:
# write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
# output network files
if args.output is not None:
# write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
# output network files
if input_type == "AnnData":
combined_df = pd.concat(
[adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1
[adata.obsm[f"{args.method}_estimate"],
adata.obsm[f"{args.method}_pvals"]], axis=1
)
else:
tf_est, tf_pvals = res
combined_df = pd.DataFrame(
{
# index is written, so no need for the set names
f"{args.method}_estimate": tf_est.iloc[0],
f"{args.method}_pvals": tf_pvals.iloc[0],
}
)
# sort ascending on the p-values
combined_df.sort_values(by=f"{args.method}_pvals", inplace=True)

# Save the combined dataframe to a file
combined_df.to_csv(args.output + ".tsv", sep="\t")
# Save the combined dataframe to a file
combined_df.to_csv(args.output + ".tsv", sep="\t")

# if args.activities_path is specified, generate the activities AnnData
# and save the AnnData object to the specified path
if args.activities_path is not None:
acts = dc.get_acts(adata, obsm_key="ulm_estimate")
acts.write_h5ad(args.activities_path)
# if args.activities_path is specified, generate the activities AnnData
# and save the AnnData object to the specified path
if args.activities_path is not None and input_type == "AnnData":
acts = dc.get_acts(adata, obsm_key=f"{args.method}_estimate")
acts.write_h5ad(args.activities_path)
Loading