Skip to content

Commit

Permalink
Renaming Compounds to Structure (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
Adafede committed Nov 22, 2023
1 parent 94a7dd2 commit d983917
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 126 deletions.
18 changes: 9 additions & 9 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

dm = DataModel()
# Should likely move in the data model if that's used all the time
all_structures = set(dm.get_compounds())
all_structures = set(dm.get_structures())
all_taxa = dm.get_taxa()

app = FastAPI(
Expand Down Expand Up @@ -52,7 +52,7 @@ def get_matching_structures_from_structure_in_item(
if item.molecule:
if item.substructure_search:
try:
results = dm.compound_search_substructure(item.molecule)
results = dm.structure_search_substructure(item.molecule)
structures = {_id for _id, _ in results}
except ValueError:
raise HTTPException(
Expand All @@ -61,7 +61,7 @@ def get_matching_structures_from_structure_in_item(
)
else:
try:
results = dm.compound_search(item.molecule)
results = dm.structure_search(item.molecule)
structures = {
_id
for _id, score in results
Expand Down Expand Up @@ -102,10 +102,10 @@ def get_matching_structures_from_taxon_in_item(dm: DataModel, item: Item) -> set
# We need to get all the matching taxa
taxa = get_matching_taxa_from_taxon_in_item(dm, item)

# We could have a parameter "recursive" in the query to have all the compounds from the parents too
# We could have a parameter "recursive" in the query to have all the structures from the parents too
out = set()
for taxon in taxa:
out.update(dm.get_compounds_of_taxon(taxon))
out.update(dm.get_structures_of_taxon(taxon))

return out

Expand All @@ -116,7 +116,7 @@ def get_matching_taxa_from_structure_in_item(dm: DataModel, item: Item) -> set[i

out = set()
for structure in structures:
out.update(dm.get_taxa_containing_compound(structure))
out.update(dm.get_taxa_containing_structure(structure))

return out

Expand All @@ -127,13 +127,13 @@ async def search_couples(item: Item) -> CoupleResult:
selected_structures = get_matching_structures_from_structure_in_item(dm, item)
selected_taxa = get_matching_taxa_from_taxon_in_item(dm, item)

compounds_of_selected_taxa = {
taxon: dm.get_compounds_of_taxon(taxon) for taxon in selected_taxa
structures_of_selected_taxa = {
taxon: dm.get_structures_of_taxon(taxon) for taxon in selected_taxa
}

couples = {
(structure, taxon)
for taxon, structures in compounds_of_selected_taxa.items()
for taxon, structures in structures_of_selected_taxa.items()
for structure in structures
if structure in selected_structures
}
Expand Down
16 changes: 8 additions & 8 deletions dash_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@

@cache.memoize(timeout=3600)
def get_svg_of_wid(j: int, molecule: str | None = None) -> str:
return molecule_svg(dm.get_compound_smiles_from_wid(j), molecule)
return molecule_svg(dm.get_structure_smiles_from_wid(j), molecule)


@cache.memoize(timeout=3600)
def get_number_of_taxa_for_compound(j: int) -> int:
return dm.get_number_of_taxa_containing_compound(j)
def get_number_of_taxa_for_structure(j: int) -> int:
return dm.get_number_of_taxa_containing_structure(j)


def generate_compounds_cards(
def generate_structures_cards(
active_page: int, data: dict[str, Any], molecule: str | None = None
) -> list[dbc.Card]:
cards = []
Expand All @@ -39,7 +39,7 @@ def generate_compounds_cards(

scores_mode = "scores" in data

displayed = data["matching_compounds"][
displayed = data["matching_structures"][
PAGE_SIZE * (active_page - 1) : PAGE_SIZE * active_page
]

Expand All @@ -62,7 +62,7 @@ def generate_compounds_cards(

img = get_svg_of_wid(j, molecule)
img_data = f"data:image/svg+xml,{quote(img)}"
taxa_count = get_number_of_taxa_for_compound(j)
taxa_count = get_number_of_taxa_for_structure(j)
card = dbc.Card(
[
dbc.CardImg(src=img_data, top=True),
Expand All @@ -74,10 +74,10 @@ def generate_compounds_cards(
className="card-text",
),
dcc.Markdown(
f"[Wikidata page of compound](https://www.wikidata.org/entity/Q{j})"
f"[Wikidata page of structure](https://www.wikidata.org/entity/Q{j})"
),
dbc.Button(
"Compound page", color="primary", href=f"/structure/{j}"
"structure page", color="primary", href=f"/structure/{j}"
),
*extras,
]
Expand Down
84 changes: 42 additions & 42 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def __new__(cls):
def num_taxa(self):
return len(self.db["taxonomy_names"])

def num_compounds(self):
return len(self.db["compound_smiles"])
def num_structures(self):
return len(self.db["structure_smiles"])

def num_couples(self):
return len(self.db["c2t"])
Expand Down Expand Up @@ -160,33 +160,33 @@ def resolve_taxon(self, query: str) -> any:
log.debug(response.json())
return response.json()

### Compoundonomy
def get_compounds(self) -> dict[int, int]:
return self.db["compound_wid"]
### structureonomy
def get_structures(self) -> dict[int, int]:
return self.db["structure_wid"]

def get_compound_smiles_from_wid(self, wid: int) -> str | None:
def get_structure_smiles_from_wid(self, wid: int) -> str | None:
try:
cid = self.db["compound_id"][wid] # ambiguous with PubChem CID?
return self.db["compound_smiles"][cid]
cid = self.db["structure_id"][wid] # ambiguous with PubChem CID?
return self.db["structure_smiles"][cid]
except (IndexError, ValueError):
log.warning(f"Impossible to find a compound with wid={wid}")
log.warning(f"Impossible to find a structure with wid={wid}")
return None

def get_compound_smiles_from_list_of_wid(self, wid: list[int]) -> list[str]:
ids = [self.db["compound_id"][w] for w in wid if w in self.db["compound_id"]]
llen = self.db["compound_smiles"]
return [self.db["compound_smiles"][i] for i in ids if 0 <= i < len(llen)]
def get_structure_smiles_from_list_of_wid(self, wid: list[int]) -> list[str]:
ids = [self.db["structure_id"][w] for w in wid if w in self.db["structure_id"]]
llen = self.db["structure_smiles"]
return [self.db["structure_smiles"][i] for i in ids if 0 <= i < len(llen)]

def get_dict_of_wid_to_smiles(self, wid: Iterable[int]) -> dict[int, str]:
ids = {w: self.db["compound_id"][w] for w in wid if w in self.db["compound_id"]}
llen = self.db["compound_smiles"]
ids = {w: self.db["structure_id"][w] for w in wid if w in self.db["structure_id"]}
llen = self.db["structure_smiles"]
return {
wid: self.db["compound_smiles"][i]
wid: self.db["structure_smiles"][i]
for wid, i in ids.items()
if 0 <= i < len(llen)
}

def compound_get_mol_fp_and_explicit(self, query: str) -> tuple[any, any, bool]:
def structure_get_mol_fp_and_explicit(self, query: str) -> tuple[any, any, bool]:
explicit_h = "[H]" in query
p = Chem.SmilesParserParams()
p.removeHs = not explicit_h
Expand All @@ -198,30 +198,30 @@ def compound_get_mol_fp_and_explicit(self, query: str) -> tuple[any, any, bool]:
fp = fingerprint(mol)
return mol, fp, explicit_h

## COMMENT (AR): Should we rename this to compound_search_from_smiles
## and have same for InChI and co and then wrap them to a `compound_search`
## COMMENT (AR): Should we rename this to structure_search_from_smiles
## and have same for InChI and co and then wrap them to a `structure_search`
## with inchi = "InChI=1S/" in query ...
def compound_search(self, query: str) -> list[tuple[int, float]]:
mol, fp, explicit_h = self.compound_get_mol_fp_and_explicit(query)
def structure_search(self, query: str) -> list[tuple[int, float]]:
mol, fp, explicit_h = self.structure_get_mol_fp_and_explicit(query)

if explicit_h:
db = self.db["compound_sim_h_fps"]
db = self.db["structure_sim_h_fps"]
else:
db = self.db["compound_sim_fps"]
db = self.db["structure_sim_fps"]
scores = DataStructs.BulkTanimotoSimilarity(fp, db)
return [(wid, score) for wid, score in zip(self.db["compound_wid"], scores)]
return [(wid, score) for wid, score in zip(self.db["structure_wid"], scores)]

def compound_search_substructure(
def structure_search_substructure(
self, query: str, chirality: bool = False
) -> list[tuple[int, float]]:
mol, fp, explicit_h = self.compound_get_mol_fp_and_explicit(query)
mol, fp, explicit_h = self.structure_get_mol_fp_and_explicit(query)

if explicit_h:
db = self.db["compound_library_h"]
fp_db = self.db["compound_sim_h_fps"]
db = self.db["structure_library_h"]
fp_db = self.db["structure_sim_h_fps"]
else:
db = self.db["compound_library"]
fp_db = self.db["compound_sim_fps"]
db = self.db["structure_library"]
fp_db = self.db["structure_sim_fps"]

iids = db.GetMatches(
mol,
Expand All @@ -231,42 +231,42 @@ def compound_search_substructure(
useChirality=chirality,
)

new_keys = [self.db["compound_wid"][iid] for iid in iids]
new_keys = [self.db["structure_wid"][iid] for iid in iids]
out = []
for iid, wid in zip(iids, new_keys):
out.append((wid, DataStructs.TanimotoSimilarity(fp, fp_db[iid])))
return out

def compound_get_tsv_from_scores(self, wids, scores) -> str:
def structure_get_tsv_from_scores(self, wids, scores) -> str:
out = "Wikidata link\tSimilarity\tSmiles\n"
for idx, score in enumerate(scores):
wid = wids[idx]
smiles = self.db["compound_smiles"][self.db["compound_id"][wid]]
smiles = self.db["structure_smiles"][self.db["structure_id"][wid]]
out += f"http://www.wikidata.org/entity/Q{wid}\t{score:.3f}\t{smiles}\n"
return out

### Taxonomy to compoundonomy
def get_compounds_of_taxon(self, wid: int, recursive: bool = True) -> list[int]:
### Taxonomy to structureonomy
def get_structures_of_taxon(self, wid: int, recursive: bool = True) -> list[int]:
if wid in self.db["t2c"]:
matching_compounds = set(self.db["t2c"][wid])
matching_structures = set(self.db["t2c"][wid])
else:
matching_compounds = set()
matching_structures = set()

if recursive:
if wid in self.db["taxonomy_children"]:
for parent in self.db["taxonomy_children"][wid]:
if parent in self.db["t2c"]:
for compound in self.db["t2c"][parent]:
matching_compounds.add(compound)
for structure in self.db["t2c"][parent]:
matching_structures.add(structure)

return list(matching_compounds)
return list(matching_structures)

def get_taxa_containing_compound(self, wid: int) -> set[int]:
def get_taxa_containing_structure(self, wid: int) -> set[int]:
if wid in self.db["c2t"]:
return self.db["c2t"][wid]
return set()

def get_number_of_taxa_containing_compound(self, wid: int) -> int:
def get_number_of_taxa_containing_structure(self, wid: int) -> int:
if wid not in self.db["c2t"]:
return 0
return len(self.db["c2t"][wid])
Expand Down
6 changes: 3 additions & 3 deletions pages/contribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def update_output(email, content, name):
dcc.Markdown(
"""You are even more awesome!
So if you have way too many compounds to add, you can send us directly a TSV or CSV file with the following three columns:
So if you have way too many structures to add, you can send us directly a TSV or CSV file with the following three columns:
**smiles** , **organism** , **reference**
Expand Down Expand Up @@ -285,8 +285,8 @@ def layout():
[
dbc.Tabs(
[
dbc.Tab(tab1_content, label="I have a single compound"),
dbc.Tab(tab2_content, label="I have multiple compounds"),
dbc.Tab(tab1_content, label="I have a single structure"),
dbc.Tab(tab2_content, label="I have multiple structures"),
]
)
]
Expand Down
6 changes: 3 additions & 3 deletions pages/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def title(wid=None):

dash.register_page(
__name__,
name="Compound information",
name="Structure information",
top_nav=True,
order=-1,
path_template="/structure/<wid>",
Expand All @@ -39,7 +39,7 @@ def layout(wid: int):
img_data = f"data:image/svg+xml,{quote(img)}"

name_id_list = []
for t in dm.get_taxa_containing_compound(wid):
for t in dm.get_taxa_containing_structure(wid):
name = dm.get_taxon_name_from_wid(t)
name_id_list.append([name, t])
name_id_list = sorted(name_id_list, key=lambda x: x[0])
Expand All @@ -66,7 +66,7 @@ def layout(wid: int):
dash_table.DataTable(
data=table,
page_size=15,
id="taxon-list-compound",
id="taxon-list-structure",
sort_action="native",
filter_action="native",
columns=[
Expand Down
26 changes: 13 additions & 13 deletions pages/taxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dash import Input, Output, callback, dcc

from config import PAGE_SIZE
from dash_common import generate_compounds_cards
from dash_common import generate_structures_cards
from model import DataModel

dm = DataModel()
Expand All @@ -21,8 +21,8 @@ def title(wid=None):
return "LOTUS"


def tsv(compounds: list[int]) -> str:
smileses = dm.get_compound_smiles_from_list_of_wid(compounds)
def tsv(structures: list[int]) -> str:
smileses = dm.get_structure_smiles_from_list_of_wid(structures)
return "smiles\n" + "\n".join(smileses)


Expand All @@ -40,8 +40,8 @@ def tsv(compounds: list[int]) -> str:
Output("cards", "children"),
[Input("pagination", "active_page"), Input("matching-ids", "data")],
)
def compound_cards(active_page: int, data: dict[str, Any]) -> list[dbc.Card]:
return generate_compounds_cards(active_page, data)
def structure_cards(active_page: int, data: dict[str, Any]) -> list[dbc.Card]:
return generate_structures_cards(active_page, data)


@callback(
Expand All @@ -50,8 +50,8 @@ def compound_cards(active_page: int, data: dict[str, Any]) -> list[dbc.Card]:
prevent_initial_call=True,
)
def func(n_clicks, data):
filename = f"compounds_of_{data['taxon_name'].replace('.', '')}.tsv"
return dict(content=tsv(data["matching_compounds"]), filename=filename)
filename = f"structures_of_{data['taxon_name'].replace('.', '')}.tsv"
return dict(content=tsv(data["matching_structures"]), filename=filename)


def layout(wid: int):
Expand Down Expand Up @@ -79,18 +79,18 @@ def layout(wid: int):
markdown += f"[{tax_name}{ranks}](/taxon/{parent[0]}) > "
taxonomic_info = markdown.strip("> ")

matching_compounds = dm.get_compounds_of_taxon(wid)
matching_compounds.sort()
nb_matches = len(matching_compounds)
matching_structures = dm.get_structures_of_taxon(wid)
matching_structures.sort()
nb_matches = len(matching_structures)

warning = f"Found {nb_matches} compounds"
warning = f"Found {nb_matches} structures"

return dbc.Container(
[
dcc.Store(
id="matching-ids",
data={
"matching_compounds": matching_compounds,
"matching_structures": matching_structures,
"taxon_name": taxon_name,
},
),
Expand Down Expand Up @@ -121,6 +121,6 @@ def layout(wid: int):
),
]
),
dbc.Spinner(id="loading-compounds-tsv", children=[dbc.Row(id="cards")]),
dbc.Spinner(id="loading-structures-tsv", children=[dbc.Row(id="cards")]),
]
)
Loading

0 comments on commit d983917

Please sign in to comment.