diff --git a/client/src/components/Pages/Structure/Input/TxSegmentElementInput/TxSegmentElementInput.tsx b/client/src/components/Pages/Structure/Input/TxSegmentElementInput/TxSegmentElementInput.tsx index f806ff8f..48e193b0 100644 --- a/client/src/components/Pages/Structure/Input/TxSegmentElementInput/TxSegmentElementInput.tsx +++ b/client/src/components/Pages/Structure/Input/TxSegmentElementInput/TxSegmentElementInput.tsx @@ -144,7 +144,7 @@ const TxSegmentCompInput: React.FC = ({ endingExon, startingExonOffset, endingExonOffset, - index + index, ]); const handleTxElementResponse = ( @@ -171,7 +171,7 @@ const TxSegmentCompInput: React.FC = ({ } }); } - setPendingResponse(false) + setPendingResponse(false); }; /** @@ -228,7 +228,7 @@ const TxSegmentCompInput: React.FC = ({ * Request construction of tx segment element from server and handle response */ const buildTranscriptSegmentElement = () => { - setPendingResponse(true) + setPendingResponse(true); // fire constructor request switch (txInputType) { case InputType.gcg: @@ -463,7 +463,8 @@ const TxSegmentCompInput: React.FC = ({ tooltipDirection="bottom" geneText={txGeneText} setGeneText={setTxGeneText} - style={{ width: 125 }} + setChromosome={setTxChrom} + setStrand={setTxStrand} /> {genomicCoordinateInfo} @@ -676,7 +677,7 @@ const TxSegmentCompInput: React.FC = ({ inputElements, validated, icon, - pendingResponse + pendingResponse, }); }; diff --git a/client/src/components/Utilities/GetCoordinates/GetCoordinates.tsx b/client/src/components/Utilities/GetCoordinates/GetCoordinates.tsx index 4521a918..f24fedda 100644 --- a/client/src/components/Utilities/GetCoordinates/GetCoordinates.tsx +++ b/client/src/components/Utilities/GetCoordinates/GetCoordinates.tsx @@ -281,7 +281,8 @@ const GetCoordinates: React.FC = () => { setGene={setGene} geneText={geneText} setGeneText={setGeneText} - style={{ width: 125 }} + setChromosome={setChromosome} + setStrand={setStrand} /> {genomicCoordinateInfo} diff --git a/client/src/components/main/shared/GeneAutocomplete/GeneAutocomplete.tsx b/client/src/components/main/shared/GeneAutocomplete/GeneAutocomplete.tsx index 05269330..85821a70 100644 --- a/client/src/components/main/shared/GeneAutocomplete/GeneAutocomplete.tsx +++ b/client/src/components/main/shared/GeneAutocomplete/GeneAutocomplete.tsx @@ -1,12 +1,12 @@ -import React, { useState, useEffect } from "react"; -import { TextField, Typography } from "@material-ui/core"; -import Autocomplete from "@material-ui/lab/Autocomplete"; -import { getGeneId, getGeneSuggestions } from "../../../../services/main"; -import { - NormalizeGeneResponse, - SuggestGeneResponse, -} from "../../../../services/ResponseModels"; +import React, { useState, useEffect, ReactNode } from "react"; +import { TextField, Typography, makeStyles } from "@material-ui/core"; +import Autocomplete, { + AutocompleteRenderGroupParams, +} from "@material-ui/lab/Autocomplete"; +import { getGeneSuggestions } from "../../../../services/main"; +import { SuggestGeneResponse } from "../../../../services/ResponseModels"; import HelpTooltip from "../HelpTooltip/HelpTooltip"; +import { useColorTheme } from "../../../../global/contexts/Theme/ColorThemeContext"; export enum GeneSuggestionType { conceptId = "Concept ID", @@ -15,7 +15,13 @@ export enum GeneSuggestionType { prevSymbol = "Previous Symbol", none = "", } -export type SuggestedGeneOption = { value: string; type: GeneSuggestionType }; + +export type SuggestedGeneOption = { + value: string; + type: GeneSuggestionType | string; + chromosome?: string; + strand?: string; +}; const defaultGeneOption: SuggestedGeneOption = { value: "", @@ -42,6 +48,8 @@ interface Props { | "top-start" | undefined; promptText?: string | undefined; + setChromosome?: CallableFunction; + setStrand?: CallableFunction; } export const GeneAutocomplete: React.FC = ({ @@ -51,6 +59,8 @@ export const GeneAutocomplete: React.FC = ({ setGeneText, tooltipDirection, promptText, + setChromosome, + setStrand, }) => { const existingGeneOption = gene ? { value: gene, type: GeneSuggestionType.symbol } @@ -58,6 +68,17 @@ export const GeneAutocomplete: React.FC = ({ const [geneOptions, setGeneOptions] = useState([]); const [geneValue, setGeneValue] = useState(existingGeneOption); const [inputValue, setInputValue] = useState(existingGeneOption); + const [loading, setLoading] = useState(false); + + const { colorTheme } = useColorTheme(); + const useStyles = makeStyles(() => ({ + autocompleteGroupHeader: { + paddingLeft: "8px", + color: colorTheme["--dark-gray"], + fontSizeAdjust: "0.5", + }, + })); + const classes = useStyles(); /** * Simple wrapper around state setters to ensure updates to local selected value are reflected @@ -67,6 +88,12 @@ export const GeneAutocomplete: React.FC = ({ const updateSelection = (selection: SuggestedGeneOption) => { setGene(selection.value); setGeneValue(selection); + if (setChromosome) { + setChromosome(selection.chromosome); + } + if (setStrand) { + setStrand(selection.strand); + } }; // Update options @@ -74,23 +101,19 @@ export const GeneAutocomplete: React.FC = ({ if (inputValue.value === "") { setGeneText(""); setGeneOptions([]); + setLoading(false); } else { - const delayDebounce = setTimeout(() => { - getGeneSuggestions(inputValue.value).then((suggestResponseJson) => { - if ( - !suggestResponseJson.symbols && - !suggestResponseJson.prev_symbols && - !suggestResponseJson.aliases - ) { - setGeneText("Unrecognized term"); - setGeneOptions([]); - } else { - setGeneText(""); - setGeneOptions(buildOptions(suggestResponseJson)); - } - }); - }, 300); - return () => clearTimeout(delayDebounce); + setLoading(true); + getGeneSuggestions(inputValue.value).then((suggestResponseJson) => { + setLoading(false); + if (suggestResponseJson.matches_count === 0) { + setGeneText("Unrecognized term"); + setGeneOptions([]); + } else { + setGeneText(""); + setGeneOptions(buildOptions(suggestResponseJson, inputValue.value)); + } + }); } }, [inputValue]); @@ -102,48 +125,43 @@ export const GeneAutocomplete: React.FC = ({ }, [gene]); /** - * Attempt exact match for entered text. Should be called if user-submitted text - * isn't specific enough to narrow options down to a reasonable number (the - * `MAX_SUGGESTIONS` value set server-side), in case their entered value - * happens to match a real gene term. - * No return value, but updates dropdown options if successful. + * Generate group HTML element. Needed to properly display text about # of other possible completions. + * @param params group object processed by autocomplete + * @returns group node to render */ - const tryExactMatch = (input: string) => { - getGeneId(input).then((geneResponseJson: NormalizeGeneResponse) => { - // just provide entered term, but correctly-cased - setGeneText(""); - if (geneResponseJson.cased) { - setGeneOptions([ - { - value: geneResponseJson.cased, - type: geneResponseJson.cased.match(/^\w[^:]*:.+$/) - ? GeneSuggestionType.conceptId - : GeneSuggestionType.symbol, - }, - ]); - } - }); + const makeGroup = (params: AutocompleteRenderGroupParams): ReactNode => { + const children = params.group.includes("possible") ? [] : params.children; + const groupElement = ( +
+ {params.group} +
+ ); + return [groupElement, children]; }; - // if geneOptions is empty, try an exact match (note: keep this useEffect separately, as we want to do this after all of the autocomplete lookups) - useEffect(() => { - if (!geneOptions.length) { - tryExactMatch(inputValue.value); - } - }, [geneOptions]); - - /** - * Construct options for use in MUI Autocomplete GroupBy - * @param suggestResponse response from suggestions API received from server - * @returns array of option objects - */ const buildOptions = ( - suggestResponse: SuggestGeneResponse + suggestResponse: SuggestGeneResponse, + inputValue: string ): SuggestedGeneOption[] => { const options: SuggestedGeneOption[] = []; - if (suggestResponse.symbols) { - suggestResponse.symbols.map((suggestion) => - options.push({ value: suggestion[0], type: GeneSuggestionType.symbol }) + if (suggestResponse.concept_id) { + suggestResponse.concept_id.map((suggestion) => + options.push({ + value: suggestion[0], + type: GeneSuggestionType.conceptId, + chromosome: suggestion[3], + strand: suggestion[4], + }) + ); + } + if (suggestResponse.symbol) { + suggestResponse.symbol.map((suggestion) => + options.push({ + value: suggestion[0], + type: GeneSuggestionType.symbol, + chromosome: suggestion[3], + strand: suggestion[4], + }) ); } if (suggestResponse.prev_symbols) { @@ -151,22 +169,50 @@ export const GeneAutocomplete: React.FC = ({ options.push({ value: suggestion[0], type: GeneSuggestionType.prevSymbol, + chromosome: suggestion[3], + strand: suggestion[4], }) ); } if (suggestResponse.aliases) { suggestResponse.aliases.map((suggestion) => - options.push({ value: suggestion[0], type: GeneSuggestionType.alias }) + options.push({ + value: suggestion[0], + type: GeneSuggestionType.alias, + chromosome: suggestion[3], + strand: suggestion[4], + }) ); } + // slightly hack-y way to insert message about number of possible options: create an option group + // with the message as the group title, and then in `makeGroup()`, remove all of its child elements. + // `value` needs to be set to `inputValue` (or another valid completion of user text) for the autocomplete object + // to render the group at all + if (suggestResponse.warnings) { + suggestResponse.warnings.map((warn: string) => { + if (warn.startsWith("Exceeds max matches")) { + const maxExceededMsg = + options.length > 0 + ? `+ ${suggestResponse.matches_count} possible options` + : `${suggestResponse.matches_count} possible options`; + options.push({ + value: inputValue, + type: maxExceededMsg, + }); + } + }); + } return options; }; return ( { if (newValue) { updateSelection(newValue); @@ -180,13 +226,11 @@ export const GeneAutocomplete: React.FC = ({ }} options={geneOptions} groupBy={(option) => (option ? option.type : "")} + renderGroup={makeGroup} getOptionLabel={(option) => (option.value ? option.value : "")} getOptionSelected={(option, selected) => { return option.value === selected.value; }} - clearOnBlur={false} - clearOnEscape - disableClearable={inputValue.value === ""} renderInput={(params) => ( `_ formatted string. A CURIE string has the structure ``prefix``:``reference``, as defined by the W3C syntax. */ export type CURIE = string; /** @@ -53,9 +39,7 @@ export type CURIE = string; */ export type Comparator = "<=" | ">="; /** - * A interval on a stained metaphase chromosome specified by cytobands. - * CytobandIntervals include the regions described by the start and end - * cytobands. + * A character string representing cytobands derived from the *International System for Human Cytogenomic Nomenclature* (ISCN) `guidelines `_. */ export type HumanCytoband = string; /** @@ -63,16 +47,12 @@ export type HumanCytoband = string; */ export type Strand = "+" | "-"; /** - * A character string of residues that represents a biological sequence - * using the conventional sequence order (5’-to-3’ for nucleic acid sequences, - * and amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes - * are permitted in Sequences. + * A character string of Residues that represents a biological sequence using the conventional sequence order (5'-to-3' for nucleic acid sequences, and amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes are permitted in Sequences. */ export type Sequence = string; /** * Permissible values for describing the underlying causative event driving an - * assayed fusion. - * + * assayed fusion. */ export type EventType = "rearrangement" | "read-through" | "trans-splicing"; /** @@ -143,13 +123,15 @@ export interface Extension { value?: unknown; } /** - * A gene is an authoritative representation of one or more heritable - * :ref:`Locations ` that includes all sequence elements necessary - * to perform a biological function. A gene may include regulatory, - * transcribed, and/or other functional Locations. + * A reference to a Gene as defined by an authority. For human genes, the use of + * `hgnc `_ as the gene authority is + * RECOMMENDED. */ export interface Gene { type?: "Gene"; + /** + * A CURIE reference to a Gene concept + */ gene_id: CURIE; } /** @@ -170,77 +152,128 @@ export interface LocationDescriptor { * A Location defined by an interval on a referenced Sequence. */ export interface SequenceLocation { + /** + * Variation Id. MUST be unique within document. + */ _id?: CURIE; type?: "SequenceLocation"; + /** + * A VRS Computed Identifier for the reference Sequence. + */ sequence_id: CURIE; + /** + * Reference sequence region defined by a SequenceInterval. + */ interval: SequenceInterval | SimpleInterval; } /** - * A SequenceInterval represents a span of sequence. Positions are always - * represented by contiguous spans using interbase coordinates. - * SequenceInterval is intended to be compatible with that in Sequence - * Ontology ([SO:0000001](http://www.sequenceontology.org/browser/ - * current_svn/term/SO:0000001)), with the exception that the GA4GH VRS - * SequenceInterval may be zero-width. The SO definition is for an extent - * greater than zero. + * A SequenceInterval represents a span on a Sequence. Positions are always + * represented by contiguous spans using interbase coordinates or coordinate ranges. */ export interface SequenceInterval { type?: "SequenceInterval"; - start: Number | IndefiniteRange | DefiniteRange; - end: Number | IndefiniteRange | DefiniteRange; + /** + * The start coordinate or range of the interval. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than the value of `end`. + */ + start: DefiniteRange | IndefiniteRange | Number; + /** + * The end coordinate or range of the interval. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`. + */ + end: DefiniteRange | IndefiniteRange | Number; } /** - * A simple number value as a VRS class. + * A bounded, inclusive range of numbers. */ -export interface Number { - type?: "Number"; - value: number; +export interface DefiniteRange { + type?: "DefiniteRange"; + /** + * The minimum value; inclusive + */ + min: number; + /** + * The maximum value; inclusive + */ + max: number; } /** - * An indefinite range represented as a number and associated comparator. - * The bound operator is interpreted as follows: `>=` are all values greater - * than and including the value, `<=` are all numbers less than and including - * the value. + * A half-bounded range of numbers represented as a number bound and associated + * comparator. The bound operator is interpreted as follows: '>=' are all numbers + * greater than and including `value`, '<=' are all numbers less than and including + * `value`. */ export interface IndefiniteRange { type?: "IndefiniteRange"; + /** + * The bounded value; inclusive + */ value: number; + /** + * MUST be one of '<=' or '>=', indicating which direction the range is indefinite + */ comparator: Comparator; } /** - * A bounded, inclusive range of numbers. + * A simple integer value as a VRS class. */ -export interface DefiniteRange { - type?: "DefiniteRange"; - min: number; - max: number; +export interface Number { + type?: "Number"; + /** + * The value represented by Number + */ + value: number; } /** - * DEPRECATED: A SimpleInterval represents a span of sequence. Positions - * are always represented by contiguous spans using interbase coordinates. + * DEPRECATED: A SimpleInterval represents a span of sequence. Positions are always + * represented by contiguous spans using interbase coordinates. * This class is deprecated. Use SequenceInterval instead. */ export interface SimpleInterval { type?: "SimpleInterval"; + /** + * The start coordinate + */ start: number; + /** + * The end coordinate + */ end: number; } /** * A Location on a chromosome defined by a species and chromosome name. */ export interface ChromosomeLocation { - type?: "ChromosomeLocation"; + /** + * Location Id. MUST be unique within document. + */ _id?: CURIE; - species_id: CURIE; + type?: "ChromosomeLocation"; + /** + * CURIE representing a species from the `NCBI species taxonomy `_. Default: 'taxonomy:9606' (human) + */ + species_id?: CURIE & string; + /** + * The symbolic chromosome name. For humans, For humans, chromosome names MUST be one of 1..22, X, Y (case-sensitive) + */ chr: string; + /** + * The chromosome region defined by a CytobandInterval + */ interval: CytobandInterval; } /** - * A contiguous region specified by chromosomal bands features. + * A contiguous span on a chromosome defined by cytoband features. The span includes + * the constituent regions described by the start and end cytobands, as well as any + * intervening regions. */ export interface CytobandInterval { type?: "CytobandInterval"; + /** + * The start cytoband region. MUST specify a region nearer the terminal end (telomere) of the chromosome p-arm than `end`. + */ start: HumanCytoband; + /** + * The end cytoband region. MUST specify a region nearer the terminal end (telomere) of the chromosome q-arm than `start`. + */ end: HumanCytoband; } /** @@ -395,7 +428,7 @@ export interface FunctionalDomain { */ export interface ClientAssayedFusion { type?: "AssayedFusion"; - regulatory_element?: RegulatoryElement; + regulatory_element?: ClientRegulatoryElement; structural_elements: ( | ClientTranscriptSegmentElement | ClientGeneElement @@ -406,6 +439,18 @@ export interface ClientAssayedFusion { causative_event: CausativeEvent; assay: Assay; } +/** + * Define regulatory element object used client-side. + */ +export interface ClientRegulatoryElement { + type?: "RegulatoryElement"; + regulatory_class: RegulatoryClass; + feature_id?: string; + associated_gene?: GeneDescriptor; + feature_location?: LocationDescriptor; + display_class: string; + nomenclature: string; +} /** * TranscriptSegment element class used client-side. */ @@ -478,7 +523,7 @@ export interface ClientUnknownGeneElement { */ export interface ClientCategoricalFusion { type?: "CategoricalFusion"; - regulatory_element?: RegulatoryElement; + regulatory_element?: ClientRegulatoryElement; structural_elements: ( | ClientTranscriptSegmentElement | ClientGeneElement @@ -509,18 +554,6 @@ export interface ClientFunctionalDomain { sequence_location?: LocationDescriptor; domain_id: string; } -/** - * Define regulatory element object used client-side. - */ -export interface ClientRegulatoryElement { - type?: "RegulatoryElement"; - regulatory_class: RegulatoryClass; - feature_id?: string; - associated_gene?: GeneDescriptor; - feature_location?: LocationDescriptor; - display_class: string; - nomenclature: string; -} /** * Abstract class to provide identification properties used by client. */ @@ -663,9 +696,11 @@ export interface ServiceInfoResponse { export interface SuggestGeneResponse { warnings?: string[]; term: string; - symbols?: [string, string, string][]; - prev_symbols?: [string, string, string][]; - aliases?: [string, string, string][]; + matches_count: number; + concept_id?: [string, string, string, string, string][]; + symbol?: [string, string, string, string, string][]; + prev_symbols?: [string, string, string, string, string][]; + aliases?: [string, string, string, string, string][]; } /** * Response model for transcript segment element construction endpoint. diff --git a/server/curfu/cli.py b/server/curfu/cli.py index 72b19aca..bb8ec26a 100644 --- a/server/curfu/cli.py +++ b/server/curfu/cli.py @@ -8,8 +8,8 @@ from curfu import APP_ROOT from curfu.devtools import DEFAULT_INTERPRO_TYPES from curfu.devtools.build_client_types import build_client_types -from curfu.devtools.gene import GeneSuggestionBuilder -from curfu.devtools.interpro import build_gene_domain_maps +from curfu.devtools.build_gene_suggest import GeneSuggestionBuilder +from curfu.devtools.build_interpro import build_gene_domain_maps @click.command() @@ -92,7 +92,7 @@ def domains( def genes() -> None: """Build gene mappings for use in Fusion Curation gene autocomplete.""" builder = GeneSuggestionBuilder() - builder.build_gene_suggest_maps() + builder.build_gene_suggestion_file() @devtools.command() diff --git a/server/curfu/devtools/build_gene_suggest.py b/server/curfu/devtools/build_gene_suggest.py new file mode 100644 index 00000000..f56ff3ab --- /dev/null +++ b/server/curfu/devtools/build_gene_suggest.py @@ -0,0 +1,148 @@ +"""Provide tools to build backend data relating to gene identification.""" +import csv +from datetime import datetime as dt +from pathlib import Path +from timeit import default_timer as timer +from typing import Dict, List, Optional + +import click +from biocommons.seqrepo.seqrepo import SeqRepo +from gene.database import create_db +from gene.schemas import RecordType + +from curfu import APP_ROOT, SEQREPO_DATA_PATH, logger + + +class GeneSuggestionBuilder: + """Provide build tools for gene autosuggest mappings. + + Implemented as a class for easier sharing of database resources between methods. + """ + + def __init__(self) -> None: + """Initialize class.""" + self.gene_db = create_db() + self.sr = SeqRepo(SEQREPO_DATA_PATH) + self.genes = [] + + def _get_chromosome(self, record: Dict) -> Optional[str]: + """Extract readable chromosome identifier from gene extensions. + + :param record: stored normalized record + :return: NC_ chromosome ID if successful + """ + for source in ("ncbi_locations", "ensembl_locations", "locations"): + for location in record.get(source, []): + if location["type"] == "SequenceLocation": + identifiers = self.sr.translate_identifier( + location["sequence_id"], "NCBI" + ) + if identifiers: + return identifiers[0] + return None + + @staticmethod + def _make_list_column(values: List[str]) -> str: + """Convert a list of strings into a comma-separated string, filtering out + non-alphabetic values. + + This static method takes a list of strings as input and converts it into a + comma-separated string. The method filters out non-alphabetic values and + ensures that only unique, alphabetic values are included in the result. + + Note: + ---- + - The method performs a case-insensitive comparison when filtering unique + values. + - If the input list contains non-alphabetic values or duplicates, they will be + excluded from the result. + - The result will be a comma-separated string with no leading or trailing + commas. + + :param values: A list of strings to be converted into a comma-separated string. + :return: A comma-separated string containing unique, alphabetic values from the + input list. + """ + unique = {v.upper() for v in values} + filtered = {v for v in unique if any(char.isalpha() for char in v)} + return ",".join(filtered) + + def _process_gene_record(self, record: Dict) -> None: + """Add the gene record to processed suggestions. + + :param record: gene record object retrieved from DB + """ + symbol = record.get("symbol") + chromosome = self._get_chromosome(record) + strand = record.get("strand") + if not all([symbol, chromosome, strand]): + return + gene_data = { + "concept_id": record["concept_id"], + "symbol": symbol, + "aliases": self._make_list_column(record.get("aliases", [])), + "previous_symbols": self._make_list_column( + record.get("previous_symbols", []) + ), + "chromosome": self._get_chromosome(record), + "strand": record.get("strand"), + } + self.genes.append(gene_data) + + def _save_suggest_file(self, output_dir: Path) -> None: + """Save the gene suggestions table to a CSV file. + + This method takes the processed gene suggestions stored in the `self.genes` + attribute and saves them to a CSV file. The CSV file will have the following + columns: + + - `concept_id`: The unique identifier for the gene concept. + - `symbol`: The primary gene symbol. + - `aliases`: Comma-separated list of gene aliases. + - `previous_symbols`: Comma-separated list of previous gene symbols. + - `chromosome`: The chromosome where the gene is located. + - `strand`: The genomic strand where the gene is located. + + The CSV file will be named using the current date in the format + "gene_suggest_YYYYMMDD.csv" and will be saved in the specified `output_dir`. + + :param output_dir: The directory where the gene suggestions table file will be + saved. + """ + fieldnames = [ + "concept_id", + "symbol", + "aliases", + "previous_symbols", + "chromosome", + "strand", + ] + today = dt.strftime(dt.today(), "%Y%m%d") + with open(output_dir / f"gene_suggest_{today}.csv", "w") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for row in self.genes: + writer.writerow(row) + + def build_gene_suggestion_file(self, output_dir: Path = APP_ROOT / "data") -> None: + """Build the gene suggestions table file by processing gene records from the + gene database. + + - The gene database should be initialized before calling this method. + - The gene suggestions table file will be saved in CSV format. + + :param output_dir: The directory where the gene suggestions table file will be + saved. Default is the 'data' directory within the application root. + :return: None + """ + start = timer() + + for record in self.gene_db.get_all_records(RecordType.MERGER): + self._process_gene_record(record) + + self._save_suggest_file(output_dir) + + stop = timer() + msg = f"Built gene suggestions table in {(stop - start):.5f} seconds." + click.echo(msg) + logger.info(msg) diff --git a/server/curfu/devtools/interpro.py b/server/curfu/devtools/build_interpro.py similarity index 99% rename from server/curfu/devtools/interpro.py rename to server/curfu/devtools/build_interpro.py index d07ad739..112fe277 100644 --- a/server/curfu/devtools/interpro.py +++ b/server/curfu/devtools/build_interpro.py @@ -10,6 +10,7 @@ from typing import Dict, Optional, Set, Tuple import click +from gene.database import create_db from gene.query import QueryHandler from curfu import APP_ROOT, logger @@ -63,7 +64,7 @@ def get_uniprot_refs() -> UniprotRefs: start = timer() # scanning on DynamoDB_Local is extremely slow - q = QueryHandler() + q = QueryHandler(create_db()) # must be dynamodb genes = q.db.genes uniprot_ids: UniprotRefs = {} diff --git a/server/curfu/devtools/gene.py b/server/curfu/devtools/gene.py deleted file mode 100644 index ddd2b07a..00000000 --- a/server/curfu/devtools/gene.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Provide tools to build backend data relating to gene identification.""" -from datetime import datetime as dt -from pathlib import Path -from timeit import default_timer as timer -from typing import Dict, Tuple - -import click -from gene.query import QueryHandler - -from curfu import APP_ROOT, logger - -# type stub -Map = Dict[str, Tuple[str, str, str]] - - -class GeneSuggestionBuilder: - """Provide build tools for gene autosuggest mappings. - - Implemented as a class for easier sharing of DynamoDB resources between methods. - """ - - xrefs_map = {} - symbol_map = {} - label_map = {} - prev_symbol_map = {} - alias_map = {} - assoc_with_map = {} - - def __init__(self) -> None: - """Initialize class. - - TODO: think about how best to force prod environment - """ - self.q = QueryHandler() - self.genes = self.q.db.genes - - @staticmethod - def write_map_to_file(mapping: Map, outfile_path: Path) -> None: - """Save individual gene mapping to file. - :param Map mapping: dictionary keying values of a specific item_type set - to normalized gene data - :param outfile_path Path: path to save mapping at - """ - with open(outfile_path, "w") as fp: - for normed in mapping.values(): - fp.write(f"{normed[0]}\t{normed[1]}\t{normed[2]}\n") - - def update_maps(self, record: Dict) -> None: - """Add map entries for relevant data in given DB record. - :param Dict record: individual identity or merged record from DDB. Ideally, - should not duplicate previous records (i.e., `record` should not be a record - for which an associated merged record exists). - """ - norm_id = record["concept_id"] - norm_symbol = record["symbol"] - - for xref in [norm_id] + record.get("xrefs", []): - self.xrefs_map[xref.lower()] = (xref, norm_id, norm_symbol) - - self.symbol_map[norm_symbol.lower()] = (norm_symbol, norm_id, "") - - for prev_symbol in record.get("previous_symbols", []): - self.prev_symbol_map[prev_symbol.lower()] = ( - prev_symbol, - norm_id, - norm_symbol, - ) - - for assoc_with in record.get("associated_with", []): - self.assoc_with_map[assoc_with.lower()] = (assoc_with, norm_id, norm_symbol) - - label = record.get("label") - if label: - self.label_map[label.lower()] = (label, norm_id, norm_symbol) - - for alias in record.get("aliases", []): - self.alias_map[alias.lower()] = (alias, norm_id, norm_symbol) - - def build_gene_suggest_maps(self, output_dir: Path = APP_ROOT / "data") -> None: - """Construct gene autocomplete suggestion mappings. - Scan existing gene_concepts table and gather all possible terms that can be - used to look up normalized concepts. Then, link them with their associated - normalized concept IDs/labels and save them. - - :param Path output_dir: path to directory to save output files in - """ - start = timer() - - last_evaluated_key = None - valid_item_types = ("identity", "merger") - while True: - if last_evaluated_key: - response = self.genes.scan(ExclusiveStartKey=last_evaluated_key) - else: - response = self.genes.scan() - last_evaluated_key = response.get("LastEvaluatedKey") - records = response["Items"] - - for record in records: - if record["item_type"] not in valid_item_types: - continue - elif "merge_ref" in record: - continue - self.update_maps(record) - - if not last_evaluated_key: - break - - today = dt.strftime(dt.today(), "%Y%m%d") - for map, name in ( - (self.xrefs_map, "xrefs"), - (self.symbol_map, "symbols"), - (self.label_map, "labels"), - (self.prev_symbol_map, "prev_symbols"), - (self.alias_map, "aliases"), - (self.assoc_with_map, "assoc_with"), - ): - self.write_map_to_file(map, output_dir / f"gene_{name}_suggest_{today}.tsv") - - stop = timer() - msg = f"Built gene suggestions table in {(stop - start):.5f} seconds." - click.echo(msg) - logger.info(msg) diff --git a/server/curfu/gene_services.py b/server/curfu/gene_services.py index f4c5f985..5d43529b 100644 --- a/server/curfu/gene_services.py +++ b/server/curfu/gene_services.py @@ -1,38 +1,53 @@ """Wrapper for required Gene Normalization services.""" import csv -from typing import Dict, List, Tuple, Union +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union from ga4gh.vrsatile.pydantic.vrsatile_models import CURIE from gene.query import QueryHandler from gene.schemas import MatchType -from curfu import MAX_SUGGESTIONS, LookupServiceError, logger +from curfu import LookupServiceError, logger from curfu.utils import get_data_file # term -> (normalized ID, normalized label) Map = Dict[str, Tuple[str, str, str]] +# term -> (normalized ID, normalized label) +Map = Dict[str, Tuple[str, str, str]] +# term, symbol, concept ID, chromosome, strand +Suggestion = Tuple[str, str, str, str, str] + class GeneService: """Provide gene ID resolution and term autocorrect suggestions.""" - symbols_map: Map = {} - prev_symbols_map: Map = {} - aliases_map: Map = {} + def __init__(self, suggestions_file: Optional[Path] = None) -> None: + """Initialize gene service provider class. - def load_mapping(self) -> None: - """Load mapping files for use in autocomplete.""" - map_pairs = ( - ("symbols", self.symbols_map), - ("prev_symbols", self.prev_symbols_map), - ("aliases", self.aliases_map), - ) - for name, map in map_pairs: - map_file = get_data_file(f"gene_{name}") - with open(map_file, "r") as m: - reader = csv.reader(m, delimiter="\t") - for term, normalized_id, normalized_label in reader: - map[term.lower()] = (term, normalized_id, normalized_label) + :param suggestions_file: path to existing suggestions file. If not provided, + will use newest available file in expected location. + """ + if not suggestions_file: + suggestions_file = get_data_file("gene_suggest") + + self.concept_id_map: Dict[str, Suggestion] = {} + self.symbol_map: Dict[str, Suggestion] = {} + self.aliases_map: Dict[str, Suggestion] = {} + self.prev_symbols_map: Dict[str, Suggestion] = {} + + for row in csv.DictReader(open(suggestions_file, "r")): + symbol = row["symbol"] + concept_id = row["concept_id"] + suggestion = [symbol, concept_id, row["chromosome"], row["strand"]] + self.concept_id_map[concept_id.upper()] = tuple([concept_id] + suggestion) + self.symbol_map[symbol.upper()] = tuple([symbol] + suggestion) + for alias in row.get("aliases", []): + self.aliases_map[alias.upper()] = tuple([alias] + suggestion) + for prev_symbol in row.get("previous_symbols", []): + self.prev_symbols_map[prev_symbol.upper()] = tuple( + [prev_symbol] + suggestion + ) @staticmethod def get_normalized_gene( @@ -101,57 +116,38 @@ def get_normalized_gene( logger.warning(warn) raise LookupServiceError(warn) - def suggest_genes(self, query: str) -> Dict[str, List[Tuple[str, str, str]]]: - """Provide autocomplete suggestions based on submitted term. + @staticmethod + def _get_completion_results(term: str, lookup: Dict) -> List[Suggestion]: + """Filter valid completions for term. - Outstanding questions: - * Where to make decisions about item types -- in client? provide as route - parameter? in gene services? All of the above? - * how to safely reduce redundant suggestions + :param term: user-entered text + :param lookup: stored mapping where key is a name (e.g. symbol or alias) and + value is the complete suggestion + :return: List of suggested completions along with relevant metadata + """ + matches = [] + for key, data in lookup.items(): + if key.startswith(term): + matches.append(data) + matches = sorted(matches, key=lambda s: s[0]) + return matches + + def suggest_genes(self, query: str) -> Dict[str, List[Suggestion]]: + """Provide autocomplete suggestions based on submitted term. :param str query: text entered by user :returns: dict returning list containing any number of suggestion tuples, where - each is the correctly-cased term, normalized ID, normalized label, for each - item type - :raises ServiceWarning: if number of matching suggestions exceeds - MAX_SUGGESTIONS + each is the correctly-cased term, normalized ID, normalized label, for each + item type """ - # tentatively, just search terms - q_lower = query.lower() + q_upper = query.upper() suggestions = {} - suggestions["symbols"] = sorted( - [ - (v[0], v[1], v[0]) - for t, v in self.symbols_map.items() - if t.startswith(q_lower) - ], - key=lambda s: s[0], + suggestions["concept_id"] = self._get_completion_results( + q_upper, self.concept_id_map ) - suggestions["prev_symbols"] = sorted( - [ - (v[0], v[1], v[2]) - for t, v in self.prev_symbols_map.items() - if t.startswith(q_lower) - ], - key=lambda s: s[0], + suggestions["symbol"] = self._get_completion_results(q_upper, self.symbol_map) + suggestions["prev_symbols"] = self._get_completion_results( + q_upper, self.prev_symbols_map ) - suggestions["aliases"] = sorted( - [ - (v[0], v[1], v[2]) - for t, v in self.aliases_map.items() - if t.startswith(q_lower) - ], - key=lambda s: s[0], - ) - - n = ( - len(suggestions["symbols"]) - + len(suggestions["prev_symbols"]) - + len(suggestions["aliases"]) - ) - if n > MAX_SUGGESTIONS: - warn = f"Exceeds max matches: Got {n} possible matches for {query} (limit: {MAX_SUGGESTIONS})" # noqa: E501 - logger.warning(warn) - raise LookupServiceError(warn) - else: - return suggestions + suggestions["aliases"] = self._get_completion_results(q_upper, self.aliases_map) + return suggestions diff --git a/server/curfu/main.py b/server/curfu/main.py index 9f8e811e..80187b0b 100644 --- a/server/curfu/main.py +++ b/server/curfu/main.py @@ -100,7 +100,6 @@ def get_gene_services() -> GeneService: :return: GeneService instance """ gene_services = GeneService() - gene_services.load_mapping() return gene_services diff --git a/server/curfu/routers/complete.py b/server/curfu/routers/complete.py index 804a8633..7faa75a5 100644 --- a/server/curfu/routers/complete.py +++ b/server/curfu/routers/complete.py @@ -3,7 +3,7 @@ from fastapi import APIRouter, Query, Request -from curfu import LookupServiceError +from curfu import MAX_SUGGESTIONS, LookupServiceError from curfu.schemas import AssociatedDomainResponse, ResponseDict, SuggestGeneResponse router = APIRouter() @@ -25,11 +25,25 @@ def suggest_gene(request: Request, term: str = Query("")) -> ResponseDict: provide suggestions. """ response: ResponseDict = {"term": term} - try: - possible_matches = request.app.state.genes.suggest_genes(term) - response.update(possible_matches) - except LookupServiceError as e: - response["warnings"] = [str(e)] + possible_matches = request.app.state.genes.suggest_genes(term) + n = ( + len(possible_matches["concept_id"]) + + len(possible_matches["symbol"]) + + len(possible_matches["prev_symbols"]) + + len(possible_matches["aliases"]) + ) + + response["matches_count"] = n + if n > MAX_SUGGESTIONS: + warn = f"Exceeds max matches: Got {n} possible matches for {term} (limit: {MAX_SUGGESTIONS})" # noqa: E501 + response["warnings"] = [warn] + term_upper = term.upper() + for match_type in ("concept_id", "symbol", "prev_symbols", "aliases"): + reduced = [ + m for m in possible_matches[match_type] if m[0].upper() == term_upper + ] + possible_matches[match_type] = reduced + response.update(possible_matches) return response diff --git a/server/curfu/schemas.py b/server/curfu/schemas.py index e714fc7b..41ca4bd4 100644 --- a/server/curfu/schemas.py +++ b/server/curfu/schemas.py @@ -22,7 +22,10 @@ ResponseWarnings = Optional[List[StrictStr]] ResponseDict = Dict[ - str, Union[str, CURIE, List[str], List[Tuple[str, str, str, str]], FunctionalDomain] + str, + Union[ + str, int, CURIE, List[str], List[Tuple[str, str, str, str]], FunctionalDomain + ], ] Warnings = List[str] @@ -148,10 +151,12 @@ class SuggestGeneResponse(Response): """Response model for gene autocomplete suggestions endpoint.""" term: StrictStr - # complete term, normalized ID, normalized label - symbols: Optional[List[Tuple[str, str, str]]] - prev_symbols: Optional[List[Tuple[str, str, str]]] - aliases: Optional[List[Tuple[str, str, str]]] + matches_count: int + # complete term, normalized symbol, normalized concept ID, chromosome ID, strand + concept_id: Optional[List[Tuple[str, str, str, str, str]]] + symbol: Optional[List[Tuple[str, str, str, str, str]]] + prev_symbols: Optional[List[Tuple[str, str, str, str, str]]] + aliases: Optional[List[Tuple[str, str, str, str, str]]] class DomainParams(BaseModel): diff --git a/server/curfu/utils.py b/server/curfu/utils.py index 3634bfff..ec610b52 100644 --- a/server/curfu/utils.py +++ b/server/curfu/utils.py @@ -15,11 +15,11 @@ def get_latest_s3_file(file_prefix: str) -> ObjectSummary: """Get latest S3 object representation for data file + :param file_prefix: filename prefix for data file :return: boto3 ObjectSummary - :raise: - ResourceLoadException: if Boto3 S3 initialization fails - FileNotFoundError: if no matching files exist in the bucket + :raise ResourceLoadException: if Boto3 S3 initialization fails + :raise FileNotFoundError: if no matching files exist in the bucket """ logger.info(f"Attempting S3 lookup for data file pattern {file_prefix}...") s3 = boto3.resource("s3", config=Config(region_name="us-east-2")) @@ -41,6 +41,7 @@ def get_latest_s3_file(file_prefix: str) -> ObjectSummary: def download_s3_file(bucket_object: ObjectSummary) -> Path: """Download local copy of file from S3 + :param bucket_object: boto object representation of S3 file :return: Path to downloaded file """ @@ -73,15 +74,16 @@ def get_latest_data_file(file_prefix: str, local_files: List[Path]) -> Path: def get_data_file(filename_prefix: str) -> Path: - """Acquire most recent version of static data file. Download from S3 if not available locally. + """Acquire most recent version of static data file. Download from S3 if not + available locally. - :param filename_prefix: leading text of filename, eg `gene_aliases_suggest`. Should not - include filetype or date information. + :param filename_prefix: leading text of filename, eg `gene_aliases_suggest`. Should + not include filetype or date information. :return: Path to acquired file. """ data_dir = APP_ROOT / "data" data_dir.mkdir(exist_ok=True) - file_glob = f"{filename_prefix}*.tsv" + file_glob = f"{filename_prefix}*sv" files = list(data_dir.glob(file_glob)) if not files: return download_s3_file(get_latest_s3_file(filename_prefix)) diff --git a/server/setup.cfg b/server/setup.cfg index 18b8c7fe..ce0bdcd5 100644 --- a/server/setup.cfg +++ b/server/setup.cfg @@ -36,6 +36,7 @@ dev = ruff black pre-commit + gene-normalizer ~= 0.1.39 pydantic-to-typescript diff --git a/server/tests/integration/test_complete.py b/server/tests/integration/test_complete.py index d99ee099..0f2e6a6d 100644 --- a/server/tests/integration/test_complete.py +++ b/server/tests/integration/test_complete.py @@ -4,37 +4,61 @@ @pytest.mark.asyncio -async def test_normalize_gene(async_client: AsyncClient): +async def test_complete_gene(async_client: AsyncClient): """Test /complete/gene endpoint""" response = await async_client.get("/api/complete/gene?term=NTRK") assert response.status_code == 200 assert response.json() == { "term": "NTRK", - "symbols": [ - ["NTRK1", "hgnc:8031", "NTRK1"], - ["NTRK2", "hgnc:8032", "NTRK2"], - ["NTRK3", "hgnc:8033", "NTRK3"], - ["NTRK3-AS1", "hgnc:27532", "NTRK3-AS1"], + "matches_count": 4, + "concept_id": [], + "symbol": [ + ["NTRK1", "NTRK1", "hgnc:8031", "NCBI:NC_000001.11", "+"], + ["NTRK2", "NTRK2", "hgnc:8032", "NCBI:NC_000009.12", "+"], + ["NTRK3", "NTRK3", "hgnc:8033", "NCBI:NC_000015.10", "-"], + ["NTRK3-AS1", "NTRK3-AS1", "hgnc:27532", "NCBI:NC_000015.10", "+"], ], + "prev_symbols": [], + "aliases": [], + } + + response = await async_client.get("/api/complete/gene?term=a") + assert response.status_code == 200 + assert response.json() == { + "warnings": [ + "Exceeds max matches: Got 2096 possible matches for a (limit: 50)" + ], + "term": "a", + "matches_count": 2096, + "concept_id": [], + "symbol": [], "prev_symbols": [ - ["NTRK4", "hgnc:2730", "DDR1"], - ["NTRKR1", "hgnc:10256", "ROR1"], - ["NTRKR2", "hgnc:10257", "ROR2"], - ["NTRKR3", "hgnc:2731", "DDR2"], + ["A", "LOC100420587", "ncbigene:100420587", "NCBI:NC_000019.10", "-"] ], "aliases": [ - ["NTRK4", "hgnc:2730", "DDR1"], - ["NTRKR1", "hgnc:10256", "ROR1"], - ["NTRKR2", "hgnc:10257", "ROR2"], - ["NTRKR3", "hgnc:2731", "DDR2"], + ["A", "LOC110467529", "ncbigene:110467529", "NCBI:NC_000021.9", "+"] ], } - response = await async_client.get("/api/complete/gene?term=a") + response = await async_client.get("/api/complete/gene?term=hgnc:1097") assert response.status_code == 200 assert response.json() == { - "term": "a", - "warnings": [ - "Exceeds max matches: Got 6645 possible matches for a (limit: 50)" + "term": "hgnc:1097", + "matches_count": 11, + "concept_id": [ + ["hgnc:1097", "BRAF", "hgnc:1097", "NCBI:NC_000007.14", "-"], + ["hgnc:10970", "SLC22A6", "hgnc:10970", "NCBI:NC_000011.10", "-"], + ["hgnc:10971", "SLC22A7", "hgnc:10971", "NCBI:NC_000006.12", "+"], + ["hgnc:10972", "SLC22A8", "hgnc:10972", "NCBI:NC_000011.10", "-"], + ["hgnc:10973", "SLC23A2", "hgnc:10973", "NCBI:NC_000020.11", "-"], + ["hgnc:10974", "SLC23A1", "hgnc:10974", "NCBI:NC_000005.10", "-"], + ["hgnc:10975", "SLC24A1", "hgnc:10975", "NCBI:NC_000015.10", "+"], + ["hgnc:10976", "SLC24A2", "hgnc:10976", "NCBI:NC_000009.12", "-"], + ["hgnc:10977", "SLC24A3", "hgnc:10977", "NCBI:NC_000020.11", "+"], + ["hgnc:10978", "SLC24A4", "hgnc:10978", "NCBI:NC_000014.9", "+"], + ["hgnc:10979", "SLC25A1", "hgnc:10979", "NCBI:NC_000022.11", "-"], ], + "symbol": [], + "prev_symbols": [], + "aliases": [], } diff --git a/server/tests/integration/test_nomenclature.py b/server/tests/integration/test_nomenclature.py index ef78fca0..782d538d 100644 --- a/server/tests/integration/test_nomenclature.py +++ b/server/tests/integration/test_nomenclature.py @@ -159,19 +159,19 @@ async def test_tx_segment_nomenclature( json=ntrk1_tx_element_start, ) assert response.status_code == 200 - assert response.json().get("nomenclature", "") == "refseq:NM_002529.3(NTRK1):e.2+1" + assert response.json().get("nomenclature", "") == "NM_002529.3(NTRK1):e.2+1" response = await async_client.post( "/api/nomenclature/transcript_segment?first=true&last=false", json=epcam_5_prime ) assert response.status_code == 200 - assert response.json().get("nomenclature", "") == "refseq:NM_002354.2(EPCAM):e.5" + assert response.json().get("nomenclature", "") == "NM_002354.2(EPCAM):e.5" response = await async_client.post( "/api/nomenclature/transcript_segment?first=false&last=true", json=epcam_3_prime ) assert response.status_code == 200 - assert response.json().get("nomenclature", "") == "refseq:NM_002354.2(EPCAM):e.5" + assert response.json().get("nomenclature", "") == "NM_002354.2(EPCAM):e.5" response = await async_client.post( "/api/nomenclature/transcript_segment?first=true&last=false", json=epcam_invalid @@ -212,7 +212,7 @@ async def test_templated_sequence_nomenclature( assert response.status_code == 200 assert ( response.json().get("nomenclature", "") - == "refseq:NC_000001.11(chr 1):g.15455_15566(-)" + == "NC_000001.11(chr 1):g.15455_15566(-)" ) response = await async_client.post( @@ -246,5 +246,5 @@ async def test_fusion_nomenclature(async_client: AsyncClient): assert response.status_code == 200 assert ( response.json().get("nomenclature", "") - == "refseq:NM_004327.3(BCR):e.2+182::ACTAAAGCG::refseq:NM_005157.5(ABL1):e.2-173" + == "NM_004327.3(BCR):e.2+182::ACTAAAGCG::NM_005157.5(ABL1):e.2-173" )