From bc5c32d2e367aafe8fd33a627a8fef96a553dbfc Mon Sep 17 00:00:00 2001 From: n1mus <709030+n1mus@users.noreply.github.com> Date: Fri, 29 Apr 2022 11:43:11 -0700 Subject: [PATCH] use locale en_US & other cleanup --- .dockerignore | 6 +- CHANGELOG.md | 8 + Makefile | 3 +- client_src/test/test_integration.py | 2 +- .../utils/json_validation.py | 2 +- spec/analyzers/icu_tokenize.json | 2 +- .../generic/fulltext_search.yaml | 94 ------------ .../stored_queries/test_fulltext_search.py | 145 ------------------ 8 files changed, 17 insertions(+), 245 deletions(-) delete mode 100644 spec/stored_queries/generic/fulltext_search.yaml diff --git a/.dockerignore b/.dockerignore index e0862330..e46761e2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -48,4 +48,8 @@ dmypy.json # docker bits Dockerfile* -docker-compose* \ No newline at end of file +docker-compose* + +# Temp files +tmp/ + diff --git a/CHANGELOG.md b/CHANGELOG.md index 3affa19e..f6f252ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased + +### Changed +- Use locale en_US instead of c.utf-8 + +### Removed +- Generic fulltext search and tests + ## [0.0.19] - 2022-04-15 ### Added - github actions to build `develop`, `pr-x` and released version (e.g. `1.2.3`) Tags diff --git a/Makefile b/Makefile index e4d49ecd..bfabf26a 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,7 @@ shell: docker-compose run re_api sh reset: - docker-compose --rmi all -v - docker-compose build + docker-compose down --rmi all -v full_query_testing: DO_QUERY_TESTING=full time python -m pytest -s $(QUERY_TESTING_FILE) diff --git a/client_src/test/test_integration.py b/client_src/test/test_integration.py index baa17e97..8794b579 100644 --- a/client_src/test/test_integration.py +++ b/client_src/test/test_integration.py @@ -2,7 +2,7 @@ import os from uuid import uuid4 -from relation_engine_client import REClient +from relation_engine_client.main import REClient from relation_engine_client.exceptions import RERequestError, RENotFound _API_URL = os.environ.get("RE_API_URL", "http://localhost:5000") diff --git a/relation_engine_server/utils/json_validation.py b/relation_engine_server/utils/json_validation.py index c44538f5..b95be623 100644 --- a/relation_engine_server/utils/json_validation.py +++ b/relation_engine_server/utils/json_validation.py @@ -159,7 +159,7 @@ def resolve_remote(self, uri): if scheme in self.handlers: result = self.handlers[scheme](uri) - elif scheme in [u"http", u"https"]: + elif scheme in ["http", "https"]: # Requests has support for detecting the correct encoding of # json over http result = requests.get(uri).json() diff --git a/spec/analyzers/icu_tokenize.json b/spec/analyzers/icu_tokenize.json index 3f69a950..7d2d8429 100644 --- a/spec/analyzers/icu_tokenize.json +++ b/spec/analyzers/icu_tokenize.json @@ -2,7 +2,7 @@ "name": "icu_tokenize", "type": "text", "properties": { - "locale": "c.utf-8", + "locale": "en_US", "accent": false, "case": "lower", "stemming": false, diff --git a/spec/stored_queries/generic/fulltext_search.yaml b/spec/stored_queries/generic/fulltext_search.yaml deleted file mode 100644 index 6859add4..00000000 --- a/spec/stored_queries/generic/fulltext_search.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Should be REVISED or DEPRECATED. -# Is currently unused outside testing. -# -# Search a collection with a fulltext index with an attribute name and search text -# Also supports filtering by outer-level attributes -# Not recommended for fast searching because it can be very slow and even timeout at 60s -name: fulltext_search -params: - type: object - required: ["@coll", search_attrkey, search_text] - additionalProperties: false - properties: - "@coll": - type: string - title: Collection name - examples: [ncbi_taxon, gtdb_taxon] - search_attrkey: - type: string - title: Search attribute key - examples: [scientific_name, name] - search_text: - type: string - title: Search text - examples: [escherichia, es] - description: Text to search on the search attribute values - ts: - type: [integer, "null"] - title: Versioning timestamp - default: null - filter_attr_expr: - type: [array, "null"] - title: Filter by document attribute equality - items: - type: object - maxItems: 50 - examples: [ - [{"rank": "species"}, {"rank": "strain"}, {"strain": true}], - [{"rank": "species", "strain": false}] - ] - default: null - description: | - An array of single-level objects. - In each item object, the key-value pairs would restrict the documents to those containing all the attribute key-value pairs. - But if any item object in the array satisfies the document, the document is filtered into the results. - Basically works like a boolean expression where each key-value pair is a boolean value, each item object is a boolean term, and the array is a sum of boolean terms - Null or empty arrays have no filtering effect. - offset: - type: [integer, "null"] - title: Paging offset - maximum: 100000 - default: 0 - limit: - type: [integer, "null"] - title: Max results to return - default: 20 - maximum: 1000 - select: - type: [string, array, "null"] - items: - type: string - examples: [scientific_name, [scientific_name, id]] - default: null - description: Document attributes to keep in the results -query: | - LET search_text__norm = REGEX_REPLACE(LOWER(TRIM(@search_text)), "\\s+", " ") - LET search_text__first_exact_tok = REGEX_SPLIT(search_text__norm, " ")[0] - LET search_text__icu_toks = TOKENS(@search_text, "icu_tokenize") /* db analyzer icu_tokenize */ - LET search_text__wordboundmod_icu_toks = ( - FOR tok IN search_text__icu_toks - RETURN REGEX_REPLACE(tok, ",.*", "") /* commas cannot be escaped */ - ) - LET search_text__fulltext = CONCAT_SEPARATOR(", ", - FOR tok IN search_text__wordboundmod_icu_toks - RETURN CONCAT("prefix:", tok) - ) - LET filter_attr_expr = @filter_attr_expr ? @filter_attr_expr : [] /* null to [] */ - LET search_text__wildcard = CONCAT("%", CONCAT_SEPARATOR("%", search_text__icu_toks), "%") /* e.g., %tok0%tok1%tokn% */ - FOR doc IN FULLTEXT(@@coll, @search_attrkey, search_text__fulltext) - FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true - /* keep doc if any obj in filter_attr_expr is a sub-obj of doc */ - FILTER LENGTH(filter_attr_expr) > 0 ? ( - FOR term IN filter_attr_expr - RETURN MATCHES(doc, term) - ) ANY == true : true - LET attrval__norm = REGEX_REPLACE(LOWER(TRIM(doc.@search_attrkey)), "\\s+", " ") - LET attrval__icu_toks = TOKENS(doc.@search_attrkey, "icu_tokenize") - SORT LIKE(doc.@search_attrkey, search_text__wildcard, true) DESC, /* icu tok ordering */ - /* TODO - icu tok ordering with no insertions? */ - CONTAINS(attrval__icu_toks[0], search_text__icu_toks[0], true) == 0 DESC, /* first icu tok */ - CONTAINS(attrval__norm, search_text__first_exact_tok, true) == 0 DESC, /* first exact tok */ - CONTAINS(attrval__norm, search_text__norm, true) == 0 DESC, /* exact match */ - doc.@search_attrkey /* lexical */ - LIMIT @offset ? @offset : 0, @limit ? @limit : 20 - RETURN @select ? KEEP(doc, @select) : doc diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py index 99bd4d44..f7acb620 100644 --- a/spec/test/stored_queries/test_fulltext_search.py +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -1,12 +1,9 @@ """ Tests for stored queries involving a fulltext search: -* Generic fulltext_search (should be used with caution because it can be slow and timeout at 60s) * Taxonomy taxonomy_search_species_strain * Taxonomy taxonomy_search_species_strain_no_sort The latter two are switched between depending on the length of the search text. -These stored query tests are all bundled in one test file because their original purpose is to do a species/strain -name search on the ncbi_taxon collection These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. """ @@ -251,148 +248,6 @@ def test_prefix_hit(self): ) -class TestFulltextSearchStoredQuery(unittest.TestCase): - @classmethod - def setUpClass(cls): - check_spec_test_env() - create_test_docs("ncbi_taxon", ncbi_taxa) - - def test_ncbi_taxon_scinames(self): - """Happy path""" - for sciname in scinames_test_all: - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="scientific_name", - search_text=sciname, - ts=_NOW if sciname in scinames_test_latest else None, - filter_attr_expr=[ - {"rank": "species"}, - {"rank": "strain"}, - {"strain": True}, - ], - offset=None, - limit=LIMIT, - select="scientific_name", - # --- - expect_error=False, - expect_hit=True, - ) - - def test_null_bind_params(self): - """Leave off parameters""" - for sciname in scinames_test_all: - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="scientific_name", - search_text=sciname, - ts=None, - filter_attr_expr=None, - offset=None, - limit=None, - select=None, - # --- - expect_error=False, - expect_hit=True, - ) - - def test_fully_specified_bind_params(self): - """Specify all parameters""" - for sciname in scinames_test_all: - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="scientific_name", - search_text=sciname, - ts=_NOW if sciname in scinames_test_latest else None, - filter_attr_expr=[ - {"rank": "species"}, - {"rank": "strain"}, - {"strain": True}, - ], - offset=0, - limit=LIMIT, - select=["id", "scientific_name"], - # --- - expect_error=False, - expect_hit=True, - ) - - def test_extra_params(self): - """Extra params not in spec/aql""" - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="scientific_name", - search_text="esch", - ts=None, - filter_attr_expr=[ - {"rank": "species"}, - {"rank": "strain"}, - {"strain": True}, - ], - offset=0, - limit=LIMIT, - select=["id", "scientific_name"], - extra_unused_param=42, - # --- - expect_error=("Additional properties are not allowed"), - ) - - def test_validation_fail(self): - _fulltext_search_query( - self, - coll=[], - search_attrkey=42, - search_text={"hi": 1}, - ts=None, - filter_attr_expr=None, - offset=None, - limit=None, - select=None, - # --- - expect_error="[] is not of type 'string'", - ) - - def test_aql_error(self): - for sciname in scinames_test_all: - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="fake_attrkey", - search_text=sciname, - ts=None, - filter_attr_expr=None, - offset=None, - limit=None, - select=None, - # --- - expect_error=True, - ) - - def test_no_hit(self): - for sciname in scinames_test_all: - _fulltext_search_query( - self, - coll="ncbi_taxon", - search_attrkey="scientific_name", - search_text=sciname[::-1], - ts=None, - filter_attr_expr=None, - offset=None, - limit=None, - select=None, - # --- - expect_error=False, - expect_hit=False, - expected_hits=[], - ) - - -# --- Test helpers --- - - def _switch_taxonomy_search_species_strain_queries(search_text): return ( "taxonomy_search_species_strain_no_sort"