From bc5c32d2e367aafe8fd33a627a8fef96a553dbfc Mon Sep 17 00:00:00 2001
From: n1mus <709030+n1mus@users.noreply.github.com>
Date: Fri, 29 Apr 2022 11:43:11 -0700
Subject: [PATCH] use locale en_US & other cleanup

---
 .dockerignore                                 |   6 +-
 CHANGELOG.md                                  |   8 +
 Makefile                                      |   3 +-
 client_src/test/test_integration.py           |   2 +-
 .../utils/json_validation.py                  |   2 +-
 spec/analyzers/icu_tokenize.json              |   2 +-
 .../generic/fulltext_search.yaml              |  94 ------------
 .../stored_queries/test_fulltext_search.py    | 145 ------------------
 8 files changed, 17 insertions(+), 245 deletions(-)
 delete mode 100644 spec/stored_queries/generic/fulltext_search.yaml

diff --git a/.dockerignore b/.dockerignore
index e0862330..e46761e2 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -48,4 +48,8 @@ dmypy.json
 
 # docker bits
 Dockerfile*
-docker-compose*
\ No newline at end of file
+docker-compose*
+
+# Temp files
+tmp/
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3affa19e..f6f252ec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Changed
+- Use locale en_US instead of c.utf-8
+
+### Removed
+- Generic fulltext search and tests
+
 ## [0.0.19] - 2022-04-15
 ### Added
 - github actions to build `develop`, `pr-x` and released version (e.g. `1.2.3`) Tags
diff --git a/Makefile b/Makefile
index e4d49ecd..bfabf26a 100644
--- a/Makefile
+++ b/Makefile
@@ -13,8 +13,7 @@ shell:
 	docker-compose run re_api sh
 
 reset:
-	docker-compose --rmi all -v
-	docker-compose build
+	docker-compose down --rmi all -v
 
 full_query_testing:
 	DO_QUERY_TESTING=full time python -m pytest -s $(QUERY_TESTING_FILE)
diff --git a/client_src/test/test_integration.py b/client_src/test/test_integration.py
index baa17e97..8794b579 100644
--- a/client_src/test/test_integration.py
+++ b/client_src/test/test_integration.py
@@ -2,7 +2,7 @@
 import os
 from uuid import uuid4
 
-from relation_engine_client import REClient
+from relation_engine_client.main import REClient
 from relation_engine_client.exceptions import RERequestError, RENotFound
 
 _API_URL = os.environ.get("RE_API_URL", "http://localhost:5000")
diff --git a/relation_engine_server/utils/json_validation.py b/relation_engine_server/utils/json_validation.py
index c44538f5..b95be623 100644
--- a/relation_engine_server/utils/json_validation.py
+++ b/relation_engine_server/utils/json_validation.py
@@ -159,7 +159,7 @@ def resolve_remote(self, uri):
 
         if scheme in self.handlers:
             result = self.handlers[scheme](uri)
-        elif scheme in [u"http", u"https"]:
+        elif scheme in ["http", "https"]:
             # Requests has support for detecting the correct encoding of
             # json over http
             result = requests.get(uri).json()
diff --git a/spec/analyzers/icu_tokenize.json b/spec/analyzers/icu_tokenize.json
index 3f69a950..7d2d8429 100644
--- a/spec/analyzers/icu_tokenize.json
+++ b/spec/analyzers/icu_tokenize.json
@@ -2,7 +2,7 @@
     "name": "icu_tokenize",
     "type": "text",
     "properties": {
-        "locale": "c.utf-8",
+        "locale": "en_US",
         "accent": false,
         "case": "lower",
         "stemming": false,
diff --git a/spec/stored_queries/generic/fulltext_search.yaml b/spec/stored_queries/generic/fulltext_search.yaml
deleted file mode 100644
index 6859add4..00000000
--- a/spec/stored_queries/generic/fulltext_search.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-# Should be REVISED or DEPRECATED.
-# Is currently unused outside testing.
-#
-# Search a collection with a fulltext index with an attribute name and search text
-# Also supports filtering by outer-level attributes
-# Not recommended for fast searching because it can be very slow and even timeout at 60s
-name: fulltext_search
-params:
-  type: object
-  required: ["@coll", search_attrkey, search_text]
-  additionalProperties: false
-  properties:
-    "@coll":
-      type: string
-      title: Collection name
-      examples: [ncbi_taxon, gtdb_taxon]
-    search_attrkey:
-      type: string
-      title: Search attribute key
-      examples: [scientific_name, name]
-    search_text:
-      type: string
-      title: Search text
-      examples: [escherichia, es]
-      description: Text to search on the search attribute values
-    ts:
-      type: [integer, "null"]
-      title: Versioning timestamp
-      default: null
-    filter_attr_expr:
-      type: [array, "null"]
-      title: Filter by document attribute equality
-      items:
-        type: object
-      maxItems: 50
-      examples: [
-          [{"rank": "species"}, {"rank": "strain"}, {"strain": true}],
-          [{"rank": "species", "strain": false}]
-        ]
-      default: null
-      description: |
-        An array of single-level objects.
-        In each item object, the key-value pairs would restrict the documents to those containing all the attribute key-value pairs. 
-        But if any item object in the array satisfies the document, the document is filtered into the results. 
-        Basically works like a boolean expression where each key-value pair is a boolean value, each item object is a boolean term, and the array is a sum of boolean terms
-        Null or empty arrays have no filtering effect.
-    offset:
-      type: [integer, "null"]
-      title: Paging offset
-      maximum: 100000
-      default: 0
-    limit:
-      type: [integer, "null"]
-      title: Max results to return
-      default: 20
-      maximum: 1000
-    select:
-      type: [string, array, "null"]
-      items:
-        type: string
-      examples: [scientific_name, [scientific_name, id]]
-      default: null
-      description: Document attributes to keep in the results
-query: |
-  LET search_text__norm = REGEX_REPLACE(LOWER(TRIM(@search_text)), "\\s+", " ")
-  LET search_text__first_exact_tok = REGEX_SPLIT(search_text__norm, " ")[0]
-  LET search_text__icu_toks = TOKENS(@search_text, "icu_tokenize") /* db analyzer icu_tokenize */
-  LET search_text__wordboundmod_icu_toks = (
-      FOR tok IN search_text__icu_toks
-          RETURN REGEX_REPLACE(tok, ",.*", "")  /* commas cannot be escaped */
-  )
-  LET search_text__fulltext = CONCAT_SEPARATOR(", ",
-      FOR tok IN search_text__wordboundmod_icu_toks
-          RETURN CONCAT("prefix:", tok)
-  )
-  LET filter_attr_expr = @filter_attr_expr ? @filter_attr_expr : []  /* null to [] */
-  LET search_text__wildcard = CONCAT("%", CONCAT_SEPARATOR("%", search_text__icu_toks), "%") /* e.g., %tok0%tok1%tokn% */
-  FOR doc IN FULLTEXT(@@coll, @search_attrkey, search_text__fulltext)
-      FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true
-      /* keep doc if any obj in filter_attr_expr is a sub-obj of doc */
-      FILTER LENGTH(filter_attr_expr) > 0 ? (
-          FOR term IN filter_attr_expr
-              RETURN MATCHES(doc, term)
-      ) ANY == true : true
-      LET attrval__norm = REGEX_REPLACE(LOWER(TRIM(doc.@search_attrkey)), "\\s+", " ")
-      LET attrval__icu_toks = TOKENS(doc.@search_attrkey, "icu_tokenize")
-      SORT LIKE(doc.@search_attrkey, search_text__wildcard, true) DESC,  /* icu tok ordering */
-          /* TODO - icu tok ordering with no insertions? */
-          CONTAINS(attrval__icu_toks[0], search_text__icu_toks[0], true) == 0 DESC,  /* first icu tok */
-          CONTAINS(attrval__norm, search_text__first_exact_tok, true) == 0 DESC,  /* first exact tok */
-          CONTAINS(attrval__norm, search_text__norm, true) == 0 DESC,  /* exact match */
-          doc.@search_attrkey  /* lexical */
-      LIMIT @offset ? @offset : 0, @limit ? @limit : 20
-      RETURN @select ? KEEP(doc, @select) : doc
diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py
index 99bd4d44..f7acb620 100644
--- a/spec/test/stored_queries/test_fulltext_search.py
+++ b/spec/test/stored_queries/test_fulltext_search.py
@@ -1,12 +1,9 @@
 """
 Tests for stored queries involving a fulltext search:
-* Generic fulltext_search (should be used with caution because it can be slow and timeout at 60s)
 * Taxonomy taxonomy_search_species_strain
 * Taxonomy taxonomy_search_species_strain_no_sort
 
 The latter two are switched between depending on the length of the search text.
-These stored query tests  are all bundled in one test file because their original purpose is to do a species/strain
-name search on the ncbi_taxon collection
 
 These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images.
 """
@@ -251,148 +248,6 @@ def test_prefix_hit(self):
         )
 
 
-class TestFulltextSearchStoredQuery(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        check_spec_test_env()
-        create_test_docs("ncbi_taxon", ncbi_taxa)
-
-    def test_ncbi_taxon_scinames(self):
-        """Happy path"""
-        for sciname in scinames_test_all:
-            _fulltext_search_query(
-                self,
-                coll="ncbi_taxon",
-                search_attrkey="scientific_name",
-                search_text=sciname,
-                ts=_NOW if sciname in scinames_test_latest else None,
-                filter_attr_expr=[
-                    {"rank": "species"},
-                    {"rank": "strain"},
-                    {"strain": True},
-                ],
-                offset=None,
-                limit=LIMIT,
-                select="scientific_name",
-                # ---
-                expect_error=False,
-                expect_hit=True,
-            )
-
-    def test_null_bind_params(self):
-        """Leave off parameters"""
-        for sciname in scinames_test_all:
-            _fulltext_search_query(
-                self,
-                coll="ncbi_taxon",
-                search_attrkey="scientific_name",
-                search_text=sciname,
-                ts=None,
-                filter_attr_expr=None,
-                offset=None,
-                limit=None,
-                select=None,
-                # ---
-                expect_error=False,
-                expect_hit=True,
-            )
-
-    def test_fully_specified_bind_params(self):
-        """Specify all parameters"""
-        for sciname in scinames_test_all:
-            _fulltext_search_query(
-                self,
-                coll="ncbi_taxon",
-                search_attrkey="scientific_name",
-                search_text=sciname,
-                ts=_NOW if sciname in scinames_test_latest else None,
-                filter_attr_expr=[
-                    {"rank": "species"},
-                    {"rank": "strain"},
-                    {"strain": True},
-                ],
-                offset=0,
-                limit=LIMIT,
-                select=["id", "scientific_name"],
-                # ---
-                expect_error=False,
-                expect_hit=True,
-            )
-
-    def test_extra_params(self):
-        """Extra params not in spec/aql"""
-        _fulltext_search_query(
-            self,
-            coll="ncbi_taxon",
-            search_attrkey="scientific_name",
-            search_text="esch",
-            ts=None,
-            filter_attr_expr=[
-                {"rank": "species"},
-                {"rank": "strain"},
-                {"strain": True},
-            ],
-            offset=0,
-            limit=LIMIT,
-            select=["id", "scientific_name"],
-            extra_unused_param=42,
-            # ---
-            expect_error=("Additional properties are not allowed"),
-        )
-
-    def test_validation_fail(self):
-        _fulltext_search_query(
-            self,
-            coll=[],
-            search_attrkey=42,
-            search_text={"hi": 1},
-            ts=None,
-            filter_attr_expr=None,
-            offset=None,
-            limit=None,
-            select=None,
-            # ---
-            expect_error="[] is not of type 'string'",
-        )
-
-    def test_aql_error(self):
-        for sciname in scinames_test_all:
-            _fulltext_search_query(
-                self,
-                coll="ncbi_taxon",
-                search_attrkey="fake_attrkey",
-                search_text=sciname,
-                ts=None,
-                filter_attr_expr=None,
-                offset=None,
-                limit=None,
-                select=None,
-                # ---
-                expect_error=True,
-            )
-
-    def test_no_hit(self):
-        for sciname in scinames_test_all:
-            _fulltext_search_query(
-                self,
-                coll="ncbi_taxon",
-                search_attrkey="scientific_name",
-                search_text=sciname[::-1],
-                ts=None,
-                filter_attr_expr=None,
-                offset=None,
-                limit=None,
-                select=None,
-                # ---
-                expect_error=False,
-                expect_hit=False,
-                expected_hits=[],
-            )
-
-
-# --- Test helpers ---
-
-
 def _switch_taxonomy_search_species_strain_queries(search_text):
     return (
         "taxonomy_search_species_strain_no_sort"