feat: fuzzy matching completions (#671)

* feat: fuzzy matching * chore: updated changelog * feat(completions): conditional fuzzy match * (pr) review suggestions * chore: updated snapshots to py3.10 * fix: add updated snapshot for 3.12 --------- Co-authored-by: Ted Conbeer <[email protected]>
tconbeer · Dec 9, 2024 · 434775d · 434775d
1 parent a4be2bb
commit 434775d
Show file tree

Hide file tree

Showing 6 changed files with 996 additions and 69 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Features
+- Fuzzy matching for autocomplete ([#671](https://github.com/tconbeer/harlequin/pull/671))
+
 ## [1.25.2] - 2024-10-31
 
 ### Bug Fixes

diff --git a/src/harlequin/autocomplete/completers.py b/src/harlequin/autocomplete/completers.py
@@ -2,6 +2,7 @@
 
 import itertools
 import re
+from collections.abc import Callable
 from typing import Iterable
 
 from harlequin.autocomplete.completion import HarlequinCompletion
@@ -42,16 +43,26 @@ def _label(c: HarlequinCompletion) -> str:
             return f"{c.label} [{self.type_color}]{c.type_label}[/]"
 
         match_val = prefix.lower()
+        matches: list[tuple[str, str]] = []
 
-        exact_matches = [
+        # Add exact matches
+        matches.extend(
             (_label(c), c.value) for c in self.completions if c.match_val == match_val
-        ]
-        matches = [
+        )
+        # Add prefix matches
+        matches.extend(
             (_label(c), c.value)
             for c in self.completions
             if c.match_val.startswith(match_val)
-        ]
-        return self._dedupe_labels((*exact_matches, *matches))
+        )
+        # Only add fuzzy matches if there are not enough exact matches
+        if len(matches) < 20:
+            matches.extend(
+                (_label(c), c.value)
+                for c in self._fuzzy_match(match_val, self.completions)
+            )
+
+        return self._dedupe_labels(matches)
 
     def update_catalog(self, catalog: Catalog) -> None:
         self._catalog_completions = build_catalog_completions(catalog=catalog)
@@ -74,6 +85,21 @@ def extend_catalog(self, parent: CatalogItem, items: list[CatalogItem]) -> None:
             self._extra_completions,
         )
 
+    @staticmethod
+    def _fuzzy_match(
+        match_val: str, completions: list[HarlequinCompletion]
+    ) -> list[HarlequinCompletion]:
+        regex_base = ".{0,2}?".join(f"({re.escape(c)})" for c in match_val)
+        regex = "^.*" + regex_base + ".*$"
+        match_regex = re.compile(regex, re.IGNORECASE)
+        matches = [c for c in completions if match_regex.match(c.match_val)]
+
+        # Sort in ascending length.
+        # I am assuming here that more insertions are less likely to be
+        # the "right" match.
+        matches.sort(key=lambda c: len(c.match_val))
+        return matches
+
     @staticmethod
     def _merge_completions(
         *completion_lists: list[HarlequinCompletion],
@@ -114,23 +140,46 @@ def _label(c: HarlequinCompletion) -> str:
         value_prefix = "".join(
             f"{w}{sep}" for w, sep in zip([*others, context], separators)
         )
-        exact_matches = [
-            (
-                f"{value_prefix}{quote_char}{_label(c)}",
-                f"{value_prefix}{quote_char}{c.value}",
-            )
-            for c in self.completions
-            if c.match_val == match_val and c.context == match_context
+
+        context_completions = [
+            c for c in self.completions if c.context == match_context
         ]
-        matches = [
-            (
-                f"{value_prefix}{quote_char}{_label(c)}",
-                f"{value_prefix}{quote_char}{c.value}",
+
+        matches: list[tuple[str, str]] = []
+        # Add exact matches
+        matches.extend(
+            self.format_completion(c, quote_char, value_prefix, _label)
+            for c in context_completions
+            if c.match_val == match_val
+        )
+
+        # Add prefix matches
+        matches.extend(
+            self.format_completion(c, quote_char, value_prefix, _label)
+            for c in context_completions
+            if c.match_val.startswith(match_val)
+        )
+
+        # Only add fuzzy matches if there are not enough exact matches
+        if len(matches) < 20:
+            matches.extend(
+                self.format_completion(c, quote_char, value_prefix, _label)
+                for c in self._fuzzy_match(match_val, context_completions)
             )
-            for c in self.completions
-            if c.match_val.startswith(match_val) and c.context == match_context
-        ]
-        return self._dedupe_labels((*exact_matches, *matches))
+
+        return self._dedupe_labels(matches)
+
+    @staticmethod
+    def format_completion(
+        completion: HarlequinCompletion,
+        quote_char: str,
+        value_prefix: str,
+        label_fn: Callable,
+    ) -> tuple[str, str]:
+        return (
+            f"{value_prefix}{quote_char}{label_fn(completion)}",
+            f"{value_prefix}{quote_char}{completion.value}",
+        )
 
     @staticmethod
     def _merge_completions(

diff --git a/tests/data/unit_tests/completions/README.md b/tests/data/unit_tests/completions/README.md
@@ -0,0 +1,44 @@
+
+
+The completions were generated from this database:
+
+```python
+from sklearn.datasets import load_iris
+import pandas as pd
+import duckdb
+
+# Load the iris dataset
+iris = load_iris()
+iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
+iris_df["target"] = iris.target
+iris_df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)
+
+# Connect to DuckDB (this will create a new database if it doesn't exist)
+con = duckdb.connect("iris.db")
+
+# Create and insert data into the table
+con.execute("""
+    CREATE TABLE IF NOT EXISTS iris (
+        sepal_length FLOAT,
+        sepal_width FLOAT,
+        petal_length FLOAT,
+        petal_width FLOAT,
+        target INTEGER,
+        species VARCHAR
+    )
+""")
+
+# Insert the data
+con.execute("INSERT INTO iris SELECT * FROM iris_df")
+
+# Verify the data (optional)
+result = con.execute("SELECT * FROM iris LIMIT 5").fetchall()
+print("First 5 rows:", result)
+
+# Close the connection
+con.close()
+```
+Essentially the database was created and then sunk the data in the completions
+to json using `dataclasses.asdict` and `json.dumps`. Proceeded with some manual
+deletion to make the file smaller.
+