Skip to content

Commit

Permalink
feat: fuzzy matching completions (#671)
Browse files Browse the repository at this point in the history
* feat: fuzzy matching

* chore: updated changelog

* feat(completions): conditional fuzzy match

* (pr) review suggestions

* chore: updated snapshots to py3.10

* fix: add updated snapshot for 3.12

---------

Co-authored-by: Ted Conbeer <[email protected]>
  • Loading branch information
jspaezp and tconbeer authored Dec 9, 2024
1 parent a4be2bb commit 434775d
Show file tree
Hide file tree
Showing 6 changed files with 996 additions and 69 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

### Features
- Fuzzy matching for autocomplete ([#671](https://github.com/tconbeer/harlequin/pull/671))

## [1.25.2] - 2024-10-31

### Bug Fixes
Expand Down
89 changes: 69 additions & 20 deletions src/harlequin/autocomplete/completers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import itertools
import re
from collections.abc import Callable
from typing import Iterable

from harlequin.autocomplete.completion import HarlequinCompletion
Expand Down Expand Up @@ -42,16 +43,26 @@ def _label(c: HarlequinCompletion) -> str:
return f"{c.label} [{self.type_color}]{c.type_label}[/]"

match_val = prefix.lower()
matches: list[tuple[str, str]] = []

exact_matches = [
# Add exact matches
matches.extend(
(_label(c), c.value) for c in self.completions if c.match_val == match_val
]
matches = [
)
# Add prefix matches
matches.extend(
(_label(c), c.value)
for c in self.completions
if c.match_val.startswith(match_val)
]
return self._dedupe_labels((*exact_matches, *matches))
)
# Only add fuzzy matches if there are not enough exact matches
if len(matches) < 20:
matches.extend(
(_label(c), c.value)
for c in self._fuzzy_match(match_val, self.completions)
)

return self._dedupe_labels(matches)

def update_catalog(self, catalog: Catalog) -> None:
self._catalog_completions = build_catalog_completions(catalog=catalog)
Expand All @@ -74,6 +85,21 @@ def extend_catalog(self, parent: CatalogItem, items: list[CatalogItem]) -> None:
self._extra_completions,
)

@staticmethod
def _fuzzy_match(
match_val: str, completions: list[HarlequinCompletion]
) -> list[HarlequinCompletion]:
regex_base = ".{0,2}?".join(f"({re.escape(c)})" for c in match_val)
regex = "^.*" + regex_base + ".*$"
match_regex = re.compile(regex, re.IGNORECASE)
matches = [c for c in completions if match_regex.match(c.match_val)]

# Sort in ascending length.
# I am assuming here that more insertions are less likely to be
# the "right" match.
matches.sort(key=lambda c: len(c.match_val))
return matches

@staticmethod
def _merge_completions(
*completion_lists: list[HarlequinCompletion],
Expand Down Expand Up @@ -114,23 +140,46 @@ def _label(c: HarlequinCompletion) -> str:
value_prefix = "".join(
f"{w}{sep}" for w, sep in zip([*others, context], separators)
)
exact_matches = [
(
f"{value_prefix}{quote_char}{_label(c)}",
f"{value_prefix}{quote_char}{c.value}",
)
for c in self.completions
if c.match_val == match_val and c.context == match_context

context_completions = [
c for c in self.completions if c.context == match_context
]
matches = [
(
f"{value_prefix}{quote_char}{_label(c)}",
f"{value_prefix}{quote_char}{c.value}",

matches: list[tuple[str, str]] = []
# Add exact matches
matches.extend(
self.format_completion(c, quote_char, value_prefix, _label)
for c in context_completions
if c.match_val == match_val
)

# Add prefix matches
matches.extend(
self.format_completion(c, quote_char, value_prefix, _label)
for c in context_completions
if c.match_val.startswith(match_val)
)

# Only add fuzzy matches if there are not enough exact matches
if len(matches) < 20:
matches.extend(
self.format_completion(c, quote_char, value_prefix, _label)
for c in self._fuzzy_match(match_val, context_completions)
)
for c in self.completions
if c.match_val.startswith(match_val) and c.context == match_context
]
return self._dedupe_labels((*exact_matches, *matches))

return self._dedupe_labels(matches)

@staticmethod
def format_completion(
completion: HarlequinCompletion,
quote_char: str,
value_prefix: str,
label_fn: Callable,
) -> tuple[str, str]:
return (
f"{value_prefix}{quote_char}{label_fn(completion)}",
f"{value_prefix}{quote_char}{completion.value}",
)

@staticmethod
def _merge_completions(
Expand Down
44 changes: 44 additions & 0 deletions tests/data/unit_tests/completions/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@


The completions were generated from this database:

```python
from sklearn.datasets import load_iris
import pandas as pd
import duckdb

# Load the iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df["target"] = iris.target
iris_df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Connect to DuckDB (this will create a new database if it doesn't exist)
con = duckdb.connect("iris.db")

# Create and insert data into the table
con.execute("""
CREATE TABLE IF NOT EXISTS iris (
sepal_length FLOAT,
sepal_width FLOAT,
petal_length FLOAT,
petal_width FLOAT,
target INTEGER,
species VARCHAR
)
""")

# Insert the data
con.execute("INSERT INTO iris SELECT * FROM iris_df")

# Verify the data (optional)
result = con.execute("SELECT * FROM iris LIMIT 5").fetchall()
print("First 5 rows:", result)

# Close the connection
con.close()
```
Essentially the database was created and then sunk the data in the completions
to json using `dataclasses.asdict` and `json.dumps`. Proceeded with some manual
deletion to make the file smaller.

Loading

0 comments on commit 434775d

Please sign in to comment.