Skip to content

Commit

Permalink
Merge pull request #177 from home-assistant/synesthesiam-20241025-fas…
Browse files Browse the repository at this point in the history
…ter-recognizer

Faster and more accurate recognition
  • Loading branch information
synesthesiam authored Nov 11, 2024
2 parents ffa4ed6 + 889084f commit 230fafa
Show file tree
Hide file tree
Showing 20 changed files with 2,144 additions and 1,422 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# Changelog

## 2.0.0

- Allow wildcards to be followed by expansion rules and lists
- Use regular expressions to filter sentence templates
- Add `filter_with_regex` to intent settings and intent data (`false` disables regex filtering)
- Filter text slot list values by required/excluded context during matching
- Use a trie to filter range slot list values based on remaining text to be matched
- Add `required_keywords` section to intent data to skip sentences without specific keywords
- Preserve case during matching
- Strip punctuation before text processing
- Remove extraneous whitespace from the end of wildcards
- Refactor string matching code into `string_matcher.py`

## 1.8.0

- Bump `unicode-rbnf` to 2.0.0
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ Exclude the `-n` argument to sample all possible sentences.
Uses a custom parser written in Python.

* Alternative words or phrases
* `(red | green | blue)`
* `turn(s | ed | ing)`
* `(red|green|blue)`
* `turn(s|ed|ing)`
* Optional words or phrases
* `[the]`
* `[this | that]`
* `[this|that]`
* `light[s]`
* Permutations of words or phrases
* `(patience; you must have) my young Padawan`
Expand Down
2 changes: 1 addition & 1 deletion hassil/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.8.0
2.0.0
2 changes: 1 addition & 1 deletion hassil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
)
from .intents import Intents
from .parse_expression import parse_sentence
from .recognize import is_match, recognize, recognize_all
from .recognize import is_match, recognize, recognize_all, recognize_best
112 changes: 0 additions & 112 deletions hassil/edit_distance.py

This file was deleted.

13 changes: 13 additions & 0 deletions hassil/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Errors for hassil."""


class HassilError(Exception):
"""Base class for hassil errors"""


class MissingListError(HassilError):
"""Error when a {slot_list} is missing."""


class MissingRuleError(HassilError):
"""Error when an <expansion_rule> is missing."""
58 changes: 58 additions & 0 deletions hassil/expression.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Classes for representing sentence templates."""

import re
from abc import ABC
from dataclasses import dataclass, field
from enum import Enum
Expand All @@ -21,6 +22,8 @@ class TextChunk(Expression):
# Set in __post_init__
original_text: str = None # type: ignore

parent: "Optional[Sequence]" = None

def __post_init__(self):
if self.original_text is None:
self.original_text = self.text
Expand Down Expand Up @@ -59,6 +62,8 @@ class Sequence(Expression):
# Group or alternative
type: SequenceType = SequenceType.GROUP

is_optional: bool = False

def text_chunk_count(self) -> int:
"""Return the number of TextChunk expressions in this sequence (recursive)."""
num_text_chunks = 0
Expand Down Expand Up @@ -134,3 +139,56 @@ class Sentence(Sequence):
"""Sequence representing a complete sentence template."""

text: Optional[str] = None
pattern: Optional[re.Pattern] = None

def compile(self, expansion_rules: Dict[str, "Sentence"]) -> None:
if self.pattern is not None:
# Already compiled
return

pattern_chunks: List[str] = []
self._compile_expression(self, pattern_chunks, expansion_rules)

pattern_str = "".join(pattern_chunks).replace(r"\ ", r"[ ]*")
self.pattern = re.compile(f"^{pattern_str}$", re.IGNORECASE)

def _compile_expression(
self, exp: Expression, pattern_chunks: List[str], rules: Dict[str, "Sentence"]
):
if isinstance(exp, TextChunk):
# Literal text
chunk: TextChunk = exp
if chunk.text:
escaped_text = re.escape(chunk.text)
pattern_chunks.append(escaped_text)
elif isinstance(exp, Sequence):
# Linear sequence or alternative choices
seq: Sequence = exp
if seq.type == SequenceType.GROUP:
# Linear sequence
for item in seq.items:
self._compile_expression(item, pattern_chunks, rules)
elif seq.type == SequenceType.ALTERNATIVE:
# Alternative choices
if seq.items:
pattern_chunks.append("(?:")
for item in seq.items:
self._compile_expression(item, pattern_chunks, rules)
pattern_chunks.append("|")
pattern_chunks[-1] = ")"
else:
raise ValueError(seq)
elif isinstance(exp, ListReference):
# Slot list
pattern_chunks.append("(?:.+)")

elif isinstance(exp, RuleReference):
# Expansion rule
rule_ref: RuleReference = exp
if rule_ref.rule_name not in rules:
raise ValueError(rule_ref)

e_rule = rules[rule_ref.rule_name]
self._compile_expression(e_rule, pattern_chunks, rules)
else:
raise ValueError(exp)
36 changes: 35 additions & 1 deletion hassil/intents.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,14 @@ class WildcardSlotList(SlotList):
"""Matches as much text as possible."""


@dataclass
class IntentDataSettings:
"""Settings for intent data."""

filter_with_regex: bool = True
"""Use regular expressions compiled from sentence patterns to filter possible matches."""


@dataclass(frozen=True)
class IntentData:
"""Block of sentences and known slots for an intent."""
Expand Down Expand Up @@ -188,6 +196,12 @@ class IntentData:
metadata: Optional[Dict[str, Any]] = None
"""Metadata that will be passed into the result if matched."""

required_keywords: Optional[Set[str]] = None
"""Keywords that must be present for any sentence to match."""

settings: IntentDataSettings = field(default_factory=IntentDataSettings)
"""Settings for block of sentences."""

@cached_property
def sentences(self) -> List[Sentence]:
"""Sentence templates that match this intent."""
Expand Down Expand Up @@ -240,6 +254,9 @@ class IntentsSettings:
ignore_whitespace: bool = False
"""True if whitespace should be ignored during matching."""

filter_with_regex: bool = True
"""Use regular expressions compiled from sentence patterns to filter possible matches."""


@dataclass
class Intents:
Expand Down Expand Up @@ -284,6 +301,7 @@ def from_dict(input_dict: Dict[str, Any]) -> "Intents":
# language: "<code>"
# settings:
# ignore_whitespace: false
# filter_with_regex: false
# intents:
# IntentName:
# data:
Expand Down Expand Up @@ -333,6 +351,14 @@ def from_dict(input_dict: Dict[str, Any]) -> "Intents":
response=data_dict.get("response"),
wildcard_list_names=wildcard_list_names,
metadata=data_dict.get("metadata"),
required_keywords=(
set(data_dict["required_keywords"])
if "required_keywords" in data_dict
else None
),
settings=_parse_data_settings(
data_dict.get("settings", {})
),
)
for data_dict in intent_dict["data"]
],
Expand Down Expand Up @@ -413,7 +439,15 @@ def _parse_list(
def _parse_settings(settings_dict: Dict[str, Any]) -> IntentsSettings:
"""Parse intent settings."""
return IntentsSettings(
ignore_whitespace=settings_dict.get("ignore_whitespace", False)
ignore_whitespace=settings_dict.get("ignore_whitespace", False),
filter_with_regex=settings_dict.get("filter_with_regex", True),
)


def _parse_data_settings(settings_dict: Dict[str, Any]) -> IntentDataSettings:
"""Parse intent data settings."""
return IntentDataSettings(
filter_with_regex=settings_dict.get("filter_with_regex", True),
)


Expand Down
62 changes: 62 additions & 0 deletions hassil/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Shared models."""

from abc import ABC
from dataclasses import dataclass
from typing import Any, Dict, Optional, Union

from .util import PUNCTUATION_ALL


@dataclass
class MatchEntity:
"""Named entity that has been matched from a {slot_list}"""

name: str
"""Name of the entity."""

value: Any
"""Value of the entity."""

text: str
"""Original value text."""

metadata: Optional[Dict[str, Any]] = None
"""Entity metadata."""

is_wildcard: bool = False
"""True if entity is a wildcard."""

is_wildcard_open: bool = True
"""While True, wildcard can continue matching."""

@property
def text_clean(self) -> str:
"""Trimmed text with punctuation removed."""
return PUNCTUATION_ALL.sub("", self.text.strip())


@dataclass
class UnmatchedEntity(ABC):
"""Base class for unmatched entities."""

name: str
"""Name of entity that should have matched."""


@dataclass
class UnmatchedTextEntity(UnmatchedEntity):
"""Text entity that should have matched."""

text: str
"""Text that failed to match slot values."""

is_open: bool = True
"""While True, entity can continue matching."""


@dataclass
class UnmatchedRangeEntity(UnmatchedEntity):
"""Range entity that should have matched."""

value: Union[int, float]
"""Value of entity that was out of range."""
Loading

0 comments on commit 230fafa

Please sign in to comment.