Merge pull request #177 from home-assistant/synesthesiam-20241025-fas…

…ter-recognizer Faster and more accurate recognition
home-assistant · Nov 11, 2024 · 230fafa · 230fafa
2 parents ffa4ed6 + 889084f
commit 230fafa
Show file tree

Hide file tree

Showing 20 changed files with 2,144 additions and 1,422 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## 2.0.0
+
+- Allow wildcards to be followed by expansion rules and lists
+- Use regular expressions to filter sentence templates
+- Add `filter_with_regex` to intent settings and intent data (`false` disables regex filtering)
+- Filter text slot list values by required/excluded context during matching
+- Use a trie to filter range slot list values based on remaining text to be matched
+- Add `required_keywords` section to intent data to skip sentences without specific keywords
+- Preserve case during matching 
+- Strip punctuation before text processing
+- Remove extraneous whitespace from the end of wildcards
+- Refactor string matching code into `string_matcher.py`
+
 ## 1.8.0
 
 - Bump `unicode-rbnf` to 2.0.0

diff --git a/README.md b/README.md
@@ -62,11 +62,11 @@ Exclude the `-n` argument to sample all possible sentences.
 Uses a custom parser written in Python.
 
 * Alternative words or phrases
-  * `(red | green | blue)`
-  * `turn(s | ed | ing)`
+  * `(red|green|blue)`
+  * `turn(s|ed|ing)`
 * Optional words or phrases
   * `[the]`
-  * `[this | that]`
+  * `[this|that]`
   * `light[s]`
 * Permutations of words or phrases
   * `(patience; you must have) my young Padawan`

diff --git a/hassil/VERSION b/hassil/VERSION
@@ -1 +1 @@
-1.8.0
+2.0.0
diff --git a/hassil/__init__.py b/hassil/__init__.py
@@ -10,4 +10,4 @@
 )
 from .intents import Intents
 from .parse_expression import parse_sentence
-from .recognize import is_match, recognize, recognize_all
+from .recognize import is_match, recognize, recognize_all, recognize_best
diff --git a/hassil/edit_distance.py b/hassil/edit_distance.py
diff --git a/hassil/errors.py b/hassil/errors.py
@@ -0,0 +1,13 @@
+"""Errors for hassil."""
+
+
+class HassilError(Exception):
+    """Base class for hassil errors"""
+
+
+class MissingListError(HassilError):
+    """Error when a {slot_list} is missing."""
+
+
+class MissingRuleError(HassilError):
+    """Error when an <expansion_rule> is missing."""
diff --git a/hassil/expression.py b/hassil/expression.py
@@ -1,5 +1,6 @@
 """Classes for representing sentence templates."""
 
+import re
 from abc import ABC
 from dataclasses import dataclass, field
 from enum import Enum
@@ -21,6 +22,8 @@ class TextChunk(Expression):
     # Set in __post_init__
     original_text: str = None  # type: ignore
 
+    parent: "Optional[Sequence]" = None
+
     def __post_init__(self):
         if self.original_text is None:
             self.original_text = self.text
@@ -59,6 +62,8 @@ class Sequence(Expression):
     # Group or alternative
     type: SequenceType = SequenceType.GROUP
 
+    is_optional: bool = False
+
     def text_chunk_count(self) -> int:
         """Return the number of TextChunk expressions in this sequence (recursive)."""
         num_text_chunks = 0
@@ -134,3 +139,56 @@ class Sentence(Sequence):
     """Sequence representing a complete sentence template."""
 
     text: Optional[str] = None
+    pattern: Optional[re.Pattern] = None
+
+    def compile(self, expansion_rules: Dict[str, "Sentence"]) -> None:
+        if self.pattern is not None:
+            # Already compiled
+            return
+
+        pattern_chunks: List[str] = []
+        self._compile_expression(self, pattern_chunks, expansion_rules)
+
+        pattern_str = "".join(pattern_chunks).replace(r"\ ", r"[ ]*")
+        self.pattern = re.compile(f"^{pattern_str}$", re.IGNORECASE)
+
+    def _compile_expression(
+        self, exp: Expression, pattern_chunks: List[str], rules: Dict[str, "Sentence"]
+    ):
+        if isinstance(exp, TextChunk):
+            # Literal text
+            chunk: TextChunk = exp
+            if chunk.text:
+                escaped_text = re.escape(chunk.text)
+                pattern_chunks.append(escaped_text)
+        elif isinstance(exp, Sequence):
+            # Linear sequence or alternative choices
+            seq: Sequence = exp
+            if seq.type == SequenceType.GROUP:
+                # Linear sequence
+                for item in seq.items:
+                    self._compile_expression(item, pattern_chunks, rules)
+            elif seq.type == SequenceType.ALTERNATIVE:
+                # Alternative choices
+                if seq.items:
+                    pattern_chunks.append("(?:")
+                    for item in seq.items:
+                        self._compile_expression(item, pattern_chunks, rules)
+                        pattern_chunks.append("|")
+                    pattern_chunks[-1] = ")"
+            else:
+                raise ValueError(seq)
+        elif isinstance(exp, ListReference):
+            # Slot list
+            pattern_chunks.append("(?:.+)")
+
+        elif isinstance(exp, RuleReference):
+            # Expansion rule
+            rule_ref: RuleReference = exp
+            if rule_ref.rule_name not in rules:
+                raise ValueError(rule_ref)
+
+            e_rule = rules[rule_ref.rule_name]
+            self._compile_expression(e_rule, pattern_chunks, rules)
+        else:
+            raise ValueError(exp)
diff --git a/hassil/intents.py b/hassil/intents.py
@@ -157,6 +157,14 @@ class WildcardSlotList(SlotList):
     """Matches as much text as possible."""
 
 
+@dataclass
+class IntentDataSettings:
+    """Settings for intent data."""
+
+    filter_with_regex: bool = True
+    """Use regular expressions compiled from sentence patterns to filter possible matches."""
+
+
 @dataclass(frozen=True)
 class IntentData:
     """Block of sentences and known slots for an intent."""
@@ -188,6 +196,12 @@ class IntentData:
     metadata: Optional[Dict[str, Any]] = None
     """Metadata that will be passed into the result if matched."""
 
+    required_keywords: Optional[Set[str]] = None
+    """Keywords that must be present for any sentence to match."""
+
+    settings: IntentDataSettings = field(default_factory=IntentDataSettings)
+    """Settings for block of sentences."""
+
     @cached_property
     def sentences(self) -> List[Sentence]:
         """Sentence templates that match this intent."""
@@ -240,6 +254,9 @@ class IntentsSettings:
     ignore_whitespace: bool = False
     """True if whitespace should be ignored during matching."""
 
+    filter_with_regex: bool = True
+    """Use regular expressions compiled from sentence patterns to filter possible matches."""
+
 
 @dataclass
 class Intents:
@@ -284,6 +301,7 @@ def from_dict(input_dict: Dict[str, Any]) -> "Intents":
         # language: "<code>"
         # settings:
         #   ignore_whitespace: false
+        #   filter_with_regex: false
         # intents:
         #   IntentName:
         #     data:
@@ -333,6 +351,14 @@ def from_dict(input_dict: Dict[str, Any]) -> "Intents":
                             response=data_dict.get("response"),
                             wildcard_list_names=wildcard_list_names,
                             metadata=data_dict.get("metadata"),
+                            required_keywords=(
+                                set(data_dict["required_keywords"])
+                                if "required_keywords" in data_dict
+                                else None
+                            ),
+                            settings=_parse_data_settings(
+                                data_dict.get("settings", {})
+                            ),
                         )
                         for data_dict in intent_dict["data"]
                     ],
@@ -413,7 +439,15 @@ def _parse_list(
 def _parse_settings(settings_dict: Dict[str, Any]) -> IntentsSettings:
     """Parse intent settings."""
     return IntentsSettings(
-        ignore_whitespace=settings_dict.get("ignore_whitespace", False)
+        ignore_whitespace=settings_dict.get("ignore_whitespace", False),
+        filter_with_regex=settings_dict.get("filter_with_regex", True),
+    )
+
+
+def _parse_data_settings(settings_dict: Dict[str, Any]) -> IntentDataSettings:
+    """Parse intent data settings."""
+    return IntentDataSettings(
+        filter_with_regex=settings_dict.get("filter_with_regex", True),
     )
 
 

diff --git a/hassil/models.py b/hassil/models.py
@@ -0,0 +1,62 @@
+"""Shared models."""
+
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union
+
+from .util import PUNCTUATION_ALL
+
+
+@dataclass
+class MatchEntity:
+    """Named entity that has been matched from a {slot_list}"""
+
+    name: str
+    """Name of the entity."""
+
+    value: Any
+    """Value of the entity."""
+
+    text: str
+    """Original value text."""
+
+    metadata: Optional[Dict[str, Any]] = None
+    """Entity metadata."""
+
+    is_wildcard: bool = False
+    """True if entity is a wildcard."""
+
+    is_wildcard_open: bool = True
+    """While True, wildcard can continue matching."""
+
+    @property
+    def text_clean(self) -> str:
+        """Trimmed text with punctuation removed."""
+        return PUNCTUATION_ALL.sub("", self.text.strip())
+
+
+@dataclass
+class UnmatchedEntity(ABC):
+    """Base class for unmatched entities."""
+
+    name: str
+    """Name of entity that should have matched."""
+
+
+@dataclass
+class UnmatchedTextEntity(UnmatchedEntity):
+    """Text entity that should have matched."""
+
+    text: str
+    """Text that failed to match slot values."""
+
+    is_open: bool = True
+    """While True, entity can continue matching."""
+
+
+@dataclass
+class UnmatchedRangeEntity(UnmatchedEntity):
+    """Range entity that should have matched."""
+
+    value: Union[int, float]
+    """Value of entity that was out of range."""