From 6165e5acfcc157970d38d23a098273dd64cfdd9e Mon Sep 17 00:00:00 2001
From: kmierzej <kmierzej@192.168.0.111>
Date: Sun, 26 Dec 2021 16:35:53 +0100
Subject: [PATCH] Search for substring by using __contains lookup in lexem.

---
 django_native_search/manager.py | 63 ++++++++++++++++-------------
 django_native_search/models.py  | 70 ++++++++++++++-------------------
 setup.py                        |  6 +--
 3 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/django_native_search/manager.py b/django_native_search/manager.py
index 9d0328c..d9d9e88 100644
--- a/django_native_search/manager.py
+++ b/django_native_search/manager.py
@@ -1,17 +1,14 @@
 import copy
 from django.core.exceptions import FieldDoesNotExist
-from django.db.models import (F, Value, Min, OuterRef, Count, FloatField, QuerySet, Q, Prefetch,
-                              ExpressionWrapper)
+from django.db.models import (F, Value, Min, Count, FloatField, QuerySet, Q, Prefetch,
+                              OuterRef, ExpressionWrapper)
 from django.db.models.manager import BaseManager, Manager
 from django.db.models.functions import Abs
-from django.conf import settings
 import logging
 from functools import cache
+from django.db.models.expressions import When, Case
 
 
-
-MAX_RANKING_KEYWORDS_COUNT=getattr(settings,"SEARCH_MAX_RANKING_KEYWORDS_COUNT", 3)
-
 logger=logging.getLogger(__name__)
 
 
@@ -37,24 +34,28 @@ def apply_filter(self, q):
         filtered.search_conditions.append(q)
         return filtered
     
+    def search_one(self, condition):
+        return self.apply_filter(condition).distinct().annotate_rank().order_by("rank")
+    
     def search(self, query):
         ranking=self
         filtered=self
         conditions=self.model.parse_query(query)
-        sticked=False
+        if len(conditions) == 1:
+            return self.search_one(conditions[0])
         for q in conditions:
             ranking=ranking.apply_filter(q).annotate_rank()
-            if getattr(q,'sticky', None):
+            if getattr(q.token,'sticky', None):
                 ranking=ranking.filter(d=1)
-                sticked=True
-            filtered=self.apply_filter(q).filter(pk__in=filtered.all())
+                filtered=filtered.filter(pk__in=ranking.values("pk"))
+                
+            if filtered is not self:
+                filtered = self.filter(pk__in=filtered.all())
+            filtered = filtered.apply_filter(q)
         
         if filtered is self:
             return self
-
-        if sticked:
-            filtered=filtered.filter(pk__in=ranking)
-
+        
         results = self.filter(pk__in=filtered)
         results = results.annotate(rank=ranking.filter(pk=OuterRef("pk")).values("rank")).order_by("rank")
         results.search_conditions=conditions
@@ -62,24 +63,32 @@ def search(self, query):
     
     def annotate_rank(self):
         ranking=self
-        
-        keycount = len(self.search_conditions)
-        
-        if keycount==1:
-            ranking=ranking.annotate(dsum=Value(1, output_field=FloatField()))
+        if "p" in ranking.query.annotations:
+            ranking=ranking.alias(d=F("occurrence__position")-F("p"))
+            ranking=ranking.alias(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum"))
         else:
-            ranking=ranking.annotate(d=F("occurrence__position")-F("p"))
-            ranking=ranking.annotate(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum"))
-        
-        if keycount<=MAX_RANKING_KEYWORDS_COUNT:
-            ranking=ranking.annotate(
-                    rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField()))
+            ranking=ranking.alias(dsum=Value(1, output_field=FloatField()))
+            
+        ranking=ranking.annotate(
+                rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField()))
 
-        ranking=ranking.annotate(p=F("occurrence__position"))
+        ranking=ranking.alias(p=F("occurrence__position"))
         return ranking
     
     def prefetch_matches(self):
-        qs=self.model.occurrences.filter(Q(*self.search_conditions, _connector=Q.OR))
+        conditions=[]
+        tokens=[]
+        for condition in self.search_conditions:
+            conditions.append(When(condition, then=Value(len(tokens))))
+            tokens.append(condition.token)
+        qs=self.model.occurrences.annotate(token=Case(*conditions)).filter(token__isnull=False)
+        class Decor(qs.__class__):
+            def __iter__(self):
+                for obj in super().__iter__():
+                    if isinstance(obj.token, int):
+                        obj.token=tokens[obj.token]
+                    yield obj
+        qs.__class__ = Decor
         return self.prefetch_related(Prefetch("occurrences", 
                                   queryset=qs,
                                   to_attr="matches"))
diff --git a/django_native_search/models.py b/django_native_search/models.py
index d5a5532..96e6d94 100644
--- a/django_native_search/models.py
+++ b/django_native_search/models.py
@@ -7,18 +7,16 @@
 
 from django.template.loader import render_to_string
 from django.utils.functional import cached_property
-from django.db.models.signals import post_save
-from django.dispatch import receiver
 
 from .manager import IndexEntryManager, IndexManager
 from django.utils.safestring import mark_safe
 from django.conf import settings
 from django.contrib.contenttypes.models import ContentType
 from django_native_search.fields import OccurrencesField
+from django.db.models.functions.text import Length
 
 
-MIN_TAIL_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2)
-MAX_TAIL_COUNT_IN_QUERY=getattr(settings, "SEARCH_MAX_SUBTSTR_COUNT_IN_QUERY", 300)
+MIN_SUBSTR_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2)
 MAX_EXCERPT_FRAGMENTS=getattr(settings, "SEARCH_MAX_EXCERPT_FRAGMENTS", 5)
 EXCERPT_FRAGMENT_START_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_START_OFFSET", -3)
 EXCERPT_FRAGMENT_END_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_END_OFFSET", 6)
@@ -35,26 +33,6 @@ class Meta:
     def __str__(self):
         return self.surface
 
-class LexemTail(models.Model):
-    lexem=models.ForeignKey(Lexem, on_delete=models.CASCADE, 
-                            related_name="tails", related_query_name='tail')
-    surface=models.CharField(max_length=255, db_index=True)
-    
-    class Meta:
-        indexes=[django_expression_index.ExpressionIndex(expressions=[Lower('surface')])]
-        unique_together=('lexem','surface')
-    
-    def __str__(self):
-        return self.surface
-
-@receiver(post_save, sender=Lexem)
-def update_lexem_tail(instance, **kwargs):
-    instance.tails.all().delete()
-    for i in range(len(instance.surface)):
-        tail=instance.surface[i:]
-        if len(tail)>MIN_TAIL_LEN:
-            instance.tails.create(surface=tail)
-
 
 models.CharField.register_lookup(Lower)
 
@@ -114,6 +92,8 @@ def tokenize(cls, text):
                 if quotes%2>0:
                     sticky=not sticky
             i=res.end()
+            if not sticky and token == text and len(token) >= MIN_SUBSTR_LEN:
+                token.lookup = "contains"
             yield token
     
     @classmethod
@@ -122,19 +102,15 @@ def parse_query(cls, query):
         if query.islower():
             lookup +="__lower"
         tokens=list(cls.tokenize(query))
-        if len(tokens)==1 and len(tokens[0])>MIN_TAIL_LEN:
-            tail_q=LexemTail.objects.filter(**{
-                lookup+"__gte":tokens[0],
-                lookup+"__lt":tokens[0]+chr(0x10FFFF)})
-            if tail_q.count()<=MAX_TAIL_COUNT_IN_QUERY:
-                return [models.Q(lexem__in=Lexem.objects.filter(
-                    tail__in=tail_q))]
         
         query=[]
         for token in tokens:
-            condition=models.Q(lexem__in=Lexem.objects.filter(**{lookup:token}))
-            if query:
-                condition.sticky=getattr(token,'sticky',False)
+            token.lookup = lookup + "__" + getattr(token,"lookup", "exact")
+            lqs = Lexem.objects.filter(**{token.lookup: token})
+            if token.lookup.endswith("__contains"):
+                lqs=lqs.order_by(Length("surface"))[:20000]
+            condition=models.Q(lexem__in=lqs)
+            condition.token = token
             query.append(condition)
         return query
         
@@ -170,7 +146,7 @@ def excerpt(self):
         return self.build_excerpt(words, matches)
     
     def build_excerpt(self, words, matches):
-        highlight=set([m.position for m in matches])
+        highlight={m.position:m for m in matches}
         excerpt=""
         pos=-1
         for word in words.select_related('lexem'):
@@ -179,18 +155,32 @@ def build_excerpt(self, words, matches):
             if pos>0:
                 excerpt+=escape(word.prefix)
             
-            if word.position in highlight:
-                excerpt+= self.highlight(word)
+            matched = highlight.get(word.position)
+            if matched:
+                matched.lexem = word.lexem
+                excerpt+= self.highlight(matched)
             else:
-                excerpt+=escape(word.lexem.surface)
+                excerpt+=self.to_html(word.lexem.surface)
             pos=word.position+1
         if pos<self.length:
             excerpt+="..."
         return mark_safe(excerpt)
     
     def highlight(self, word):
-        surface=escape(word.lexem.surface)
-        return f"<em>{surface}</em>"
+        if word.token.lookup.endswith("contains"):
+            flags = re.IGNORECASE if "__lower__" in word.token.lookup else 0
+            pattern = re.compile("(.*?)(("+re.escape(word.token)+")|$)", flags)
+            return "".join([self.to_html(part[0]) + self.to_html(part[1], True) 
+                            for part in pattern.findall(word.lexem.surface)])
+        return self.to_html(word.lexem.surface, True)
+
+    def to_html(self, surface, highlight=False):
+        if not surface:
+            return ""
+        surface=escape(surface)
+        if highlight:
+            surface = f"<em>{surface}</em>"
+        return surface
     
     @cached_property
     def rendered_text(self):
diff --git a/setup.py b/setup.py
index e919b12..63e4c77 100644
--- a/setup.py
+++ b/setup.py
@@ -6,13 +6,13 @@
 setuptools.setup(
     author='Kamil Mierzejewski',
     name='django-native-search',
-    version='0.5.4',
+    version='0.6',
     description='A simple search engine using native django database backend.',
     long_description=long_description,
     long_description_content_type='text/markdown',
     url='https://github.com/kmierzeje/django-native-search',
     packages=setuptools.find_packages(),
-    install_requires=['django>=3.0.8', 'django-expression-index>=0.1.0'],
+    install_requires=['django>=3.2', 'django-expression-index>=0.1.0'],
     classifiers=[
         'Development Status :: 4 - Beta',
         'Environment :: Web Environment',
@@ -21,6 +21,6 @@
         'License :: OSI Approved :: BSD License',
         'Natural Language :: English',
         'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.9',
     ],
 )