From 6165e5acfcc157970d38d23a098273dd64cfdd9e Mon Sep 17 00:00:00 2001 From: kmierzej Date: Sun, 26 Dec 2021 16:35:53 +0100 Subject: [PATCH] Search for substring by using __contains lookup in lexem. --- django_native_search/manager.py | 63 ++++++++++++++++------------- django_native_search/models.py | 70 ++++++++++++++------------------- setup.py | 6 +-- 3 files changed, 69 insertions(+), 70 deletions(-) diff --git a/django_native_search/manager.py b/django_native_search/manager.py index 9d0328c..d9d9e88 100644 --- a/django_native_search/manager.py +++ b/django_native_search/manager.py @@ -1,17 +1,14 @@ import copy from django.core.exceptions import FieldDoesNotExist -from django.db.models import (F, Value, Min, OuterRef, Count, FloatField, QuerySet, Q, Prefetch, - ExpressionWrapper) +from django.db.models import (F, Value, Min, Count, FloatField, QuerySet, Q, Prefetch, + OuterRef, ExpressionWrapper) from django.db.models.manager import BaseManager, Manager from django.db.models.functions import Abs -from django.conf import settings import logging from functools import cache +from django.db.models.expressions import When, Case - -MAX_RANKING_KEYWORDS_COUNT=getattr(settings,"SEARCH_MAX_RANKING_KEYWORDS_COUNT", 3) - logger=logging.getLogger(__name__) @@ -37,24 +34,28 @@ def apply_filter(self, q): filtered.search_conditions.append(q) return filtered + def search_one(self, condition): + return self.apply_filter(condition).distinct().annotate_rank().order_by("rank") + def search(self, query): ranking=self filtered=self conditions=self.model.parse_query(query) - sticked=False + if len(conditions) == 1: + return self.search_one(conditions[0]) for q in conditions: ranking=ranking.apply_filter(q).annotate_rank() - if getattr(q,'sticky', None): + if getattr(q.token,'sticky', None): ranking=ranking.filter(d=1) - sticked=True - filtered=self.apply_filter(q).filter(pk__in=filtered.all()) + filtered=filtered.filter(pk__in=ranking.values("pk")) + + if filtered is not self: + filtered = self.filter(pk__in=filtered.all()) + filtered = filtered.apply_filter(q) if filtered is self: return self - - if sticked: - filtered=filtered.filter(pk__in=ranking) - + results = self.filter(pk__in=filtered) results = results.annotate(rank=ranking.filter(pk=OuterRef("pk")).values("rank")).order_by("rank") results.search_conditions=conditions @@ -62,24 +63,32 @@ def search(self, query): def annotate_rank(self): ranking=self - - keycount = len(self.search_conditions) - - if keycount==1: - ranking=ranking.annotate(dsum=Value(1, output_field=FloatField())) + if "p" in ranking.query.annotations: + ranking=ranking.alias(d=F("occurrence__position")-F("p")) + ranking=ranking.alias(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum")) else: - ranking=ranking.annotate(d=F("occurrence__position")-F("p")) - ranking=ranking.annotate(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum")) - - if keycount<=MAX_RANKING_KEYWORDS_COUNT: - ranking=ranking.annotate( - rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField())) + ranking=ranking.alias(dsum=Value(1, output_field=FloatField())) + + ranking=ranking.annotate( + rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField())) - ranking=ranking.annotate(p=F("occurrence__position")) + ranking=ranking.alias(p=F("occurrence__position")) return ranking def prefetch_matches(self): - qs=self.model.occurrences.filter(Q(*self.search_conditions, _connector=Q.OR)) + conditions=[] + tokens=[] + for condition in self.search_conditions: + conditions.append(When(condition, then=Value(len(tokens)))) + tokens.append(condition.token) + qs=self.model.occurrences.annotate(token=Case(*conditions)).filter(token__isnull=False) + class Decor(qs.__class__): + def __iter__(self): + for obj in super().__iter__(): + if isinstance(obj.token, int): + obj.token=tokens[obj.token] + yield obj + qs.__class__ = Decor return self.prefetch_related(Prefetch("occurrences", queryset=qs, to_attr="matches")) diff --git a/django_native_search/models.py b/django_native_search/models.py index d5a5532..96e6d94 100644 --- a/django_native_search/models.py +++ b/django_native_search/models.py @@ -7,18 +7,16 @@ from django.template.loader import render_to_string from django.utils.functional import cached_property -from django.db.models.signals import post_save -from django.dispatch import receiver from .manager import IndexEntryManager, IndexManager from django.utils.safestring import mark_safe from django.conf import settings from django.contrib.contenttypes.models import ContentType from django_native_search.fields import OccurrencesField +from django.db.models.functions.text import Length -MIN_TAIL_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2) -MAX_TAIL_COUNT_IN_QUERY=getattr(settings, "SEARCH_MAX_SUBTSTR_COUNT_IN_QUERY", 300) +MIN_SUBSTR_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2) MAX_EXCERPT_FRAGMENTS=getattr(settings, "SEARCH_MAX_EXCERPT_FRAGMENTS", 5) EXCERPT_FRAGMENT_START_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_START_OFFSET", -3) EXCERPT_FRAGMENT_END_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_END_OFFSET", 6) @@ -35,26 +33,6 @@ class Meta: def __str__(self): return self.surface -class LexemTail(models.Model): - lexem=models.ForeignKey(Lexem, on_delete=models.CASCADE, - related_name="tails", related_query_name='tail') - surface=models.CharField(max_length=255, db_index=True) - - class Meta: - indexes=[django_expression_index.ExpressionIndex(expressions=[Lower('surface')])] - unique_together=('lexem','surface') - - def __str__(self): - return self.surface - -@receiver(post_save, sender=Lexem) -def update_lexem_tail(instance, **kwargs): - instance.tails.all().delete() - for i in range(len(instance.surface)): - tail=instance.surface[i:] - if len(tail)>MIN_TAIL_LEN: - instance.tails.create(surface=tail) - models.CharField.register_lookup(Lower) @@ -114,6 +92,8 @@ def tokenize(cls, text): if quotes%2>0: sticky=not sticky i=res.end() + if not sticky and token == text and len(token) >= MIN_SUBSTR_LEN: + token.lookup = "contains" yield token @classmethod @@ -122,19 +102,15 @@ def parse_query(cls, query): if query.islower(): lookup +="__lower" tokens=list(cls.tokenize(query)) - if len(tokens)==1 and len(tokens[0])>MIN_TAIL_LEN: - tail_q=LexemTail.objects.filter(**{ - lookup+"__gte":tokens[0], - lookup+"__lt":tokens[0]+chr(0x10FFFF)}) - if tail_q.count()<=MAX_TAIL_COUNT_IN_QUERY: - return [models.Q(lexem__in=Lexem.objects.filter( - tail__in=tail_q))] query=[] for token in tokens: - condition=models.Q(lexem__in=Lexem.objects.filter(**{lookup:token})) - if query: - condition.sticky=getattr(token,'sticky',False) + token.lookup = lookup + "__" + getattr(token,"lookup", "exact") + lqs = Lexem.objects.filter(**{token.lookup: token}) + if token.lookup.endswith("__contains"): + lqs=lqs.order_by(Length("surface"))[:20000] + condition=models.Q(lexem__in=lqs) + condition.token = token query.append(condition) return query @@ -170,7 +146,7 @@ def excerpt(self): return self.build_excerpt(words, matches) def build_excerpt(self, words, matches): - highlight=set([m.position for m in matches]) + highlight={m.position:m for m in matches} excerpt="" pos=-1 for word in words.select_related('lexem'): @@ -179,18 +155,32 @@ def build_excerpt(self, words, matches): if pos>0: excerpt+=escape(word.prefix) - if word.position in highlight: - excerpt+= self.highlight(word) + matched = highlight.get(word.position) + if matched: + matched.lexem = word.lexem + excerpt+= self.highlight(matched) else: - excerpt+=escape(word.lexem.surface) + excerpt+=self.to_html(word.lexem.surface) pos=word.position+1 if pos{surface}" + if word.token.lookup.endswith("contains"): + flags = re.IGNORECASE if "__lower__" in word.token.lookup else 0 + pattern = re.compile("(.*?)(("+re.escape(word.token)+")|$)", flags) + return "".join([self.to_html(part[0]) + self.to_html(part[1], True) + for part in pattern.findall(word.lexem.surface)]) + return self.to_html(word.lexem.surface, True) + + def to_html(self, surface, highlight=False): + if not surface: + return "" + surface=escape(surface) + if highlight: + surface = f"{surface}" + return surface @cached_property def rendered_text(self): diff --git a/setup.py b/setup.py index e919b12..63e4c77 100644 --- a/setup.py +++ b/setup.py @@ -6,13 +6,13 @@ setuptools.setup( author='Kamil Mierzejewski', name='django-native-search', - version='0.5.4', + version='0.6', description='A simple search engine using native django database backend.', long_description=long_description, long_description_content_type='text/markdown', url='https://github.com/kmierzeje/django-native-search', packages=setuptools.find_packages(), - install_requires=['django>=3.0.8', 'django-expression-index>=0.1.0'], + install_requires=['django>=3.2', 'django-expression-index>=0.1.0'], classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', @@ -21,6 +21,6 @@ 'License :: OSI Approved :: BSD License', 'Natural Language :: English', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.9', ], )