Skip to content

Commit

Permalink
Search for substring by using __contains lookup in lexem.
Browse files Browse the repository at this point in the history
  • Loading branch information
kmierzej committed Dec 26, 2021
1 parent 39d4191 commit 6165e5a
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 70 deletions.
63 changes: 36 additions & 27 deletions django_native_search/manager.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
import copy
from django.core.exceptions import FieldDoesNotExist
from django.db.models import (F, Value, Min, OuterRef, Count, FloatField, QuerySet, Q, Prefetch,
ExpressionWrapper)
from django.db.models import (F, Value, Min, Count, FloatField, QuerySet, Q, Prefetch,
OuterRef, ExpressionWrapper)
from django.db.models.manager import BaseManager, Manager
from django.db.models.functions import Abs
from django.conf import settings
import logging
from functools import cache
from django.db.models.expressions import When, Case



MAX_RANKING_KEYWORDS_COUNT=getattr(settings,"SEARCH_MAX_RANKING_KEYWORDS_COUNT", 3)

logger=logging.getLogger(__name__)


Expand All @@ -37,49 +34,61 @@ def apply_filter(self, q):
filtered.search_conditions.append(q)
return filtered

def search_one(self, condition):
return self.apply_filter(condition).distinct().annotate_rank().order_by("rank")

def search(self, query):
ranking=self
filtered=self
conditions=self.model.parse_query(query)
sticked=False
if len(conditions) == 1:
return self.search_one(conditions[0])
for q in conditions:
ranking=ranking.apply_filter(q).annotate_rank()
if getattr(q,'sticky', None):
if getattr(q.token,'sticky', None):
ranking=ranking.filter(d=1)
sticked=True
filtered=self.apply_filter(q).filter(pk__in=filtered.all())
filtered=filtered.filter(pk__in=ranking.values("pk"))

if filtered is not self:
filtered = self.filter(pk__in=filtered.all())
filtered = filtered.apply_filter(q)

if filtered is self:
return self

if sticked:
filtered=filtered.filter(pk__in=ranking)


results = self.filter(pk__in=filtered)
results = results.annotate(rank=ranking.filter(pk=OuterRef("pk")).values("rank")).order_by("rank")
results.search_conditions=conditions
return results

def annotate_rank(self):
ranking=self

keycount = len(self.search_conditions)

if keycount==1:
ranking=ranking.annotate(dsum=Value(1, output_field=FloatField()))
if "p" in ranking.query.annotations:
ranking=ranking.alias(d=F("occurrence__position")-F("p"))
ranking=ranking.alias(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum"))
else:
ranking=ranking.annotate(d=F("occurrence__position")-F("p"))
ranking=ranking.annotate(dsum=Abs(F('d')-1.0, output_field=FloatField())+F("dsum"))

if keycount<=MAX_RANKING_KEYWORDS_COUNT:
ranking=ranking.annotate(
rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField()))
ranking=ranking.alias(dsum=Value(1, output_field=FloatField()))

ranking=ranking.annotate(
rank=ExpressionWrapper(Min("dsum")*F("length")/Count("*"), output_field=FloatField()))

ranking=ranking.annotate(p=F("occurrence__position"))
ranking=ranking.alias(p=F("occurrence__position"))
return ranking

def prefetch_matches(self):
qs=self.model.occurrences.filter(Q(*self.search_conditions, _connector=Q.OR))
conditions=[]
tokens=[]
for condition in self.search_conditions:
conditions.append(When(condition, then=Value(len(tokens))))
tokens.append(condition.token)
qs=self.model.occurrences.annotate(token=Case(*conditions)).filter(token__isnull=False)
class Decor(qs.__class__):
def __iter__(self):
for obj in super().__iter__():
if isinstance(obj.token, int):
obj.token=tokens[obj.token]
yield obj
qs.__class__ = Decor
return self.prefetch_related(Prefetch("occurrences",
queryset=qs,
to_attr="matches"))
Expand Down
70 changes: 30 additions & 40 deletions django_native_search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@

from django.template.loader import render_to_string
from django.utils.functional import cached_property
from django.db.models.signals import post_save
from django.dispatch import receiver

from .manager import IndexEntryManager, IndexManager
from django.utils.safestring import mark_safe
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django_native_search.fields import OccurrencesField
from django.db.models.functions.text import Length


MIN_TAIL_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2)
MAX_TAIL_COUNT_IN_QUERY=getattr(settings, "SEARCH_MAX_SUBTSTR_COUNT_IN_QUERY", 300)
MIN_SUBSTR_LEN=getattr(settings,"SEARCH_MIN_SUBSTR_LENGTH", 2)
MAX_EXCERPT_FRAGMENTS=getattr(settings, "SEARCH_MAX_EXCERPT_FRAGMENTS", 5)
EXCERPT_FRAGMENT_START_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_START_OFFSET", -3)
EXCERPT_FRAGMENT_END_OFFSET=getattr(settings, "SEARCH_EXCERPT_FRAGMENT_END_OFFSET", 6)
Expand All @@ -35,26 +33,6 @@ class Meta:
def __str__(self):
return self.surface

class LexemTail(models.Model):
lexem=models.ForeignKey(Lexem, on_delete=models.CASCADE,
related_name="tails", related_query_name='tail')
surface=models.CharField(max_length=255, db_index=True)

class Meta:
indexes=[django_expression_index.ExpressionIndex(expressions=[Lower('surface')])]
unique_together=('lexem','surface')

def __str__(self):
return self.surface

@receiver(post_save, sender=Lexem)
def update_lexem_tail(instance, **kwargs):
instance.tails.all().delete()
for i in range(len(instance.surface)):
tail=instance.surface[i:]
if len(tail)>MIN_TAIL_LEN:
instance.tails.create(surface=tail)


models.CharField.register_lookup(Lower)

Expand Down Expand Up @@ -114,6 +92,8 @@ def tokenize(cls, text):
if quotes%2>0:
sticky=not sticky
i=res.end()
if not sticky and token == text and len(token) >= MIN_SUBSTR_LEN:
token.lookup = "contains"
yield token

@classmethod
Expand All @@ -122,19 +102,15 @@ def parse_query(cls, query):
if query.islower():
lookup +="__lower"
tokens=list(cls.tokenize(query))
if len(tokens)==1 and len(tokens[0])>MIN_TAIL_LEN:
tail_q=LexemTail.objects.filter(**{
lookup+"__gte":tokens[0],
lookup+"__lt":tokens[0]+chr(0x10FFFF)})
if tail_q.count()<=MAX_TAIL_COUNT_IN_QUERY:
return [models.Q(lexem__in=Lexem.objects.filter(
tail__in=tail_q))]

query=[]
for token in tokens:
condition=models.Q(lexem__in=Lexem.objects.filter(**{lookup:token}))
if query:
condition.sticky=getattr(token,'sticky',False)
token.lookup = lookup + "__" + getattr(token,"lookup", "exact")
lqs = Lexem.objects.filter(**{token.lookup: token})
if token.lookup.endswith("__contains"):
lqs=lqs.order_by(Length("surface"))[:20000]
condition=models.Q(lexem__in=lqs)
condition.token = token
query.append(condition)
return query

Expand Down Expand Up @@ -170,7 +146,7 @@ def excerpt(self):
return self.build_excerpt(words, matches)

def build_excerpt(self, words, matches):
highlight=set([m.position for m in matches])
highlight={m.position:m for m in matches}
excerpt=""
pos=-1
for word in words.select_related('lexem'):
Expand All @@ -179,18 +155,32 @@ def build_excerpt(self, words, matches):
if pos>0:
excerpt+=escape(word.prefix)

if word.position in highlight:
excerpt+= self.highlight(word)
matched = highlight.get(word.position)
if matched:
matched.lexem = word.lexem
excerpt+= self.highlight(matched)
else:
excerpt+=escape(word.lexem.surface)
excerpt+=self.to_html(word.lexem.surface)
pos=word.position+1
if pos<self.length:
excerpt+="..."
return mark_safe(excerpt)

def highlight(self, word):
surface=escape(word.lexem.surface)
return f"<em>{surface}</em>"
if word.token.lookup.endswith("contains"):
flags = re.IGNORECASE if "__lower__" in word.token.lookup else 0
pattern = re.compile("(.*?)(("+re.escape(word.token)+")|$)", flags)
return "".join([self.to_html(part[0]) + self.to_html(part[1], True)
for part in pattern.findall(word.lexem.surface)])
return self.to_html(word.lexem.surface, True)

def to_html(self, surface, highlight=False):
if not surface:
return ""
surface=escape(surface)
if highlight:
surface = f"<em>{surface}</em>"
return surface

@cached_property
def rendered_text(self):
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
setuptools.setup(
author='Kamil Mierzejewski',
name='django-native-search',
version='0.5.4',
version='0.6',
description='A simple search engine using native django database backend.',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/kmierzeje/django-native-search',
packages=setuptools.find_packages(),
install_requires=['django>=3.0.8', 'django-expression-index>=0.1.0'],
install_requires=['django>=3.2', 'django-expression-index>=0.1.0'],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Web Environment',
Expand All @@ -21,6 +21,6 @@
'License :: OSI Approved :: BSD License',
'Natural Language :: English',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.9',
],
)

0 comments on commit 6165e5a

Please sign in to comment.