Fixed #32492 -- Added TrigramWordSimilarity() and TrigramWordDistance() on PostgreSQL.

This commit is contained in:
Nikita Marchant 2021-09-15 12:57:49 +02:00 committed by Mariusz Felisiak
parent 4ca508a689
commit 4e4082f939
10 changed files with 148 additions and 9 deletions

View File

@ -710,6 +710,7 @@ answer newbie questions, and generally made Django that much better:
Nicola Larosa <nico@teknico.net> Nicola Larosa <nico@teknico.net>
Nicolas Lara <nicolaslara@gmail.com> Nicolas Lara <nicolaslara@gmail.com>
Nicolas Noé <nicolas@niconoe.eu> Nicolas Noé <nicolas@niconoe.eu>
Nikita Marchant <nikita.marchant@gmail.com>
Niran Babalola <niran@niran.org> Niran Babalola <niran@niran.org>
Nis Jørgensen <nis@superlativ.dk> Nis Jørgensen <nis@superlativ.dk>
Nowell Strite <https://nowell.strite.org/> Nowell Strite <https://nowell.strite.org/>

View File

@ -13,7 +13,7 @@ from django.test.signals import setting_changed
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from .indexes import OpClass from .indexes import OpClass
from .lookups import SearchLookup, TrigramSimilar, Unaccent from .lookups import SearchLookup, TrigramSimilar, TrigramWordSimilar, Unaccent
from .serializers import RangeSerializer from .serializers import RangeSerializer
from .signals import register_type_handlers from .signals import register_type_handlers
@ -33,6 +33,8 @@ def uninstall_if_needed(setting, value, enter, **kwargs):
TextField._unregister_lookup(SearchLookup) TextField._unregister_lookup(SearchLookup)
CharField._unregister_lookup(TrigramSimilar) CharField._unregister_lookup(TrigramSimilar)
TextField._unregister_lookup(TrigramSimilar) TextField._unregister_lookup(TrigramSimilar)
CharField._unregister_lookup(TrigramWordSimilar)
TextField._unregister_lookup(TrigramWordSimilar)
# Disconnect this receiver until the next time this app is installed # Disconnect this receiver until the next time this app is installed
# and ready() connects it again to prevent unnecessary processing on # and ready() connects it again to prevent unnecessary processing on
# each setting change. # each setting change.
@ -65,5 +67,7 @@ class PostgresConfig(AppConfig):
TextField.register_lookup(SearchLookup) TextField.register_lookup(SearchLookup)
CharField.register_lookup(TrigramSimilar) CharField.register_lookup(TrigramSimilar)
TextField.register_lookup(TrigramSimilar) TextField.register_lookup(TrigramSimilar)
CharField.register_lookup(TrigramWordSimilar)
TextField.register_lookup(TrigramWordSimilar)
MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer) MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer)
IndexExpression.register_wrappers(OrderBy, OpClass, Collate) IndexExpression.register_wrappers(OrderBy, OpClass, Collate)

View File

@ -58,3 +58,8 @@ class SearchLookup(SearchVectorExact):
class TrigramSimilar(PostgresOperatorLookup): class TrigramSimilar(PostgresOperatorLookup):
lookup_name = 'trigram_similar' lookup_name = 'trigram_similar'
postgres_operator = '%%' postgres_operator = '%%'
class TrigramWordSimilar(PostgresOperatorLookup):
lookup_name = 'trigram_word_similar'
postgres_operator = '%%>'

View File

@ -293,6 +293,15 @@ class TrigramBase(Func):
super().__init__(expression, string, **extra) super().__init__(expression, string, **extra)
class TrigramWordBase(Func):
output_field = FloatField()
def __init__(self, string, expression, **extra):
if not hasattr(string, 'resolve_expression'):
string = Value(string)
super().__init__(string, expression, **extra)
class TrigramSimilarity(TrigramBase): class TrigramSimilarity(TrigramBase):
function = 'SIMILARITY' function = 'SIMILARITY'
@ -300,3 +309,12 @@ class TrigramSimilarity(TrigramBase):
class TrigramDistance(TrigramBase): class TrigramDistance(TrigramBase):
function = '' function = ''
arg_joiner = ' <-> ' arg_joiner = ' <-> '
class TrigramWordDistance(TrigramWordBase):
function = ''
arg_joiner = ' <<-> '
class TrigramWordSimilarity(TrigramWordBase):
function = 'WORD_SIMILARITY'

View File

@ -14,9 +14,8 @@ returns results that have a similarity measurement greater than the current
similarity threshold. similarity threshold.
To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS` To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS`
and activate the `pg_trgm extension and activate the `pg_trgm extension`_ on PostgreSQL. You can install the
<https://www.postgresql.org/docs/current/pgtrgm.html>`_ on PostgreSQL. You can extension using the
install the extension using the
:class:`~django.contrib.postgres.operations.TrigramExtension` migration :class:`~django.contrib.postgres.operations.TrigramExtension` migration
operation. operation.
@ -26,6 +25,31 @@ The ``trigram_similar`` lookup can be used on
>>> City.objects.filter(name__trigram_similar="Middlesborough") >>> City.objects.filter(name__trigram_similar="Middlesborough")
['<City: Middlesbrough>'] ['<City: Middlesbrough>']
.. fieldlookup:: trigram_word_similar
.. versionadded:: 4.0
The ``trigram_word_similar`` lookup allows you to perform trigram word
similarity lookups using a dedicated PostgreSQL extension. It can be
approximately understood as measuring the greatest number of trigrams shared
between the parameter and any substring of the field. A trigram word lookup is
given an expression and returns results that have a word similarity measurement
greater than the current similarity threshold.
To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS`
and activate the `pg_trgm extension`_ on PostgreSQL. You can install the
extension using the
:class:`~django.contrib.postgres.operations.TrigramExtension` migration
operation.
The ``trigram_word_similar`` lookup can be used on
:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`::
>>> Sentence.objects.filter(name__trigram_word_similar='Middlesborough')
['<Sentence: Gumby rides on the path of Middlesbrough>']
.. _`pg_trgm extension`: https://www.postgresql.org/docs/current/pgtrgm.html
``Unaccent`` ``Unaccent``
============ ============

View File

@ -280,8 +280,9 @@ Trigram similarity
================== ==================
Another approach to searching is trigram similarity. A trigram is a group of Another approach to searching is trigram similarity. A trigram is a group of
three consecutive characters. In addition to the :lookup:`trigram_similar` three consecutive characters. In addition to the :lookup:`trigram_similar` and
lookup, you can use a couple of other expressions. :lookup:`trigram_word_similar` lookups, you can use a couple of other
expressions.
To use them, you need to activate the `pg_trgm extension To use them, you need to activate the `pg_trgm extension
<https://www.postgresql.org/docs/current/pgtrgm.html>`_ on PostgreSQL. You can <https://www.postgresql.org/docs/current/pgtrgm.html>`_ on PostgreSQL. You can
@ -308,6 +309,27 @@ Usage example::
... ).filter(similarity__gt=0.3).order_by('-similarity') ... ).filter(similarity__gt=0.3).order_by('-similarity')
[<Author: Katy Stevens>, <Author: Stephen Keats>] [<Author: Katy Stevens>, <Author: Stephen Keats>]
``TrigramWordSimilarity``
-------------------------
.. versionadded:: 4.0
.. class:: TrigramWordSimilarity(string, expression, **extra)
Accepts a string or expression, and a field name or expression. Returns the
trigram word similarity between the two arguments.
Usage example::
>>> from django.contrib.postgres.search import TrigramWordSimilarity
>>> Author.objects.create(name='Katy Stevens')
>>> Author.objects.create(name='Stephen Keats')
>>> test = 'Kat'
>>> Author.objects.annotate(
... similarity=TrigramWordSimilarity(test, 'name'),
... ).filter(similarity__gt=0.3).order_by('-similarity')
[<Author: Katy Stevens>]
``TrigramDistance`` ``TrigramDistance``
------------------- -------------------
@ -326,3 +348,24 @@ Usage example::
... distance=TrigramDistance('name', test), ... distance=TrigramDistance('name', test),
... ).filter(distance__lte=0.7).order_by('distance') ... ).filter(distance__lte=0.7).order_by('distance')
[<Author: Katy Stevens>, <Author: Stephen Keats>] [<Author: Katy Stevens>, <Author: Stephen Keats>]
``TrigramWordDistance``
-----------------------
.. versionadded:: 4.0
.. class:: TrigramWordDistance(string, expression, **extra)
Accepts a string or expression, and a field name or expression. Returns the
trigram word distance between the two arguments.
Usage example::
>>> from django.contrib.postgres.search import TrigramWordDistance
>>> Author.objects.create(name='Katy Stevens')
>>> Author.objects.create(name='Stephen Keats')
>>> test = 'Kat'
>>> Author.objects.annotate(
... distance=TrigramWordDistance(test, 'name'),
... ).filter(distance__lte=0.7).order_by('distance')
[<Author: Katy Stevens>]

View File

@ -200,6 +200,13 @@ Minor features
expression allows using subqueries to construct lists of values on expression allows using subqueries to construct lists of values on
PostgreSQL. PostgreSQL.
* The new :lookup:`trigram_word_similar` lookup, and the
:class:`TrigramWordDistance()
<django.contrib.postgres.search.TrigramWordDistance>` and
:class:`TrigramWordSimilarity()
<django.contrib.postgres.search.TrigramWordSimilarity>` expressions allow
using trigram word similarity.
:mod:`django.contrib.redirects` :mod:`django.contrib.redirects`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -110,7 +110,7 @@ class Migration(migrations.Migration):
name='CharFieldModel', name='CharFieldModel',
fields=[ fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('field', models.CharField(max_length=16)), ('field', models.CharField(max_length=64)),
], ],
options=None, options=None,
bases=None, bases=None,

View File

@ -83,7 +83,7 @@ class ArrayEnumModel(PostgreSQLModel):
class CharFieldModel(models.Model): class CharFieldModel(models.Model):
field = models.CharField(max_length=16) field = models.CharField(max_length=64)
class TextFieldModel(models.Model): class TextFieldModel(models.Model):

View File

@ -5,7 +5,8 @@ from .models import CharFieldModel, TextFieldModel
try: try:
from django.contrib.postgres.search import ( from django.contrib.postgres.search import (
TrigramDistance, TrigramSimilarity, TrigramDistance, TrigramSimilarity, TrigramWordDistance,
TrigramWordSimilarity,
) )
except ImportError: except ImportError:
pass pass
@ -30,6 +31,15 @@ class TrigramTest(PostgreSQLTestCase):
transform=lambda instance: instance.field, transform=lambda instance: instance.field,
) )
def test_trigram_word_search(self):
obj = self.Model.objects.create(
field='Gumby rides on the path of Middlesbrough',
)
self.assertSequenceEqual(
self.Model.objects.filter(field__trigram_word_similar='Middlesborough'),
[obj],
)
def test_trigram_similarity(self): def test_trigram_similarity(self):
search = 'Bat sat on cat.' search = 'Bat sat on cat.'
# Round result of similarity because PostgreSQL 12+ uses greater # Round result of similarity because PostgreSQL 12+ uses greater
@ -43,6 +53,20 @@ class TrigramTest(PostgreSQLTestCase):
ordered=True, ordered=True,
) )
def test_trigram_word_similarity(self):
search = 'mat'
self.assertSequenceEqual(
self.Model.objects.filter(
field__trigram_word_similar=search,
).annotate(
word_similarity=TrigramWordSimilarity(search, 'field'),
).values('field', 'word_similarity').order_by('-word_similarity'),
[
{'field': 'Cat sat on mat.', 'word_similarity': 1.0},
{'field': 'Matthew', 'word_similarity': 0.75},
],
)
def test_trigram_similarity_alternate(self): def test_trigram_similarity_alternate(self):
# Round result of distance because PostgreSQL 12+ uses greater # Round result of distance because PostgreSQL 12+ uses greater
# precision. # precision.
@ -55,6 +79,19 @@ class TrigramTest(PostgreSQLTestCase):
ordered=True, ordered=True,
) )
def test_trigram_word_similarity_alternate(self):
self.assertSequenceEqual(
self.Model.objects.annotate(
word_distance=TrigramWordDistance('mat', 'field'),
).filter(
word_distance__lte=0.7,
).values('field', 'word_distance').order_by('word_distance'),
[
{'field': 'Cat sat on mat.', 'word_distance': 0},
{'field': 'Matthew', 'word_distance': 0.25},
],
)
class TrigramTextFieldTest(TrigramTest): class TrigramTextFieldTest(TrigramTest):
""" """