diff --git a/sde_collections/migrations/0040_candidateurl_hash.py b/sde_collections/migrations/0040_candidateurl_hash.py new file mode 100644 index 00000000..a055275f --- /dev/null +++ b/sde_collections/migrations/0040_candidateurl_hash.py @@ -0,0 +1,20 @@ +# Generated by Django 5.0.1 on 2024-01-31 20:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0039_includepattern"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="hash", + field=models.CharField( + blank=True, default="", max_length=32, verbose_name="Hash" + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index bd136d4b..a4b35cd7 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -1,3 +1,4 @@ +import hashlib from urllib.parse import urlparse from django.db import models @@ -30,6 +31,7 @@ class CandidateURL(models.Model): Collection, on_delete=models.CASCADE, related_name="candidate_urls" ) url = models.CharField("URL") + hash = models.CharField("Hash", max_length=32, blank=True, default="") scraped_title = models.CharField( "Scraped Title", default="", @@ -109,3 +111,13 @@ def path(self) -> str: def __str__(self) -> str: return self.url + + def save(self, *args, **kwargs): + # Generate the hash based on the model values + hash_string = f"{self.url}{self.generated_title}{self.document_type}" + hash_value = hashlib.md5(hash_string.encode()).hexdigest() + + # Set the hash value + self.hash = hash_value + + super().save(*args, **kwargs)