From cbaebbc15230af7ff4d6fc2104929650e49cbe94 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Thu, 22 Feb 2024 15:29:17 -0600 Subject: [PATCH 1/2] Add function to extract fileext from URL --- sde_collections/models/candidate_url.py | 22 ++++++++++++++ sde_collections/tests/test_fileext.py | 40 +++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 sde_collections/tests/test_fileext.py diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 7812a8f1..acef4114 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -1,4 +1,5 @@ import hashlib +import os from urllib.parse import urlparse from django.db import models @@ -91,6 +92,27 @@ class Meta: verbose_name_plural = "Candidate URLs" ordering = ["url"] + @property + def fileext(self) -> str: + # Parse the URL to get the path + parsed_url = urlparse(self.url) + path = parsed_url.path + + # Check for cases where the path ends with a slash or is empty, implying a directory or default file + if path.endswith("/") or not path: + return "html" + + # Extract the extension from the path + extension = os.path.splitext(path)[1] + + # Default to .html if no extension is found + if not extension: + return "html" + + if extension.startswith("."): + return extension[1:] + return extension + def splits(self) -> list[tuple[str, str]]: """Split the path into multiple collections.""" parts = [] diff --git a/sde_collections/tests/test_fileext.py b/sde_collections/tests/test_fileext.py new file mode 100644 index 00000000..35c632b5 --- /dev/null +++ b/sde_collections/tests/test_fileext.py @@ -0,0 +1,40 @@ +from django.test import TestCase + +from ..models.candidate_url import CandidateURL +from ..models.collection import Collection + + +class CandidateURLsTestCase(TestCase): + def setUp(self): + # Set up non-modified objects used by all test methods + collection = Collection.objects.create( + config_folder="test_folder", name="Test Collection", division=1 + ) + CandidateURL.objects.create( + url="https://example.com/something.jpg", collection=collection + ) + + # Test cases + self.urls = { + "https://example.com/path/to/file.jpg": "jpg", # Standard file extension + "https://example.com/path/to/file": "html", # No extension + "https://example.com/": "html", # Root directory + "https://example.com/path/to/": "html", # Directory + "https://example.com/path/to/file.jpg?query=123": "jpg", # URL with query parameters + "https://example.com/path/to/file.jpeg#anchor": "jpeg", # URL with anchor + "https://example.com/path/to/file": "html", # File without extension + "https://example.com/path/to/.hiddenfile": "html", # Hidden file (starts with dot) + "https://example.com/path/to/.htaccess": "html", # .htaccess file + } + + self.candidate_urls = [] + + for url in self.urls: + self.candidate_urls.append( + CandidateURL.objects.create(url=url, collection=collection) + ) + + def test_url_content(self): + for candidate_url in self.candidate_urls: + expected_extension = self.urls[candidate_url.url] + self.assertEqual(expected_extension, candidate_url.fileext) From 4e41e9d6b25bed22496b655d7e620de50b8dce98 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 22 Feb 2024 15:50:18 -0600 Subject: [PATCH 2/2] added file extension field to the serilaizer --- sde_collections/serializers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 409b0878..5051bfd1 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -91,6 +91,7 @@ class Meta: class CandidateURLAPISerializer(serializers.ModelSerializer): document_type = serializers.SerializerMethodField() title = serializers.SerializerMethodField() + file_extension = serializers.SerializerMethodField() class Meta: model = CandidateURL @@ -99,6 +100,7 @@ class Meta: "title", "document_type", "hash", + "file_extension", ) def get_document_type(self, obj): @@ -112,6 +114,9 @@ def get_document_type(self, obj): def get_title(self, obj): return obj.generated_title if obj.generated_title else obj.scraped_title + def get_file_extension(self, obj): + return obj.fileext + class BasePatternSerializer(serializers.ModelSerializer): match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)