From cbaebbc15230af7ff4d6fc2104929650e49cbe94 Mon Sep 17 00:00:00 2001
From: Ashish Acharya <ashish.acharya14@gmail.com>
Date: Thu, 22 Feb 2024 15:29:17 -0600
Subject: [PATCH 1/2] Add function to extract fileext from URL

---
 sde_collections/models/candidate_url.py | 22 ++++++++++++++
 sde_collections/tests/test_fileext.py   | 40 +++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 sde_collections/tests/test_fileext.py

diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 7812a8f1..acef4114 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -1,4 +1,5 @@
 import hashlib
+import os
 from urllib.parse import urlparse
 
 from django.db import models
@@ -91,6 +92,27 @@ class Meta:
         verbose_name_plural = "Candidate URLs"
         ordering = ["url"]
 
+    @property
+    def fileext(self) -> str:
+        # Parse the URL to get the path
+        parsed_url = urlparse(self.url)
+        path = parsed_url.path
+
+        # Check for cases where the path ends with a slash or is empty, implying a directory or default file
+        if path.endswith("/") or not path:
+            return "html"
+
+        # Extract the extension from the path
+        extension = os.path.splitext(path)[1]
+
+        # Default to .html if no extension is found
+        if not extension:
+            return "html"
+
+        if extension.startswith("."):
+            return extension[1:]
+        return extension
+
     def splits(self) -> list[tuple[str, str]]:
         """Split the path into multiple collections."""
         parts = []
diff --git a/sde_collections/tests/test_fileext.py b/sde_collections/tests/test_fileext.py
new file mode 100644
index 00000000..35c632b5
--- /dev/null
+++ b/sde_collections/tests/test_fileext.py
@@ -0,0 +1,40 @@
+from django.test import TestCase
+
+from ..models.candidate_url import CandidateURL
+from ..models.collection import Collection
+
+
+class CandidateURLsTestCase(TestCase):
+    def setUp(self):
+        # Set up non-modified objects used by all test methods
+        collection = Collection.objects.create(
+            config_folder="test_folder", name="Test Collection", division=1
+        )
+        CandidateURL.objects.create(
+            url="https://example.com/something.jpg", collection=collection
+        )
+
+        # Test cases
+        self.urls = {
+            "https://example.com/path/to/file.jpg": "jpg",  # Standard file extension
+            "https://example.com/path/to/file": "html",  # No extension
+            "https://example.com/": "html",  # Root directory
+            "https://example.com/path/to/": "html",  # Directory
+            "https://example.com/path/to/file.jpg?query=123": "jpg",  # URL with query parameters
+            "https://example.com/path/to/file.jpeg#anchor": "jpeg",  # URL with anchor
+            "https://example.com/path/to/file": "html",  # File without extension
+            "https://example.com/path/to/.hiddenfile": "html",  # Hidden file (starts with dot)
+            "https://example.com/path/to/.htaccess": "html",  # .htaccess file
+        }
+
+        self.candidate_urls = []
+
+        for url in self.urls:
+            self.candidate_urls.append(
+                CandidateURL.objects.create(url=url, collection=collection)
+            )
+
+    def test_url_content(self):
+        for candidate_url in self.candidate_urls:
+            expected_extension = self.urls[candidate_url.url]
+            self.assertEqual(expected_extension, candidate_url.fileext)

From 4e41e9d6b25bed22496b655d7e620de50b8dce98 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Thu, 22 Feb 2024 15:50:18 -0600
Subject: [PATCH 2/2] added file extension field to the serilaizer

---
 sde_collections/serializers.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 409b0878..5051bfd1 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -91,6 +91,7 @@ class Meta:
 class CandidateURLAPISerializer(serializers.ModelSerializer):
     document_type = serializers.SerializerMethodField()
     title = serializers.SerializerMethodField()
+    file_extension = serializers.SerializerMethodField()
 
     class Meta:
         model = CandidateURL
@@ -99,6 +100,7 @@ class Meta:
             "title",
             "document_type",
             "hash",
+            "file_extension",
         )
 
     def get_document_type(self, obj):
@@ -112,6 +114,9 @@ def get_document_type(self, obj):
     def get_title(self, obj):
         return obj.generated_title if obj.generated_title else obj.scraped_title
 
+    def get_file_extension(self, obj):
+        return obj.fileext
+
 
 class BasePatternSerializer(serializers.ModelSerializer):
     match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)