Skip to content

Commit

Permalink
Add function to extract fileext from URL
Browse files Browse the repository at this point in the history
  • Loading branch information
code-geek committed Feb 22, 2024
1 parent d3b79d5 commit cbaebbc
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
22 changes: 22 additions & 0 deletions sde_collections/models/candidate_url.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import os
from urllib.parse import urlparse

from django.db import models
Expand Down Expand Up @@ -91,6 +92,27 @@ class Meta:
verbose_name_plural = "Candidate URLs"
ordering = ["url"]

@property
def fileext(self) -> str:
# Parse the URL to get the path
parsed_url = urlparse(self.url)
path = parsed_url.path

# Check for cases where the path ends with a slash or is empty, implying a directory or default file
if path.endswith("/") or not path:
return "html"

# Extract the extension from the path
extension = os.path.splitext(path)[1]

# Default to .html if no extension is found
if not extension:
return "html"

if extension.startswith("."):
return extension[1:]
return extension

def splits(self) -> list[tuple[str, str]]:
"""Split the path into multiple collections."""
parts = []
Expand Down
40 changes: 40 additions & 0 deletions sde_collections/tests/test_fileext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from django.test import TestCase

from ..models.candidate_url import CandidateURL
from ..models.collection import Collection


class CandidateURLsTestCase(TestCase):
def setUp(self):
# Set up non-modified objects used by all test methods
collection = Collection.objects.create(
config_folder="test_folder", name="Test Collection", division=1
)
CandidateURL.objects.create(
url="https://example.com/something.jpg", collection=collection
)

# Test cases
self.urls = {
"https://example.com/path/to/file.jpg": "jpg", # Standard file extension
"https://example.com/path/to/file": "html", # No extension
"https://example.com/": "html", # Root directory
"https://example.com/path/to/": "html", # Directory
"https://example.com/path/to/file.jpg?query=123": "jpg", # URL with query parameters
"https://example.com/path/to/file.jpeg#anchor": "jpeg", # URL with anchor
"https://example.com/path/to/file": "html", # File without extension
"https://example.com/path/to/.hiddenfile": "html", # Hidden file (starts with dot)
"https://example.com/path/to/.htaccess": "html", # .htaccess file
}

self.candidate_urls = []

for url in self.urls:
self.candidate_urls.append(
CandidateURL.objects.create(url=url, collection=collection)
)

def test_url_content(self):
for candidate_url in self.candidate_urls:
expected_extension = self.urls[candidate_url.url]
self.assertEqual(expected_extension, candidate_url.fileext)

0 comments on commit cbaebbc

Please sign in to comment.