Skip to content

Commit

Permalink
Merge pull request #631 from NASA-IMPACT/629-add-fileext-to-api-respo…
Browse files Browse the repository at this point in the history
…nse-for-candidate-urls

629 add fileext to api response for candidate urls
  • Loading branch information
code-geek authored Feb 22, 2024
2 parents 6757c53 + 4e41e9d commit 2821691
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
22 changes: 22 additions & 0 deletions sde_collections/models/candidate_url.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import os
from urllib.parse import urlparse

from django.db import models
Expand Down Expand Up @@ -91,6 +92,27 @@ class Meta:
verbose_name_plural = "Candidate URLs"
ordering = ["url"]

@property
def fileext(self) -> str:
# Parse the URL to get the path
parsed_url = urlparse(self.url)
path = parsed_url.path

# Check for cases where the path ends with a slash or is empty, implying a directory or default file
if path.endswith("/") or not path:
return "html"

# Extract the extension from the path
extension = os.path.splitext(path)[1]

# Default to .html if no extension is found
if not extension:
return "html"

if extension.startswith("."):
return extension[1:]
return extension

def splits(self) -> list[tuple[str, str]]:
"""Split the path into multiple collections."""
parts = []
Expand Down
5 changes: 5 additions & 0 deletions sde_collections/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class Meta:
class CandidateURLAPISerializer(serializers.ModelSerializer):
document_type = serializers.SerializerMethodField()
title = serializers.SerializerMethodField()
file_extension = serializers.SerializerMethodField()

class Meta:
model = CandidateURL
Expand All @@ -99,6 +100,7 @@ class Meta:
"title",
"document_type",
"hash",
"file_extension",
)

def get_document_type(self, obj):
Expand All @@ -112,6 +114,9 @@ def get_document_type(self, obj):
def get_title(self, obj):
return obj.generated_title if obj.generated_title else obj.scraped_title

def get_file_extension(self, obj):
return obj.fileext


class BasePatternSerializer(serializers.ModelSerializer):
match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)
Expand Down
40 changes: 40 additions & 0 deletions sde_collections/tests/test_fileext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from django.test import TestCase

from ..models.candidate_url import CandidateURL
from ..models.collection import Collection


class CandidateURLsTestCase(TestCase):
def setUp(self):
# Set up non-modified objects used by all test methods
collection = Collection.objects.create(
config_folder="test_folder", name="Test Collection", division=1
)
CandidateURL.objects.create(
url="https://example.com/something.jpg", collection=collection
)

# Test cases
self.urls = {
"https://example.com/path/to/file.jpg": "jpg", # Standard file extension
"https://example.com/path/to/file": "html", # No extension
"https://example.com/": "html", # Root directory
"https://example.com/path/to/": "html", # Directory
"https://example.com/path/to/file.jpg?query=123": "jpg", # URL with query parameters
"https://example.com/path/to/file.jpeg#anchor": "jpeg", # URL with anchor
"https://example.com/path/to/file": "html", # File without extension
"https://example.com/path/to/.hiddenfile": "html", # Hidden file (starts with dot)
"https://example.com/path/to/.htaccess": "html", # .htaccess file
}

self.candidate_urls = []

for url in self.urls:
self.candidate_urls.append(
CandidateURL.objects.create(url=url, collection=collection)
)

def test_url_content(self):
for candidate_url in self.candidate_urls:
expected_extension = self.urls[candidate_url.url]
self.assertEqual(expected_extension, candidate_url.fileext)

0 comments on commit 2821691

Please sign in to comment.