From 4698d979c607f8900ca0c7faaa3f404f18311b3a Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 24 Apr 2024 14:38:10 -0500 Subject: [PATCH 001/111] adding env variable for webhook url --- config/settings/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/config/settings/base.py b/config/settings/base.py index 93842d73..616b9c92 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -338,3 +338,4 @@ SINEQUA_CONFIGS_REPO_MASTER_BRANCH = env("SINEQUA_CONFIGS_REPO_MASTER_BRANCH") SINEQUA_CONFIGS_REPO_DEV_BRANCH = env("SINEQUA_CONFIGS_REPO_DEV_BRANCH") SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH = env("SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH") +SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL") From a5b7f713f29b3bae54187b40a6bd3f7904348dfa Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 24 Apr 2024 14:38:56 -0500 Subject: [PATCH 002/111] adding logic to save function to check status transition --- sde_collections/models/collection.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index a0a1a915..4ffc6bb1 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -12,6 +12,11 @@ from config_generation.db_to_xml import XmlEditor from ..utils.github_helper import GitHubHandler +from ..utils.slack_utils import ( + STATUS_CHANGE_NOTIFICATIONS, + format_slack_message, + send_slack_message, +) from .collection_choice_fields import ( ConnectorChoices, CurationStatusChoices, @@ -432,6 +437,16 @@ def save(self, *args, **kwargs): if not self.config_folder: self.config_folder = self._compute_config_folder_name() + if not self._state.adding: + old_status = Collection.objects.get(id=self.id).workflow_status + new_status = self.workflow_status + if old_status != new_status: + transition = (old_status, new_status) + if transition in STATUS_CHANGE_NOTIFICATIONS: + details = STATUS_CHANGE_NOTIFICATIONS[transition] + message = format_slack_message(self.name, details, self.id) + send_slack_message(message) + # Call the parent class's save method super().save(*args, **kwargs) From 021a42c8a7955e92e59867a2757fbb1aea3537a9 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 24 Apr 2024 14:39:55 -0500 Subject: [PATCH 003/111] adding slack utils file for slack related code additions --- sde_collections/utils/slack_utils.py | 100 +++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 sde_collections/utils/slack_utils.py diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py new file mode 100644 index 00000000..796d9b48 --- /dev/null +++ b/sde_collections/utils/slack_utils.py @@ -0,0 +1,100 @@ +import requests +from django.conf import settings + +from ..models.collection_choice_fields import WorkflowStatusChoices + +SLACK_ID_MAPPING = { + "Carson Davis": "@UESJLQXH6", + "Bishwas Praveen": "@U05QZUF182J", + "Xiang Li": "@U03PPLNDZA7", + "Shravan Vishwanathan": "@U056B4HMGEP", + "Advait Yogaonkar": "@U06L5SKQ5QA", + "Emily Foshee": "@UPKDARB9P", + "Ashish Acharya": "@UC97PNAF6", + "channel": "!here", +} + + +STATUS_CHANGE_NOTIFICATIONS = { + (WorkflowStatusChoices.RESEARCH_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_ENGINEERING): { + "message": "Research on {name} is complete. Ready for engineering! :rocket:", + "tags": [ + SLACK_ID_MAPPING["Xiang Li"], + SLACK_ID_MAPPING["Shravan Vishwanathan"], + SLACK_ID_MAPPING["Advait Yogaonkar"], + ], + }, + (WorkflowStatusChoices.ENGINEERING_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_CURATION): { + "message": "Engineering on {name} is complete. Ready for curation! :mag:", + "tags": [SLACK_ID_MAPPING["Emily Foshee"]], + }, + (WorkflowStatusChoices.CURATION_IN_PROGRESS, WorkflowStatusChoices.CURATED): { + "message": "Curation on {name} is complete. It's now curated! :checkered_flag:", + "tags": [ + SLACK_ID_MAPPING["Carson Davis"], + SLACK_ID_MAPPING["Bishwas Praveen"], + SLACK_ID_MAPPING["Ashish Acharya"], + ], + }, + (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED): { + "message": "Alert: Secret deployment of {name} has failed! :warning:", + "tags": [ + SLACK_ID_MAPPING["Carson Davis"], + SLACK_ID_MAPPING["Bishwas Praveen"], + SLACK_ID_MAPPING["Ashish Acharya"], + ], + }, + (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK): { + "message": "Indexing of {name} on Secret Prod completed successfully. Ready for LRM QC! :clipboard:", + "tags": [SLACK_ID_MAPPING["Shravan Vishwanathan"], SLACK_ID_MAPPING["Advait Yogaonkar"]], + }, + (WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK): { + "message": "LRM QC passed for {name}. Ready for final quality check! :white_check_mark:", + "tags": [SLACK_ID_MAPPING["Emily Foshee"]], + }, + (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_FAILED): { + "message": "Quality check on {name} has failed. Changes needed! :x:", + "tags": [ + SLACK_ID_MAPPING["Xiang Li"], + SLACK_ID_MAPPING["Shravan Vishwanathan"], + SLACK_ID_MAPPING["Advait Yogaonkar"], + SLACK_ID_MAPPING["Carson Davis"], + SLACK_ID_MAPPING["Bishwas Praveen"], + SLACK_ID_MAPPING["Ashish Acharya"], + ], + }, + (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.READY_FOR_PUBLIC_PROD): { + "message": "{name} has passed all quality checks and is ready for public production! :trophy:", + "tags": [ + SLACK_ID_MAPPING["Carson Davis"], + SLACK_ID_MAPPING["Bishwas Praveen"], + SLACK_ID_MAPPING["Ashish Acharya"], + ], + }, + (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.LOW_PRIORITY_PROBLEMS_ON_PROD): { + "message": "{name} is now on Public Prod and is almost perfect, with minor issues noted. Please review! :memo:", + "tags": [SLACK_ID_MAPPING["channel"]], + }, + (WorkflowStatusChoices.READY_FOR_PUBLIC_PROD, WorkflowStatusChoices.PERFECT_ON_PROD): { + "message": "{name} is now live on Public Prod! Congrats team! :sparkles:", + "tags": [SLACK_ID_MAPPING["channel"]], + }, +} + + +def format_slack_message(name, details, collection_id): + message_template = details["message"] + tags = " ".join([f"<{user}>" for user in details["tags"]]) + link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/" + linked_name = f"<{link}|{name}>" + return tags + " " + message_template.format(name=linked_name) + + +def send_slack_message(message): + webhook_url = settings.SLACK_WEBHOOK_URL + payload = {"text": message} + response = requests.post(webhook_url, json=payload) + if response.status_code != 200: + raise ValueError( + f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}" + ) From 00b453d2fd65250174dc5c57473af9bd0c450838 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 1 May 2024 09:29:54 -0500 Subject: [PATCH 004/111] add webhook url to env variable list --- .envs/.local/.django | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.envs/.local/.django b/.envs/.local/.django index 026be76d..7ae4db1c 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -29,3 +29,7 @@ SINEQUA_CONFIGS_GITHUB_REPO='NASA-IMPACT/sde-backend' SINEQUA_CONFIGS_REPO_MASTER_BRANCH='master' SINEQUA_CONFIGS_REPO_DEV_BRANCH='dev' SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch' + +# Slack Webhook +# ------------------------------------------------------------------------------ +SLACK_WEBHOOK_URL='https://hooks.slack.com/services/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' From f285dbeb1d4463f95ed03977780f32bcc320b453 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 2 May 2024 13:58:28 -0500 Subject: [PATCH 005/111] adding script to create bulk sources via backend --- scripts/bulk_create_sources_on_webapp.py | 64 ++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scripts/bulk_create_sources_on_webapp.py diff --git a/scripts/bulk_create_sources_on_webapp.py b/scripts/bulk_create_sources_on_webapp.py new file mode 100644 index 00000000..c13e2a74 --- /dev/null +++ b/scripts/bulk_create_sources_on_webapp.py @@ -0,0 +1,64 @@ +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import Divisions + +DIVISION_MAPPING = { + "Helio": Divisions.HELIOPHYSICS, + "Astro": Divisions.ASTROPHYSICS, + "PDS": Divisions.PLANETARY, + "Earth": Divisions.EARTH_SCIENCE, + "BPS": Divisions.BIOLOGY, + "General": Divisions.GENERAL, +} + +sources = [ + { + "Name": "Source name", + "Link": "Base link to the source", + "Division": "Division of the source from the spread sheet", + "Notes": "Any notes available from the spreadsheet", + }, +] + + +def get_division_id(division_name): + division_name = division_name.strip() + return DIVISION_MAPPING.get(division_name, None) + + +def create_collection(source): + name = source["Name"] + link = source["Link"] + division_text = source["Division"] + notes = source["Notes"] + + division_id = get_division_id(division_text) + if division_id is None: + print(f"No valid division found for '{division_text}'. Skipping creation for {name}.") + return False + + try: + if Collection.objects.filter(name=name).exists(): + print(f"Collection with name '{name}' already exists. Skipping.") + return False + if Collection.objects.filter(url=link).exists(): + print(f"Collection with link '{link}' already exists. Skipping.") + return False + new_collection = Collection(name=name, url=link, division=division_id, notes=notes) + new_collection.save() + print(f"Collection '{name}' created successfully.") + return True + except Exception as e: + print(f"Failed to create collection '{name}': {e}") + return False + + +def main(): + created_count = 0 + for source in sources: + if create_collection(source): + created_count += 1 + print(f"Total new collections created: {created_count}") + + +if __name__ == "__main__": + main() From 2b09c5e45b3efb14a6698bfb6bd8d75dbac0d566 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 2 May 2024 16:02:58 -0500 Subject: [PATCH 006/111] slack notifications for feedbacks --- feedback/models.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/feedback/models.py b/feedback/models.py index 1e1a228f..4b3d31ec 100644 --- a/feedback/models.py +++ b/feedback/models.py @@ -1,6 +1,8 @@ from django.db import models from django.utils import timezone +from sde_collections.utils.slack_utils import send_slack_message + class Feedback(models.Model): name = models.CharField(max_length=150) @@ -18,6 +20,28 @@ def save(self, *args, **kwargs): if not self.id: self.created_at = timezone.now() super().save(*args, **kwargs) + is_new = self._state.adding + if is_new: + message = self.format_notification_message() + try: + send_slack_message(message) + except Exception as e: + print(f"Failed to send slack message: {e}") + + def format_notification_message(self): + """ + Returns a formatted notification message containing details from this Feedback instance. + """ + notification_message = ( + f"New Feedback Received!\n" + f"Name: {self.name}\n" + f"Email: {self.email}\n" + f"Subject: {self.subject}\n" + f"Comments: {self.comments}\n" + f"Source: {self.source}\n" + f"Received on: {self.created_at.strftime('%Y-%m-%d %H:%M:%S')}" + ) + return notification_message class ContentCurationRequest(models.Model): From 47062d9795c4e9db7c41d7b485830f66be460bf8 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 2 May 2024 16:40:50 -0500 Subject: [PATCH 007/111] made some aesthetic changes to the message --- feedback/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feedback/models.py b/feedback/models.py index 4b3d31ec..bc98536e 100644 --- a/feedback/models.py +++ b/feedback/models.py @@ -19,7 +19,6 @@ class Meta: def save(self, *args, **kwargs): if not self.id: self.created_at = timezone.now() - super().save(*args, **kwargs) is_new = self._state.adding if is_new: message = self.format_notification_message() @@ -27,13 +26,14 @@ def save(self, *args, **kwargs): send_slack_message(message) except Exception as e: print(f"Failed to send slack message: {e}") + super().save(*args, **kwargs) def format_notification_message(self): """ Returns a formatted notification message containing details from this Feedback instance. """ notification_message = ( - f"New Feedback Received!\n" + f" Hey team!! Good news! We've received a new feedback! :rocket: Here are the details : \n" f"Name: {self.name}\n" f"Email: {self.email}\n" f"Subject: {self.subject}\n" From 55fce28b26f64d0948f4057a255437a1916511e8 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 2 May 2024 16:51:52 -0500 Subject: [PATCH 008/111] deleted aesthetics --- feedback/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedback/models.py b/feedback/models.py index bc98536e..0666080f 100644 --- a/feedback/models.py +++ b/feedback/models.py @@ -33,7 +33,7 @@ def format_notification_message(self): Returns a formatted notification message containing details from this Feedback instance. """ notification_message = ( - f" Hey team!! Good news! We've received a new feedback! :rocket: Here are the details : \n" + f" New Feedback Received : \n" f"Name: {self.name}\n" f"Email: {self.email}\n" f"Subject: {self.subject}\n" From 1ad598150e8d36855b70bbee13f6f02eecb24645 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 3 May 2024 08:42:45 -0500 Subject: [PATCH 009/111] add merge pending status --- .../0045_alter_collection_workflow_status.py | 39 +++++++++++++++++++ .../models/collection_choice_fields.py | 1 + 2 files changed, 40 insertions(+) create mode 100644 sde_collections/migrations/0045_alter_collection_workflow_status.py diff --git a/sde_collections/migrations/0045_alter_collection_workflow_status.py b/sde_collections/migrations/0045_alter_collection_workflow_status.py new file mode 100644 index 00000000..3580be38 --- /dev/null +++ b/sde_collections/migrations/0045_alter_collection_workflow_status.py @@ -0,0 +1,39 @@ +# Generated by Django 4.2.9 on 2024-05-03 13:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0044_alter_collection_document_type"), + ] + + operations = [ + migrations.AlterField( + model_name="collection", + name="workflow_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "Quality Check Failed"), + (13, "Ready for Public Production"), + (14, "Perfect and on Production"), + (15, "Low Priority Problems on Production"), + (16, "High Priority Problems on Production, only for old sources"), + (17, "Code Merge Pending"), + ], + default=1, + ), + ), + ] diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py index 5d0a78e2..37ac9412 100644 --- a/sde_collections/models/collection_choice_fields.py +++ b/sde_collections/models/collection_choice_fields.py @@ -95,3 +95,4 @@ class WorkflowStatusChoices(models.IntegerChoices): PERFECT_ON_PROD = 14, "Perfect and on Production" LOW_PRIORITY_PROBLEMS_ON_PROD = 15, "Low Priority Problems on Production" HIGH_PRIORITY_PROBLEMS_ON_PROD = 16, "High Priority Problems on Production, only for old sources" + MERGE_PENDING = 17, "Code Merge Pending" From aed6a2b13f1bf54559bd8641ff9e23daea42b21c Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 9 May 2024 09:43:15 -0500 Subject: [PATCH 010/111] chage link to secret prod --- sde_collections/models/collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 527cc007..568e48ef 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -123,7 +123,7 @@ def tree_root(self) -> str: def server_url_secret_prod(self) -> str: base_url = "https://sciencediscoveryengine.nasa.gov" payload = { - "name": "query-sde-primary", + "name": "secret-prod", "scope": "All", "text": "", "advanced": { From 34ff8b08232b5187e25393797589993632ef304d Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 9 May 2024 09:50:01 -0500 Subject: [PATCH 011/111] update secret-prod url in the return --- sde_collections/models/collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 568e48ef..7ce9da1b 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -131,7 +131,7 @@ def server_url_secret_prod(self) -> str: }, } encoded_payload = urllib.parse.quote(json.dumps(payload)) - return f"{base_url}/app/nasa-sba-sde/#/search?query={encoded_payload}" + return f"{base_url}/app/secret-prod/#/search?query={encoded_payload}" @property def server_url_prod(self) -> str: From 7816aed36e4b8bc8f63a45a1daa013b0a0d51f53 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 14 May 2024 15:19:08 -0500 Subject: [PATCH 012/111] added a new model called workflowstatushistory to track status changes --- sde_collections/models/collection.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 7ce9da1b..5cc40912 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -467,7 +467,13 @@ def save(self, *args, **kwargs): details = STATUS_CHANGE_NOTIFICATIONS[transition] message = format_slack_message(self.name, details, self.id) send_slack_message(message) - + if "workflow_status" in self.tracker.changed(): + WorkflowStatusHistory.objects.create( + collection=self, + old_status=self.tracker.previous("workflow_status"), + new_status=self.workflow_status, + changed_by=self.curated_by, + ) # Call the parent class's save method super().save(*args, **kwargs) @@ -511,3 +517,18 @@ def create_configs_on_status_change(sender, instance, created, **kwargs): instance.create_indexer_config(overwrite=False) elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_PUBLIC_PROD: instance.add_to_public_query() + + +class WorkflowStatusHistory(models.Model): + collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="workflow_status_history") + old_status = models.IntegerField(choices=WorkflowStatusChoices.choices) + new_status = models.IntegerField(choices=WorkflowStatusChoices.choices) + changed_by = models.ForeignKey(get_user_model(), on_delete=models.SET_NULL, null=True, blank=True) + changed_at = models.DateTimeField(auto_now_add=True) + + class Meta: + verbose_name = "Workflow Status History" + verbose_name_plural = "Workflow Status Histories" + + def __str__(self): + return f"{self.collection.name} - {self.get_old_status_display()} to {self.get_new_status_display()}" From 559d95f69b18e2c642d7de20b5afc4a88361cce9 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 14 May 2024 15:19:43 -0500 Subject: [PATCH 013/111] added the migrations file for the new model --- .../migrations/0046_workflowstatushistory.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 sde_collections/migrations/0046_workflowstatushistory.py diff --git a/sde_collections/migrations/0046_workflowstatushistory.py b/sde_collections/migrations/0046_workflowstatushistory.py new file mode 100644 index 00000000..95f75a62 --- /dev/null +++ b/sde_collections/migrations/0046_workflowstatushistory.py @@ -0,0 +1,89 @@ +# Generated by Django 4.2.9 on 2024-05-14 20:00 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ("sde_collections", "0045_alter_collection_workflow_status"), + ] + + operations = [ + migrations.CreateModel( + name="WorkflowStatusHistory", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "old_status", + models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "Quality Check Failed"), + (13, "Ready for Public Production"), + (14, "Perfect and on Production"), + (15, "Low Priority Problems on Production"), + (16, "High Priority Problems on Production, only for old sources"), + (17, "Code Merge Pending"), + ] + ), + ), + ( + "new_status", + models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "Quality Check Failed"), + (13, "Ready for Public Production"), + (14, "Perfect and on Production"), + (15, "Low Priority Problems on Production"), + (16, "High Priority Problems on Production, only for old sources"), + (17, "Code Merge Pending"), + ] + ), + ), + ("changed_at", models.DateTimeField(auto_now_add=True)), + ( + "changed_by", + models.ForeignKey( + blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="workflow_status_history", + to="sde_collections.collection", + ), + ), + ], + options={ + "verbose_name": "Workflow Status History", + "verbose_name_plural": "Workflow Status Histories", + }, + ), + ] From 0bf293aefd9836970f1db2ec357c5422de56ce2b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 14 May 2024 15:20:52 -0500 Subject: [PATCH 014/111] made the model visible on the admin console --- sde_collections/admin.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index a175c3c9..b05c24f2 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -4,7 +4,7 @@ from django.http import HttpResponse from .models.candidate_url import CandidateURL -from .models.collection import Collection +from .models.collection import Collection, WorkflowStatusHistory from .models.pattern import IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api @@ -22,10 +22,7 @@ def generate_deployment_message(modeladmin, request, queryset): Collections Now Live in Prod:\n""" message_middle = "\n\n".join( - [ - f"- {collection.name} | {collection.server_url_prod}" - for collection in queryset.all() - ] + [f"- {collection.name} | {collection.server_url_prod}" for collection in queryset.all()] ) message_end = """ @@ -46,14 +43,10 @@ def download_candidate_urls_as_csv(modeladmin, request, queryset): writer = csv.writer(response) if len(queryset) > 1: - messages.add_message( - request, messages.ERROR, "You can only export one collection at a time." - ) + messages.add_message(request, messages.ERROR, "You can only export one collection at a time.") return - urls = CandidateURL.objects.filter(collection=queryset.first()).values_list( - "url", flat=True - ) + urls = CandidateURL.objects.filter(collection=queryset.first()).values_list("url", flat=True) # Write your headers here writer.writerow(["candidate_url"]) @@ -137,9 +130,7 @@ def import_candidate_urls_secret_test(modeladmin, request, queryset): @admin.action(description="Import candidate URLs from Secret Production") def import_candidate_urls_secret_production(modeladmin, request, queryset): - import_candidate_urls_from_api_caller( - modeladmin, request, queryset, "secret_production" - ) + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production") @admin.action(description="Import candidate URLs from Li's Server") @@ -149,9 +140,7 @@ def import_candidate_urls_lis_server(modeladmin, request, queryset): @admin.action(description="Import candidate URLs from LRM Dev Server") def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller( - modeladmin, request, queryset, "lrm_dev_server" - ) + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server") class ExportCsvMixin: @@ -287,6 +276,13 @@ class TitlePatternAdmin(admin.ModelAdmin): ) +class WorkflowStatusHistoryAdmin(admin.ModelAdmin): + list_display = ("collection", "old_status", "new_status", "changed_at") + search_fields = ["collection__name"] + list_filter = ["new_status", "old_status"] + + +admin.site.register(WorkflowStatusHistory, WorkflowStatusHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) From 45e757da120d0f4177de62ed153bd32226d5085b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 15 May 2024 15:35:36 -0500 Subject: [PATCH 015/111] remove necessary migration file --- .../migrations/0046_workflowstatushistory.py | 89 ------------------- 1 file changed, 89 deletions(-) delete mode 100644 sde_collections/migrations/0046_workflowstatushistory.py diff --git a/sde_collections/migrations/0046_workflowstatushistory.py b/sde_collections/migrations/0046_workflowstatushistory.py deleted file mode 100644 index 95f75a62..00000000 --- a/sde_collections/migrations/0046_workflowstatushistory.py +++ /dev/null @@ -1,89 +0,0 @@ -# Generated by Django 4.2.9 on 2024-05-14 20:00 - -from django.conf import settings -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ("sde_collections", "0045_alter_collection_workflow_status"), - ] - - operations = [ - migrations.CreateModel( - name="WorkflowStatusHistory", - fields=[ - ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), - ( - "old_status", - models.IntegerField( - choices=[ - (1, "Research in Progress"), - (2, "Ready for Engineering"), - (3, "Engineering in Progress"), - (4, "Ready for Curation"), - (5, "Curation in Progress"), - (6, "Curated"), - (7, "Quality Fixed"), - (8, "Secret Deployment Started"), - (9, "Secret Deployment Failed"), - (10, "Ready for LRM Quality Check"), - (11, "Ready for Quality Check"), - (12, "Quality Check Failed"), - (13, "Ready for Public Production"), - (14, "Perfect and on Production"), - (15, "Low Priority Problems on Production"), - (16, "High Priority Problems on Production, only for old sources"), - (17, "Code Merge Pending"), - ] - ), - ), - ( - "new_status", - models.IntegerField( - choices=[ - (1, "Research in Progress"), - (2, "Ready for Engineering"), - (3, "Engineering in Progress"), - (4, "Ready for Curation"), - (5, "Curation in Progress"), - (6, "Curated"), - (7, "Quality Fixed"), - (8, "Secret Deployment Started"), - (9, "Secret Deployment Failed"), - (10, "Ready for LRM Quality Check"), - (11, "Ready for Quality Check"), - (12, "Quality Check Failed"), - (13, "Ready for Public Production"), - (14, "Perfect and on Production"), - (15, "Low Priority Problems on Production"), - (16, "High Priority Problems on Production, only for old sources"), - (17, "Code Merge Pending"), - ] - ), - ), - ("changed_at", models.DateTimeField(auto_now_add=True)), - ( - "changed_by", - models.ForeignKey( - blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL - ), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - related_name="workflow_status_history", - to="sde_collections.collection", - ), - ), - ], - options={ - "verbose_name": "Workflow Status History", - "verbose_name_plural": "Workflow Status Histories", - }, - ), - ] From 2596ccaa5a48a6f23842a0f1dc7a7d254770c245 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 15 May 2024 15:36:05 -0500 Subject: [PATCH 016/111] remove the workflow status history model --- sde_collections/models/collection.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 5cc40912..d06016ea 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -467,13 +467,6 @@ def save(self, *args, **kwargs): details = STATUS_CHANGE_NOTIFICATIONS[transition] message = format_slack_message(self.name, details, self.id) send_slack_message(message) - if "workflow_status" in self.tracker.changed(): - WorkflowStatusHistory.objects.create( - collection=self, - old_status=self.tracker.previous("workflow_status"), - new_status=self.workflow_status, - changed_by=self.curated_by, - ) # Call the parent class's save method super().save(*args, **kwargs) @@ -517,18 +510,3 @@ def create_configs_on_status_change(sender, instance, created, **kwargs): instance.create_indexer_config(overwrite=False) elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_PUBLIC_PROD: instance.add_to_public_query() - - -class WorkflowStatusHistory(models.Model): - collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="workflow_status_history") - old_status = models.IntegerField(choices=WorkflowStatusChoices.choices) - new_status = models.IntegerField(choices=WorkflowStatusChoices.choices) - changed_by = models.ForeignKey(get_user_model(), on_delete=models.SET_NULL, null=True, blank=True) - changed_at = models.DateTimeField(auto_now_add=True) - - class Meta: - verbose_name = "Workflow Status History" - verbose_name_plural = "Workflow Status Histories" - - def __str__(self): - return f"{self.collection.name} - {self.get_old_status_display()} to {self.get_new_status_display()}" From f521dad382e8f37f12a6386035bcf905594e79b0 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 15 May 2024 15:36:09 -0500 Subject: [PATCH 017/111] add script for updating primary query --- scripts/add_perfect_to_prod_query.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 scripts/add_perfect_to_prod_query.py diff --git a/scripts/add_perfect_to_prod_query.py b/scripts/add_perfect_to_prod_query.py new file mode 100644 index 00000000..013be440 --- /dev/null +++ b/scripts/add_perfect_to_prod_query.py @@ -0,0 +1,11 @@ +""" +adds collections marked as ready for public prod to the public query +after running this code, you will need to merge in the webapp branch +""" + +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices + +for collection in Collection.objects.filter(workflow_status=WorkflowStatusChoices.READY_FOR_PUBLIC_PROD): + print(collection.config_folder) + collection.add_to_public_query() From 64fec0a2a0f36246190d0f318b501506daeb0115 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 15 May 2024 15:36:40 -0500 Subject: [PATCH 018/111] move script into quality folder --- scripts/{ => quality_and_indexing}/add_perfect_to_prod_query.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{ => quality_and_indexing}/add_perfect_to_prod_query.py (100%) diff --git a/scripts/add_perfect_to_prod_query.py b/scripts/quality_and_indexing/add_perfect_to_prod_query.py similarity index 100% rename from scripts/add_perfect_to_prod_query.py rename to scripts/quality_and_indexing/add_perfect_to_prod_query.py From 102c0c44fc01606899fcf9056da31198c04ab40b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 15 May 2024 15:36:43 -0500 Subject: [PATCH 019/111] remove admin console modifications --- sde_collections/admin.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index b05c24f2..0b0e60e4 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -4,7 +4,7 @@ from django.http import HttpResponse from .models.candidate_url import CandidateURL -from .models.collection import Collection, WorkflowStatusHistory +from .models.collection import Collection from .models.pattern import IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api @@ -276,13 +276,6 @@ class TitlePatternAdmin(admin.ModelAdmin): ) -class WorkflowStatusHistoryAdmin(admin.ModelAdmin): - list_display = ("collection", "old_status", "new_status", "changed_at") - search_fields = ["collection__name"] - list_filter = ["new_status", "old_status"] - - -admin.site.register(WorkflowStatusHistory, WorkflowStatusHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) From 27e74a6eea6e305fab28f105fd31409f1b705368 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 15 May 2024 15:38:24 -0500 Subject: [PATCH 020/111] add script to update statuses for qc'd sources --- .../change_statuses_on_webapp.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 scripts/quality_and_indexing/change_statuses_on_webapp.py diff --git a/scripts/quality_and_indexing/change_statuses_on_webapp.py b/scripts/quality_and_indexing/change_statuses_on_webapp.py new file mode 100644 index 00000000..cb9abb30 --- /dev/null +++ b/scripts/quality_and_indexing/change_statuses_on_webapp.py @@ -0,0 +1,66 @@ +""" +take emily's notes from slack and change the appropriate statuses in the webapp +""" + +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices + +RESEARCH_IN_PROGRESS = 1, "Research in Progress" +READY_FOR_ENGINEERING = 2, "Ready for Engineering" +ENGINEERING_IN_PROGRESS = 3, "Engineering in Progress" +READY_FOR_CURATION = 4, "Ready for Curation" +CURATION_IN_PROGRESS = 5, "Curation in Progress" +CURATED = 6, "Curated" +QUALITY_FIXED = 7, "Quality Fixed" +SECRET_DEPLOYMENT_STARTED = 8, "Secret Deployment Started" +SECRET_DEPLOYMENT_FAILED = 9, "Secret Deployment Failed" +READY_FOR_LRM_QUALITY_CHECK = 10, "Ready for LRM Quality Check" +READY_FOR_FINAL_QUALITY_CHECK = 11, "Ready for Quality Check" +QUALITY_CHECK_FAILED = 12, "Quality Check Failed" +READY_FOR_PUBLIC_PROD = 13, "Ready for Public Production" +PERFECT_ON_PROD = 14, "Perfect and on Production" +LOW_PRIORITY_PROBLEMS_ON_PROD = 15, "Low Priority Problems on Production" +HIGH_PRIORITY_PROBLEMS_ON_PROD = 16, "High Priority Problems on Production, only for old sources" +MERGE_PENDING = 17, "Code Merge Pending" + +perfect = [ + # "WIND_Spacecraft", + # "gamma_ray_data_tools_core_package", + # "land_processes_distributed_active_archive_center", + # "mdscc_deep_space_network", + # "HelioAnalytics", + # "nasa_infrared_telescope_facility_irtf", + # "gmao_fluid", + # "starchild_a_learning_center_for_young_astronomers", + # "voyager_Cosmic_Ray_Subsystem", + "ldas_land_data_assimilatin_system", + "ppi_node", +] + +low_priority = [ + "nasa_applied_sciences", + "parker_solar_probe", + "virtual_wave_observatory", + "explorer_program_acquisition", + "lisa_consortium", + "astropy", + "fermi_at_gsfc", + "microobservatory_robotic_telescope_network", +] + +for config in perfect: + print(config) + collection = Collection.objects.get(config_folder=config) + collection.workflow_status = WorkflowStatusChoices.PERFECT_ON_PROD + collection.save() + +for config in low_priority: + print(config) + collection = Collection.objects.get(config_folder=config) + collection.workflow_status = WorkflowStatusChoices.LOW_PRIORITY_PROBLEMS_ON_PROD + collection.save() + +# for config in perfect: +# collection = Collection.objects.get(config_folder=config) +# collection.workflow_status = WorkflowStatusChoices.PERFECT_ON_PROD +# collection.save() From c90e11d3b3e2556fdc9f7c0b21f7083f46a22de6 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 15 May 2024 15:40:59 -0500 Subject: [PATCH 021/111] add script to find sources to index and surface problems --- .../find_missing_folders.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/quality_and_indexing/find_missing_folders.py diff --git a/scripts/quality_and_indexing/find_missing_folders.py b/scripts/quality_and_indexing/find_missing_folders.py new file mode 100644 index 00000000..a91fdc06 --- /dev/null +++ b/scripts/quality_and_indexing/find_missing_folders.py @@ -0,0 +1,60 @@ +"""you run this in the shell on the server to find sources to index and find any that are missing plugin folders""" + +import os + +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices +from sde_collections.utils.github_helper import GitHubHandler + + +def get_sources_to_fix(): + return Collection.objects.filter(workflow_status__in=[WorkflowStatusChoices.QUALITY_FIXED]) + + +def get_sources_to_index(): + return Collection.objects.filter(workflow_status__in=[WorkflowStatusChoices.CURATED]) + + +def get_all_relevant_sources(): + return Collection.objects.filter( + workflow_status__in=[WorkflowStatusChoices.QUALITY_FIXED, WorkflowStatusChoices.CURATED] + ) + + +def get_missing_folders(collections, base_directory): + gh = GitHubHandler() + missing = [] + for source in collections: + folder_path = os.path.join(base_directory, source.config_folder, "default.xml") + if not gh.check_file_exists(folder_path): + missing.append(source) + return missing + + +def print_configs(queryset): + for source in queryset: + print(source.config_folder) + print("---" * 20) + print() + + +print("sources_to_fix") +sources_to_fix = get_sources_to_fix() +print_configs(sources_to_fix) + + +print("sources_to_index") +sources_to_index = get_sources_to_index() +print_configs(sources_to_index) + + +all_relevant_sources = get_all_relevant_sources() + +print("missing_scraper_folders") +missing_folders = get_missing_folders(all_relevant_sources, "sources/scrapers/") +print_configs(missing_folders) + + +print("missing_plugin_folders") +missing_folders = get_missing_folders(all_relevant_sources, "sources/SDE/") +print_configs(missing_folders) From cf26348228d8ce08eebfd5f4919b9f23b4734d51 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Thu, 16 May 2024 13:46:23 -0500 Subject: [PATCH 022/111] changed division mapping --- scripts/bulk_create_sources_on_webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bulk_create_sources_on_webapp.py b/scripts/bulk_create_sources_on_webapp.py index c13e2a74..8c26de28 100644 --- a/scripts/bulk_create_sources_on_webapp.py +++ b/scripts/bulk_create_sources_on_webapp.py @@ -7,7 +7,7 @@ "PDS": Divisions.PLANETARY, "Earth": Divisions.EARTH_SCIENCE, "BPS": Divisions.BIOLOGY, - "General": Divisions.GENERAL, + "Multiple": Divisions.GENERAL, } sources = [ From 2869570c1b57bf001cee9639a2a726c4e4becc42 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 20 May 2024 14:17:19 -0500 Subject: [PATCH 023/111] add draft xpath processing code --- scripts/process_xpaths.py | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 scripts/process_xpaths.py diff --git a/scripts/process_xpaths.py b/scripts/process_xpaths.py new file mode 100644 index 00000000..9485cde4 --- /dev/null +++ b/scripts/process_xpaths.py @@ -0,0 +1,68 @@ +import requests +from lxml import etree, html + + +def is_valid_xpath(xpath): + try: + etree.XPath(xpath) + return True + except etree.XPathSyntaxError: + return False + + +def get_value_from_xpath(url, xpath): + if not is_valid_xpath(xpath): + raise ValueError(f"The xpath, {xpath}, is not valid.") + + response = requests.get(url) + + if response.ok: + tree = html.fromstring(response.content) + values = tree.xpath(xpath) + + if len(values) == 1: + text_content = values[0].text + if text_content: + return text_content + else: + raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") + elif len(values) > 1: + raise ValueError(f"More than one element found for the xpath, {xpath}") + else: + raise ValueError(f"No element found for the xpath, {xpath}") + else: + raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") + + +xpath = '//*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1]' + +candidate_urls = [ + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20021213.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20041209.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20050224.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20050804.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060223.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioDonFreeman.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioPaulaGoodman.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioThaddeusMiles.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioKimberlyPaul.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioMadelynePfeiffer.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioRachelEvans.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629.html", + "https://mars.nasa.gov/imagine/leaders/project_resources/webcasts.html", +] + + +xpath = '//*[@id="main_content_wrapper"]/h4' +candidate_urls = [ + "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=cabinet", + "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=flowbench", + "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=materials", + "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=SIprep", + "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=thinandthick", +] + +for candidate_url in candidate_urls: + value = get_value_from_xpath(candidate_url, xpath) + print(f"The value at the specified XPath is: {value}") From a3bbb957a28e4cc9544a46721fa6d955d922a4eb Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 12:12:14 -0500 Subject: [PATCH 024/111] add basic title processing script --- scripts/process_xpaths.py | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/scripts/process_xpaths.py b/scripts/process_xpaths.py index 9485cde4..ab5025b5 100644 --- a/scripts/process_xpaths.py +++ b/scripts/process_xpaths.py @@ -1,3 +1,5 @@ +import re + import requests from lxml import etree, html @@ -34,6 +36,58 @@ def get_value_from_xpath(url, xpath): raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") +def parse_string(input_string): + # Define regex patterns for each type + brace_pattern = re.compile(r"\{([^\}]+)\}") + xpath_pattern = re.compile(r"xpath:(//[^\s]+)") + + # Initialize the result list + result = [] + + # Define the current index + current_index = 0 + + while current_index < len(input_string): + # Try to match brace pattern + brace_match = brace_pattern.match(input_string, current_index) + if brace_match: + result.append(("brace", brace_match.group(1))) + current_index = brace_match.end() + continue + + # Try to match xpath pattern + xpath_match = xpath_pattern.match(input_string, current_index) + if xpath_match: + result.append(("xpath", xpath_match.group(1))) + current_index = xpath_match.end() + continue + + # Otherwise, accumulate as a normal string until the next special pattern + next_special_index = min( + ( + brace_pattern.search(input_string, current_index).start() + if brace_pattern.search(input_string, current_index) + else len(input_string) + ), + ( + xpath_pattern.search(input_string, current_index).start() + if xpath_pattern.search(input_string, current_index) + else len(input_string) + ), + ) + + result.append(("str", input_string[current_index:next_special_index])) + current_index = next_special_index + + return result + + +# Example usage +input_string = 'content: {title} xpath://*[@id="centeredcontent2"] overview' +parsed_list = parse_string(input_string) +print(parsed_list) + + xpath = '//*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1]' candidate_urls = [ From aa1900a86a3fd2baaee456f4093b4b5e23015716 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 12:40:21 -0500 Subject: [PATCH 025/111] further flesh out intermediate xpath functions --- scripts/process_xpaths.py | 48 ++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/scripts/process_xpaths.py b/scripts/process_xpaths.py index ab5025b5..71d9bbb7 100644 --- a/scripts/process_xpaths.py +++ b/scripts/process_xpaths.py @@ -1,7 +1,9 @@ +import html as html_lib import re import requests from lxml import etree, html +from unidecode import unidecode def is_valid_xpath(xpath): @@ -12,7 +14,23 @@ def is_valid_xpath(xpath): return False -def get_value_from_xpath(url, xpath): +def clean_text(text): + text_content = unidecode(text) + text_content = html_lib.unescape(text_content) + # remove tabs and newlines, replace them with a single space + text_content = re.sub(r"[\t\n\r]+", " ", text_content) + # remove multiple spaces + text_content = re.sub(r"\s+", " ", text_content) + # strip leading and trailing whitespace + text_content = text_content.strip() + return text_content + + +def resolve_brace(brace_content): + return brace_content + + +def resolve_xpath(xpath, url): if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") @@ -25,6 +43,7 @@ def get_value_from_xpath(url, xpath): if len(values) == 1: text_content = values[0].text if text_content: + text_content = clean_text(text_content) return text_content else: raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") @@ -36,15 +55,11 @@ def get_value_from_xpath(url, xpath): raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") -def parse_string(input_string): - # Define regex patterns for each type +def parse_title(input_string): brace_pattern = re.compile(r"\{([^\}]+)\}") xpath_pattern = re.compile(r"xpath:(//[^\s]+)") - # Initialize the result list result = [] - - # Define the current index current_index = 0 while current_index < len(input_string): @@ -82,10 +97,21 @@ def parse_string(input_string): return result -# Example usage -input_string = 'content: {title} xpath://*[@id="centeredcontent2"] overview' -parsed_list = parse_string(input_string) -print(parsed_list) +def resolve_title(raw_title, url): + parsed_title = parse_title(raw_title) + final_string = "" + + for element in parsed_title: + element_type, element_value = element + + if element_type == "xpath": + final_string += resolve_xpath(element_value, url) + elif element_type == "brace": + final_string += resolve_brace(element_value) + elif element_type == "str": + final_string += element_value + + return final_string xpath = '//*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1]' @@ -118,5 +144,5 @@ def parse_string(input_string): ] for candidate_url in candidate_urls: - value = get_value_from_xpath(candidate_url, xpath) + value = resolve_xpath(xpath, candidate_url) print(f"The value at the specified XPath is: {value}") From a03d7c9f1bdef9be02a5a5542e5aa425ff39638e Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:00:56 -0500 Subject: [PATCH 026/111] incorporate brace resolution --- .../utils/title_resolver.py | 74 ++++++++++++------- 1 file changed, 48 insertions(+), 26 deletions(-) rename scripts/process_xpaths.py => sde_collections/utils/title_resolver.py (68%) diff --git a/scripts/process_xpaths.py b/sde_collections/utils/title_resolver.py similarity index 68% rename from scripts/process_xpaths.py rename to sde_collections/utils/title_resolver.py index 71d9bbb7..b8877f2e 100644 --- a/scripts/process_xpaths.py +++ b/sde_collections/utils/title_resolver.py @@ -1,5 +1,8 @@ +import _ast +import ast import html as html_lib import re +from dataclasses import dataclass import requests from lxml import etree, html @@ -26,11 +29,25 @@ def clean_text(text): return text_content -def resolve_brace(brace_content): - return brace_content +def resolve_brace(pattern, context): + """Safely interpolates the variables in an f-string pattern using the provided context.""" + context = {"url": "www.google.com", "title": "Original Title"} + parsed = ast.parse(f"f'''{pattern}'''", mode="eval") + + # Walk through the AST to ensure it only contains safe expressions + for node in ast.walk(parsed): + if isinstance(node, _ast.FormattedValue): + if not isinstance(node.value, _ast.Name): + raise ValueError("Unsupported expression in f-string pattern.") + if node.value.id not in context: + raise ValueError(f"Variable {node.value.id} not allowed in f-string pattern.") + + compiled = compile(parsed, "", "eval") + return eval(compiled, {}, context) def resolve_xpath(xpath, url): + print("url is", url) if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") @@ -97,7 +114,7 @@ def parse_title(input_string): return result -def resolve_title(raw_title, url): +def resolve_title(raw_title, context): parsed_title = parse_title(raw_title) final_string = "" @@ -105,37 +122,26 @@ def resolve_title(raw_title, url): element_type, element_value = element if element_type == "xpath": - final_string += resolve_xpath(element_value, url) + final_string += resolve_xpath(element_value, context["url"]) elif element_type == "brace": - final_string += resolve_brace(element_value) + final_string += resolve_brace(element_value, context) elif element_type == "str": final_string += element_value return final_string -xpath = '//*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1]' - -candidate_urls = [ - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20021213.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20041209.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20050224.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20050804.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060223.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioDonFreeman.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioPaulaGoodman.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731_bioThaddeusMiles.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20060731.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioKimberlyPaul.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioMadelynePfeiffer.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629_bioRachelEvans.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcast_20070629.html", - "https://mars.nasa.gov/imagine/leaders/project_resources/webcasts.html", -] +@dataclass +class CandidateURL: + url: str + scraped_title: str + collection: str + title_pattern: str xpath = '//*[@id="main_content_wrapper"]/h4' -candidate_urls = [ +pattern = '{collection} Overview: xpath://*[@id="main_content_wrapper"]/h4' +urls = [ "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=cabinet", "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=flowbench", "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=materials", @@ -143,6 +149,22 @@ def resolve_title(raw_title, url): "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=thinandthick", ] +candidate_urls = [ + CandidateURL(url=url, scraped_title="Scraped Title", collection="Collection Name", title_pattern=pattern) + for url in urls +] + + for candidate_url in candidate_urls: - value = resolve_xpath(xpath, candidate_url) - print(f"The value at the specified XPath is: {value}") + context = { + "url": candidate_url.url, + "title": candidate_url.scraped_title, + "collection": candidate_url.collection, + } + + title = resolve_title(candidate_url.title_pattern, context) + print(title) + print() + # value = resolve_xpath(xpath, candidate_url) + + # print(f"The value at the specified XPath is: {value}") From 8f8d791d3d7dee821e67230e1b14c6132d3ede00 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 14:04:59 -0500 Subject: [PATCH 027/111] Fix site header etc in admin --- config/urls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/urls.py b/config/urls.py index 9fbb8e39..169188f8 100644 --- a/config/urls.py +++ b/config/urls.py @@ -5,10 +5,10 @@ from django.views import defaults as default_views admin.site.site_header = ( - "SDE Indexing Administration" # default: "Django Administration" + "SDE Indexing Helper Administration" # default: "Django Administration" ) -admin.site.index_title = "SDE Indexing" # default: "Site administration" -admin.site.site_title = "SDE Indexing" # default: "Django site admin" +admin.site.index_title = "SDE Indexing Helper" # default: "Site administration" +admin.site.site_title = "SDE Indexing Helper" # default: "Django site admin" urlpatterns = [ path("", include("sde_collections.urls", namespace="sde_collections")), From bc59542c0b643f54b72d1c0685e4840f0d546c28 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 14:08:41 -0500 Subject: [PATCH 028/111] Remove unnecessary comment --- config/urls.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/urls.py b/config/urls.py index 169188f8..1c1f2f45 100644 --- a/config/urls.py +++ b/config/urls.py @@ -18,8 +18,7 @@ # User management path("users/", include("sde_indexing_helper.users.urls", namespace="users")), path("accounts/", include("allauth.urls")), - path("api-auth/", include("rest_framework.urls")) - # Your stuff: custom urls includes go here + path("api-auth/", include("rest_framework.urls")), ] + static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) From 261ef5abc469b1d05954850e813ee2ed5f3789a9 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:34:19 -0500 Subject: [PATCH 029/111] added ResolvedTitle model and FK bindings --- sde_collections/models/candidate_url.py | 41 +++++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index acef4114..b539dfc5 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -3,19 +3,18 @@ from urllib.parse import urlparse from django.db import models +from django.utils import timezone from .collection import Collection from .collection_choice_fields import DocumentTypes -from .pattern import ExcludePattern +from .pattern import ExcludePattern, TitlePattern class CandidateURLQuerySet(models.QuerySet): def with_exclusion_status(self): return self.annotate( excluded=models.Exists( - ExcludePattern.candidate_urls.through.objects.filter( - candidateurl=models.OuterRef("pk") - ) + ExcludePattern.candidate_urls.through.objects.filter(candidateurl=models.OuterRef("pk")) ) ) @@ -28,9 +27,7 @@ def get_queryset(self): class CandidateURL(models.Model): """A candidate URL scraped for a given collection.""" - collection = models.ForeignKey( - Collection, on_delete=models.CASCADE, related_name="candidate_urls" - ) + collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="candidate_urls") url = models.CharField("URL") hash = models.CharField("Hash", max_length=32, blank=True, default="1") scraped_title = models.CharField( @@ -57,9 +54,7 @@ class CandidateURL(models.Model): blank=True, help_text="This is the title present on Production Server", ) - level = models.IntegerField( - "Level", default=0, blank=True, help_text="Level in the tree. Based on /." - ) + level = models.IntegerField("Level", default=0, blank=True, help_text="Level in the tree. Based on /.") visited = models.BooleanField(default=False) objects = CandidateURLManager() document_type = models.IntegerField(choices=DocumentTypes.choices, null=True) @@ -84,6 +79,14 @@ class CandidateURL(models.Model): default=False, help_text="Helps keep track if the Current URL is present in production or not", ) + resolved_title = models.ForeignKey( + "ResolvedTitle", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="candidate_urls", + help_text="Link to the resolved title data", + ) class Meta: """Meta definition for Candidate URL.""" @@ -143,3 +146,21 @@ def save(self, *args, **kwargs): self.hash = hash_value super().save(*args, **kwargs) + + +class ResolvedTitle(models.Model): + title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE, related_name="resolved_titles") + candidate_url = models.ForeignKey(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") + resolution_status = models.BooleanField(default=False, help_text="True if resolved, False if unresolved") + resolution_date_time = models.DateTimeField(default=timezone.now) + resolved_title = models.CharField(max_length=1024, blank=True) + error_string = models.TextField(blank=True) + http_status_code = models.IntegerField(null=True, blank=True) + + def __str__(self): + status = "Resolved" if self.resolution_status else "Unresolved" + return f"{self.resolved_title} - {status}" + + class Meta: + verbose_name = "Resolved Title" + verbose_name_plural = "Resolved Titles" From a468d6d58e991cf4e2c5c79ba843205f348c79c4 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:35:00 -0500 Subject: [PATCH 030/111] added a ListView for the ResolvedTitle model --- sde_collections/views.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index 999a6684..41727ce4 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -18,7 +18,7 @@ from rest_framework.views import APIView from .forms import CollectionGithubIssueForm, CommentsForm, RequiredUrlForm -from .models.candidate_url import CandidateURL +from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, Comments, RequiredUrls from .models.collection_choice_fields import ( ConnectorChoices, @@ -462,3 +462,12 @@ def get_context_data(self, **kwargs): context["differences"] = self.data return context + + +class ResolvedTitleListView(ListView): + model = ResolvedTitle + template_name = "sde_collections/resolved_titles_list.html" + context_object_name = "resolved_titles" + + def get_queryset(self): + return super().get_queryset().order_by("-resolution_date_time") From 421cb8d163f11f491b129e0db11168967fba1299 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:35:28 -0500 Subject: [PATCH 031/111] add unidecode and lxml to base req --- requirements/base.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/base.txt b/requirements/base.txt index fcd9ac27..dfbdf806 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -28,3 +28,5 @@ PyGithub==2.2.0 tqdm==4.66.1 xmltodict==0.13.0 django-cors-headers==4.3.1 +unidecode==1.3.8 +lxml==4.9.2 From 58538153e6936aaca2694da50212d11ea23cf853 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:35:29 -0500 Subject: [PATCH 032/111] added the url path --- sde_collections/urls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 261207af..e7bd305d 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -56,4 +56,5 @@ view=views.CandidateURLAPIView.as_view(), name="candidate-url-api", ), + path("resolved-titles/", view=views.ResolvedTitleListView.as_view(), name="resolved-titles"), ] From 3c020be8a131c96f12d5092dcb2c3b380e785d5f Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:36:01 -0500 Subject: [PATCH 033/111] added model to admin console --- sde_collections/admin.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 0b0e60e4..33a4ba37 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -3,7 +3,7 @@ from django.contrib import admin, messages from django.http import HttpResponse -from .models.candidate_url import CandidateURL +from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection from .models.pattern import IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api @@ -276,6 +276,11 @@ class TitlePatternAdmin(admin.ModelAdmin): ) +class ResolvedTitleAdmin(admin.ModelAdmin): + list_display = ["title_pattern", "resolved_title", "resolution_status", "resolution_date_time"] + + admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) +admin.site.register(ResolvedTitle, ResolvedTitleAdmin) From 13480a083713102035c2eb5ff9dedc301acb43fe Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:36:03 -0500 Subject: [PATCH 034/111] replace safe_f_string with resolve_title and add col.name to context --- sde_collections/models/pattern.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 203f3d9c..44d702e1 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -3,7 +3,7 @@ from django.apps import apps from django.db import models -from ..pattern_interpreter import safe_f_string_evaluation +from ..utils.title_resolver import resolve_title from .collection_choice_fields import DocumentTypes @@ -142,10 +142,14 @@ def apply(self) -> None: updated_urls = [] for candidate_url in matched_urls: - context = {"url": candidate_url.url, "title": candidate_url.scraped_title} + context = { + "url": candidate_url.url, + "title": candidate_url.scraped_title, + "collection": self.collection.name, + } try: - generated_title = safe_f_string_evaluation(self.title_pattern, context) + generated_title = resolve_title(self.title_pattern, context) candidate_url.generated_title = generated_title updated_urls.append(candidate_url) except ValueError as e: From ae6f804d7bc6d33acf2fc6deeb62d46e387b5050 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:36:42 -0500 Subject: [PATCH 035/111] added basic html template for the resolvedTitles view --- .../sde_collections/resolved_titles_list.html | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 sde_indexing_helper/templates/sde_collections/resolved_titles_list.html diff --git a/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html b/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html new file mode 100644 index 00000000..69ed605b --- /dev/null +++ b/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html @@ -0,0 +1,50 @@ +{% extends "layouts/base.html" %} +{% load static %} +{% load i18n %} + +{% block title %}Resolved Titles{% endblock %} +{% block stylesheets %} + {{ block.super }} + +{% endblock %} + +{% block content %} +
+

Resolved Titles

+ + + + + + + + + + + + {% for resolved_title in resolved_titles %} + + + + + + + + {% endfor %} + +
URLResolved TitleStatusResolved DateHTTP Status
{{ resolved_title.candidate_url.url }}{{ resolved_title.resolved_title }}{{ resolved_title.get_resolution_status_display }}{{ resolved_title.resolution_date_time|date:"Y-m-d H:i" }}{{ resolved_title.http_status_code }}
+
+{% endblock %} + +{% block javascripts %} + {{ block.super }} + + + +{% endblock %} From 9bae8af9cd240705e6b2187bf786b5a51ececa96 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Tue, 21 May 2024 14:36:54 -0500 Subject: [PATCH 036/111] adding the migration file --- ...solvedtitle_candidateurl_resolved_title.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py diff --git a/sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py b/sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py new file mode 100644 index 00000000..4d1e6961 --- /dev/null +++ b/sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py @@ -0,0 +1,61 @@ +# Generated by Django 4.2.9 on 2024-05-21 19:01 + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0045_alter_collection_workflow_status"), + ] + + operations = [ + migrations.CreateModel( + name="ResolvedTitle", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "resolution_status", + models.BooleanField(default=False, help_text="True if resolved, False if unresolved"), + ), + ("resolution_date_time", models.DateTimeField(default=django.utils.timezone.now)), + ("resolved_title", models.CharField(blank=True, max_length=1024)), + ("error_string", models.TextField(blank=True)), + ("http_status_code", models.IntegerField(blank=True, null=True)), + ( + "candidate_url", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="resolved_titles", + to="sde_collections.candidateurl", + ), + ), + ( + "title_pattern", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="resolved_titles", + to="sde_collections.titlepattern", + ), + ), + ], + options={ + "verbose_name": "Resolved Title", + "verbose_name_plural": "Resolved Titles", + }, + ), + migrations.AddField( + model_name="candidateurl", + name="resolved_title", + field=models.ForeignKey( + blank=True, + help_text="Link to the resolved title data", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="candidate_urls", + to="sde_collections.resolvedtitle", + ), + ), + ] From 6fd8664e214112e4d05d57607e26018d0a7e80e6 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 14:37:46 -0500 Subject: [PATCH 037/111] Update how errors are shown for title_pattern --- sde_indexing_helper/static/js/candidate_url_list.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index de89a6dd..640c14ba 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -442,7 +442,10 @@ function postTitlePatterns(match_pattern, title_pattern, match_pattern_type = 1) }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; - toastr.error(errorMessage); + var errorMessages = JSON.parse(errorMessage); + Object.entries(errorMessages.error).forEach(([key, value]) => { + toastr.error(value, key); + }); } }); } From a32e20eb41841f4c9a766ed6c1d49592ecb0ba36 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:37:54 -0500 Subject: [PATCH 038/111] remove example code from title_resolver --- sde_collections/utils/title_resolver.py | 40 ------------------------- 1 file changed, 40 deletions(-) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index b8877f2e..c24bc9b2 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -2,7 +2,6 @@ import ast import html as html_lib import re -from dataclasses import dataclass import requests from lxml import etree, html @@ -129,42 +128,3 @@ def resolve_title(raw_title, context): final_string += element_value return final_string - - -@dataclass -class CandidateURL: - url: str - scraped_title: str - collection: str - title_pattern: str - - -xpath = '//*[@id="main_content_wrapper"]/h4' -pattern = '{collection} Overview: xpath://*[@id="main_content_wrapper"]/h4' -urls = [ - "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=cabinet", - "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=flowbench", - "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=materials", - "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=SIprep", - "https://curator.jsc.nasa.gov/antmet/sample_preparation.cfm?section=thinandthick", -] - -candidate_urls = [ - CandidateURL(url=url, scraped_title="Scraped Title", collection="Collection Name", title_pattern=pattern) - for url in urls -] - - -for candidate_url in candidate_urls: - context = { - "url": candidate_url.url, - "title": candidate_url.scraped_title, - "collection": candidate_url.collection, - } - - title = resolve_title(candidate_url.title_pattern, context) - print(title) - print() - # value = resolve_xpath(xpath, candidate_url) - - # print(f"The value at the specified XPath is: {value}") From e0db6b32dd9e89eb156f13a877ceea4bc36324e2 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:38:33 -0500 Subject: [PATCH 039/111] remove deprecated pattern interpreter --- sde_collections/pattern_interpreter.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 sde_collections/pattern_interpreter.py diff --git a/sde_collections/pattern_interpreter.py b/sde_collections/pattern_interpreter.py deleted file mode 100644 index 4ae5d3aa..00000000 --- a/sde_collections/pattern_interpreter.py +++ /dev/null @@ -1,18 +0,0 @@ -import _ast -import ast - - -def safe_f_string_evaluation(pattern, context): - """Safely interpolates the variables in an f-string pattern using the provided context.""" - parsed = ast.parse(f"f'''{pattern}'''", mode="eval") - - # Walk through the AST to ensure it only contains safe expressions - for node in ast.walk(parsed): - if isinstance(node, _ast.FormattedValue): - if not isinstance(node.value, _ast.Name): - raise ValueError("Unsupported expression in f-string pattern.") - if node.value.id not in context: - raise ValueError(f"Variable {node.value.id} not allowed in f-string pattern.") - - compiled = compile(parsed, "", "eval") - return eval(compiled, {}, context) From 474447991fe9b0c1f8858ea24a94f7cda2bee200 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Tue, 21 May 2024 14:42:29 -0500 Subject: [PATCH 040/111] add type hinting --- sde_collections/utils/title_resolver.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index c24bc9b2..eaa491b5 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -2,13 +2,14 @@ import ast import html as html_lib import re +from typing import Any import requests from lxml import etree, html from unidecode import unidecode -def is_valid_xpath(xpath): +def is_valid_xpath(xpath: str) -> bool: try: etree.XPath(xpath) return True @@ -16,7 +17,7 @@ def is_valid_xpath(xpath): return False -def clean_text(text): +def clean_text(text: str) -> str: text_content = unidecode(text) text_content = html_lib.unescape(text_content) # remove tabs and newlines, replace them with a single space @@ -28,7 +29,7 @@ def clean_text(text): return text_content -def resolve_brace(pattern, context): +def resolve_brace(pattern: str, context: dict[str, Any]) -> str: """Safely interpolates the variables in an f-string pattern using the provided context.""" context = {"url": "www.google.com", "title": "Original Title"} parsed = ast.parse(f"f'''{pattern}'''", mode="eval") @@ -42,10 +43,10 @@ def resolve_brace(pattern, context): raise ValueError(f"Variable {node.value.id} not allowed in f-string pattern.") compiled = compile(parsed, "", "eval") - return eval(compiled, {}, context) + return str(eval(compiled, {}, context)) -def resolve_xpath(xpath, url): +def resolve_xpath(xpath: str, url: str) -> str: print("url is", url) if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") @@ -71,7 +72,7 @@ def resolve_xpath(xpath, url): raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") -def parse_title(input_string): +def parse_title(input_string: str) -> list[tuple[str, str]]: brace_pattern = re.compile(r"\{([^\}]+)\}") xpath_pattern = re.compile(r"xpath:(//[^\s]+)") @@ -113,7 +114,7 @@ def parse_title(input_string): return result -def resolve_title(raw_title, context): +def resolve_title(raw_title: str, context: dict[str, Any]) -> str: parsed_title = parse_title(raw_title) final_string = "" From 301a5bb2f07feaa5a55f4b6ae267bb09d666aaa4 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 14:44:30 -0500 Subject: [PATCH 041/111] Add a task to resolve TitlePatterns --- sde_collections/tasks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 761d92a1..f63151f7 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -8,6 +8,7 @@ from django.core.management.commands import loaddata from config import celery_app +from sde_collections.models.pattern import TitlePattern from .models.collection import Collection from .sinequa_api import Api @@ -129,3 +130,9 @@ def pull_latest_collection_metadata_from_github(): aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, ) s3_client.upload_file(FILENAME, s3_bucket_name, s3_key) + + +@celery_app.task() +def resolve_title_pattern(title_pattern_id): + title_pattern = TitlePattern.objects.get(id=title_pattern_id) + title_pattern.resolve() From 2092798d1a942f731f55198160c8dcf2fcc4d858 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 14:45:19 -0500 Subject: [PATCH 042/111] Add a validationerror block for titlepattern. TODO: Replace --- sde_collections/models/pattern.py | 42 +++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 203f3d9c..3d427fe8 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -1,6 +1,7 @@ import re from django.apps import apps +from django.core.exceptions import ValidationError from django.db import models from ..pattern_interpreter import safe_f_string_evaluation @@ -23,7 +24,9 @@ class MatchPatternTypeChoices(models.IntegerChoices): help_text="This pattern is compared against the URL of all the documents in the collection " "and matching documents will be returned", ) - match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1) + match_pattern_type = models.IntegerField( + choices=MatchPatternTypeChoices.choices, default=1 + ) candidate_urls = models.ManyToManyField( "CandidateURL", related_name="%(class)s_urls", @@ -33,10 +36,14 @@ def matched_urls(self): """Find all the urls matching the pattern.""" escaped_match_pattern = re.escape(self.match_pattern) if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL: - return self.collection.candidate_urls.filter(url__regex=f"{escaped_match_pattern}$") + return self.collection.candidate_urls.filter( + url__regex=f"{escaped_match_pattern}$" + ) elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN: return self.collection.candidate_urls.filter( - url__regex=escaped_match_pattern.replace(r"\*", ".*") # allow * wildcards + url__regex=escaped_match_pattern.replace( + r"\*", ".*" + ) # allow * wildcards ) else: raise NotImplementedError @@ -51,7 +58,10 @@ def _process_match_pattern(self) -> str: if not processed_pattern.startswith("http"): # if it doesn't begin with http, it must need a star at the beginning processed_pattern = f"*{processed_pattern}" - if self.match_pattern_type == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN: + if ( + self.match_pattern_type + == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN + ): # all multi urls should have a star at the end, but individuals should not processed_pattern = f"{processed_pattern}*" return processed_pattern @@ -89,7 +99,9 @@ def apply(self) -> None: candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create( objs=[ - ExcludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, excludepattern_id=self.id) + ExcludePattern.candidate_urls.through( + candidateurl_id=candidate_url_id, excludepattern_id=self.id + ) for candidate_url_id in candidate_url_ids ] ) @@ -112,7 +124,9 @@ def apply(self) -> None: candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create( objs=[ - IncludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, includepattern_id=self.id) + IncludePattern.candidate_urls.through( + candidateurl_id=candidate_url_id, includepattern_id=self.id + ) for candidate_url_id in candidate_url_ids ] ) @@ -129,11 +143,17 @@ class Meta: unique_together = ("collection", "match_pattern") +def validate_title_pattern(title_pattern_string): + if not title_pattern_string.startswith("http"): + raise ValidationError("Title pattern has to start with http") + + class TitlePattern(BaseMatchPattern): title_pattern = models.CharField( "Title Pattern", help_text="This is the pattern for the new title. You can either write an exact replacement string" " (no quotes required) or you can write sinequa-valid code", + validators=[validate_title_pattern], ) def apply(self) -> None: @@ -156,13 +176,19 @@ def apply(self) -> None: TitlePatternCandidateURL = TitlePattern.candidate_urls.through pattern_url_associations = [ - TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls + TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) + for url in updated_urls ] - TitlePatternCandidateURL.objects.bulk_create(pattern_url_associations, ignore_conflicts=True) + TitlePatternCandidateURL.objects.bulk_create( + pattern_url_associations, ignore_conflicts=True + ) def unapply(self) -> None: self.candidate_urls.update(generated_title="") + def resolve(self) -> None: + print(f"Resolving title pattern {self.id}") + class Meta: """Meta definition for TitlePattern.""" From df0c4fce9a00a7dd8270104af7d3f7fdb617bbc5 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 15:25:05 -0500 Subject: [PATCH 043/111] Update black to use 120 line length --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..55ec8d78 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 120 From 36106edcc15acc559083ad809b766b578b242852 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 15:26:11 -0500 Subject: [PATCH 044/111] Remove unused lines --- sde_collections/utils/title_resolver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index eaa491b5..124a355e 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -31,7 +31,6 @@ def clean_text(text: str) -> str: def resolve_brace(pattern: str, context: dict[str, Any]) -> str: """Safely interpolates the variables in an f-string pattern using the provided context.""" - context = {"url": "www.google.com", "title": "Original Title"} parsed = ast.parse(f"f'''{pattern}'''", mode="eval") # Walk through the AST to ensure it only contains safe expressions @@ -40,14 +39,15 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str: if not isinstance(node.value, _ast.Name): raise ValueError("Unsupported expression in f-string pattern.") if node.value.id not in context: - raise ValueError(f"Variable {node.value.id} not allowed in f-string pattern.") + raise ValueError( + f"Variable {node.value.id} not allowed in f-string pattern. Allowed variables are: {context.keys()}" + ) compiled = compile(parsed, "", "eval") return str(eval(compiled, {}, context)) def resolve_xpath(xpath: str, url: str) -> str: - print("url is", url) if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") From f59e79e5604a2ed1cf49312dcaceb59bd7b1ee0f Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:16:03 -0500 Subject: [PATCH 045/111] Add function to validate fstring --- sde_collections/utils/title_resolver.py | 32 +++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index 124a355e..98036e76 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -17,6 +17,26 @@ def is_valid_xpath(xpath: str) -> bool: return False +def is_valid_fstring(pattern: str) -> bool: + context = { + "url": "", + "title": "", + "collection": "", + } + parsed = ast.parse(f"f'''{pattern}'''", mode="eval") + # Walk through the AST to ensure it only contains safe expressions + for node in ast.walk(parsed): + if isinstance(node, _ast.FormattedValue): + if not isinstance(node.value, _ast.Name): + raise ValueError("Unsupported expression in f-string pattern.") + if node.value.id not in context: + variables_allowed = ", ".join([key for key in context.keys()]) + raise ValueError( + f"Variable '{node.value.id}' not allowed in f-string pattern." + f" Allowed variables are: {variables_allowed}" + ) + + def clean_text(text: str) -> str: text_content = unidecode(text) text_content = html_lib.unescape(text_content) @@ -33,15 +53,7 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str: """Safely interpolates the variables in an f-string pattern using the provided context.""" parsed = ast.parse(f"f'''{pattern}'''", mode="eval") - # Walk through the AST to ensure it only contains safe expressions - for node in ast.walk(parsed): - if isinstance(node, _ast.FormattedValue): - if not isinstance(node.value, _ast.Name): - raise ValueError("Unsupported expression in f-string pattern.") - if node.value.id not in context: - raise ValueError( - f"Variable {node.value.id} not allowed in f-string pattern. Allowed variables are: {context.keys()}" - ) + is_valid_fstring(pattern) # Refactor this compiled = compile(parsed, "", "eval") return str(eval(compiled, {}, context)) @@ -83,7 +95,7 @@ def parse_title(input_string: str) -> list[tuple[str, str]]: # Try to match brace pattern brace_match = brace_pattern.match(input_string, current_index) if brace_match: - result.append(("brace", brace_match.group(1))) + result.append(("brace", "{" + brace_match.group(1) + "}")) current_index = brace_match.end() continue From 967b25702d685433ce0b38007b9a2e0f5d87bb2e Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:17:12 -0500 Subject: [PATCH 046/111] Add validate_title_pattern function to patterns --- sde_collections/models/pattern.py | 54 +++++++++++++++---------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 7fed55fe..4fc47b26 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -4,7 +4,12 @@ from django.core.exceptions import ValidationError from django.db import models -from ..utils.title_resolver import resolve_title +from ..utils.title_resolver import ( + is_valid_fstring, + is_valid_xpath, + parse_title, + resolve_title, +) from .collection_choice_fields import DocumentTypes @@ -24,9 +29,7 @@ class MatchPatternTypeChoices(models.IntegerChoices): help_text="This pattern is compared against the URL of all the documents in the collection " "and matching documents will be returned", ) - match_pattern_type = models.IntegerField( - choices=MatchPatternTypeChoices.choices, default=1 - ) + match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1) candidate_urls = models.ManyToManyField( "CandidateURL", related_name="%(class)s_urls", @@ -36,14 +39,10 @@ def matched_urls(self): """Find all the urls matching the pattern.""" escaped_match_pattern = re.escape(self.match_pattern) if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL: - return self.collection.candidate_urls.filter( - url__regex=f"{escaped_match_pattern}$" - ) + return self.collection.candidate_urls.filter(url__regex=f"{escaped_match_pattern}$") elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN: return self.collection.candidate_urls.filter( - url__regex=escaped_match_pattern.replace( - r"\*", ".*" - ) # allow * wildcards + url__regex=escaped_match_pattern.replace(r"\*", ".*") # allow * wildcards ) else: raise NotImplementedError @@ -58,10 +57,7 @@ def _process_match_pattern(self) -> str: if not processed_pattern.startswith("http"): # if it doesn't begin with http, it must need a star at the beginning processed_pattern = f"*{processed_pattern}" - if ( - self.match_pattern_type - == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN - ): + if self.match_pattern_type == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN: # all multi urls should have a star at the end, but individuals should not processed_pattern = f"{processed_pattern}*" return processed_pattern @@ -99,9 +95,7 @@ def apply(self) -> None: candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create( objs=[ - ExcludePattern.candidate_urls.through( - candidateurl_id=candidate_url_id, excludepattern_id=self.id - ) + ExcludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, excludepattern_id=self.id) for candidate_url_id in candidate_url_ids ] ) @@ -124,9 +118,7 @@ def apply(self) -> None: candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create( objs=[ - IncludePattern.candidate_urls.through( - candidateurl_id=candidate_url_id, includepattern_id=self.id - ) + IncludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, includepattern_id=self.id) for candidate_url_id in candidate_url_ids ] ) @@ -144,8 +136,19 @@ class Meta: def validate_title_pattern(title_pattern_string): - if not title_pattern_string.startswith("http"): - raise ValidationError("Title pattern has to start with http") + parsed_title = parse_title(title_pattern_string) + + for element in parsed_title: + element_type, element_value = element + + if element_type == "xpath": + if not is_valid_xpath(element_value): + raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") + elif element_type == "brace": + try: + is_valid_fstring(element_value) + except ValueError as e: + raise ValidationError(str(e)) class TitlePattern(BaseMatchPattern): @@ -180,12 +183,9 @@ def apply(self) -> None: TitlePatternCandidateURL = TitlePattern.candidate_urls.through pattern_url_associations = [ - TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) - for url in updated_urls + TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls ] - TitlePatternCandidateURL.objects.bulk_create( - pattern_url_associations, ignore_conflicts=True - ) + TitlePatternCandidateURL.objects.bulk_create(pattern_url_associations, ignore_conflicts=True) def unapply(self) -> None: self.candidate_urls.update(generated_title="") From 8799f23a8b2bd16ea3d1fee2f2f96181614aaf16 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:18:44 -0500 Subject: [PATCH 047/111] Remove field from model --- ...ve_candidateurl_resolved_title_and_more.py | 27 +++++++++++++++++++ sde_collections/models/candidate_url.py | 8 ------ 2 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py diff --git a/sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py b/sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py new file mode 100644 index 00000000..46720186 --- /dev/null +++ b/sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py @@ -0,0 +1,27 @@ +# Generated by Django 4.2.9 on 2024-05-21 21:18 + +from django.db import migrations, models +import sde_collections.models.pattern + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0046_resolvedtitle_candidateurl_resolved_title"), + ] + + operations = [ + migrations.RemoveField( + model_name="candidateurl", + name="resolved_title", + ), + migrations.AlterField( + model_name="titlepattern", + name="title_pattern", + field=models.CharField( + help_text="This is the pattern for the new title. You can either write an exact replacement string (no quotes required) or you can write sinequa-valid code", + validators=[sde_collections.models.pattern.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index b539dfc5..6f5a046e 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -79,14 +79,6 @@ class CandidateURL(models.Model): default=False, help_text="Helps keep track if the Current URL is present in production or not", ) - resolved_title = models.ForeignKey( - "ResolvedTitle", - on_delete=models.SET_NULL, - null=True, - blank=True, - related_name="candidate_urls", - help_text="Link to the resolved title data", - ) class Meta: """Meta definition for Candidate URL.""" From 01d5cc2bf19495eb1045cff4504dc048ddc1936f Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:20:37 -0500 Subject: [PATCH 048/111] Change from foreign key to onetoonefield --- .../0048_alter_resolvedtitle_candidate_url.py | 23 +++++++++++++++++++ sde_collections/models/candidate_url.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py diff --git a/sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py b/sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py new file mode 100644 index 00000000..0a2a7c82 --- /dev/null +++ b/sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.9 on 2024-05-21 21:20 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0047_remove_candidateurl_resolved_title_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="resolvedtitle", + name="candidate_url", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="resolved_titles", + to="sde_collections.candidateurl", + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 6f5a046e..03dc6af5 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -142,7 +142,7 @@ def save(self, *args, **kwargs): class ResolvedTitle(models.Model): title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE, related_name="resolved_titles") - candidate_url = models.ForeignKey(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") + candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") resolution_status = models.BooleanField(default=False, help_text="True if resolved, False if unresolved") resolution_date_time = models.DateTimeField(default=timezone.now) resolved_title = models.CharField(max_length=1024, blank=True) From beaaae18a0edcc1c8a31e238475b9b6031fc9947 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:23:51 -0500 Subject: [PATCH 049/111] Add auto_now_add to resolution_date_time --- ...alter_resolvedtitle_resolution_date_time.py | 18 ++++++++++++++++++ sde_collections/models/candidate_url.py | 3 +-- 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py diff --git a/sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py b/sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py new file mode 100644 index 00000000..f51f9fb0 --- /dev/null +++ b/sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-05-21 21:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0048_alter_resolvedtitle_candidate_url"), + ] + + operations = [ + migrations.AlterField( + model_name="resolvedtitle", + name="resolution_date_time", + field=models.DateTimeField(auto_now_add=True), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 03dc6af5..2f457dcc 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -3,7 +3,6 @@ from urllib.parse import urlparse from django.db import models -from django.utils import timezone from .collection import Collection from .collection_choice_fields import DocumentTypes @@ -144,7 +143,7 @@ class ResolvedTitle(models.Model): title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE, related_name="resolved_titles") candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") resolution_status = models.BooleanField(default=False, help_text="True if resolved, False if unresolved") - resolution_date_time = models.DateTimeField(default=timezone.now) + resolution_date_time = models.DateTimeField(auto_now_add=True) resolved_title = models.CharField(max_length=1024, blank=True) error_string = models.TextField(blank=True) http_status_code = models.IntegerField(null=True, blank=True) From f6e3475850a761615872262e06a99d5f674794b4 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Tue, 21 May 2024 16:25:05 -0500 Subject: [PATCH 050/111] Remove length requirement from resolved_title --- .../0050_alter_resolvedtitle_resolved_title.py | 18 ++++++++++++++++++ sde_collections/models/candidate_url.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py diff --git a/sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py b/sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py new file mode 100644 index 00000000..3c47818f --- /dev/null +++ b/sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-05-21 21:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0049_alter_resolvedtitle_resolution_date_time"), + ] + + operations = [ + migrations.AlterField( + model_name="resolvedtitle", + name="resolved_title", + field=models.CharField(blank=True), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 2f457dcc..23ef0b85 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -144,7 +144,7 @@ class ResolvedTitle(models.Model): candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") resolution_status = models.BooleanField(default=False, help_text="True if resolved, False if unresolved") resolution_date_time = models.DateTimeField(auto_now_add=True) - resolved_title = models.CharField(max_length=1024, blank=True) + resolved_title = models.CharField(blank=True) error_string = models.TextField(blank=True) http_status_code = models.IntegerField(null=True, blank=True) From 2c07f21d98e3c86705f8b0336cceaad440815c90 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 22 May 2024 10:25:36 -0500 Subject: [PATCH 051/111] added a base class and migraton --- ...ter_resolvedtitle_error_string_and_more.py | 23 +++++++++++++++++++ sde_collections/models/candidate_url.py | 22 ++++++++++-------- 2 files changed, 36 insertions(+), 9 deletions(-) create mode 100644 sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py diff --git a/sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py b/sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py new file mode 100644 index 00000000..f6e69f23 --- /dev/null +++ b/sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.9 on 2024-05-22 15:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0050_alter_resolvedtitle_resolved_title"), + ] + + operations = [ + migrations.AlterField( + model_name="resolvedtitle", + name="error_string", + field=models.TextField(blank=True, default=""), + ), + migrations.AlterField( + model_name="resolvedtitle", + name="resolved_title", + field=models.CharField(blank=True, default=""), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 23ef0b85..64453778 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -139,19 +139,23 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) -class ResolvedTitle(models.Model): +class ResolvedTitleBase(models.Model): title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE, related_name="resolved_titles") candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") - resolution_status = models.BooleanField(default=False, help_text="True if resolved, False if unresolved") - resolution_date_time = models.DateTimeField(auto_now_add=True) - resolved_title = models.CharField(blank=True) - error_string = models.TextField(blank=True) - http_status_code = models.IntegerField(null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + abstract = True + - def __str__(self): - status = "Resolved" if self.resolution_status else "Unresolved" - return f"{self.resolved_title} - {status}" +class ResolvedTitle(ResolvedTitleBase): + resolved_title = models.CharField(blank=True, default="") class Meta: verbose_name = "Resolved Title" verbose_name_plural = "Resolved Titles" + + +class ResolvedTitleError(ResolvedTitleBase): + error_string = models.TextField(blank=True, default="") + http_status_code = models.IntegerField(null=True, blank=True) From d63bd50222d440e6ab272760fcd1ed2573cf4930 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 22 May 2024 11:10:18 -0500 Subject: [PATCH 052/111] made changes to models and pattern --- sde_collections/models/candidate_url.py | 11 ++++++++--- sde_collections/models/pattern.py | 14 +++++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 64453778..8f4e423e 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -140,8 +140,8 @@ def save(self, *args, **kwargs): class ResolvedTitleBase(models.Model): - title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE, related_name="resolved_titles") - candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE, related_name="resolved_titles") + title_pattern = models.ForeignKey(TitlePattern, on_delete=models.CASCADE) + candidate_url = models.OneToOneField(CandidateURL, on_delete=models.CASCADE) created_at = models.DateTimeField(auto_now_add=True) class Meta: @@ -155,7 +155,12 @@ class Meta: verbose_name = "Resolved Title" verbose_name_plural = "Resolved Titles" + def save(self, *args, **kwargs): + # Finds the linked candidate URL and deletes ResolvedTitleError objects linked to it + ResolvedTitleError.objects.filter(candidate_url=self.candidate_url).delete() + super().save(*args, **kwargs) + class ResolvedTitleError(ResolvedTitleBase): - error_string = models.TextField(blank=True, default="") + error_string = models.TextField(null=False, blank=False) http_status_code = models.IntegerField(null=True, blank=True) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 4fc47b26..2a31e38c 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -1,6 +1,6 @@ import re -from django.apps import apps +# from django.apps import apps from django.core.exceptions import ValidationError from django.db import models @@ -12,6 +12,8 @@ ) from .collection_choice_fields import DocumentTypes +# from sde_collections.models.candidate_url import ResolvedTitleError + class BaseMatchPattern(models.Model): class MatchPatternTypeChoices(models.IntegerChoices): @@ -160,7 +162,7 @@ class TitlePattern(BaseMatchPattern): ) def apply(self) -> None: - CandidateURL = apps.get_model("sde_collections", "CandidateURL") + # CandidateURL = apps.get_model("sde_collections", "CandidateURL") matched_urls = self.matched_urls() updated_urls = [] @@ -174,13 +176,11 @@ def apply(self) -> None: try: generated_title = resolve_title(self.title_pattern, context) candidate_url.generated_title = generated_title - updated_urls.append(candidate_url) except ValueError as e: - print(f"Error applying title pattern to {candidate_url.url}: {e}") - - if updated_urls: - CandidateURL.objects.bulk_update(updated_urls, ["generated_title"]) + # error_object = ResolvedTitleError.objects.create(error_string=message) + # ResolvedTitleError.objects.create(error_string=str(e), http_status_code=) + raise ValidationError(str(e)) TitlePatternCandidateURL = TitlePattern.candidate_urls.through pattern_url_associations = [ TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls From ec4f8eba3915bfcd6068118c70919390d9d9e99f Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 22 May 2024 11:25:24 -0500 Subject: [PATCH 053/111] add error pass through to the titlepatten apply --- sde_collections/models/pattern.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 2a31e38c..aa956643 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -4,6 +4,8 @@ from django.core.exceptions import ValidationError from django.db import models +from sde_collections.models.candidate_url import ResolvedTitleError + from ..utils.title_resolver import ( is_valid_fstring, is_valid_xpath, @@ -177,10 +179,17 @@ def apply(self) -> None: generated_title = resolve_title(self.title_pattern, context) candidate_url.generated_title = generated_title except ValueError as e: - # error_object = ResolvedTitleError.objects.create(error_string=message) + message = str(e) + error_object = ResolvedTitleError.objects.create(error_string=message) + + status_code = re.search(r"Status code: (\d+)", message) + if status_code: + error_object.http_status_code = int(status_code.group(1)) + + error_object.save() - # ResolvedTitleError.objects.create(error_string=str(e), http_status_code=) raise ValidationError(str(e)) + TitlePatternCandidateURL = TitlePattern.candidate_urls.through pattern_url_associations = [ TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls From 7388e81207f7f28d9681d77a1d1cf8433f47bdf3 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 22 May 2024 11:26:49 -0500 Subject: [PATCH 054/111] change resolve_title_pattern task to apply --- sde_collections/tasks.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index f63151f7..6a66c3ec 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -32,9 +32,7 @@ def _get_data_to_import(collection, server_name): page = 1 while True: print(f"Getting page: {page}") - response = api.query( - page=page, collection_config_folder=collection.config_folder - ) + response = api.query(page=page, collection_config_folder=collection.config_folder) if response["cursorRowCount"] == 0: break @@ -75,9 +73,7 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]): urls_file = f"{TEMP_FOLDER_NAME}/{collection.config_folder}.json" print("Getting responses from API") - data_to_import = _get_data_to_import( - server_name=server_name, collection=collection - ) + data_to_import = _get_data_to_import(server_name=server_name, collection=collection) print(f"Got {len(data_to_import)} records for {collection.config_folder}") print("Dumping django fixture to file") @@ -135,4 +131,4 @@ def pull_latest_collection_metadata_from_github(): @celery_app.task() def resolve_title_pattern(title_pattern_id): title_pattern = TitlePattern.objects.get(id=title_pattern_id) - title_pattern.resolve() + title_pattern.apply() From 3cf7e2029fdd675a44042ceea5b86cb2348aff52 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 22 May 2024 15:04:05 -0500 Subject: [PATCH 055/111] initial working title code --- sde_collections/admin.py | 2 +- ..._time_resolvedtitle_created_at_and_more.py | 63 ++++++++++++++++ sde_collections/models/pattern.py | 27 +++++-- sde_collections/tasks.py | 3 +- sde_collections/urls.py | 1 + sde_collections/views.py | 26 +++++-- .../titles_and_errors_list.html | 74 +++++++++++++++++++ 7 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py create mode 100644 sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 33a4ba37..0941c6b0 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -277,7 +277,7 @@ class TitlePatternAdmin(admin.ModelAdmin): class ResolvedTitleAdmin(admin.ModelAdmin): - list_display = ["title_pattern", "resolved_title", "resolution_status", "resolution_date_time"] + list_display = ["title_pattern", "candidate_url", "resolved_title", "created_at"] admin.site.register(CandidateURL, CandidateURLAdmin) diff --git a/sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py b/sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py new file mode 100644 index 00000000..621f8f3e --- /dev/null +++ b/sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py @@ -0,0 +1,63 @@ +# Generated by Django 4.2.9 on 2024-05-22 17:39 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0051_alter_resolvedtitle_error_string_and_more"), + ] + + operations = [ + migrations.RenameField( + model_name="resolvedtitle", + old_name="resolution_date_time", + new_name="created_at", + ), + migrations.RemoveField( + model_name="resolvedtitle", + name="error_string", + ), + migrations.RemoveField( + model_name="resolvedtitle", + name="http_status_code", + ), + migrations.RemoveField( + model_name="resolvedtitle", + name="resolution_status", + ), + migrations.AlterField( + model_name="resolvedtitle", + name="candidate_url", + field=models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.candidateurl"), + ), + migrations.AlterField( + model_name="resolvedtitle", + name="title_pattern", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.titlepattern"), + ), + migrations.CreateModel( + name="ResolvedTitleError", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("error_string", models.TextField()), + ("http_status_code", models.IntegerField(blank=True, null=True)), + ( + "candidate_url", + models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, to="sde_collections.candidateurl" + ), + ), + ( + "title_pattern", + models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.titlepattern"), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index aa956643..9e63c687 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -1,10 +1,12 @@ import re -# from django.apps import apps +from django.apps import apps from django.core.exceptions import ValidationError from django.db import models +from django.db.models.signals import post_save +from django.dispatch import receiver -from sde_collections.models.candidate_url import ResolvedTitleError +from sde_collections.tasks import resolve_title_pattern from ..utils.title_resolver import ( is_valid_fstring, @@ -14,8 +16,6 @@ ) from .collection_choice_fields import DocumentTypes -# from sde_collections.models.candidate_url import ResolvedTitleError - class BaseMatchPattern(models.Model): class MatchPatternTypeChoices(models.IntegerChoices): @@ -164,9 +164,10 @@ class TitlePattern(BaseMatchPattern): ) def apply(self) -> None: - # CandidateURL = apps.get_model("sde_collections", "CandidateURL") matched_urls = self.matched_urls() updated_urls = [] + ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle") + ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError") for candidate_url in matched_urls: context = { @@ -177,7 +178,15 @@ def apply(self) -> None: try: generated_title = resolve_title(self.title_pattern, context) - candidate_url.generated_title = generated_title + + # check to see if the candidate url has an existing resolved title and delete it + ResolvedTitle.objects.filter(candidate_url=candidate_url).delete() + + resolved_title = ResolvedTitle.objects.create( + title_pattern=self, candidate_url=candidate_url, resolved_title=generated_title + ) + resolved_title.save() + except ValueError as e: message = str(e) error_object = ResolvedTitleError.objects.create(error_string=message) @@ -235,3 +244,9 @@ class Meta: verbose_name = "Document Type Pattern" verbose_name_plural = "Document Type Patterns" unique_together = ("collection", "match_pattern") + + +@receiver(post_save, sender=TitlePattern) +def send_title_patterns_to_celery(sender, instance, created, **kwargs): + if created: + resolve_title_pattern.delay(instance.id) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 6a66c3ec..659f4cc8 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -3,12 +3,12 @@ import shutil import boto3 +from django.apps import apps from django.conf import settings from django.core import management from django.core.management.commands import loaddata from config import celery_app -from sde_collections.models.pattern import TitlePattern from .models.collection import Collection from .sinequa_api import Api @@ -130,5 +130,6 @@ def pull_latest_collection_metadata_from_github(): @celery_app.task() def resolve_title_pattern(title_pattern_id): + TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() diff --git a/sde_collections/urls.py b/sde_collections/urls.py index e7bd305d..9877b421 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -57,4 +57,5 @@ name="candidate-url-api", ), path("resolved-titles/", view=views.ResolvedTitleListView.as_view(), name="resolved-titles"), + path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), ] diff --git a/sde_collections/views.py b/sde_collections/views.py index 41727ce4..a4b21950 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -4,10 +4,10 @@ from django.contrib.auth import get_user_model from django.contrib.auth.mixins import LoginRequiredMixin from django.db import models -from django.shortcuts import get_object_or_404, redirect +from django.shortcuts import get_object_or_404, redirect, render from django.urls import reverse from django.utils import timezone -from django.views.generic import TemplateView +from django.views.generic import TemplateView, View from django.views.generic.detail import DetailView from django.views.generic.edit import DeleteView from django.views.generic.list import ListView @@ -18,7 +18,7 @@ from rest_framework.views import APIView from .forms import CollectionGithubIssueForm, CommentsForm, RequiredUrlForm -from .models.candidate_url import CandidateURL, ResolvedTitle +from .models.candidate_url import CandidateURL, ResolvedTitle, ResolvedTitleError from .models.collection import Collection, Comments, RequiredUrls from .models.collection_choice_fields import ( ConnectorChoices, @@ -466,8 +466,22 @@ def get_context_data(self, **kwargs): class ResolvedTitleListView(ListView): model = ResolvedTitle - template_name = "sde_collections/resolved_titles_list.html" context_object_name = "resolved_titles" + template_name = "candidate_url_list.html" - def get_queryset(self): - return super().get_queryset().order_by("-resolution_date_time") + +class ResolvedTitleErrorListView(ListView): + model = ResolvedTitleError + context_object_name = "resolved_title_errors" + template_name = "candidate_url_list.html" + + +class TitlesAndErrorsView(View): + def get(self, request, *args, **kwargs): + resolved_titles = ResolvedTitle.objects.all() + resolved_title_errors = ResolvedTitleError.objects.all() + context = { + "resolved_titles": resolved_titles, + "resolved_title_errors": resolved_title_errors, + } + return render(request, "sde_collections/titles_and_errors_list.html", context) diff --git a/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html b/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html new file mode 100644 index 00000000..a9352cd1 --- /dev/null +++ b/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html @@ -0,0 +1,74 @@ +{% extends "layouts/base.html" %} +{% load static %} +{% load i18n %} + +{% block title %}Resolved Titles{% endblock %} + +{% block stylesheets %} + {{ block.super }} + +{% endblock %} + +{% block content %} +
+

Candidate URLs with Resolved Titles and Errors

+ +

Resolved Titles

+ + + + + + + + + + {% for resolved_title in resolved_titles %} + + + + + + {% endfor %} + +
Candidate URLResolved TitleCreated At
{{ resolved_title.candidate_url }}{{ resolved_title.resolved_title }}{{ resolved_title.created_at|date:"Y-m-d H:i" }}
+ +

Resolved Title Errors

+ + + + + + + + + + + {% for resolved_title_error in resolved_title_errors %} + + + + + + + {% endfor %} + +
Candidate URLError StringHTTP Status CodeCreated At
{{ resolved_title_error.candidate_url }}{{ resolved_title_error.error_string }}{{ resolved_title_error.http_status_code }}{{ resolved_title_error.created_at|date:"Y-m-d H:i" }}
+
+{% endblock %} + +{% block javascripts %} + {{ block.super }} + + + +{% endblock %} From 0a8a497dfd93d5e4d217018fcc78b7f8d02fd68a Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Wed, 22 May 2024 15:16:55 -0500 Subject: [PATCH 056/111] add title pattern to the html --- .../templates/sde_collections/titles_and_errors_list.html | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html b/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html index a9352cd1..129f17dd 100644 --- a/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html +++ b/sde_indexing_helper/templates/sde_collections/titles_and_errors_list.html @@ -18,6 +18,7 @@

Resolved Titles

Candidate URL + Title Pattern Resolved Title Created At @@ -26,6 +27,7 @@

Resolved Titles

{% for resolved_title in resolved_titles %} {{ resolved_title.candidate_url }} + {{ resolved_title.title_pattern }} {{ resolved_title.resolved_title }} {{ resolved_title.created_at|date:"Y-m-d H:i" }} From 2dd31e851cb527a92cd3ac73c48e69f62e0d5cb6 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 10:55:14 -0500 Subject: [PATCH 057/111] add retry logic and write the generated title field --- sde_collections/models/pattern.py | 9 ++++++--- sde_collections/tasks.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 9e63c687..5ca11f9f 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -2,7 +2,7 @@ from django.apps import apps from django.core.exceptions import ValidationError -from django.db import models +from django.db import models, transaction from django.db.models.signals import post_save from django.dispatch import receiver @@ -187,6 +187,9 @@ def apply(self) -> None: ) resolved_title.save() + candidate_url.generated_title = generated_title + candidate_url.save() + except ValueError as e: message = str(e) error_object = ResolvedTitleError.objects.create(error_string=message) @@ -247,6 +250,6 @@ class Meta: @receiver(post_save, sender=TitlePattern) -def send_title_patterns_to_celery(sender, instance, created, **kwargs): +def post_save_handler(sender, instance, created, **kwargs): if created: - resolve_title_pattern.delay(instance.id) + transaction.on_commit(lambda: resolve_title_pattern.delay(instance.pk)) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 659f4cc8..7b2a8bb9 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -3,6 +3,7 @@ import shutil import boto3 +from celery import shared_task from django.apps import apps from django.conf import settings from django.core import management @@ -128,8 +129,13 @@ def pull_latest_collection_metadata_from_github(): s3_client.upload_file(FILENAME, s3_bucket_name, s3_key) -@celery_app.task() -def resolve_title_pattern(title_pattern_id): +@shared_task(bind=True, max_retries=5, default_retry_delay=1) +def resolve_title_pattern(self, title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") - title_pattern = TitlePattern.objects.get(id=title_pattern_id) - title_pattern.apply() + + try: + title_pattern = TitlePattern.objects.get(id=title_pattern_id) + title_pattern.apply() + except TitlePattern.DoesNotExist: + # Retry the task if the title pattern is not yet available + raise self.retry(countdown=0.5) From a2c1301aa8a396e065e72afab20a19815068f601 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 13:11:08 -0500 Subject: [PATCH 058/111] save errors to database --- sde_collections/models/pattern.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 5ca11f9f..90b5c47e 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -192,13 +192,15 @@ def apply(self) -> None: except ValueError as e: message = str(e) - error_object = ResolvedTitleError.objects.create(error_string=message) + resolved_title_error = ResolvedTitleError.objects.create( + title_pattern=self, candidate_url=candidate_url, error_string=message + ) status_code = re.search(r"Status code: (\d+)", message) if status_code: - error_object.http_status_code = int(status_code.group(1)) + resolved_title_error.http_status_code = int(status_code.group(1)) - error_object.save() + resolved_title_error.save() raise ValidationError(str(e)) From 6dfd2935f134ab7f98aaf74a416b60dfe6d8b657 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 13:58:49 -0500 Subject: [PATCH 059/111] revert to previous task handler --- sde_collections/tasks.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 7b2a8bb9..659f4cc8 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -3,7 +3,6 @@ import shutil import boto3 -from celery import shared_task from django.apps import apps from django.conf import settings from django.core import management @@ -129,13 +128,8 @@ def pull_latest_collection_metadata_from_github(): s3_client.upload_file(FILENAME, s3_bucket_name, s3_key) -@shared_task(bind=True, max_retries=5, default_retry_delay=1) -def resolve_title_pattern(self, title_pattern_id): +@celery_app.task() +def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") - - try: - title_pattern = TitlePattern.objects.get(id=title_pattern_id) - title_pattern.apply() - except TitlePattern.DoesNotExist: - # Retry the task if the title pattern is not yet available - raise self.retry(countdown=0.5) + title_pattern = TitlePattern.objects.get(id=title_pattern_id) + title_pattern.apply() From 323e2735b9787cad4711701d9b12c6bd5056045a Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 13:59:04 -0500 Subject: [PATCH 060/111] add select related to titlesanderrorsview --- sde_collections/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index a4b21950..eea638e4 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -478,8 +478,8 @@ class ResolvedTitleErrorListView(ListView): class TitlesAndErrorsView(View): def get(self, request, *args, **kwargs): - resolved_titles = ResolvedTitle.objects.all() - resolved_title_errors = ResolvedTitleError.objects.all() + resolved_titles = ResolvedTitle.objects.select_related("title_pattern", "candidate_url").all() + resolved_title_errors = ResolvedTitleError.objects.select_related("title_pattern", "candidate_url").all() context = { "resolved_titles": resolved_titles, "resolved_title_errors": resolved_title_errors, From 72b09f5d1e2bc3e06647b737dc8e186b104b6cc8 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 13:59:46 -0500 Subject: [PATCH 061/111] add validationerror to titlepattern handling --- sde_collections/models/pattern.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 90b5c47e..c1ec85b9 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -190,7 +190,7 @@ def apply(self) -> None: candidate_url.generated_title = generated_title candidate_url.save() - except ValueError as e: + except (ValueError, ValidationError) as e: message = str(e) resolved_title_error = ResolvedTitleError.objects.create( title_pattern=self, candidate_url=candidate_url, error_string=message @@ -202,8 +202,6 @@ def apply(self) -> None: resolved_title_error.save() - raise ValidationError(str(e)) - TitlePatternCandidateURL = TitlePattern.candidate_urls.through pattern_url_associations = [ TitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls From 9683e2488e79c324d919309f67962aa2c3b0f7bd Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 14:06:50 -0500 Subject: [PATCH 062/111] remove unused resolve function --- sde_collections/models/pattern.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index c1ec85b9..a1bd8044 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -211,9 +211,6 @@ def apply(self) -> None: def unapply(self) -> None: self.candidate_urls.update(generated_title="") - def resolve(self) -> None: - print(f"Resolving title pattern {self.id}") - class Meta: """Meta definition for TitlePattern.""" From 42b94fb3ec9888042f5f728f3ef25813cc05a9ef Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 14:08:34 -0500 Subject: [PATCH 063/111] remove unused url --- sde_collections/urls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 9877b421..98a1df06 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -56,6 +56,5 @@ view=views.CandidateURLAPIView.as_view(), name="candidate-url-api", ), - path("resolved-titles/", view=views.ResolvedTitleListView.as_view(), name="resolved-titles"), path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), ] From fccc32d514f9b7deb058cbb52fbd6e155286707e Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 14:09:50 -0500 Subject: [PATCH 064/111] remove unused titles template --- sde_collections/views.py | 2 - .../sde_collections/resolved_titles_list.html | 50 ------------------- 2 files changed, 52 deletions(-) delete mode 100644 sde_indexing_helper/templates/sde_collections/resolved_titles_list.html diff --git a/sde_collections/views.py b/sde_collections/views.py index eea638e4..b3ec4102 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -467,13 +467,11 @@ def get_context_data(self, **kwargs): class ResolvedTitleListView(ListView): model = ResolvedTitle context_object_name = "resolved_titles" - template_name = "candidate_url_list.html" class ResolvedTitleErrorListView(ListView): model = ResolvedTitleError context_object_name = "resolved_title_errors" - template_name = "candidate_url_list.html" class TitlesAndErrorsView(View): diff --git a/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html b/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html deleted file mode 100644 index 69ed605b..00000000 --- a/sde_indexing_helper/templates/sde_collections/resolved_titles_list.html +++ /dev/null @@ -1,50 +0,0 @@ -{% extends "layouts/base.html" %} -{% load static %} -{% load i18n %} - -{% block title %}Resolved Titles{% endblock %} -{% block stylesheets %} - {{ block.super }} - -{% endblock %} - -{% block content %} -
-

Resolved Titles

- - - - - - - - - - - - {% for resolved_title in resolved_titles %} - - - - - - - - {% endfor %} - -
URLResolved TitleStatusResolved DateHTTP Status
{{ resolved_title.candidate_url.url }}{{ resolved_title.resolved_title }}{{ resolved_title.get_resolution_status_display }}{{ resolved_title.resolution_date_time|date:"Y-m-d H:i" }}{{ resolved_title.http_status_code }}
-
-{% endblock %} - -{% block javascripts %} - {{ block.super }} - - - -{% endblock %} From 211213807786fa7b857730c398122ed3ea5da188 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Thu, 23 May 2024 15:03:55 -0500 Subject: [PATCH 065/111] Add file and instructions on how to run locally --- .env_sample | 18 ++++++++++++++++++ README.md | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 .env_sample diff --git a/.env_sample b/.env_sample new file mode 100644 index 00000000..e370568f --- /dev/null +++ b/.env_sample @@ -0,0 +1,18 @@ +CELERY_BROKER_URL="" +CELERY_FLOWER_PASSWORD="" +CELERY_FLOWER_USER="" +DATABASE_URL='postgresql://:@localhost:5432/' +DJANGO_ACCOUNT_ALLOW_REGISTRATION=False +DJANGO_AWS_ACCESS_KEY_ID="" +DJANGO_AWS_SECRET_ACCESS_KEY="" +DJANGO_AWS_STORAGE_BUCKET_NAME="" +GITHUB_ACCESS_TOKEN="" +GITHUB_BRANCH_FOR_WEBAPP="" +IPYTHONDIR="" +REDIS_URL="" +SINEQUA_CONFIGS_GITHUB_REPO="" +SINEQUA_CONFIGS_REPO_DEV_BRANCH="" +SINEQUA_CONFIGS_REPO_MASTER_BRANCH="" +SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH="" +SLACK_WEBHOOK_URL="" +USE_DOCKER=no diff --git a/README.md b/README.md index 01efe654..ceabb8dd 100644 --- a/README.md +++ b/README.md @@ -12,15 +12,46 @@ Moved to [settings](http://cookiecutter-django.readthedocs.io/en/latest/settings ## Basic Commands ### Building The Project + ```bash $ docker-compose -f local.yml build ``` ### Running The Necessary Containers + ```bash $ docker-compose -f local.yml up ``` +### Non-docker Local Setup + +If you want to run the project without docker, you will need the following: + +- Postgres + +Run the following commands: + +```` +$ psql postgres +postgres=# create database ; +postgres=# create user with password ''; +postgres=# grant all privileges on database to ; + +# This next one is optional, but it will allow the user to create databases for testing + +postgres=# alter role with superuser; +``` + +Now copy .env_sample in the root directory to .env. Note that in this setup we don't end up using the .envs/ directory, but instead we use the .env file. + +Replace the variables in this line in the .env file: `DATABASE_URL='postgresql://:@localhost:5432/'` with your user, password and database. Change the port if you have a different one. + +You don't need to change any other variable, unless you want to use specific modules (like the GitHub code will require a GitHub token etc). + +There is a section in `config/settings/base.py` which reads environment variables from this file. The line should look like `READ_DOT_ENV_FILE = env.bool("DJANGO_READ_DOT_ENV_FILE", default=True)`. Make sure either the default is True here (which it should already be), or run `export DJANGO_READ_DOT_ENV_FILE=True` in your terminal. + +Run `python manage.py runserver` to test if your setup worked. You might have to run an initial migration with `python manage.py migrate`. + ### Setting Up Your Users - To create a **normal user account**, just go to Sign Up and fill out the form. Once you submit it, you'll see a "Verify Your E-mail Address" page. Go to your console to see a simulated email verification message. Copy the link into your browser. Now the user's email should be verified and ready to go. @@ -144,7 +175,7 @@ To run a celery worker: ```bash cd sde_indexing_helper celery -A config.celery_app worker -l info -``` +```` Please note: For Celery's import magic to work, it is important _where_ the celery commands are run. If you are in the same folder with _manage.py_, you should be right. @@ -186,7 +217,6 @@ Run against the files : It's usually a good idea to run the hooks against all of the files when adding new hooks (usually `pre-commit` will only run on the chnages files during git hooks). - ### Sentry Sentry is an error logging aggregator service. You can sign up for a free account at or download and host it yourself. From b08ae34bdd9c14f7f5b7aa0e5f202fd425887272 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Thu, 23 May 2024 16:07:19 -0500 Subject: [PATCH 066/111] Don't allow a collection to be created without a URL --- .../migrations/0053_alter_collection_url.py | 18 ++++++++++++++++++ sde_collections/models/collection.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 sde_collections/migrations/0053_alter_collection_url.py diff --git a/sde_collections/migrations/0053_alter_collection_url.py b/sde_collections/migrations/0053_alter_collection_url.py new file mode 100644 index 00000000..3378e82a --- /dev/null +++ b/sde_collections/migrations/0053_alter_collection_url.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-05-23 21:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0052_rename_resolution_date_time_resolvedtitle_created_at_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="collection", + name="url", + field=models.URLField(max_length=2048, verbose_name="URL"), + ), + ] diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index d06016ea..f4c8f422 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -35,7 +35,7 @@ class Collection(models.Model): name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) - url = models.URLField("URL", max_length=2048, blank=True) + url = models.URLField("URL", max_length=2048) division = models.IntegerField(choices=Divisions.choices) turned_on = models.BooleanField("Turned On", default=True) connector = models.IntegerField(choices=ConnectorChoices.choices, default=ConnectorChoices.CRAWLER2) From 8b85f842a50de8e0a09c2450638c662830d586be Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 16:45:34 -0500 Subject: [PATCH 067/111] add xpath mappings --- scripts/xpath_mappings.py | 73 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 scripts/xpath_mappings.py diff --git a/scripts/xpath_mappings.py b/scripts/xpath_mappings.py new file mode 100644 index 00000000..305de864 --- /dev/null +++ b/scripts/xpath_mappings.py @@ -0,0 +1,73 @@ +# flake8: noqa +xpath_mappings = { + 'Concat(xpath://*[@id="cpad"]/h2, xpath://*[@id="cpad"]/h3)': 'xpath://*[@id="cpad"]/h2 xpath://*[@id="cpad"]/h3', + 'Concat(xpath://*[@id="cpad"]/h2, doc.title)': 'xpath://*[@id="cpad"]/h2 {title}', + 'xpath://*[@id="cpad"]/h2': 'xpath://*[@id="cpad"]/h2', + 'xpath://*[@id="cpad"]/h3': 'xpath://*[@id="cpad"]/h3', + 'Concat("GCN ", xpath://*[@id="gcn-news-and-events"]/a)': 'GCN xpath://*[@id="gcn-news-and-events"]/a', + 'Concat("GCN", xpath://*[@id="super-kamioka-neutrino-detection-experiment-super-kamiokande"]/a)': 'GCN xpath://*[@id="super-kamioka-neutrino-detection-experiment-super-kamiokande"]/a', + 'concat("MAST - Missions and Data - ",xpath://*[@id="page-title"])': 'MAST - Missions and Data - xpath://*[@id="page-title"]', + 'concat("HEK Observation Details: ",xpath://*[@id="event-detail"]/div[1])': 'HEK Observation Details: xpath://*[@id="event-detail"]/div[1]', + 'concat("The Martian Meteorite Compendium ",xpath://*[@id="main_content_wrapper"]/h4/text())': 'The Martian Meteorite Compendium xpath://*[@id="main_content_wrapper"]/h4/text()', + 'concat("Antarctic Meteorite Sample Preparation - ",xpath://*[@id="main_content_wrapper"]/h4)': 'Antarctic Meteorite Sample Preparation - xpath://*[@id="main_content_wrapper"]/h4', + 'concat("My NASA Data: ",xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span)': 'My NASA Data: xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span', + 'concat("My NASA Data: Phenomenon - ",xpath:/html/body/div[1]/div/div[1]/div[2]/div/div[1]/div/section/div/div[1]/h1/text())': "My NASA Data: Phenomenon - xpath:/html/body/div[1]/div/div[1]/div[2]/div/div[1]/div/section/div/div[1]/h1/text()", + 'concat("My NASA Data: Mini Lessons - ",xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span)': 'My NASA Data: Mini Lessons - xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span', + 'concat("My NASA Data: Lesson Plans - ",xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span)': 'My NASA Data: Lesson Plans - xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span', + 'concat("My NASA Data: Interactive Models - ",xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span)': 'My NASA Data: Interactive Models - xpath://*[@id="block-mynasadata-theme-content"]/article/div/div[1]/h1/span', + 'concat("FIRMS Layer Information: ",xpath://*[@id="layerid"])': 'FIRMS Layer Information: xpath://*[@id="layerid"]', + "concat(“Artwork: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Artwork: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Calibration: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Calibration: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Canyons: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Canyons: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Craters: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Craters: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Dust Storms: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Dust Storms: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Martian Terrain: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Martian Terrain: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + "concat(“Sand Dunes: “, xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b)": "Sand Dunes: xpath:/html/body/div/table/tbody/tr/td/table[7]/tbody/tr/td[3]/table[2]/tbody/tr[4]/td/p[1]/b", + 'concat(“MER Mission: “, xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td)': 'MER Mission: xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td', + 'concat(“MER Spacecraft: “, xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td)': 'MER Spacecraft: xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td', + 'concat(“MER Spotlight: “, xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td)': 'MER Spotlight: xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td', + 'concat(“MER Videos: “, xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td)': 'MER Videos: xpath://*[@id="white-blur"]/table/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td', + 'concat(“Imagine Mars: “, xpath://*[@id="centeredcontent2"]/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/div)': 'Imagine Mars: xpath://*[@id="centeredcontent2"]/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/div', + 'concat(“Imagine Mars Webcasts: “, xpath://*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1])': 'Imagine Mars Webcasts: xpath://*[@id="centeredcontent2"]/table[4]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td/div/p[1]/text()[1]', + 'STEREO Learning Center - {xpath://*[@id="content"]/div/h3}': 'STEREO Learning Center - xpath://*[@id="content"]/div/h3', + '{xpath://*[@id="content"]/div/h1}': 'xpath://*[@id="content"]/div/h1', + "{xpath:/html/body/center[1]/font/h1/i}": "xpath:/html/body/center[1]/font/h1/i", + "{xpath:/html/body/div[2]/section[1]/div/div/h5/text()} - Images": "xpath:/html/body/div[2]/section[1]/div/div/h5/text() - Images", + "{xpath:/html/body/div[1]/div[2]/div/div[1]/div/div/div/div/div/h2/text()}": "xpath:/html/body/div[1]/div[2]/div/div[1]/div/div/div/div/div/h2/text()", + "{xpath:/html/body/div[2]/div[2]/div/div[1]/div/div/div/div[1]/div/h2/text()}": "xpath:/html/body/div[2]/div[2]/div/div[1]/div/div/div/div[1]/div/h2/text()", + "{xpath:/html/body/div[1]/div[2]/div/div[1]/div/div/div/div[1]/div/h2/text()}": "xpath:/html/body/div[1]/div[2]/div/div[1]/div/div/div/div[1]/div/h2/text()", + '{xpath://*[@id="ascl_body"]/div/h2}': 'xpath://*[@id="ascl_body"]/div/h2', + '{xpath://*[@id="rightcontent"]/h1} NASA - NSSDCA - Experiment - Details': 'xpath://*[@id="rightcontent"]/h1 NASA - NSSDCA - Experiment - Details', + '{xpath://*[@id="rightcontent"]/h1} NASA - NSSDCA - Spacecraft - Details': 'xpath://*[@id="rightcontent"]/h1 NASA - NSSDCA - Spacecraft - Details', + '{xpath://*[@id="rightcontent"]/h1} NASA - NSSDCA - Dataset - Details': 'xpath://*[@id="rightcontent"]/h1 NASA - NSSDCA - Dataset - Details', + '{xpath://*[@id="rightcontent"]/h1} NASA - NSSDCA - Publication - Details': 'xpath://*[@id="rightcontent"]/h1 NASA - NSSDCA - Publication - Details', + '{xpath://*[@id="contentwrapper"]/center/h2} - Abstract': 'xpath://*[@id="contentwrapper"]/center/h2 - Abstract', + '{xpath://*[@id="contentwrapper"]/center/h1} - Publications and Abstracts': 'xpath://*[@id="contentwrapper"]/center/h1 - Publications and Abstracts', + '{xpath://*[@id="page"]/section[3]/div/article/header/h2} - Blogs by Author': 'xpath://*[@id="page"]/section[3]/div/article/header/h2 - Blogs by Author', + "{xpath:/html/body/h2} - {xpath:/html/body/h4[2]}": "xpath:/html/body/h2} - xpath:/html/body/h4[2]", + "{title} - {xpath:/html/body/div/div/h2}": "{title} - xpath:/html/body/div/div/h2", + "{title} - {xpath:/html/body/h3[1]}": "{title} - xpath:/html/body/h3[1]", + '{title} - {xpath://*[@id="OneColumn"]/div[2]/table/tbody/tr/td/blockquote/h2}': '{title} - xpath://*[@id="OneColumn"]/div[2]/table/tbody/tr/td/blockquote/h2', + '{title} - {xpath://*[@id="content-wrapper"]/h1}': '{title} - xpath://*[@id="content-wrapper"]/h1', + "{xpath:/html/body/div/main/div[2]/section/div[2]/h1} | Astrobiology": "xpath:/html/body/div/main/div[2]/section/div[2]/h1 | Astrobiology", + "{xpath:/html/body/div/main/section/div[2]/h1} | The Classroom | Astrobiology": "xpath:/html/body/div/main/section/div[2]/h1 | The Classroom | Astrobiology", + "{xpath:/html/body/div/section[2]/div[1]/article/h1} | About FameLab - Finalist Bios": "xpath:/html/body/div/section[2]/div[1]/article/h1 | About FameLab - Finalist Bios", + "{xpath:/html/body/div/section[2]/div[2]/h1} | About FameLab - Videos": "xpath:/html/body/div/section[2]/div[2]/h1 | About FameLab - Videos", + '{xpath://*[@id="container-body"]/div[2]/div[2]/h2} - {xpath://*[@id="container-body"]/div[2]/div[2]/h4/span[1]/text() | NASA Astrobiology Institute': 'xpath://*[@id="container-body"]/div[2]/div[2]/h2} - {xpath://*[@id="container-body"]/div[2]/div[2]/h4/span[1]/text()} | NASA Astrobiology Institute', + '{xpath://*[@id="container-body"]/div[2]/div[2]/h3/text()} - Annual Report | NASA Astrobiology Institute': 'xpath://*[@id="container-body"]/div[2]/div[2]/h3/text() - Annual Report | NASA Astrobiology Institute', + '{xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h3} - Article | NASA Astrobiology Institute': 'xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h3 - Article | NASA Astrobiology Institute', + "All Things Electric and Magnetic - {xpath:/html/body/div[1]/center[1]/table/tbody/tr/td[2]/font/center/h1/i}": "All Things Electric and Magnetic - xpath:/html/body/div[1]/center[1]/table/tbody/tr/td[2]/font/center/h1/i", + 'Tutorial - {xpath://*[@id="Analyzing-interstellar-reddening-and-calculating-synthetic-photometry"]}': 'Tutorial - xpath://*[@id="Analyzing-interstellar-reddening-and-calculating-synthetic-photometry"]', + 'Health & Air Quality - {xpath://*[@id="block-views-block-hero-block-7"]/div/div/div[2]/div/div/div[2]/div/p}': 'Health & Air Quality - xpath://*[@id="block-views-block-hero-block-7"]/div/div/div[2]/div/div/div[2]/div/p', + 'News - {xpath://*[@id="left-column"]/h2} - {xpath://*[@id="left-column"]/p[1]}': 'News - {xpath://*[@id="left-column"]/h2} - xpath://*[@id="left-column"]/p[1]', + 'JWST {xpath://*[@id="stsci-content"]/div/div/h2} - {title}': 'JWST xpath://*[@id="stsci-content"]/div/div/h2 - {title}', + '{xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h2} | NASA Astrobiology Institute': 'xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h2 | NASA Astrobiology Institute', + 'Directory - {xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text()} | NASA Astrobiology Institute': 'Directory - xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text() | NASA Astrobiology Institute', + 'Conference and School Funding - {xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h3/text()} | NASA Astrobiology Institute': 'Conference and School Funding - xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h3/text() | NASA Astrobiology Institute', + 'Seminars - {xpath://*[@id="container-body"]/div[2]/div[2]/div/h2} | NASA Astrobiology Institute': 'Seminars - xpath://*[@id="container-body"]/div[2]/div[2]/div/h2 | NASA Astrobiology Institute', + 'Team Members - {xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text()} | NASA Astrobiology Institute': 'Team Members - xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text() | NASA Astrobiology Institute', + 'Teams - {xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text()} | NASA Astrobiology Institute': 'Teams - xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text() | NASA Astrobiology Institute', + '{xpath://*[@id="page"]/section[3]/div/article/header/h2}': 'xpath://*[@id="page"]/section[3]/div/article/header/h2', + '{xpath://*[@id="page"]/section[1]/div/header/h2}': 'xpath://*[@id="page"]/section[1]/div/header/h2', + '{xpath://*[@id="page"]/section[1]/div/header/h2} - News by Column': 'xpath://*[@id="page"]/section[1]/div/header/h2 - News by Column', +} From 5bd2178353173e5e2b633d031f1b01ab4594f282 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 16:47:22 -0500 Subject: [PATCH 068/111] fix xpath mapping --- scripts/xpath_mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/xpath_mappings.py b/scripts/xpath_mappings.py index 305de864..31a66496 100644 --- a/scripts/xpath_mappings.py +++ b/scripts/xpath_mappings.py @@ -59,7 +59,7 @@ "All Things Electric and Magnetic - {xpath:/html/body/div[1]/center[1]/table/tbody/tr/td[2]/font/center/h1/i}": "All Things Electric and Magnetic - xpath:/html/body/div[1]/center[1]/table/tbody/tr/td[2]/font/center/h1/i", 'Tutorial - {xpath://*[@id="Analyzing-interstellar-reddening-and-calculating-synthetic-photometry"]}': 'Tutorial - xpath://*[@id="Analyzing-interstellar-reddening-and-calculating-synthetic-photometry"]', 'Health & Air Quality - {xpath://*[@id="block-views-block-hero-block-7"]/div/div/div[2]/div/div/div[2]/div/p}': 'Health & Air Quality - xpath://*[@id="block-views-block-hero-block-7"]/div/div/div[2]/div/div/div[2]/div/p', - 'News - {xpath://*[@id="left-column"]/h2} - {xpath://*[@id="left-column"]/p[1]}': 'News - {xpath://*[@id="left-column"]/h2} - xpath://*[@id="left-column"]/p[1]', + 'News - {xpath://*[@id="left-column"]/h2} - {xpath://*[@id="left-column"]/p[1]}': 'News - xpath://*[@id="left-column"]/h2} - xpath://*[@id="left-column"]/p[1]', 'JWST {xpath://*[@id="stsci-content"]/div/div/h2} - {title}': 'JWST xpath://*[@id="stsci-content"]/div/div/h2 - {title}', '{xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h2} | NASA Astrobiology Institute': 'xpath://*[@id="container-body"]/div[2]/div[2]/ol/li/h2 | NASA Astrobiology Institute', 'Directory - {xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text()} | NASA Astrobiology Institute': 'Directory - xpath://*[@id="container-body"]/div[2]/div[2]/div[1]/h2/text() | NASA Astrobiology Institute', From be25d8a8a1acd88f0874374d0f5e3fc9d385d0fd Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 16:51:38 -0500 Subject: [PATCH 069/111] identify and map xpath patterns --- scripts/xpath_cleanup/find_xpath_patterns.py | 18 ++++++++++++++++++ scripts/{ => xpath_cleanup}/xpath_mappings.py | 0 2 files changed, 18 insertions(+) create mode 100644 scripts/xpath_cleanup/find_xpath_patterns.py rename scripts/{ => xpath_cleanup}/xpath_mappings.py (100%) diff --git a/scripts/xpath_cleanup/find_xpath_patterns.py b/scripts/xpath_cleanup/find_xpath_patterns.py new file mode 100644 index 00000000..0205a46a --- /dev/null +++ b/scripts/xpath_cleanup/find_xpath_patterns.py @@ -0,0 +1,18 @@ +# flake8: noqa +"""this script is used to find all the xpath patterns in the database, so that they can be mapped to new patterns in xpath_mappings.py""" + +from sde_collections.models.pattern import TitlePattern + +print( + "there are", TitlePattern.objects.filter(title_pattern__contains="xpath").count(), "xpath patterns in the database" +) + +# Get all the xpath patterns and their candidate urls +xpath_patterns = TitlePattern.objects.filter(title_pattern__contains="xpath") +for xpath_pattern in xpath_patterns: + print(xpath_pattern.title_pattern) + # for url in xpath_pattern.candidate_urls.all(): + # print(url.url) + print() + +# not every xpath pattern has a candidate url, but I went ahead and fixed all of them anyway diff --git a/scripts/xpath_mappings.py b/scripts/xpath_cleanup/xpath_mappings.py similarity index 100% rename from scripts/xpath_mappings.py rename to scripts/xpath_cleanup/xpath_mappings.py From fedee23f2633d05d6e39915721f8b466a70f1895 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 23 May 2024 17:27:26 -0500 Subject: [PATCH 070/111] handle case where xpath resolves to text not element --- sde_collections/utils/title_resolver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index 98036e76..b9171de3 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -70,7 +70,11 @@ def resolve_xpath(xpath: str, url: str) -> str: values = tree.xpath(xpath) if len(values) == 1: - text_content = values[0].text + if isinstance(values[0], str): + text_content = values[0] + else: + text_content = values[0].text + if text_content: text_content = clean_text(text_content) return text_content From 6ccffd5f4d4bfc807d63dc5447f0d4b5ec217aa4 Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Fri, 24 May 2024 09:16:46 -0400 Subject: [PATCH 071/111] quick save to switch branches --- .../templates/sde_collections/candidate_urls_list.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 20d4a631..6c458786 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -77,7 +77,7 @@

Document Type
ID
- +
- +
From 410130c8cacaf79605642d43f76afe09198ee685 Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Tue, 28 May 2024 13:30:13 -0400 Subject: [PATCH 072/111] quick save --- .../static/js/candidate_url_list.js | 34 +++++++++++++++++++ .../sde_collections/candidate_urls_list.html | 23 +++++++++---- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 52fb3858..600b260e 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -125,6 +125,29 @@ function initializeDataTable() { d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; }, }, + // initComplete: function (data) { + // const addDropdownSelect = [1, 4, 5]; + // const dict = { + // 1: "Images", + // 2: "Data", + // 3: "Documentation", + // 4: "Software and Tools", + // 5: "Missions and Instruments", + // 6: "Training and Education", + // }; + // this.api() + // .columns() + // .every(function (index) { + // let column = this; + // if (addDropdownSelect.includes(index)) { + // $("thead tr td select.dropdown-" + index).on("change", function () { + // var val = $.fn.dataTable.util.escapeRegex($(this).val()); + // column.search(val ? "^" + val + "$" : "", true, false).draw(); + // }); + // } + // }); + // }, + initComplete: function (data) { const addDropdownSelect = [1, 4, 5]; const dict = { @@ -144,6 +167,17 @@ function initializeDataTable() { var val = $.fn.dataTable.util.escapeRegex($(this).val()); column.search(val ? "^" + val + "$" : "", true, false).draw(); }); + // Add list of options + column + .data() + .unique() + .sort() + .each(function (d, j) { + let val = index === 5 ? dict[d] : d; + $("thead tr td select.dropdown-" + index).append( + '" + ); + }); } }); }, diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 6c458786..e3c2b4c7 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -77,7 +77,7 @@

Document Type
ID
-
+ + + + + + + + + + + +
@@ -122,7 +133,7 @@

Actions ID - +
@@ -164,7 +175,7 @@

ID - + From 7281a66040e4abaee0fe68ae484426e9b68e161d Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Tue, 28 May 2024 13:48:31 -0400 Subject: [PATCH 073/111] quick save --- .../static/js/candidate_url_list.js | 156 ++++++++++++++++++ .../sde_collections/candidate_urls_list.html | 39 ++++- 2 files changed, 192 insertions(+), 3 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 600b260e..8617a722 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -239,6 +239,29 @@ function initializeDataTable() { orderCellsTop: true, pageLength: 100, ajax: `/api/exclude-patterns/?format=datatables&collection_id=${collection_id}`, + // initComplete: function (data) { + // var table = $("#exclude_patterns_table").DataTable(); + + // this.api() + // .columns() + // .every(function (index) { + // let column = this; + // if (column.data().length === 0) { + // $("#exclude-patterns-dropdown-1").prop("disabled", true); + // } else if (index === 1) { + // $("#exclude-patterns-dropdown-1").on("change", function () { + // if ($(this).val() === "") table.columns(6).search("").draw(); + // else { + // table + // .column(6) + // .search(matchPatternTypeMap[$(this).val()]) + // .draw(); + // } + // }); + // } + // }); + // }, + initComplete: function (data) { var table = $("#exclude_patterns_table").DataTable(); @@ -258,6 +281,15 @@ function initializeDataTable() { .draw(); } }); + column + .data() + .unique() + .sort() + .each(function (d, j) { + $("#exclude-patterns-dropdown-1").append( + '" + ); + }); } }); }, @@ -317,6 +349,27 @@ function initializeDataTable() { orderCellsTop: true, serverSide: true, ajax: `/api/include-patterns/?format=datatables&collection_id=${collection_id}`, + // initComplete: function (data) { + // var table = $("#include_patterns_table").DataTable(); + // this.api() + // .columns() + // .every(function (index) { + // let column = this; + // if (column.data().length === 0) { + // $("#include-patterns-dropdown-1").prop("disabled", true); + // } else { + // if (index === 1) { + // $("#include-patterns-dropdown-1").on("change", function () { + // if ($(this).val() === "") table.columns(5).search("").draw(); + // table + // .column(5) + // .search(matchPatternTypeMap[$(this).val()]) + // .draw(); + // }); + // } + // } + // }); + // }, initComplete: function (data) { var table = $("#include_patterns_table").DataTable(); this.api() @@ -335,6 +388,16 @@ function initializeDataTable() { .draw(); }); } + column + .data() + .unique() + .sort() + .each(function (d, j) { + console.log("d", d); + $("#include-patterns-dropdown-1").append( + '" + ); + }); } }); }, @@ -386,6 +449,29 @@ function initializeDataTable() { pageLength: 100, orderCellsTop: true, ajax: `/api/title-patterns/?format=datatables&collection_id=${collection_id}`, + // initComplete: function (data) { + // var table = $("#title_patterns_table").DataTable(); + + // this.api() + // .columns() + // .every(function (index) { + // let column = this; + // if (column.data().length === 0) { + // $("#title-patterns-dropdown-1").prop("disabled", true); + // } else if (index === 1) { + // $("#title-patterns-dropdown-1").on("change", function () { + // if ($(this).val() === "") table.columns(6).search("").draw(); + // else { + // table + // .column(6) + // .search(matchPatternTypeMap[$(this).val()]) + // .draw(); + // } + // }); + // } + // }); + // }, + initComplete: function (data) { var table = $("#title_patterns_table").DataTable(); @@ -405,6 +491,15 @@ function initializeDataTable() { .draw(); } }); + column + .data() + .unique() + .sort() + .each(function (d, j) { + $("#title-patterns-dropdown-1").append( + '" + ); + }); } }); }, @@ -466,6 +561,57 @@ function initializeDataTable() { orderCellsTop: true, pageLength: 100, ajax: `/api/document-type-patterns/?format=datatables&collection_id=${collection_id}`, + // initComplete: function (data) { + // this.api() + // .columns() + // .every(function (index) { + // var table = $("#document_type_patterns_table").DataTable(); + + // let addDropdownSelect = { + // 1: { + // columnToSearch: 6, + // matchPattern: { + // "Individual URL Pattern": 1, + // "Multi-URL Pattern": 2, + // }, + // }, + // 2: { + // columnToSearch: 7, + // matchPattern: { + // Images: 1, + // Data: 2, + // Documentation: 3, + // "Software and Tools": 4, + // "Missions and Instruments": 5, + // "Training and Education": 6, + // }, + // }, + // }; + + // let column = this; + // if (column.data().length === 0) { + // $(`#document-type-patterns-dropdown-${index}`).prop( + // "disabled", + // true + // ); + // } else if (index in addDropdownSelect) { + // $("#document-type-patterns-dropdown-" + index).on( + // "change", + // function () { + // let col = addDropdownSelect[index].columnToSearch; + // let searchInput = + // addDropdownSelect[index].matchPattern[$(this).val()]; + // if ($(this).val() === "" || $(this).val() === undefined) + // table.columns(col).search("").draw(); + // else { + // table.columns(col).search(searchInput).draw(); + // } + // } + // ); + // } + // }); + // }, + initComplete: function (data) { this.api() .columns() @@ -513,6 +659,16 @@ function initializeDataTable() { } } ); + // Add list of options + column + .data() + .unique() + .sort() + .each(function (d, j) { + $("#document-type-patterns-dropdown-" + index).append( + '" + ); + }); } }); }, diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index e3c2b4c7..0504e29d 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -144,6 +144,14 @@

--> + + + + + + + +
@@ -185,6 +193,13 @@

--> + + + + + + + @@ -219,7 +234,7 @@

ID - + + + + + + + + @@ -263,7 +286,7 @@

ID - + + + + + + + + + + + From d83f7908d3d47d78cba2a29464d3600e737de81a Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Tue, 28 May 2024 13:57:14 -0400 Subject: [PATCH 074/111] quick save --- .../static/js/candidate_url_list.js | 190 ------------------ .../sde_collections/candidate_urls_list.html | 58 +----- 2 files changed, 7 insertions(+), 241 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 8617a722..52fb3858 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -125,29 +125,6 @@ function initializeDataTable() { d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; }, }, - // initComplete: function (data) { - // const addDropdownSelect = [1, 4, 5]; - // const dict = { - // 1: "Images", - // 2: "Data", - // 3: "Documentation", - // 4: "Software and Tools", - // 5: "Missions and Instruments", - // 6: "Training and Education", - // }; - // this.api() - // .columns() - // .every(function (index) { - // let column = this; - // if (addDropdownSelect.includes(index)) { - // $("thead tr td select.dropdown-" + index).on("change", function () { - // var val = $.fn.dataTable.util.escapeRegex($(this).val()); - // column.search(val ? "^" + val + "$" : "", true, false).draw(); - // }); - // } - // }); - // }, - initComplete: function (data) { const addDropdownSelect = [1, 4, 5]; const dict = { @@ -167,17 +144,6 @@ function initializeDataTable() { var val = $.fn.dataTable.util.escapeRegex($(this).val()); column.search(val ? "^" + val + "$" : "", true, false).draw(); }); - // Add list of options - column - .data() - .unique() - .sort() - .each(function (d, j) { - let val = index === 5 ? dict[d] : d; - $("thead tr td select.dropdown-" + index).append( - '" - ); - }); } }); }, @@ -239,29 +205,6 @@ function initializeDataTable() { orderCellsTop: true, pageLength: 100, ajax: `/api/exclude-patterns/?format=datatables&collection_id=${collection_id}`, - // initComplete: function (data) { - // var table = $("#exclude_patterns_table").DataTable(); - - // this.api() - // .columns() - // .every(function (index) { - // let column = this; - // if (column.data().length === 0) { - // $("#exclude-patterns-dropdown-1").prop("disabled", true); - // } else if (index === 1) { - // $("#exclude-patterns-dropdown-1").on("change", function () { - // if ($(this).val() === "") table.columns(6).search("").draw(); - // else { - // table - // .column(6) - // .search(matchPatternTypeMap[$(this).val()]) - // .draw(); - // } - // }); - // } - // }); - // }, - initComplete: function (data) { var table = $("#exclude_patterns_table").DataTable(); @@ -281,15 +224,6 @@ function initializeDataTable() { .draw(); } }); - column - .data() - .unique() - .sort() - .each(function (d, j) { - $("#exclude-patterns-dropdown-1").append( - '" - ); - }); } }); }, @@ -349,27 +283,6 @@ function initializeDataTable() { orderCellsTop: true, serverSide: true, ajax: `/api/include-patterns/?format=datatables&collection_id=${collection_id}`, - // initComplete: function (data) { - // var table = $("#include_patterns_table").DataTable(); - // this.api() - // .columns() - // .every(function (index) { - // let column = this; - // if (column.data().length === 0) { - // $("#include-patterns-dropdown-1").prop("disabled", true); - // } else { - // if (index === 1) { - // $("#include-patterns-dropdown-1").on("change", function () { - // if ($(this).val() === "") table.columns(5).search("").draw(); - // table - // .column(5) - // .search(matchPatternTypeMap[$(this).val()]) - // .draw(); - // }); - // } - // } - // }); - // }, initComplete: function (data) { var table = $("#include_patterns_table").DataTable(); this.api() @@ -388,16 +301,6 @@ function initializeDataTable() { .draw(); }); } - column - .data() - .unique() - .sort() - .each(function (d, j) { - console.log("d", d); - $("#include-patterns-dropdown-1").append( - '" - ); - }); } }); }, @@ -449,29 +352,6 @@ function initializeDataTable() { pageLength: 100, orderCellsTop: true, ajax: `/api/title-patterns/?format=datatables&collection_id=${collection_id}`, - // initComplete: function (data) { - // var table = $("#title_patterns_table").DataTable(); - - // this.api() - // .columns() - // .every(function (index) { - // let column = this; - // if (column.data().length === 0) { - // $("#title-patterns-dropdown-1").prop("disabled", true); - // } else if (index === 1) { - // $("#title-patterns-dropdown-1").on("change", function () { - // if ($(this).val() === "") table.columns(6).search("").draw(); - // else { - // table - // .column(6) - // .search(matchPatternTypeMap[$(this).val()]) - // .draw(); - // } - // }); - // } - // }); - // }, - initComplete: function (data) { var table = $("#title_patterns_table").DataTable(); @@ -491,15 +371,6 @@ function initializeDataTable() { .draw(); } }); - column - .data() - .unique() - .sort() - .each(function (d, j) { - $("#title-patterns-dropdown-1").append( - '" - ); - }); } }); }, @@ -561,57 +432,6 @@ function initializeDataTable() { orderCellsTop: true, pageLength: 100, ajax: `/api/document-type-patterns/?format=datatables&collection_id=${collection_id}`, - // initComplete: function (data) { - // this.api() - // .columns() - // .every(function (index) { - // var table = $("#document_type_patterns_table").DataTable(); - - // let addDropdownSelect = { - // 1: { - // columnToSearch: 6, - // matchPattern: { - // "Individual URL Pattern": 1, - // "Multi-URL Pattern": 2, - // }, - // }, - // 2: { - // columnToSearch: 7, - // matchPattern: { - // Images: 1, - // Data: 2, - // Documentation: 3, - // "Software and Tools": 4, - // "Missions and Instruments": 5, - // "Training and Education": 6, - // }, - // }, - // }; - - // let column = this; - // if (column.data().length === 0) { - // $(`#document-type-patterns-dropdown-${index}`).prop( - // "disabled", - // true - // ); - // } else if (index in addDropdownSelect) { - // $("#document-type-patterns-dropdown-" + index).on( - // "change", - // function () { - // let col = addDropdownSelect[index].columnToSearch; - // let searchInput = - // addDropdownSelect[index].matchPattern[$(this).val()]; - // if ($(this).val() === "" || $(this).val() === undefined) - // table.columns(col).search("").draw(); - // else { - // table.columns(col).search(searchInput).draw(); - // } - // } - // ); - // } - // }); - // }, - initComplete: function (data) { this.api() .columns() @@ -659,16 +479,6 @@ function initializeDataTable() { } } ); - // Add list of options - column - .data() - .unique() - .sort() - .each(function (d, j) { - $("#document-type-patterns-dropdown-" + index).append( - '" - ); - }); } }); }, diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 0504e29d..6c458786 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -77,7 +77,7 @@

Document Type
ID
- - - - - - - - - - - - + @@ -133,7 +122,7 @@

Actions ID - - - - - - - - @@ -183,7 +164,7 @@

ID - - - - - - - @@ -234,7 +208,7 @@

ID - - - - - - - - @@ -286,7 +252,7 @@

ID - - - - - - - - - - - + From 3efb56e9a54d40f743953f1822d1c0e9d7db466f Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Tue, 28 May 2024 15:32:59 -0400 Subject: [PATCH 075/111] wip --- .../templates/sde_collections/candidate_urls_list.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 6c458786..20d4a631 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -77,7 +77,7 @@

Document Type
ID
-
+ -
+ From f6d7ebe6ce6f0cf37091a59cace5d3ef4f47103b Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Tue, 28 May 2024 20:12:36 -0400 Subject: [PATCH 076/111] ignoring columns when clicking on csv button --- sde_indexing_helper/static/js/candidate_url_list.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 52fb3858..d9a5af94 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -93,8 +93,12 @@ function initializeDataTable() { pagingType: "input", dom: "lBritip", buttons: [ - "spacer", - "csv", + { + extend: "csv", + exportOptions: { + columns: [0, 1, 2, 3, 4, 5], + }, + }, "spacer", { text: "Customize Columns", From 4c915cd31a094cc0d3f1d2ac35acfcaa12a5362a Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Wed, 29 May 2024 10:29:16 -0400 Subject: [PATCH 077/111] qip --- sde_indexing_helper/static/js/candidate_url_list.js | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index d9a5af94..afeaec63 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -98,6 +98,18 @@ function initializeDataTable() { exportOptions: { columns: [0, 1, 2, 3, 4, 5], }, + customize: function (csv) { + console.log("CSV", csv.split("\n")[1].split('","')); + console.log("CSV", csv.split("\n")[2].split('","')); + // // Customization logic for the CSV + // // e.g., Add a header row, modify content, etc. + // var csvRows = csv.split("\n"); + // // Add a custom header + // csvRows.unshift( + // "Custom Header 1, Custom Header 2, Custom Header 3, Custom Header 4, Custom Header 5, Custom Header 6" + // ); + // return csvRows.join("\n"); + }, }, "spacer", { From 83ed7ba21f1943170765c3c17301f8fbe67934fb Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Wed, 29 May 2024 15:03:57 -0400 Subject: [PATCH 078/111] columns now populating properly in csv --- .../static/js/candidate_url_list.js | 56 ++++++++++++------- .../sde_collections/candidate_urls_list.html | 10 ++++ 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index afeaec63..76f79ef6 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -13,6 +13,14 @@ var matchPatternTypeMap = { "Multi-URL Pattern": 2, }; var uniqueId; //used for logic related to contents on column customization modal +const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + 6: "Training and Education", +}; //fix table allignment when changing around tabs $('a[data-toggle="tab"]').on("shown.bs.tab", function (e) { @@ -96,19 +104,29 @@ function initializeDataTable() { { extend: "csv", exportOptions: { - columns: [0, 1, 2, 3, 4, 5], + columns: [0, 1, 2, 3, 4, 11], }, customize: function (csv) { - console.log("CSV", csv.split("\n")[1].split('","')); - console.log("CSV", csv.split("\n")[2].split('","')); - // // Customization logic for the CSV - // // e.g., Add a header row, modify content, etc. - // var csvRows = csv.split("\n"); - // // Add a custom header - // csvRows.unshift( - // "Custom Header 1, Custom Header 2, Custom Header 3, Custom Header 4, Custom Header 5, Custom Header 6" - // ); - // return csvRows.join("\n"); + var lines = csv.split("\n"); + console.log("lines[1].split(", ")", lines[1].split(",")); + const colInfo = { + 0: $("#candidateUrlFilter").val() || "No input", + 1: $(".dropdown-1").val() || "SELECT", + 2: $("#candidateScrapedTitleFilter").val() || "No input", + 3: $("#candidateNewTitleFilter").val() || "No input", + 4: $(".dropdown-4").val() || "SELECT", + 5: dict[$(".dropdown-5").val()] || "SELECT", + }; + if (lines.length > 2) { + var secondRow = lines[1].split(","); + // Modify the second row as needed + for (let key in colInfo) { + secondRow[key] = colInfo[key]; + } + + lines[1] = secondRow.join(","); + } + return lines.join("\n"); }, }, "spacer", @@ -143,14 +161,6 @@ function initializeDataTable() { }, initComplete: function (data) { const addDropdownSelect = [1, 4, 5]; - const dict = { - 1: "Images", - 2: "Data", - 3: "Documentation", - 4: "Software and Tools", - 5: "Missions and Instruments", - 6: "Training and Education", - }; this.api() .columns() .every(function (index) { @@ -176,6 +186,14 @@ function initializeDataTable() { { data: "match_pattern_type", visible: false, searchable: false }, { data: "candidate_urls_count", visible: false, searchable: false }, { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, ], createdRow: function (row, data, dataIndex) { if (data["excluded"]) { diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 20d4a631..bec3d2f2 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -76,6 +76,11 @@

Visited?
Document Type
ID
+ + + + +
Document Type
@@ -100,6 +105,11 @@

+ + + + + From 4ef76dbb897889d809a4e17a6f838611381ef3dc Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Wed, 29 May 2024 16:57:34 -0400 Subject: [PATCH 079/111] quick save --- .../static/js/candidate_url_list.js | 91 ++++++++++--------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index e44c31f1..68da9352 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -109,7 +109,7 @@ function initializeDataTable() { { extend: "csv", exportOptions: { - columns: [0, 1, 2, 3, 4, 11], + columns: [0, 1, 2, 3, 10], }, customize: function (csv) { var lines = csv.split("\n"); @@ -878,14 +878,16 @@ function postDocumentTypePatterns( success: function (data) { $("#candidate_urls_table").DataTable().ajax.reload(null, false); $("#document_type_patterns_table").DataTable().ajax.reload(null, false); - if(currentTab === ""){ //Only add a notification if we are on the first tab - newDocumentTypePatternsCount = newDocumentTypePatternsCount + 1; - $("#documentTypePatternsTab").html( - `Document Type Patterns ` + - newDocumentTypePatternsCount + " new" + - `` - ); - } + if (currentTab === "") { + //Only add a notification if we are on the first tab + newDocumentTypePatternsCount = newDocumentTypePatternsCount + 1; + $("#documentTypePatternsTab").html( + `Document Type Patterns ` + + newDocumentTypePatternsCount + + " new" + + `` + ); + } }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; @@ -899,15 +901,16 @@ function postExcludePatterns(match_pattern, match_pattern_type = 0, force) { toastr.error("Please highlight a pattern to exclude."); return; } - if(!force){ //If the user clicked the icon in the table, we make the change regardless - // if pattern exists in table already (unless another pattern overrules it) - var table = $("#exclude_patterns_table").DataTable(); - var itemIdColumnData = table.column(0).data().toArray(); - if (itemIdColumnData.includes(match_pattern)) { - toastr.success("Pattern already exists"); - return; + if (!force) { + //If the user clicked the icon in the table, we make the change regardless + // if pattern exists in table already (unless another pattern overrules it) + var table = $("#exclude_patterns_table").DataTable(); + var itemIdColumnData = table.column(0).data().toArray(); + if (itemIdColumnData.includes(match_pattern)) { + toastr.success("Pattern already exists"); + return; + } } -} $.ajax({ url: "/api/exclude-patterns/", @@ -921,14 +924,16 @@ function postExcludePatterns(match_pattern, match_pattern_type = 0, force) { success: function (data) { $("#candidate_urls_table").DataTable().ajax.reload(null, false); $("#exclude_patterns_table").DataTable().ajax.reload(null, false); - if(currentTab === ""){ //Only add a notification if we are on the first tab - newExcludePatternsCount = newExcludePatternsCount + 1; - $("#excludePatternsTab").html( - `Exclude Patterns ` + - newExcludePatternsCount + " new" + - `` - ); - } + if (currentTab === "") { + //Only add a notification if we are on the first tab + newExcludePatternsCount = newExcludePatternsCount + 1; + $("#excludePatternsTab").html( + `Exclude Patterns ` + + newExcludePatternsCount + + " new" + + `` + ); + } }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; @@ -963,14 +968,16 @@ function postIncludePatterns(match_pattern, match_pattern_type = 0) { success: function (data) { $("#candidate_urls_table").DataTable().ajax.reload(null, false); $("#include_patterns_table").DataTable().ajax.reload(null, false); - if(currentTab === ""){ //Only add a notification if we are on the first tab - newIncludePatternsCount = newIncludePatternsCount + 1; - $("#includePatternsTab").html( - `Include Patterns ` + - newIncludePatternsCount + " new" + - `` - ); - } + if (currentTab === "") { + //Only add a notification if we are on the first tab + newIncludePatternsCount = newIncludePatternsCount + 1; + $("#includePatternsTab").html( + `Include Patterns ` + + newIncludePatternsCount + + " new" + + `` + ); + } }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; @@ -1002,14 +1009,16 @@ function postTitlePatterns( success: function (data) { $("#candidate_urls_table").DataTable().ajax.reload(null, false); $("#title_patterns_table").DataTable().ajax.reload(null, false); - if(currentTab === ""){ //Only add a notification if we are on the first tab - newTitlePatternsCount = newTitlePatternsCount + 1; - $("#titlePatternsTab").html( - `Title Patterns ` + - newTitlePatternsCount + " new" + - `` - ); - } + if (currentTab === "") { + //Only add a notification if we are on the first tab + newTitlePatternsCount = newTitlePatternsCount + 1; + $("#titlePatternsTab").html( + `Title Patterns ` + + newTitlePatternsCount + + " new" + + `` + ); + } }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; From 33ff1bbc14dd8ce57a6817dae43eabf26512d749 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Thu, 30 May 2024 09:34:07 -0400 Subject: [PATCH 080/111] 79: comments now have correct styling --- sde_indexing_helper/static/css/collection_detail.css | 3 +-- .../templates/sde_collections/collection_detail.html | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sde_indexing_helper/static/css/collection_detail.css b/sde_indexing_helper/static/css/collection_detail.css index affcf945..79386ac0 100644 --- a/sde_indexing_helper/static/css/collection_detail.css +++ b/sde_indexing_helper/static/css/collection_detail.css @@ -19,7 +19,7 @@ margin-top: -3; } .comment { - background-color: #f8f9fa; + background-color: transparent; border: 1px solid #ddd; padding: 10px; margin-bottom: 10px; @@ -29,7 +29,6 @@ color: #007bff; } .comment span { - color: #6c6840; font-size: 0.9em; margin-bottom: 10px; } diff --git a/sde_indexing_helper/templates/sde_collections/collection_detail.html b/sde_indexing_helper/templates/sde_collections/collection_detail.html index 8b6d8d8f..e5d136f9 100644 --- a/sde_indexing_helper/templates/sde_collections/collection_detail.html +++ b/sde_indexing_helper/templates/sde_collections/collection_detail.html @@ -186,8 +186,8 @@

{{ colle {% for comment in comments %}
{{ comment.user.username }} - {{ comment.created_at|date:"M. d, Y, P" }} -

{{ comment.text }}

+ {{ comment.created_at|date:"M. d, Y, P" }} +

{{ comment.text }}

{% empty %}

No comments yet

From 1a7fd6b20646a617a634f623831b900782eb9614 Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Thu, 30 May 2024 09:59:41 -0400 Subject: [PATCH 081/111] fixed error from merge and added invisible column for exclude column --- .../static/js/candidate_url_list.js | 24 ++++++++++++++----- .../sde_collections/candidate_urls_list.html | 2 ++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 68da9352..a538df73 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -109,18 +109,17 @@ function initializeDataTable() { { extend: "csv", exportOptions: { - columns: [0, 1, 2, 3, 10], + columns: [0, 2, 3, 10, 11], }, customize: function (csv) { var lines = csv.split("\n"); console.log("lines[1].split(", ")", lines[1].split(",")); const colInfo = { 0: $("#candidateUrlFilter").val() || "No input", - 1: $(".dropdown-1").val() || "SELECT", - 2: $("#candidateScrapedTitleFilter").val() || "No input", - 3: $("#candidateNewTitleFilter").val() || "No input", - 4: $(".dropdown-4").val() || "SELECT", - 5: dict[$(".dropdown-5").val()] || "SELECT", + 1: $("#candidateScrapedTitleFilter").val() || "No input", + 2: $("#candidateNewTitleFilter").val() || "No input", + 3: dict[$(".dropdown-5").val()] || "No Selection", + 4: $(".dropdown-1").val() || "No Selection", }; if (lines.length > 2) { var secondRow = lines[1].split(","); @@ -199,6 +198,19 @@ function initializeDataTable() { }, visible: false, }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + + console.log("row.excluded", row.excluded); + return excludedDict[row.excluded]; + }, + visible: false, + }, ], createdRow: function (row, data, dataIndex) { if (data["excluded"]) { diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index 0043d520..0f2da7cf 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -83,6 +83,7 @@

Document Type
+
Exclude
@@ -107,6 +108,7 @@

+ From fead64bc89eab6a07df1801b6888f5f14eb5ce50 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 08:13:44 -0400 Subject: [PATCH 082/111] 79:wip on pagination --- .../static/css/candidate_url_list.css | 24 +++++++++++++++++++ .../static/js/candidate_url_list.js | 7 ++++-- .../sde_collections/candidate_urls_list.html | 4 +++- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index da6dabbc..96322f18 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -271,4 +271,28 @@ letter-spacing: -0.02em; div.dt-container div.dt-info { padding-top: 0; white-space: normal; +} + +.page-link{ + color:white !important; + border:0.5px solid !important; + margin-left:3px; + margin-right:3px; +} +.page-link:hover{ + background-color: #0066CA !important; + +} + +.page-item.disabled .page-link { + color:grey!important; +} +.dt-paging-input{ + color:white; +} + +.dt-paging-input input{ + background-color: #3F4A58; + color: white; + border:solid 0.5px !important; } \ No newline at end of file diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 8d5f1e17..4064f27e 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -93,10 +93,13 @@ function initializeDataTable() { ], pageLength: 100, stateSave: true, + layout: { + bottomEnd: 'inputPaging', + topEnd: 'inputPaging', + }, serverSide: true, orderCellsTop: true, - pagingType: "input", - dom: "ilBrtip", + // dom: "ilBrtip", buttons: [ "spacer", "csv", diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html index ef84b8b2..722c5f92 100644 --- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html +++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html @@ -7,6 +7,7 @@ {% block stylesheets %} {{ block.super }} + {% endblock stylesheets %} @@ -17,7 +18,8 @@ - + + {% endblock javascripts %} From 698f123dcdfeff7d38d5a0a22b2e31b77b442b58 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 12:25:20 -0400 Subject: [PATCH 083/111] 79: adjusting pagination styling --- .../static/css/candidate_url_list.css | 13 +++++++++++++ sde_indexing_helper/static/css/project.css | 2 +- sde_indexing_helper/static/js/candidate_url_list.js | 1 - 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index 96322f18..f5725690 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -295,4 +295,17 @@ letter-spacing: -0.02em; background-color: #3F4A58; color: white; border:solid 0.5px !important; +} + +.dt-inputpaging{ + float:right; +} + +.mr-auto{ + float: left; + width: 50%; +} + +.ml-auto{ + width:50%; } \ No newline at end of file diff --git a/sde_indexing_helper/static/css/project.css b/sde_indexing_helper/static/css/project.css index 9864209e..885c63eb 100644 --- a/sde_indexing_helper/static/css/project.css +++ b/sde_indexing_helper/static/css/project.css @@ -50,7 +50,7 @@ #candidate_urls_table_wrapper div.dt-info:first-of-type { display: inline-block; - width: 25%; + /* width: 25%; */ } #candidate_urls_table_wrapper div.dt-length, #exclude_patterns_table_wrapper div.dt-length, #include_patterns_table_wrapper div.dt-length, #document_type_patterns_table_wrapper div.dt-length, #title_patterns_table_wrapper div.dt-length{ diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 4064f27e..3fac0197 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -95,7 +95,6 @@ function initializeDataTable() { stateSave: true, layout: { bottomEnd: 'inputPaging', - topEnd: 'inputPaging', }, serverSide: true, orderCellsTop: true, From 378f7aea1ae6aef81bafacd9ee912df6a8d52a99 Mon Sep 17 00:00:00 2001 From: Michelle <88682822+emshahh@users.noreply.github.com> Date: Fri, 31 May 2024 12:46:53 -0400 Subject: [PATCH 084/111] applying feedbacK --- .../static/js/candidate_url_list.js | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index f957c466..34f49026 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -69,9 +69,7 @@ function modalContents(tableName) { .attr("for", "checkbox_" + columnName.replace(/\s+/g, "_")) .text(columnName); var $caption = $("

") - .text( - candidateTableHeaderDefinitons[columnName] - ) + .text(candidateTableHeaderDefinitons[columnName]) .attr({ id: "caption", }); @@ -99,6 +97,7 @@ function initializeDataTable() { ["Show 25", "Show 50", "Show 100", "Show 500"], ], pageLength: 100, + colReorder: true, stateSave: true, serverSide: true, orderCellsTop: true, @@ -108,28 +107,42 @@ function initializeDataTable() { { extend: "csv", exportOptions: { - columns: [0, 2, 3, 10, 11], + columns: [0, 11, 2, 3, 10], }, customize: function (csv) { var lines = csv.split("\n"); - console.log("lines[1].split(", ")", lines[1].split(",")); - const colInfo = { - 0: $("#candidateUrlFilter").val() || "No input", - 1: $("#candidateScrapedTitleFilter").val() || "No input", - 2: $("#candidateNewTitleFilter").val() || "No input", - 3: dict[$(".dropdown-5").val()] || "No Selection", - 4: $(".dropdown-1").val() || "No Selection", - }; + // Reorder the header columns + var headers = lines[0].split(","); + var reorderedHeaders = [ + headers[0], + headers[4], + headers[1], + headers[2], + headers[3], + ]; + lines[0] = reorderedHeaders.join(","); + + // Add filter information in the footer + const secondRowFilters = [ + "Applied filters:", + `URL: ${$("#candidateUrlFilter").val() || "No input"}`, + `Exclude: ${$(".dropdown-1").val() || "No selection"}`, + `Scraped Title: ${ + $("#candidateNewTitleFilter").val() || "No input" + }`, + `New Title: ${dict[$(".dropdown-5").val()] || "No input"}`, + `Document Type: ${ + $("#candidateScrapedTitleFilter").val() || "No selection" + }`, + ]; + var appliedFiltersInfo = secondRowFilters.join("\n"); + + // Remove the second row with the filters if (lines.length > 2) { - var secondRow = lines[1].split(","); - // Modify the second row as needed - for (let key in colInfo) { - secondRow[key] = colInfo[key]; - } - - lines[1] = secondRow.join(","); + lines.splice(1, 1); } - return lines.join("\n"); + + return lines.join("\n") + appliedFiltersInfo; }, }, "spacer", @@ -204,8 +217,6 @@ function initializeDataTable() { true: "Yes", false: "No", }; - - console.log("row.excluded", row.excluded); return excludedDict[row.excluded]; }, visible: false, From 75cfaa7ef7576627991423646ecca35ee1f2245c Mon Sep 17 00:00:00 2001 From: 635487 <635487@bah.com> Date: Fri, 31 May 2024 09:54:57 -0700 Subject: [PATCH 085/111] 97-url-link: move link to right, change color --- .../static/css/candidate_url_list.css | 12 +++++++++++- sde_indexing_helper/static/js/candidate_url_list.js | 9 +++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index 7dc4a32f..036ad516 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -298,4 +298,14 @@ div.dt-buttons .btn.processing:after { .headerDiv{ display: flex; justify-content: space-between; -} \ No newline at end of file +} + +.url-cell { + display:flex; + align-items: center; + justify-content: space-between; + } + + .url-icon { + color: #65B1EF; + } \ No newline at end of file diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 23e151ee..d4d2cbaa 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -608,11 +608,12 @@ function getURLColumn() { return { data: "url", render: function (data, type, row) { - return ` open_in_new ${remove_protocol( + return `

${remove_protocol( data - )}`; + )} + open_in_new
`; }, }; } From 4a9945714baf07598edfc73568ddb19bb1b8a000 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 13:47:21 -0400 Subject: [PATCH 086/111] 79: pageLength working again --- .../static/js/candidate_url_list.js | 55 +++++++++++++------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 3fac0197..8ccab9ad 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -87,30 +87,53 @@ function initializeDataTable() { var candidate_urls_table = $("#candidate_urls_table").DataTable({ // scrollY: true, - lengthMenu: [ - [25, 50, 100, 500], - ["Show 25", "Show 50", "Show 100", "Show 500"], - ], + // lengthMenu: [ + // [25, 50, 100, 500], + // ["Show 25", "Show 50", "Show 100", "Show 500"], + // ], pageLength: 100, stateSave: true, layout: { bottomEnd: 'inputPaging', + topEnd: null, + topStart: { + // lengthMenu: [ + // [25, 50, 100, 500], + // ["Show 25", "Show 50", "Show 100", "Show 500"], + // ], + pageLength: { + menu: [[25, 50, 100, 500],["Show 25", "Show 50", "Show 100", "Show 500"]] + }, + buttons: [ + // "pageLength", + "spacer", + "csv", + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#candidate_urls_table"); + }, + }, + ], + } }, serverSide: true, orderCellsTop: true, // dom: "ilBrtip", - buttons: [ - "spacer", - "csv", - "spacer", - { - text: "Customize Columns", - className: "customizeColumns", - action: function () { - modalContents("#candidate_urls_table"); - }, - }, - ], + // buttons: [ + // "spacer", + // "csv", + // "spacer", + // { + // text: "Customize Columns", + // className: "customizeColumns", + // action: function () { + // modalContents("#candidate_urls_table"); + // }, + // }, + // ], select: { style: "os", selector: "td:nth-child(5)", From 61495890d6894e039705df14aeac2b5aa0ca6c00 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 13:55:25 -0400 Subject: [PATCH 087/111] 79: wip --- sde_indexing_helper/static/css/candidate_url_list.css | 4 ++-- sde_indexing_helper/static/js/candidate_url_list.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index f5725690..7e71e0a9 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -301,10 +301,10 @@ letter-spacing: -0.02em; float:right; } -.mr-auto{ +/* .mr-auto{ float: left; width: 50%; -} +} */ .ml-auto{ width:50%; diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 8ccab9ad..ce6d5a0f 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -97,6 +97,7 @@ function initializeDataTable() { bottomEnd: 'inputPaging', topEnd: null, topStart: { + info:true, // lengthMenu: [ // [25, 50, 100, 500], // ["Show 25", "Show 50", "Show 100", "Show 500"], @@ -105,7 +106,6 @@ function initializeDataTable() { menu: [[25, 50, 100, 500],["Show 25", "Show 50", "Show 100", "Show 500"]] }, buttons: [ - // "pageLength", "spacer", "csv", "spacer", From 61e4a69fcc5a7cbf75baa803f8138ab5b59d03c3 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 14:06:04 -0400 Subject: [PATCH 088/111] 79: styling complete --- .../static/css/candidate_url_list.css | 12 ++++++++++-- sde_indexing_helper/static/css/project.css | 4 +++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index 7e71e0a9..2f189c64 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -298,7 +298,10 @@ letter-spacing: -0.02em; } .dt-inputpaging{ - float:right; + /* float:right; */ + position: absolute; + right: 16px; + top: -27px; } /* .mr-auto{ @@ -308,4 +311,9 @@ letter-spacing: -0.02em; .ml-auto{ width:50%; -} \ No newline at end of file +} + +.custom-select-sm{ + margin-left:5px; +} + diff --git a/sde_indexing_helper/static/css/project.css b/sde_indexing_helper/static/css/project.css index 885c63eb..9e0c26a9 100644 --- a/sde_indexing_helper/static/css/project.css +++ b/sde_indexing_helper/static/css/project.css @@ -59,7 +59,8 @@ } #candidate_urls_table_wrapper div.dt-buttons { - width: 64%; + /* width: 64%; */ + float:right; justify-content: end; } @@ -233,6 +234,7 @@ body { .dt-info{ font-weight:900; font-size:16px; + margin-top:15px; } .buttons-csv, .customizeColumns{ From d0b630fd592fe319cdcf07269ef0eb4b4fb80776 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 14:16:18 -0400 Subject: [PATCH 089/111] 79: removing commented out code --- .../static/css/candidate_url_list.css | 7 ------ .../static/js/candidate_url_list.js | 22 ------------------- 2 files changed, 29 deletions(-) diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css index 2f189c64..10440ab2 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/candidate_url_list.css @@ -298,17 +298,10 @@ letter-spacing: -0.02em; } .dt-inputpaging{ - /* float:right; */ position: absolute; right: 16px; top: -27px; } - -/* .mr-auto{ - float: left; - width: 50%; -} */ - .ml-auto{ width:50%; } diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index ce6d5a0f..b0e31893 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -86,11 +86,6 @@ function initializeDataTable() { var false_icon = 'close'; var candidate_urls_table = $("#candidate_urls_table").DataTable({ - // scrollY: true, - // lengthMenu: [ - // [25, 50, 100, 500], - // ["Show 25", "Show 50", "Show 100", "Show 500"], - // ], pageLength: 100, stateSave: true, layout: { @@ -98,10 +93,6 @@ function initializeDataTable() { topEnd: null, topStart: { info:true, - // lengthMenu: [ - // [25, 50, 100, 500], - // ["Show 25", "Show 50", "Show 100", "Show 500"], - // ], pageLength: { menu: [[25, 50, 100, 500],["Show 25", "Show 50", "Show 100", "Show 500"]] }, @@ -121,19 +112,6 @@ function initializeDataTable() { }, serverSide: true, orderCellsTop: true, - // dom: "ilBrtip", - // buttons: [ - // "spacer", - // "csv", - // "spacer", - // { - // text: "Customize Columns", - // className: "customizeColumns", - // action: function () { - // modalContents("#candidate_urls_table"); - // }, - // }, - // ], select: { style: "os", selector: "td:nth-child(5)", From 7143394306d8e8b2cdc10dfce7ef3911f6eafc1d Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Fri, 31 May 2024 14:18:47 -0400 Subject: [PATCH 090/111] 79: merge issue --- sde_indexing_helper/static/js/candidate_url_list.js | 1 - 1 file changed, 1 deletion(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 8bcf183d..680bfe02 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -112,7 +112,6 @@ function initializeDataTable() { serverSide: true, orderCellsTop: true, pagingType: "input", - dom: "ilBrtip", buttons: [ "spacer", "csv", From 07456b1b6a79734551960a7ecc941de88edb16f4 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Fri, 31 May 2024 13:32:25 -0500 Subject: [PATCH 091/111] Resolve conflicting migration --- .../migrations/0054_merge_20240531_1332.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 sde_collections/migrations/0054_merge_20240531_1332.py diff --git a/sde_collections/migrations/0054_merge_20240531_1332.py b/sde_collections/migrations/0054_merge_20240531_1332.py new file mode 100644 index 00000000..1b54568a --- /dev/null +++ b/sde_collections/migrations/0054_merge_20240531_1332.py @@ -0,0 +1,13 @@ +# Generated by Django 4.2.9 on 2024-05-31 18:32 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0046_workflowhistory_old_status"), + ("sde_collections", "0053_alter_collection_url"), + ] + + operations = [] From e88b79234d2b70c309f2beab44cf6ed17370016a Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Fri, 31 May 2024 13:39:24 -0500 Subject: [PATCH 092/111] Fix button color KeyError --- sde_collections/models/collection.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 46a0a572..5f082a81 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -180,6 +180,7 @@ def workflow_status_button_color(self) -> str: 14: "btn-primary", 15: "btn-info", 16: "btn-secondary", + 17: "btn-light", } return color_choices[self.workflow_status] @@ -500,24 +501,20 @@ class Comments(models.Model): def __str__(self): return self.text + class WorkflowHistory(models.Model): - collection = models.ForeignKey( - Collection, on_delete=models.CASCADE, related_name="workflow_history", null=True - ) + collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="workflow_history", null=True) workflow_status = models.IntegerField( choices=WorkflowStatusChoices.choices, default=WorkflowStatusChoices.RESEARCH_IN_PROGRESS, ) - old_status = models.IntegerField( - choices=WorkflowStatusChoices.choices, null=True - ) + old_status = models.IntegerField(choices=WorkflowStatusChoices.choices, null=True) curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) - def __str__(self): - return (str(self.collection) + str(self.workflow_status)) - + return str(self.collection) + str(self.workflow_status) + @property def workflow_status_button_color(self) -> str: color_choices = { @@ -537,9 +534,11 @@ def workflow_status_button_color(self) -> str: 14: "btn-primary", 15: "btn-info", 16: "btn-secondary", + 17: "btn-light", } return color_choices[self.workflow_status] + @receiver(post_save, sender=Collection) def log_workflow_history(sender, instance, created, **kwargs): if instance.workflow_status != instance.old_workflow_status: @@ -547,13 +546,13 @@ def log_workflow_history(sender, instance, created, **kwargs): collection=instance, workflow_status=instance.workflow_status, curated_by=instance.curated_by, - old_status=instance.old_workflow_status + old_status=instance.old_workflow_status, ) @receiver(post_save, sender=Collection) def create_configs_on_status_change(sender, instance, created, **kwargs): - """ + """ Creates various config files on certain workflow status changes """ From 45714ffcadd1d754b0c29a2250f55d183f20d5e2 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Fri, 31 May 2024 13:39:39 -0500 Subject: [PATCH 093/111] Add missing migration --- ...ter_workflowhistory_old_status_and_more.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py diff --git a/sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py b/sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py new file mode 100644 index 00000000..8d098cfd --- /dev/null +++ b/sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py @@ -0,0 +1,65 @@ +# Generated by Django 4.2.9 on 2024-05-31 18:33 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0054_merge_20240531_1332"), + ] + + operations = [ + migrations.AlterField( + model_name="workflowhistory", + name="old_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "Quality Check Failed"), + (13, "Ready for Public Production"), + (14, "Perfect and on Production"), + (15, "Low Priority Problems on Production"), + (16, "High Priority Problems on Production, only for old sources"), + (17, "Code Merge Pending"), + ], + null=True, + ), + ), + migrations.AlterField( + model_name="workflowhistory", + name="workflow_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "Quality Check Failed"), + (13, "Ready for Public Production"), + (14, "Perfect and on Production"), + (15, "Low Priority Problems on Production"), + (16, "High Priority Problems on Production, only for old sources"), + (17, "Code Merge Pending"), + ], + default=1, + ), + ), + ] From afa295c3ecc238dbac8c24250bc2390de1a0daab Mon Sep 17 00:00:00 2001 From: 635487 <635487@bah.com> Date: Fri, 31 May 2024 11:41:17 -0700 Subject: [PATCH 094/111] 102-title-default: add collection name as default --- .../templates/sde_collections/collection_detail.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_indexing_helper/templates/sde_collections/collection_detail.html b/sde_indexing_helper/templates/sde_collections/collection_detail.html index 0f534b10..18e2c1ae 100644 --- a/sde_indexing_helper/templates/sde_collections/collection_detail.html +++ b/sde_indexing_helper/templates/sde_collections/collection_detail.html @@ -285,7 +285,7 @@

From 0043a80d2a2942928c31cf096c80fcb90a739265 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Tue, 4 Jun 2024 12:13:50 -0400 Subject: [PATCH 103/111] 60: fixing header mismatch --- sde_indexing_helper/static/js/candidate_url_list.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 26947eae..bf419a39 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -115,10 +115,10 @@ function initializeDataTable() { var headers = lines[0].split(","); var reorderedHeaders = [ headers[0], + headers[3], headers[4], headers[1], - headers[2], - headers[3], + headers[2] ]; lines[0] = reorderedHeaders.join(","); From b0dc79176a013cbedeffcfad4a2bef7a14df7fd0 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Tue, 4 Jun 2024 12:50:16 -0400 Subject: [PATCH 104/111] 118: doc-type-fix --- sde_indexing_helper/static/js/candidate_url_list.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index c1bbfa90..fe1969ae 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -668,7 +668,7 @@ function getDocumentTypeColumn() { button_text = data ? dict[data] : "Select"; button_color = data ? "btn-success" : "btn-secondary"; return ` -
- + {% csrf_token %} {{ form|crispy }} From c568cebc49941259bfd8610fb3003890e327f009 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Wed, 5 Jun 2024 08:54:21 -0400 Subject: [PATCH 110/111] 88: adding the rest of the column descriptions --- sde_indexing_helper/static/js/project.js | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sde_indexing_helper/static/js/project.js b/sde_indexing_helper/static/js/project.js index 83b55278..620e301a 100644 --- a/sde_indexing_helper/static/js/project.js +++ b/sde_indexing_helper/static/js/project.js @@ -13,14 +13,15 @@ const candidateTableHeaderDefinitons = { - "URL": "A scraped URL.", - "Exclude": "Whether or not this URL is excluded from the collection.", - "Scraped Title": "Title scraped from the document.", - "New Title" : "New title set by a user.", - "Document Type": "{insert description here}", - "Match Pattern" : "Pattern that is used to match against URLs in the collection.", - "Match Pattern Type": "{Insert explanation here}", - "Reason": "{Insert explanation here}", - "Affected URLs": "The URLs that match the pattern.", - "Actions": "Delete a pattern." + "URL": "The web address of a specific webpage from a given source.", + "Exclude": "The action of omitting a certain URL(s) from being included in the final list of candidate URLs. This can be based on URL patterns or URL content.", + "Scraped Title": "The initial scraped title of the webpage generated from the webpage metadata.", + "New Title" : "A modified or updated title for a webpage set by the curator either through a manual or pattern change. The new title often improves readability and clarity.", + "Document Type": "The classification of the content found at the URL. This can be set as 'Documentation', 'Images', 'Software and Tools', 'Missions and Instruments', or 'Data'.", + "Match Pattern" : "A pattern set by the curator for which to exclude URLs, change URL titles, or assign URL document types. A match pattern could be a portion of the URL (e.g. URL extension) or a pattern that includes wild cards.", + "Match Pattern Type": "Indicates whether the Match Pattern applies to a single or multiple URLs.", + "Reason": "Indicates why the curator has excluded single or multiple URLs.", + "Affected URLs": "Indicates the number of URLs the given action, rule, or pattern has been applied to.", + "Actions": "Gives the curator the ability to delete a set title, document type, or exclude pattern.", + "Title Pattern": "A specific format given by the curator to make changes to original titles. This can include the use of xpaths or additions to the original title string." }; From b48fde2c9b19dcf912e520df2c434ead31197543 Mon Sep 17 00:00:00 2001 From: Kshaw362 Date: Wed, 5 Jun 2024 11:20:00 -0400 Subject: [PATCH 111/111] 97: removing the icon name --- sde_indexing_helper/static/js/candidate_url_list.js | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 57bba443..8999cfde 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -141,8 +141,13 @@ function initializeDataTable() { if (lines.length > 2) { lines.splice(1, 1); } - - return lines.join("\n") + appliedFiltersInfo; + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new",""); + alteredLines.push(newLine); + }) + return alteredLines.join("\n") + appliedFiltersInfo; }, }, "spacer",