Merge branch 'dev' into 107-doc-type-filter-not-working

NASA-IMPACT · Jun 5, 2024 · be9413b · be9413b
2 parents 7013c83 + f3c38a0
commit be9413b
Show file tree

Hide file tree

Showing 53 changed files with 1,935 additions and 191 deletions.
diff --git a/.env_sample b/.env_sample
@@ -0,0 +1,18 @@
+CELERY_BROKER_URL=""
+CELERY_FLOWER_PASSWORD=""
+CELERY_FLOWER_USER=""
+DATABASE_URL='postgresql://<user>:<password>@localhost:5432/<database>'
+DJANGO_ACCOUNT_ALLOW_REGISTRATION=False
+DJANGO_AWS_ACCESS_KEY_ID=""
+DJANGO_AWS_SECRET_ACCESS_KEY=""
+DJANGO_AWS_STORAGE_BUCKET_NAME=""
+GITHUB_ACCESS_TOKEN=""
+GITHUB_BRANCH_FOR_WEBAPP=""
+IPYTHONDIR=""
+REDIS_URL=""
+SINEQUA_CONFIGS_GITHUB_REPO=""
+SINEQUA_CONFIGS_REPO_DEV_BRANCH=""
+SINEQUA_CONFIGS_REPO_MASTER_BRANCH=""
+SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH=""
+SLACK_WEBHOOK_URL=""
+USE_DOCKER=no
diff --git a/.envs/.local/.django b/.envs/.local/.django
@@ -29,3 +29,7 @@ SINEQUA_CONFIGS_GITHUB_REPO='NASA-IMPACT/sde-backend'
 SINEQUA_CONFIGS_REPO_MASTER_BRANCH='master'
 SINEQUA_CONFIGS_REPO_DEV_BRANCH='dev'
 SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
+
+# Slack Webhook
+# ------------------------------------------------------------------------------
+SLACK_WEBHOOK_URL='https://hooks.slack.com/services/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'
diff --git a/README.md b/README.md
@@ -12,15 +12,46 @@ Moved to [settings](http://cookiecutter-django.readthedocs.io/en/latest/settings
 ## Basic Commands
 
 ### Building The Project
+
     ```bash
     $ docker-compose -f local.yml build
     ```
 
 ### Running The Necessary Containers
+
     ```bash
     $ docker-compose -f local.yml up
     ```
 
+### Non-docker Local Setup
+
+If you want to run the project without docker, you will need the following:
+
+- Postgres
+
+Run the following commands:
+
+````
+$ psql postgres
+postgres=# create database <some database>;
+postgres=# create user <some username> with password '<some password>';
+postgres=# grant all privileges on database <some database> to <some username>;
+
+# This next one is optional, but it will allow the user to create databases for testing
+
+postgres=# alter role <some username> with superuser;
+```
+
+Now copy .env_sample in the root directory to .env. Note that in this setup we don't end up using the .envs/ directory, but instead we use the .env file.
+
+Replace the variables in this line in the .env file: `DATABASE_URL='postgresql://<user>:<password>@localhost:5432/<database>'` with your user, password and database. Change the port if you have a different one.
+
+You don't need to change any other variable, unless you want to use specific modules (like the GitHub code will require a GitHub token etc).
+
+There is a section in `config/settings/base.py` which reads environment variables from this file. The line should look like `READ_DOT_ENV_FILE = env.bool("DJANGO_READ_DOT_ENV_FILE", default=True)`. Make sure either the default is True here (which it should already be), or run `export DJANGO_READ_DOT_ENV_FILE=True` in your terminal.
+
+Run `python manage.py runserver` to test if your setup worked. You might have to run an initial migration with `python manage.py migrate`.
+
 ### Setting Up Your Users
 
 - To create a **normal user account**, just go to Sign Up and fill out the form. Once you submit it, you'll see a "Verify Your E-mail Address" page. Go to your console to see a simulated email verification message. Copy the link into your browser. Now the user's email should be verified and ready to go.
@@ -144,7 +175,7 @@ To run a celery worker:
 ```bash
 cd sde_indexing_helper
 celery -A config.celery_app worker -l info
-```
+````
 
 Please note: For Celery's import magic to work, it is important _where_ the celery commands are run. If you are in the same folder with _manage.py_, you should be right.
 
@@ -186,7 +217,6 @@ Run against the files :
 
     It's usually a good idea to run the hooks against all of the files when adding new hooks (usually `pre-commit` will only run on the chnages files during git hooks).
 
-
 ### Sentry
 
 Sentry is an error logging aggregator service. You can sign up for a free account at <https://sentry.io/signup/?code=cookiecutter> or download and host it yourself.

diff --git a/config/settings/base.py b/config/settings/base.py
@@ -338,3 +338,4 @@
 SINEQUA_CONFIGS_REPO_MASTER_BRANCH = env("SINEQUA_CONFIGS_REPO_MASTER_BRANCH")
 SINEQUA_CONFIGS_REPO_DEV_BRANCH = env("SINEQUA_CONFIGS_REPO_DEV_BRANCH")
 SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH = env("SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH")
+SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL")
diff --git a/config/urls.py b/config/urls.py
@@ -5,10 +5,10 @@
 from django.views import defaults as default_views
 
 admin.site.site_header = (
-    "SDE Indexing Administration"  # default: "Django Administration"
+    "SDE Indexing Helper Administration"  # default: "Django Administration"
 )
-admin.site.index_title = "SDE Indexing"  # default: "Site administration"
-admin.site.site_title = "SDE Indexing"  # default: "Django site admin"
+admin.site.index_title = "SDE Indexing Helper"  # default: "Site administration"
+admin.site.site_title = "SDE Indexing Helper"  # default: "Django site admin"
 
 urlpatterns = [
     path("", include("sde_collections.urls", namespace="sde_collections")),
@@ -18,8 +18,7 @@
     # User management
     path("users/", include("sde_indexing_helper.users.urls", namespace="users")),
     path("accounts/", include("allauth.urls")),
-    path("api-auth/", include("rest_framework.urls"))
-    # Your stuff: custom urls includes go here
+    path("api-auth/", include("rest_framework.urls")),
 ] + static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 
 

diff --git a/feedback/models.py b/feedback/models.py
@@ -1,6 +1,8 @@
 from django.db import models
 from django.utils import timezone
 
+from sde_collections.utils.slack_utils import send_slack_message
+
 
 class Feedback(models.Model):
     name = models.CharField(max_length=150)
@@ -17,8 +19,30 @@ class Meta:
     def save(self, *args, **kwargs):
         if not self.id:
             self.created_at = timezone.now()
+        is_new = self._state.adding
+        if is_new:
+            message = self.format_notification_message()
+            try:
+                send_slack_message(message)
+            except Exception as e:
+                print(f"Failed to send slack message: {e}")
         super().save(*args, **kwargs)
 
+    def format_notification_message(self):
+        """
+        Returns a formatted notification message containing details from this Feedback instance.
+        """
+        notification_message = (
+            f"<!here> New Feedback Received : \n"
+            f"Name: {self.name}\n"
+            f"Email: {self.email}\n"
+            f"Subject: {self.subject}\n"
+            f"Comments: {self.comments}\n"
+            f"Source: {self.source}\n"
+            f"Received on: {self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
+        )
+        return notification_message
+
 
 class ContentCurationRequest(models.Model):
     name = models.CharField(max_length=150)

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.black]
+line-length = 120
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -28,3 +28,5 @@ PyGithub==2.2.0
 tqdm==4.66.1
 xmltodict==0.13.0
 django-cors-headers==4.3.1
+unidecode==1.3.8
+lxml==4.9.2
diff --git a/scripts/bulk_create_sources_on_webapp.py b/scripts/bulk_create_sources_on_webapp.py
@@ -0,0 +1,64 @@
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import Divisions
+
+DIVISION_MAPPING = {
+    "Helio": Divisions.HELIOPHYSICS,
+    "Astro": Divisions.ASTROPHYSICS,
+    "PDS": Divisions.PLANETARY,
+    "Earth": Divisions.EARTH_SCIENCE,
+    "BPS": Divisions.BIOLOGY,
+    "Multiple": Divisions.GENERAL,
+}
+
+sources = [
+    {
+        "Name": "Source name",
+        "Link": "Base link to the source",
+        "Division": "Division of the source from the spread sheet",
+        "Notes": "Any notes available from the spreadsheet",
+    },
+]
+
+
+def get_division_id(division_name):
+    division_name = division_name.strip()
+    return DIVISION_MAPPING.get(division_name, None)
+
+
+def create_collection(source):
+    name = source["Name"]
+    link = source["Link"]
+    division_text = source["Division"]
+    notes = source["Notes"]
+
+    division_id = get_division_id(division_text)
+    if division_id is None:
+        print(f"No valid division found for '{division_text}'. Skipping creation for {name}.")
+        return False
+
+    try:
+        if Collection.objects.filter(name=name).exists():
+            print(f"Collection with name '{name}' already exists. Skipping.")
+            return False
+        if Collection.objects.filter(url=link).exists():
+            print(f"Collection with link '{link}' already exists. Skipping.")
+            return False
+        new_collection = Collection(name=name, url=link, division=division_id, notes=notes)
+        new_collection.save()
+        print(f"Collection '{name}' created successfully.")
+        return True
+    except Exception as e:
+        print(f"Failed to create collection '{name}': {e}")
+        return False
+
+
+def main():
+    created_count = 0
+    for source in sources:
+        if create_collection(source):
+            created_count += 1
+    print(f"Total new collections created: {created_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/quality_and_indexing/add_perfect_to_prod_query.py b/scripts/quality_and_indexing/add_perfect_to_prod_query.py
@@ -0,0 +1,11 @@
+"""
+adds collections marked as ready for public prod to the public query
+after running this code, you will need to merge in the webapp branch
+"""
+
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+for collection in Collection.objects.filter(workflow_status=WorkflowStatusChoices.READY_FOR_PUBLIC_PROD):
+    print(collection.config_folder)
+    collection.add_to_public_query()
diff --git a/scripts/quality_and_indexing/change_statuses_on_webapp.py b/scripts/quality_and_indexing/change_statuses_on_webapp.py
@@ -0,0 +1,66 @@
+"""
+take emily's notes from slack and change the appropriate statuses in the webapp
+"""
+
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+RESEARCH_IN_PROGRESS = 1, "Research in Progress"
+READY_FOR_ENGINEERING = 2, "Ready for Engineering"
+ENGINEERING_IN_PROGRESS = 3, "Engineering in Progress"
+READY_FOR_CURATION = 4, "Ready for Curation"
+CURATION_IN_PROGRESS = 5, "Curation in Progress"
+CURATED = 6, "Curated"
+QUALITY_FIXED = 7, "Quality Fixed"
+SECRET_DEPLOYMENT_STARTED = 8, "Secret Deployment Started"
+SECRET_DEPLOYMENT_FAILED = 9, "Secret Deployment Failed"
+READY_FOR_LRM_QUALITY_CHECK = 10, "Ready for LRM Quality Check"
+READY_FOR_FINAL_QUALITY_CHECK = 11, "Ready for Quality Check"
+QUALITY_CHECK_FAILED = 12, "Quality Check Failed"
+READY_FOR_PUBLIC_PROD = 13, "Ready for Public Production"
+PERFECT_ON_PROD = 14, "Perfect and on Production"
+LOW_PRIORITY_PROBLEMS_ON_PROD = 15, "Low Priority Problems on Production"
+HIGH_PRIORITY_PROBLEMS_ON_PROD = 16, "High Priority Problems on Production, only for old sources"
+MERGE_PENDING = 17, "Code Merge Pending"
+
+perfect = [
+    # "WIND_Spacecraft",
+    # "gamma_ray_data_tools_core_package",
+    # "land_processes_distributed_active_archive_center",
+    # "mdscc_deep_space_network",
+    # "HelioAnalytics",
+    # "nasa_infrared_telescope_facility_irtf",
+    # "gmao_fluid",
+    # "starchild_a_learning_center_for_young_astronomers",
+    # "voyager_Cosmic_Ray_Subsystem",
+    "ldas_land_data_assimilatin_system",
+    "ppi_node",
+]
+
+low_priority = [
+    "nasa_applied_sciences",
+    "parker_solar_probe",
+    "virtual_wave_observatory",
+    "explorer_program_acquisition",
+    "lisa_consortium",
+    "astropy",
+    "fermi_at_gsfc",
+    "microobservatory_robotic_telescope_network",
+]
+
+for config in perfect:
+    print(config)
+    collection = Collection.objects.get(config_folder=config)
+    collection.workflow_status = WorkflowStatusChoices.PERFECT_ON_PROD
+    collection.save()
+
+for config in low_priority:
+    print(config)
+    collection = Collection.objects.get(config_folder=config)
+    collection.workflow_status = WorkflowStatusChoices.LOW_PRIORITY_PROBLEMS_ON_PROD
+    collection.save()
+
+# for config in perfect:
+#     collection = Collection.objects.get(config_folder=config)
+#     collection.workflow_status = WorkflowStatusChoices.PERFECT_ON_PROD
+#     collection.save()
diff --git a/scripts/quality_and_indexing/find_missing_folders.py b/scripts/quality_and_indexing/find_missing_folders.py
@@ -0,0 +1,60 @@
+"""you run this in the shell on the server to find sources to index and find any that are missing plugin folders"""
+
+import os
+
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+from sde_collections.utils.github_helper import GitHubHandler
+
+
+def get_sources_to_fix():
+    return Collection.objects.filter(workflow_status__in=[WorkflowStatusChoices.QUALITY_FIXED])
+
+
+def get_sources_to_index():
+    return Collection.objects.filter(workflow_status__in=[WorkflowStatusChoices.CURATED])
+
+
+def get_all_relevant_sources():
+    return Collection.objects.filter(
+        workflow_status__in=[WorkflowStatusChoices.QUALITY_FIXED, WorkflowStatusChoices.CURATED]
+    )
+
+
+def get_missing_folders(collections, base_directory):
+    gh = GitHubHandler()
+    missing = []
+    for source in collections:
+        folder_path = os.path.join(base_directory, source.config_folder, "default.xml")
+        if not gh.check_file_exists(folder_path):
+            missing.append(source)
+    return missing
+
+
+def print_configs(queryset):
+    for source in queryset:
+        print(source.config_folder)
+    print("---" * 20)
+    print()
+
+
+print("sources_to_fix")
+sources_to_fix = get_sources_to_fix()
+print_configs(sources_to_fix)
+
+
+print("sources_to_index")
+sources_to_index = get_sources_to_index()
+print_configs(sources_to_index)
+
+
+all_relevant_sources = get_all_relevant_sources()
+
+print("missing_scraper_folders")
+missing_folders = get_missing_folders(all_relevant_sources, "sources/scrapers/")
+print_configs(missing_folders)
+
+
+print("missing_plugin_folders")
+missing_folders = get_missing_folders(all_relevant_sources, "sources/SDE/")
+print_configs(missing_folders)
diff --git a/scripts/xpath_cleanup/find_xpath_patterns.py b/scripts/xpath_cleanup/find_xpath_patterns.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+"""this script is used to find all the xpath patterns in the database, so that they can be mapped to new patterns in xpath_mappings.py"""
+
+from sde_collections.models.pattern import TitlePattern
+
+print(
+    "there are", TitlePattern.objects.filter(title_pattern__contains="xpath").count(), "xpath patterns in the database"
+)
+
+# Get all the xpath patterns and their candidate urls
+xpath_patterns = TitlePattern.objects.filter(title_pattern__contains="xpath")
+for xpath_pattern in xpath_patterns:
+    print(xpath_pattern.title_pattern)
+    # for url in xpath_pattern.candidate_urls.all():
+    #     print(url.url)
+    print()
+
+# not every xpath pattern has a candidate url, but I went ahead and fixed all of them anyway