From 835c43e721451a73627b87b5cb227c85472fed64 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 8 Nov 2024 04:41:40 -0600 Subject: [PATCH] Update fixes for #1071, add fix for #1085 --- .envs/.local/.django | 11 ++-- config/settings/base.py | 6 +- sde_collections/admin.py | 31 +++++---- .../0060_alter_candidateurl_scraped_text.py | 24 +++++++ sde_collections/models/candidate_url.py | 8 ++- sde_collections/sinequa_api.py | 65 ++++++++++++------- sde_collections/tasks.py | 59 +++++------------ 7 files changed, 118 insertions(+), 86 deletions(-) create mode 100644 sde_collections/migrations/0060_alter_candidateurl_scraped_text.py diff --git a/.envs/.local/.django b/.envs/.local/.django index 07e159fa..0978166d 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -33,8 +33,11 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch' # Slack Webhook # ------------------------------------------------------------------------------ SLACK_WEBHOOK_URL='' -LRM_USER='' -LRM_PASSWORD='' + +#Server Credentials +#-------------------------------------------------------------------------------- +LRM_DEV_USER='' +LRM_DEV_PASSWORD='' XLI_USER='' XLI_PASSWORD='' LRM_QA_USER='' @@ -42,5 +45,5 @@ LRM_QA_PASSWORD='' #Server Tokens #-------------------------------------------------------------------------------- -LRMDEV_TOKEN='' -LIS_TOKEN='' +LRM_DEV_TOKEN='' +XLI_TOKEN='' diff --git a/config/settings/base.py b/config/settings/base.py index 55c3e758..d5b111e2 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -343,7 +343,9 @@ SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL") XLI_USER = env("XLI_USER") XLI_PASSWORD = env("XLI_PASSWORD") -LRM_USER = env("LRM_USER") -LRM_PASSWORD = env("LRM_PASSWORD") +LRM_DEV_USER = env("LRM_DEV_USER") +LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD") LRM_QA_USER = env("LRM_QA_USER") LRM_QA_PASSWORD = env("LRM_QA_PASSWORD") +LRM_DEV_TOKEN=env("LRM_DEV_TOKEN") +XLI_TOKEN=env("XLI_TOKEN") diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 7b519a15..0357e819 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -9,19 +9,18 @@ from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api -@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") -def fetch_full_text_lrm_dev_action(modeladmin, request, queryset): +def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name): for collection in queryset: - fetch_and_update_full_text.delay(collection.id, "LRM_DEV") - modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.") + fetch_and_update_full_text.delay(collection.id, server_name) + modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server") +@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") +def fetch_full_text_lrm_dev_action(modeladmin, request, queryset): + fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev") -@admin.action(description="Import candidate URLs from Li's Server with Full Text") +@admin.action(description="Import candidate URLs from XLI Server with Full Text") def fetch_full_text_lis_action(modeladmin, request, queryset): - for collection in queryset: - fetch_and_update_full_text.delay(collection.id, "LIS") - modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.") - + fetch_and_update_text_for_server(modeladmin, request, queryset, "xli") @admin.action(description="Generate deployment message") def generate_deployment_message(modeladmin, request, queryset): @@ -123,7 +122,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_ messages.add_message( request, messages.INFO, - f"Started importing URLs from the API for: {collection_names} from {server_name.title()}", + f"Started importing URLs from {server_name.upper()} Server", ) @@ -147,19 +146,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset): import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production") -@admin.action(description="Import candidate URLs from Li's Server") -def import_candidate_urls_lis_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server") +@admin.action(description="Import candidate URLs from XLI Server") +def import_candidate_urls_xli_server(modeladmin, request, queryset): + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli") @admin.action(description="Import candidate URLs from LRM Dev Server") def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server") + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev") @admin.action(description="Import candidate URLs from LRM QA Server") def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server") + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa") class ExportCsvMixin: @@ -250,7 +249,7 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): import_candidate_urls_production, import_candidate_urls_secret_test, import_candidate_urls_secret_production, - import_candidate_urls_lis_server, + import_candidate_urls_xli_server, import_candidate_urls_lrm_dev_server, import_candidate_urls_lrm_qa_server, fetch_full_text_lrm_dev_action, diff --git a/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py new file mode 100644 index 00000000..12a0fb3c --- /dev/null +++ b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.9 on 2024-11-07 17:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_candidateurl_scraped_text"), + ] + + operations = [ + migrations.AlterField( + model_name="candidateurl", + name="scraped_text", + field=models.TextField( + blank=True, + default="", + help_text="This is the text scraped by Sinequa", + null=True, + verbose_name="Scraped Text", + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 936ea363..080e3c15 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -35,7 +35,13 @@ class CandidateURL(models.Model): blank=True, help_text="This is the original title scraped by Sinequa", ) - scraped_text = models.TextField(blank=True, null=True) + scraped_text = models.TextField( + "Scraped Text", + default="", + null=True, + blank=True, + help_text="This is the text scraped by Sinequa", + ) generated_title = models.CharField( "Generated Title", default="", diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py index 0e4c3b62..652f3317 100644 --- a/sde_collections/sinequa_api.py +++ b/sde_collections/sinequa_api.py @@ -1,5 +1,5 @@ from typing import Any - +import json import requests import urllib3 from django.conf import settings @@ -32,17 +32,17 @@ "query_name": "query-sde-primary", "base_url": "https://sciencediscoveryengine.nasa.gov", }, - "lis_server": { + "xli": { "app_name": "nasa-sba-smd", "query_name": "query-smd-primary", "base_url": "http://sde-xli.nasa-impact.net", }, - "lrm_dev_server": { + "lrm_dev": { "app_name": "sde-init-check", "query_name": "query-init-check", "base_url": "https://sde-lrm.nasa-impact.net", }, - "lrm_qa_server": { + "lrm_qa": { "app_name": "sde-init-check", "query_name": "query-init-check", "base_url": "https://sde-qa.nasa-impact.net", @@ -53,15 +53,13 @@ class Api: def __init__(self, server_name: str) -> None: self.server_name = server_name - self.app_name: str = server_configs[server_name]["app_name"] - self.query_name: str = server_configs[server_name]["query_name"] - self.base_url: str = server_configs[server_name]["base_url"] - self.xli_user = settings.XLI_USER - self.xli_password = settings.XLI_PASSWORD - self.lrm_user = settings.LRM_USER - self.lrm_password = settings.LRM_PASSWORD - self.lrm_qa_user = settings.LRM_QA_USER - self.lrm_qa_password = settings.LRM_QA_PASSWORD + config = server_configs[server_name] + self.app_name: str = config["app_name"] + self.query_name: str = config["query_name"] + self.base_url: str = config["base_url"] + self.user = getattr(settings, f"{server_name}_USER".upper(), None) + self.password = getattr(settings, f"{server_name}_PASSWORD".upper(), None) + self.token = getattr(settings, f"{server_name}_TOKEN".upper(), None) def process_response(self, url: str, payload: dict[str, Any]) -> Any: response = requests.post(url, headers={}, json=payload, verify=False) @@ -74,14 +72,7 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any: return meaningful_response def query(self, page: int, collection_config_folder: str = "") -> Any: - if self.server_name == "lis_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}" - elif self.server_name == "lrm_dev_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}" - elif self.server_name == "lrm_qa_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}" - else: - url = f"{self.base_url}/api/v1/search.query" + url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}" payload = { "app": self.app_name, "query": { @@ -94,7 +85,7 @@ def query(self, page: int, collection_config_folder: str = "") -> Any: } if collection_config_folder: - if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]: + if self.server_name in ["xli", "lrm_dev", "lrm_qa"]: payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/" else: payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/" @@ -102,3 +93,33 @@ def query(self, page: int, collection_config_folder: str = "") -> Any: response = self.process_response(url, payload) return response + + def sql_query(self, sql: str) -> Any: + """Executes an SQL query on the configured server using token-based authentication.""" + if not self.token: + raise ValueError("You must have a token to use the SQL endpoint") + + url = f"{self.base_url}/api/v1/engine.sql" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.token}" + } + payload = json.dumps({ + "method": "engine.sql", + "sql": sql, + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default", + }) + try: + response = requests.post(url, headers=headers, data=payload, timeout=10) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + raise Exception(f"API request failed: {str(e)}") + + def get_full_texts(self, collection_config_folder: str) -> Any: + sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'" + return self.sql_query(sql) \ No newline at end of file diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index f505c942..65b3b158 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -146,37 +146,24 @@ def resolve_title_pattern(title_pattern_id): @celery_app.task -def fetch_and_update_full_text(collection_id, server_type): - try: - collection = Collection.objects.get(id=collection_id) - except Collection.DoesNotExist: - raise Exception(f"Collection with ID {collection_id} does not exist.") - - server_config = get_server_config(server_type) - token = server_config["token"] - url = server_config["url"] - - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} - - payload = json.dumps( - { - "method": "engine.sql", - "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default", - } - ) - - try: - response = requests.post(url, headers=headers, data=payload, timeout=10) - response.raise_for_status() # Raise exception for HTTP errors - except requests.exceptions.RequestException as e: - raise Exception(f"API request failed: {str(e)}") - - records = response.json().get("Rows", []) +def fetch_and_update_full_text(collection_id, server_name): + """ + Task to fetch and update full text and metadata for all URLs associated with a specified collection + from a given server. + + Args: + collection_id (int): The identifier for the collection in the database. + server_name (str): The name of the server. + + Returns: + str: A message indicating the result of the operation, including the number of URLs processed + or a message if no records were found. + """ + collection = Collection.objects.get(id=collection_id) + api = Api(server_name) + full_texts = api.get_full_texts(collection.config_folder) + + records = full_texts.get("Rows", []) if not records: return "No records found in the response." @@ -188,14 +175,4 @@ def fetch_and_update_full_text(collection_id, server_type): CandidateURL.objects.update_or_create( url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title} ) - return f"Successfully processed {len(records)} records and updated the database." - - -def get_server_config(server_type): - if server_type == "LRM_DEV": - return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")} - elif server_type == "LIS": - return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} - else: - raise ValueError("Invalid server type.")