Skip to content

Commit

Permalink
Update fixes for #1071, add fix for #1085
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Nov 8, 2024
1 parent 47f164f commit 835c43e
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 86 deletions.
11 changes: 7 additions & 4 deletions .envs/.local/.django
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
# Slack Webhook
# ------------------------------------------------------------------------------
SLACK_WEBHOOK_URL=''
LRM_USER=''
LRM_PASSWORD=''

#Server Credentials
#--------------------------------------------------------------------------------
LRM_DEV_USER=''
LRM_DEV_PASSWORD=''
XLI_USER=''
XLI_PASSWORD=''
LRM_QA_USER=''
LRM_QA_PASSWORD=''

#Server Tokens
#--------------------------------------------------------------------------------
LRMDEV_TOKEN=''
LIS_TOKEN=''
LRM_DEV_TOKEN=''
XLI_TOKEN=''
6 changes: 4 additions & 2 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@
SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL")
XLI_USER = env("XLI_USER")
XLI_PASSWORD = env("XLI_PASSWORD")
LRM_USER = env("LRM_USER")
LRM_PASSWORD = env("LRM_PASSWORD")
LRM_DEV_USER = env("LRM_DEV_USER")
LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD")
LRM_QA_USER = env("LRM_QA_USER")
LRM_QA_PASSWORD = env("LRM_QA_PASSWORD")
LRM_DEV_TOKEN=env("LRM_DEV_TOKEN")
XLI_TOKEN=env("XLI_TOKEN")
31 changes: 15 additions & 16 deletions sde_collections/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api


@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
for collection in queryset:
fetch_and_update_full_text.delay(collection.id, "LRM_DEV")
modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.")
fetch_and_update_full_text.delay(collection.id, server_name)
modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")

@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")

@admin.action(description="Import candidate URLs from Li's Server with Full Text")
@admin.action(description="Import candidate URLs from XLI Server with Full Text")
def fetch_full_text_lis_action(modeladmin, request, queryset):
for collection in queryset:
fetch_and_update_full_text.delay(collection.id, "LIS")
modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.")

fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")

@admin.action(description="Generate deployment message")
def generate_deployment_message(modeladmin, request, queryset):
Expand Down Expand Up @@ -123,7 +122,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
messages.add_message(
request,
messages.INFO,
f"Started importing URLs from the API for: {collection_names} from {server_name.title()}",
f"Started importing URLs from {server_name.upper()} Server",
)


Expand All @@ -147,19 +146,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production")


@admin.action(description="Import candidate URLs from Li's Server")
def import_candidate_urls_lis_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server")
@admin.action(description="Import candidate URLs from XLI Server")
def import_candidate_urls_xli_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli")


@admin.action(description="Import candidate URLs from LRM Dev Server")
def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server")
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev")


@admin.action(description="Import candidate URLs from LRM QA Server")
def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server")
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa")


class ExportCsvMixin:
Expand Down Expand Up @@ -250,7 +249,7 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
import_candidate_urls_production,
import_candidate_urls_secret_test,
import_candidate_urls_secret_production,
import_candidate_urls_lis_server,
import_candidate_urls_xli_server,
import_candidate_urls_lrm_dev_server,
import_candidate_urls_lrm_qa_server,
fetch_full_text_lrm_dev_action,
Expand Down
24 changes: 24 additions & 0 deletions sde_collections/migrations/0060_alter_candidateurl_scraped_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.9 on 2024-11-07 17:34

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("sde_collections", "0059_candidateurl_scraped_text"),
]

operations = [
migrations.AlterField(
model_name="candidateurl",
name="scraped_text",
field=models.TextField(
blank=True,
default="",
help_text="This is the text scraped by Sinequa",
null=True,
verbose_name="Scraped Text",
),
),
]
8 changes: 7 additions & 1 deletion sde_collections/models/candidate_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ class CandidateURL(models.Model):
blank=True,
help_text="This is the original title scraped by Sinequa",
)
scraped_text = models.TextField(blank=True, null=True)
scraped_text = models.TextField(
"Scraped Text",
default="",
null=True,
blank=True,
help_text="This is the text scraped by Sinequa",
)
generated_title = models.CharField(
"Generated Title",
default="",
Expand Down
65 changes: 43 additions & 22 deletions sde_collections/sinequa_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Any

import json
import requests
import urllib3
from django.conf import settings
Expand Down Expand Up @@ -32,17 +32,17 @@
"query_name": "query-sde-primary",
"base_url": "https://sciencediscoveryengine.nasa.gov",
},
"lis_server": {
"xli": {
"app_name": "nasa-sba-smd",
"query_name": "query-smd-primary",
"base_url": "http://sde-xli.nasa-impact.net",
},
"lrm_dev_server": {
"lrm_dev": {
"app_name": "sde-init-check",
"query_name": "query-init-check",
"base_url": "https://sde-lrm.nasa-impact.net",
},
"lrm_qa_server": {
"lrm_qa": {
"app_name": "sde-init-check",
"query_name": "query-init-check",
"base_url": "https://sde-qa.nasa-impact.net",
Expand All @@ -53,15 +53,13 @@
class Api:
def __init__(self, server_name: str) -> None:
self.server_name = server_name
self.app_name: str = server_configs[server_name]["app_name"]
self.query_name: str = server_configs[server_name]["query_name"]
self.base_url: str = server_configs[server_name]["base_url"]
self.xli_user = settings.XLI_USER
self.xli_password = settings.XLI_PASSWORD
self.lrm_user = settings.LRM_USER
self.lrm_password = settings.LRM_PASSWORD
self.lrm_qa_user = settings.LRM_QA_USER
self.lrm_qa_password = settings.LRM_QA_PASSWORD
config = server_configs[server_name]
self.app_name: str = config["app_name"]
self.query_name: str = config["query_name"]
self.base_url: str = config["base_url"]
self.user = getattr(settings, f"{server_name}_USER".upper(), None)
self.password = getattr(settings, f"{server_name}_PASSWORD".upper(), None)
self.token = getattr(settings, f"{server_name}_TOKEN".upper(), None)

def process_response(self, url: str, payload: dict[str, Any]) -> Any:
response = requests.post(url, headers={}, json=payload, verify=False)
Expand All @@ -74,14 +72,7 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
return meaningful_response

def query(self, page: int, collection_config_folder: str = "") -> Any:
if self.server_name == "lis_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}"
elif self.server_name == "lrm_dev_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}"
elif self.server_name == "lrm_qa_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}"
else:
url = f"{self.base_url}/api/v1/search.query"
url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
payload = {
"app": self.app_name,
"query": {
Expand All @@ -94,11 +85,41 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
}

if collection_config_folder:
if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]:
if self.server_name in ["xli", "lrm_dev", "lrm_qa"]:
payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
else:
payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"

response = self.process_response(url, payload)

return response

def sql_query(self, sql: str) -> Any:
"""Executes an SQL query on the configured server using token-based authentication."""
if not self.token:
raise ValueError("You must have a token to use the SQL endpoint")

url = f"{self.base_url}/api/v1/engine.sql"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.token}"
}
payload = json.dumps({
"method": "engine.sql",
"sql": sql,
"pretty": True,
"log": False,
"output": "json",
"resolveIndexList": "false",
"engines": "default",
})
try:
response = requests.post(url, headers=headers, data=payload, timeout=10)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
raise Exception(f"API request failed: {str(e)}")

def get_full_texts(self, collection_config_folder: str) -> Any:
sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
return self.sql_query(sql)
59 changes: 18 additions & 41 deletions sde_collections/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,37 +146,24 @@ def resolve_title_pattern(title_pattern_id):


@celery_app.task
def fetch_and_update_full_text(collection_id, server_type):
try:
collection = Collection.objects.get(id=collection_id)
except Collection.DoesNotExist:
raise Exception(f"Collection with ID {collection_id} does not exist.")

server_config = get_server_config(server_type)
token = server_config["token"]
url = server_config["url"]

headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}

payload = json.dumps(
{
"method": "engine.sql",
"sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
"pretty": True,
"log": False,
"output": "json",
"resolveIndexList": "false",
"engines": "default",
}
)

try:
response = requests.post(url, headers=headers, data=payload, timeout=10)
response.raise_for_status() # Raise exception for HTTP errors
except requests.exceptions.RequestException as e:
raise Exception(f"API request failed: {str(e)}")

records = response.json().get("Rows", [])
def fetch_and_update_full_text(collection_id, server_name):
"""
Task to fetch and update full text and metadata for all URLs associated with a specified collection
from a given server.
Args:
collection_id (int): The identifier for the collection in the database.
server_name (str): The name of the server.
Returns:
str: A message indicating the result of the operation, including the number of URLs processed
or a message if no records were found.
"""
collection = Collection.objects.get(id=collection_id)
api = Api(server_name)
full_texts = api.get_full_texts(collection.config_folder)

records = full_texts.get("Rows", [])
if not records:
return "No records found in the response."

Expand All @@ -188,14 +175,4 @@ def fetch_and_update_full_text(collection_id, server_type):
CandidateURL.objects.update_or_create(
url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
)

return f"Successfully processed {len(records)} records and updated the database."


def get_server_config(server_type):
if server_type == "LRM_DEV":
return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")}
elif server_type == "LIS":
return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")}
else:
raise ValueError("Invalid server type.")

0 comments on commit 835c43e

Please sign in to comment.