-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added script to download supporting docs
- Loading branch information
1 parent
4f7ab43
commit 9705b61
Showing
6 changed files
with
144 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,34 @@ | ||
hp: | ||
chunk-size: 300 | ||
chunk-size: 500 | ||
overlap: 100 | ||
embeddings-model: "all-MiniLM-L6-v2" | ||
embeddings-model: all-MiniLM-L6-v2 | ||
doc-store: | ||
collection: "eidc-data" | ||
files: "data/chroma-data" | ||
collection: eidc-data | ||
files: data/chroma-data | ||
files: | ||
metadata: "data/eidc_metadata.json" | ||
extracted: "data/extracted_metadata.json" | ||
supporting-docs: "data/supporting-docs.json" | ||
chunked: "data/chunked_data.json" | ||
embeddings: "data/embeddings.json" | ||
doc-store: "data/chroma-data" | ||
test-set: "data/eidc_rag_test_sample.csv" | ||
eval-set: "data/evaluation_data.csv" | ||
metrics: "data/metrics.json" | ||
eval-plot: "data/eval.png" | ||
metadata: data/eidc_metadata.json | ||
extracted: data/extracted_metadata.json | ||
supporting-docs: data/supporting-docs.json | ||
chunked: data/chunked_data.json | ||
embeddings: data/embeddings.json | ||
doc-store: data/chroma-data | ||
test-set: data/eidc_rag_test_sample.csv | ||
eval-set: data/evaluation_data.csv | ||
metrics: data/metrics.json | ||
eval-plot: data/eval.png | ||
sample-size: 10 # sample size of 0 will process all data | ||
rag: | ||
model: llama3.1 | ||
prompt: > | ||
You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n | ||
prompt: >- | ||
You are part of a retrieval augmented pipeline. You will be given a question and | ||
a context on which to base your answer.\n | ||
Do not use your own knowledge to answer the question.\n | ||
The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n | ||
Do not refer to "context" in your answer, instead refer to the context as available information. | ||
If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n | ||
The context provided will be metadata from datasets contained in the Environmental | ||
Information Data Centre (EIDC).\n | ||
Do not refer to "context" in your answer, instead refer to the context as available | ||
information. | ||
If the answer to the question is not clear from the context, suggest which dataset | ||
or datasets might be helpful in answering the question.\n | ||
Question: {{query}}\n | ||
Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %} | ||
Answer: | ||
Answer: |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from argparse import ArgumentParser | ||
import json | ||
from tqdm import tqdm | ||
import requests | ||
import os | ||
from typing import Dict, List | ||
from dotenv import load_dotenv | ||
|
||
|
||
def extract_ids(metadata_file: str): | ||
with open(metadata_file) as f: | ||
json_data = json.load(f) | ||
ids = [dataset["identifier"] for dataset in json_data["results"]] | ||
return ids | ||
|
||
|
||
def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: | ||
res = requests.get( | ||
f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) | ||
) | ||
json_data = res.json() | ||
docs = [] | ||
for key, val in json_data["success"].items(): | ||
docs.append({"id": eidc_id, "field": key, "value": val}) | ||
return docs | ||
|
||
|
||
def main(metadata_file: str, supporting_docs_file: str): | ||
load_dotenv() | ||
user = os.getenv("username") | ||
password = os.getenv("password") | ||
ids = extract_ids(metadata_file) | ||
docs = [] | ||
for id in tqdm(ids): | ||
docs.extend(get_supporting_docs(id, user, password)) | ||
if len(docs) > 0: | ||
break | ||
with open(supporting_docs_file, "w") as f: | ||
json.dump(docs, f, indent=4) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser("fetch_supporting_docs.py") | ||
parser.add_argument("metadata", help="File containing EIDC metadata.") | ||
parser.add_argument("supporting_docs", help="File to save supporting docs to.") | ||
args = parser.parse_args() | ||
main(args.metadata, args.supporting_docs) |