Skip to content

Commit

Permalink
Fixed bug so all metadata fields are extracted
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 11, 2024
1 parent 8148da1 commit 2ec86cd
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 16 deletions.
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
/eidc_metadata.json
/prepared_data.json
/prepared_eidc_metadata.json
/extracted_metadata.json
14 changes: 7 additions & 7 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ stages:
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
prepare:
cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
- path: scripts/prepare_data.py
- path: scripts/extract_metadata.py
hash: md5
md5: bcbf4413aeee83928054d9c6c6c2bacc
size: 1224
md5: c2fa7d2c4b8f28a6e24536ce0df244fd
size: 1296
outs:
- path: data/prepared_eidc_metadata.json
- path: data/extracted_metadata.json
hash: md5
md5: 0b4ca8c49da450bc8fec0e92d577466c
size: 411936
md5: 7d2ae8d6a41a960592f30496eb498af7
size: 4578493
6 changes: 3 additions & 3 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ stages:
outs:
- data/eidc_metadata.json
prepare:
cmd: python scripts/prepare_data.py data/eidc_metadata.json data/prepared_eidc_metadata.json
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- data/eidc_metadata.json
- scripts/prepare_data.py
- scripts/extract_metadata.py
outs:
- data/prepared_eidc_metadata.json
- data/extracted_metadata.json
14 changes: 8 additions & 6 deletions scripts/prepare_data.py → scripts/extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
from argparse import ArgumentParser


METADATA_FIELDS = ["title", "description", "lineage", "title"]
METADATA_FIELDS = ["title", "description", "lineage"]


def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> Dict[str,str]:
metadata = {}
metadata["id"] = json_data["identifier"]
def extact_eidc_metadata_fields(json_data: Dict, fields: List[str] = METADATA_FIELDS) -> List[Dict[str,str]]:
metadatas = []
for field in fields:
if json_data[field]:
metadata = {}
metadata["id"] = json_data["identifier"]
metadata["field"] = field
metadata["value"] = json_data[field]
return metadata
metadatas.append(metadata)
return metadatas


def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
Expand All @@ -22,7 +24,7 @@ def parse_eidc_metadata(file_path: str) -> List[Dict[str,str]]:
json_data = json.load(f)
for dataset in json_data["results"]:
dataset_metadata = extact_eidc_metadata_fields(dataset)
data.append(dataset_metadata)
data.extend(dataset_metadata)
return data


Expand Down

1 comment on commit 2ec86cd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

context_precision: 0.48770238494036633
answer_correctness: 0.5298055734207788
answer_relevancy: 0.5242851670405076
context_recall: 0.445709726378774

Please sign in to comment.