Skip to content

Commit

Permalink
Added pipeline step for embeddings and parameter options
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 15, 2024
1 parent 24cad77 commit 4071052
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 35 deletions.
2 changes: 2 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
/prepared_eidc_metadata.json
/extracted_metadata.json
/chunked_data.json
/chunked_embeddings.json
/embeddings.json
45 changes: 31 additions & 14 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ stages:
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: 43a63d91a3d66caa03738a000c841406
md5: ba838a284da239217d0464f08e0a45ce
size: 674
outs:
- path: data/eidc_metadata.json
hash: md5
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
md5: fc2f9ebe92cbd07eb06ff6e39366fdac
size: 12146216
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,30 +33,47 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
md5: fc2f9ebe92cbd07eb06ff6e39366fdac
size: 12146216
- path: scripts/extract_metadata.py
hash: md5
md5: c2fa7d2c4b8f28a6e24536ce0df244fd
size: 1296
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 7d2ae8d6a41a960592f30496eb498af7
size: 4578493
md5: fce18ce3c43175af1cea5d84dac9baf9
size: 4579965
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
10 data/extracted_metadata.json
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 7d2ae8d6a41a960592f30496eb498af7
size: 4578493
md5: fce18ce3c43175af1cea5d84dac9baf9
size: 4579965
- path: scripts/chunk_data.py
hash: md5
md5: b89a3ae9f6f9a0142149e70dc6fc5735
size: 1903
md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
size: 2509
outs:
- path: data/chunked_data.json
hash: md5
md5: 7ba3d3785db066283e35d654e11cf28b
size: 6373503
md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
size: 14947
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
deps:
- path: data/chunked_data.json
hash: md5
md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
size: 14947
- path: scripts/create_embeddings.py
hash: md5
md5: 3dc6ef284730398375a13df4bff41846
size: 808
outs:
- path: data/embeddings.json
hash: md5
md5: b08299369d1f243eb8d8ffa2cdb9a90f
size: 351126
23 changes: 15 additions & 8 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata}
deps:
- scripts/fetch_eidc_metadata.py
outs:
- data/eidc_metadata.json
- ${files.metadata}
extract-metadata:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted}
deps:
- data/eidc_metadata.json
- ${files.metadata}
- scripts/extract_metadata.py
outs:
- data/extracted_metadata.json
- ${files.extracted}
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
deps:
- data/extracted_metadata.json
- ${files.extracted}
- scripts/chunk_data.py
outs:
- data/chunked_data.json
- ${files.chunked}
create-embeddings:
cmd: python scripts/create_embeddings.py ${files.chunked} ${files.embeddings}
deps:
- ${files.chunked}
- scripts/create_embeddings.py
outs:
- ${files.embeddings}
9 changes: 9 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
hp:
chunk-size: 300
overlap: 100
files:
metadata: "data/eidc_metadata.json"
extracted: "data/extracted_metadata.json"
chunked: "data/chunked_data.json"
embeddings: "data/embeddings.json"
sample-size: 10 # sample size of 0 will process all data
49 changes: 42 additions & 7 deletions scripts/chunk_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,33 +25,68 @@ def chunk_metadata_value(metada_value, chunk_size, overlap):
]


def chunk_metadata_file(file: str, chunk_size: int, overlap: int) -> List[Dict[str, str]]:
def chunk_metadata_file(
file: str, chunk_size: int, overlap: int, sample_size: int
) -> List[Dict[str, str]]:
chunked_metadata = []
with open(file) as f:
json_data = json.load(f)
count = 0
for metadata in json_data:
chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
count += 1
if count == sample_size:
break
return chunked_metadata


def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
def main(
files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int
) -> None:
all_chunked_metadata = []
for file in files:
all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
all_chunked_metadata.extend(
chunk_metadata_file(file, chunk_size, overlap, sample_size)
)
with open(ouput_file, "w") as f:
json.dump(all_chunked_metadata, f, indent=4)


if __name__ == "__main__":
parser = ArgumentParser("prepare_data.py")
parser.add_argument("input_files", nargs="+", help="List of files to chunk.")
parser.add_argument("-o", "--output", help="The file to write the output to.")
parser.add_argument(
"-c", "--chunk", help="Desired chunk size in characters.", type=int
"-o",
"--output",
help="The json file to write the output to.",
type=str,
nargs="?",
const="chunk_data_output.json",
)
parser.add_argument(
"-ol", "--overlap", help="Chunk overlap in characters.", type=int
"-c",
"--chunk",
help="Desired chunk size in characters.",
type=int,
nargs="?",
const=300,
)
parser.add_argument(
"-ol",
"--overlap",
help="Chunk overlap in characters.",
type=int,
nargs="?",
const=100,
)
parser.add_argument(
"-s",
"--sample",
help="Only generate chunks for n datasets",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
assert args.chunk > args.overlap
main(args.input_files, args.output, args.chunk, args.overlap)
main(args.input_files, args.output, args.chunk, args.overlap, args.sample)
16 changes: 11 additions & 5 deletions scripts/create_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import json
from sentence_transformers import SentenceTransformer
from argparse import ArgumentParser
from tqdm import tqdm

def create_embedding(text):
model = SentenceTransformer("all-MiniLM-L6-v2")
return model.encode(text)


def main():
with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output:
def main(input_file, output_file):
with open(input_file) as input, open(output_file, "w") as output:
data = json.load(input)
for dataset in data["datasets"]:
dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist()
for chunk in tqdm(data):
chunk["embedding"] = create_embedding(chunk["chunk"]).tolist()
json.dump(data, output)


if __name__ == "__main__":
main()
parser = ArgumentParser("prepare_data.py")
parser.add_argument("input", help="The file to be used as input.")
parser.add_argument("output", help="The path to save the processed result.")
args = parser.parse_args()
main(args.input, args.output)
2 changes: 1 addition & 1 deletion scripts/fetch_eidc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def main(output_file: str) -> None:
headers={"content-type": "application/json"},
params={
"page": 1,
"rows": 2000,
"rows": 2500,
"term": "recordType:Dataset",
},
)
Expand Down
6 changes: 6 additions & 0 deletions scripts/upload_to_docstore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from argparse import ArgumentParser

if __name__ == "__main__":
parser = ArgumentParser("prepare_data.py")
parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store")
parser.add_argument("-o", "--output", help="The file to write the output to.")

0 comments on commit 4071052

Please sign in to comment.