-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added pipeline step for embeddings and parameter options
- Loading branch information
1 parent
24cad77
commit 4071052
Showing
8 changed files
with
117 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,28 @@ | ||
stages: | ||
fetch-metadata: | ||
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json | ||
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} | ||
deps: | ||
- scripts/fetch_eidc_metadata.py | ||
outs: | ||
- data/eidc_metadata.json | ||
- ${files.metadata} | ||
extract-metadata: | ||
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json | ||
cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted} | ||
deps: | ||
- data/eidc_metadata.json | ||
- ${files.metadata} | ||
- scripts/extract_metadata.py | ||
outs: | ||
- data/extracted_metadata.json | ||
- ${files.extracted} | ||
chunk-data: | ||
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json | ||
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} | ||
deps: | ||
- data/extracted_metadata.json | ||
- ${files.extracted} | ||
- scripts/chunk_data.py | ||
outs: | ||
- data/chunked_data.json | ||
- ${files.chunked} | ||
create-embeddings: | ||
cmd: python scripts/create_embeddings.py ${files.chunked} ${files.embeddings} | ||
deps: | ||
- ${files.chunked} | ||
- scripts/create_embeddings.py | ||
outs: | ||
- ${files.embeddings} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
hp: | ||
chunk-size: 300 | ||
overlap: 100 | ||
files: | ||
metadata: "data/eidc_metadata.json" | ||
extracted: "data/extracted_metadata.json" | ||
chunked: "data/chunked_data.json" | ||
embeddings: "data/embeddings.json" | ||
sample-size: 10 # sample size of 0 will process all data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,24 @@ | ||
import json | ||
from sentence_transformers import SentenceTransformer | ||
from argparse import ArgumentParser | ||
from tqdm import tqdm | ||
|
||
def create_embedding(text): | ||
model = SentenceTransformer("all-MiniLM-L6-v2") | ||
return model.encode(text) | ||
|
||
|
||
def main(): | ||
with open("data/eidc_metadata.json") as input, open("data/prepared_data.json", "w") as output: | ||
def main(input_file, output_file): | ||
with open(input_file) as input, open(output_file, "w") as output: | ||
data = json.load(input) | ||
for dataset in data["datasets"]: | ||
dataset["desc_emb"] = create_embedding(dataset["desc"]).tolist() | ||
for chunk in tqdm(data): | ||
chunk["embedding"] = create_embedding(chunk["chunk"]).tolist() | ||
json.dump(data, output) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
parser = ArgumentParser("prepare_data.py") | ||
parser.add_argument("input", help="The file to be used as input.") | ||
parser.add_argument("output", help="The path to save the processed result.") | ||
args = parser.parse_args() | ||
main(args.input, args.output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from argparse import ArgumentParser | ||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser("prepare_data.py") | ||
parser.add_argument("input_file", nargs="+", help="File containing chunks and embeddings to upload to document store") | ||
parser.add_argument("-o", "--output", help="The file to write the output to.") |