Skip to content

Commit

Permalink
Script to push data to HF (for later)
Browse files Browse the repository at this point in the history
  • Loading branch information
iamgroot42 committed Jan 30, 2024
1 parent 95050ba commit 83e8cc5
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions python_scripts/dump_cache_to_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Script to push cache to HuggingFace as a dataset. Uploaded data is public by default.
"""
from simple_parsing import ArgumentParser
import os

from mimir.data_utils import HFCompatibleDataset
from mimir.custom_datasets import load_data
from mimir.utils import get_cache_path


def main(args):
prefix = f"cache_100_200_{args.num_records}_512"
target = args.target
nrgram_suffix = args.ngram_suffix

for split in ["train", "test"]:
# Load data
records = load_data(
os.path.join(
get_cache_path(),
prefix,
split,
f"the_pile_{target}{nrgram_suffix}.jsonl",
)
)
# Load neighbors (if they exist)
neighbors = None
neighbor_path = os.path.join(
get_cache_path(),
prefix,
f"{split}_neighbors",
f"the_pile_{target}{nrgram_suffix}_neighbors_25_bert_in_place_swap.jsonl",
)
if os.path.exists(neighbor_path):
neighbors = load_data(neighbor_path)
ds = HFCompatibleDataset(records, neighbors)
# TODO: Not familiar with this, but looks like direct ch.Dataset cannot be pushed. Need to figure out the best way to do this.
# One alternative is to keep 'neighbors' separate and upload them directly via jsonl files
# ds.push_to_hub(f"iamgroot42/mimir", split=f"{target}{nrgram_suffix}_{split}")


if __name__ == "__main__":
# Extract relevant configurations from config file
parser = ArgumentParser(add_help=False)
parser.add_argument("--target", help="Data source to upload", required=True)
parser.add_argument("--ngram_suffix", help="Specific ngram_suffix?", default="")
parser.add_argument(
"--num_records",
help="How many records does this source have?",
default=1000,
type=int,
)
args = parser.parse_args()
main(args)

0 comments on commit 83e8cc5

Please sign in to comment.