Skip to content

Commit

Permalink
Merge branch 'main' into refactor-loss
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Jul 23, 2024
2 parents 0c23440 + 3d7d12e commit a5218d6
Show file tree
Hide file tree
Showing 6 changed files with 1,025 additions and 777 deletions.
26 changes: 26 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import os
from typing import Annotated, Optional

import psutil
Expand All @@ -9,6 +10,7 @@
from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
convert_dataset_json_from_args,
convert_delta_to_json_from_args,
convert_finetuning_dataset_from_args,
convert_text_to_mds_from_args,
)
Expand Down Expand Up @@ -240,3 +242,27 @@ def convert_text_to_mds(
trust_remote_code=trust_remote_code,
logging_level=logging_level,
)


@app.command(name='convert_delta_to_json')
def convert_delta_to_json_cli(
delta_table_name: Annotated[str, Option(..., help='UC table <catalog>.<schema>.<table name>')],
json_output_folder: Annotated[str, Option(..., help='Local path to save the converted json')],
http_path: Annotated[Optional[str], Option(help='If set, dbsql method is used')] = None,
batch_size: Annotated[int, Option(help='Row chunks to transmit a time to avoid OOM')] = 1 << 30,
processes: Annotated[int, Option(help='Number of processes allowed to use')] = os.cpu_count(), # type: ignore
cluster_id: Annotated[Optional[str], Option(help='Cluster ID with runtime newer than 14.1.0 and access mode of either assigned or shared can use databricks-connect.')] = None,
use_serverless: Annotated[bool, Option(help='Use serverless or not. Make sure the workspace is entitled with serverless')] = False,
json_output_filename: Annotated[str, Option(help='The name of the combined final jsonl that combines all partitioned jsonl')] = 'train-00000-of-00001.jsonl',
):
"""Convert a Delta table into JSON files."""
convert_delta_to_json_from_args(
delta_table_name=delta_table_name,
json_output_folder=json_output_folder,
http_path=http_path,
batch_size=batch_size,
processes=processes,
cluster_id=cluster_id,
use_serverless=use_serverless,
json_output_filename=json_output_filename,
)
6 changes: 6 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
convert_dataset_json,
convert_dataset_json_from_args,
)
from llmfoundry.command_utils.data_prep.convert_delta_to_json import (
convert_delta_to_json_from_args,
fetch_DT,
)
from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import (
convert_finetuning_dataset,
convert_finetuning_dataset_from_args,
Expand Down Expand Up @@ -44,4 +48,6 @@
'convert_finetuning_dataset',
'convert_text_to_mds',
'convert_text_to_mds_from_args',
'convert_delta_to_json_from_args',
'fetch_DT',
]
Loading

0 comments on commit a5218d6

Please sign in to comment.