From 9b37063ff198c78027c0d50b5dcf0ea4ea280749 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 10:09:22 -0800 Subject: [PATCH] Validation (#866) * add validation script * update * change token count function * reorganize cells * Add unit tests * Add a printout for CPT * update question * Add questions * Fix lints * update format * update * nb source * add validation script * update * change token count function * reorganize cells * Add unit tests * Add a printout for CPT * update question * Add questions * Fix lints * update format * update * nb source * Remove license insert for validation notebook * Add validation utils * Minor cleanups (#858) * nits * logger * add log * lint * update utils/__init__.py to include extra validation functions * update notebook * update * update * Read UC delta table (#773) * initial commit * use databricks-sql to read delta table and convert to json * update * update * update * add mocked unittest * Fix lints * update * update * restructure code * Add timer for optimizing * Add db-connect * add wrapper * update * add install dbconnect * update * update * patch dbconnect to allow multiple return formats * update * add arrow * use compression * clean up * Add cluster rt check * Fix lints * remove patch.py for CI * update * update * updat * update * fix tests * fix lint * update * update * Add more tests * update * update * update * change to download_json * update * fix lints * Add decompressed option for arrow * format json to jsonl * Add comments * Make cf_collect_type global option * fix comments * fix lints * fix comments * Fix lints * change to use workspaceclient * Add CPT support * Rewire method assignment logic * Fix bug in stripping https * Add tests for rewired method assignment logic * Fix lints * Fix lints * Removed logger set_level * Remove pyspark. It conflicts with databricks-connect * Update the comment * skip cluster version check when cluster_id is serverless * Add use_serverless flag * update tests with use_serverless flag * Fix lints --------- Co-authored-by: Xiaohan Zhang * Add download remote function to util * update * remove fused layernorm (#859) * update * update * update * update * update * update * update * update * update * Remove hardcoded combined.jsonl with a flag (#861) * Remove hardcoded combined.jsonl with a flag * update * change output_json_path output_json_folder --------- Co-authored-by: Xiaohan Zhang * bump (#828) * Add dask and dataframe_to_mds * update * update * update * update * Add notebook * update * update * remove script and tests, keep notebook * update * update * update * update * Always initialize dist (#864) * fix dev * lint * remove gpu * updated notebook * remove scripts keep notebook --------- Co-authored-by: Xiaohan Zhang Co-authored-by: xiaohanzhan-db Co-authored-by: Mihir Patel --- notebooks/validate_and_tokenize_data.ipynb | 496 +++--------- .../data_prep/validate_and_tokenize_data.py | 731 ------------------ .../test_validate_and_tokenize_data.py | 131 ---- tests/a_scripts/eval/test_eval.py | 1 - tests/fixtures/autouse.py | 6 +- 5 files changed, 94 insertions(+), 1271 deletions(-) delete mode 100644 scripts/data_prep/validate_and_tokenize_data.py delete mode 100644 tests/a_scripts/data_prep/test_validate_and_tokenize_data.py diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb index 6df4453e99..8d974cc479 100644 --- a/notebooks/validate_and_tokenize_data.ipynb +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "f275a21b-47d4-472c-972b-e2a84a597db2", "showTitle": false, @@ -58,10 +55,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d", "showTitle": false, @@ -69,7 +63,7 @@ } }, "source": [ - "#### Install llmfoundry Validation Branch" + "# Installation" ] }, { @@ -87,16 +81,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nWARNING: Skipping llm-foundry as it is not installed.\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "%pip uninstall -y llm-foundry" ] @@ -136,16 +121,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nCollecting git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation\n Cloning https://github.com/XiaohanZhangCMU/llm-foundryX.git (to revision validation) to /tmp/pip-req-build-k0ts0h4y\n Running command git clone --filter=blob:none --quiet https://github.com/XiaohanZhangCMU/llm-foundryX.git /tmp/pip-req-build-k0ts0h4y\n Running command git checkout -b validation --track origin/validation\n Switched to a new branch 'validation'\n branch 'validation' set up to track 'origin/validation'.\n Resolved https://github.com/XiaohanZhangCMU/llm-foundryX.git to commit 596443af831e8fcea2d3b0f470382f0ac356bb45\n Installing build dependencies: started\n Installing build dependencies: finished with status 'done'\n Getting requirements to build wheel: started\n Getting requirements to build wheel: finished with status 'done'\n Installing backend dependencies: started\n Installing backend dependencies: finished with status 'done'\n Preparing metadata (pyproject.toml): started\n Preparing metadata (pyproject.toml): finished with status 'done'\nCollecting triton-pre-mlir@ git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python\n Cloning https://github.com/vchiley/triton.git (to revision triton_pre_mlir_sm90) to /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git clone --filter=blob:none --quiet https://github.com/vchiley/triton.git /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git checkout -b triton_pre_mlir_sm90 --track origin/triton_pre_mlir_sm90\n Switched to a new branch 'triton_pre_mlir_sm90'\n branch 'triton_pre_mlir_sm90' set up to track 'origin/triton_pre_mlir_sm90'.\n Resolved https://github.com/vchiley/triton.git to commit 86c7fe23397467ade531513291f729c12dd8d15e\n Running command git submodule update --init --recursive -q\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.6.1-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.2/255.2 kB 4.8 MB/s eta 0:00:00\nCollecting beautifulsoup4<5,>=4.12.2\n Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.0/143.0 kB 7.6 MB/s eta 0:00:00\nCollecting accelerate<0.26,>=0.25\n Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.7/265.7 kB 8.4 MB/s eta 0:00:00\nCollecting mosaicml-streaming<0.8,>=0.7.2\n Downloading mosaicml_streaming-0.7.2-py3-none-any.whl (249 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 249.9/249.9 kB 9.1 MB/s eta 0:00:00\nCollecting sentencepiece==0.1.97\n Downloading sentencepiece-0.1.97-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 12.8 MB/s eta 0:00:00\nCollecting fsspec==2023.6.0\n Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.8/163.8 kB 16.9 MB/s eta 0:00:00\nCollecting omegaconf<3,>=2.2.3\n Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 kB 15.4 MB/s eta 0:00:00\nCollecting torch<2.1.1,>=2.1\n Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 670.2/670.2 MB 1.9 MB/s eta 0:00:00\nCollecting transformers<4.37,>=4.36\n Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.2/8.2 MB 115.1 MB/s eta 0:00:00\nCollecting mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2\n Downloading mosaicml-0.17.2-py3-none-any.whl (622 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 622.8/622.8 kB 83.3 MB/s eta 0:00:00\nCollecting huggingface-hub<1.0,>=0.17.0\n Downloading huggingface_hub-0.20.2-py3-none-any.whl (330 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 330.3/330.3 kB 68.0 MB/s eta 0:00:00\nRequirement already satisfied: boto3<2,>=1.21.45 in /databricks/python3/lib/python3.10/site-packages (from llm-foundry==0.4.0) (1.24.28)\nCollecting cmake<=3.26.3,>=3.25.0\n Downloading cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.0/24.0 MB 76.8 MB/s eta 0:00:00\nCollecting datasets==2.15.0\n Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 521.2/521.2 kB 87.5 MB/s eta 0:00:00\nCollecting onnx==1.14.0\n Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 110.0 MB/s eta 0:00:00\nCollecting einops==0.7.0\n Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 kB 11.2 MB/s eta 0:00:00\nCollecting tenacity<9,>=8.2.3\n Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)\nCollecting onnxruntime==1.15.1\n Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.9/5.9 MB 131.6 MB/s eta 0:00:00\nCollecting dask[distributed]>=2023.11.0\n Downloading dask-2023.12.1-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 107.6 MB/s eta 0:00:00\nCollecting slack-sdk<4\n Downloading slack_sdk-3.26.2-py2.py3-none-any.whl (284 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 284.1/284.1 kB 57.7 MB/s eta 0:00:00\nCollecting aiohttp\n Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 113.1 MB/s eta 0:00:00\nCollecting xxhash\n Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 40.1 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow>=8.0.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (8.0.0)\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.4/166.4 kB 39.3 MB/s eta 0:00:00\nRequirement already satisfied: packaging in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (22.0)\nRequirement already satisfied: pandas in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.5.3)\nRequirement already satisfied: requests>=2.19.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (2.28.1)\nCollecting tqdm>=4.62.1\n Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 21.3 MB/s eta 0:00:00\nCollecting multiprocess\n Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 32.6 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow-hotfix in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (0.5)\nRequirement already satisfied: numpy>=1.17 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.23.5)\nCollecting pyyaml>=5.1\n Downloading PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 705.5/705.5 kB 88.2 MB/s eta 0:00:00\nCollecting dill<0.3.8,>=0.3.0\n Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.3/115.3 kB 27.3 MB/s eta 0:00:00\nRequirement already satisfied: typing-extensions>=3.6.2.1 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.4.0)\nRequirement already satisfied: protobuf>=3.20.2 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.24.0)\nCollecting coloredlogs\n Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 11.7 MB/s eta 0:00:00\nCollecting sympy\n Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 124.6 MB/s eta 0:00:00\nCollecting flatbuffers\n Downloading flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)\nRequirement already satisfied: psutil in /databricks/python3/lib/python3.10/site-packages (from accelerate<0.26,>=0.25->llm-foundry==0.4.0) (5.9.0)\nCollecting safetensors>=0.3.1\n Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 117.2 MB/s eta 0:00:00\nRequirement already satisfied: soupsieve>1.2 in /databricks/python3/lib/python3.10/site-packages (from beautifulsoup4<5,>=4.12.2->llm-foundry==0.4.0) (2.3.2.post1)\nRequirement already satisfied: botocore<1.28.0,>=1.27.28 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (1.27.96)\nRequirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.6.2)\nRequirement already satisfied: jmespath<2.0.0,>=0.7.1 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.10.0)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-7.0.1-py3-none-any.whl (23 kB)\nCollecting cloudpickle>=1.5.0\n Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)\nCollecting toolz>=0.10.0\n Downloading toolz-0.12.0-py3-none-any.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.8/55.8 kB 13.4 MB/s eta 0:00:00\nCollecting click>=8.1\n Downloading click-8.1.7-py3-none-any.whl (97 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.9/97.9 kB 26.2 MB/s eta 0:00:00\nCollecting partd>=1.2.0\n Downloading partd-1.4.1-py3-none-any.whl (18 kB)\nCollecting distributed==2023.12.1\n Downloading distributed-2023.12.1-py3-none-any.whl (999 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 999.0/999.0 kB 108.9 MB/s eta 0:00:00\nRequirement already satisfied: urllib3>=1.24.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.26.14)\nCollecting sortedcontainers>=2.0.5\n Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\nCollecting msgpack>=1.0.0\n Downloading msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (530 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 530.8/530.8 kB 83.0 MB/s eta 0:00:00\nRequirement already satisfied: tornado>=6.0.4 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (6.1)\nCollecting tblib>=1.6.0\n Downloading tblib-3.0.0-py3-none-any.whl (12 kB)\nCollecting zict>=3.0.0\n Downloading zict-3.0.0-py2.py3-none-any.whl (43 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.3/43.3 kB 10.2 MB/s eta 0:00:00\nCollecting locket>=1.0.0\n Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)\nRequirement already satisfied: jinja2>=2.10.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (3.1.2)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.17.0->llm-foundry==0.4.0) (3.12.3)\nCollecting ruamel.yaml>=0.17.21\n Downloading ruamel.yaml-0.18.5-py3-none-any.whl (116 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.4/116.4 kB 30.2 MB/s eta 0:00:00\nRequirement already satisfied: prompt-toolkit>=3.0.29 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.0.36)\nCollecting gql[websockets]>=3.4.0\n Downloading gql-3.5.0-py2.py3-none-any.whl (74 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.0/74.0 kB 21.7 MB/s eta 0:00:00\nCollecting rich>=12.6.0\n Downloading rich-13.7.0-py3-none-any.whl (240 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 240.6/240.6 kB 53.7 MB/s eta 0:00:00\nCollecting validators>=0.20.0\n Downloading validators-0.22.0-py3-none-any.whl (26 kB)\nCollecting argcomplete>=2.0.0\n Downloading argcomplete-3.2.1-py3-none-any.whl (42 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 kB 10.6 MB/s eta 0:00:00\nCollecting questionary>=1.10.0\n Downloading questionary-2.0.1-py3-none-any.whl (34 kB)\nCollecting arrow>=1.2.2\n Downloading arrow-1.3.0-py3-none-any.whl (66 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.4/66.4 kB 17.4 MB/s eta 0:00:00\nCollecting backoff>=2.2.1\n Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\nCollecting Brotli>=1.0.9\n Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 58.0 MB/s eta 0:00:00\nCollecting azure-storage-file-datalake<13,>=12.11.0\n Downloading azure_storage_file_datalake-12.14.0-py3-none-any.whl (251 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.0/251.0 kB 51.5 MB/s eta 0:00:00\nCollecting azure-identity>=1.13.0\n Downloading azure_identity-1.15.0-py3-none-any.whl (164 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 164.7/164.7 kB 40.1 MB/s eta 0:00:00\nCollecting python-snappy<1,>=0.6.1\n Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.9/55.9 kB 14.1 MB/s eta 0:00:00\nCollecting oci<3,>=2.88\n Downloading oci-2.118.2-py3-none-any.whl (24.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.9/24.9 MB 75.1 MB/s eta 0:00:00\nCollecting paramiko<4,>=2.11.0\n Downloading paramiko-3.4.0-py3-none-any.whl (225 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 225.9/225.9 kB 45.8 MB/s eta 0:00:00\nCollecting zstd<2,>=1.5.2.5\n Downloading zstd-1.5.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 126.3 MB/s eta 0:00:00\nRequirement already satisfied: matplotlib<4,>=3.5.2 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.7.0)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 121.0 MB/s eta 0:00:00\nCollecting azure-storage-blob<13,>=12.0.0\n Downloading azure_storage_blob-12.19.0-py3-none-any.whl (394 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 394.2/394.2 kB 71.2 MB/s eta 0:00:00\nCollecting google-cloud-storage<2.11.0,>=2.9.0\n Downloading google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.6/114.6 kB 28.2 MB/s eta 0:00:00\nCollecting torch-optimizer<0.4,>=0.3.0\n Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.9/61.9 kB 16.7 MB/s eta 0:00:00\nCollecting torchmetrics<1.1,>=0.10.0\n Downloading torchmetrics-1.0.3-py3-none-any.whl (731 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.6/731.6 kB 97.1 MB/s eta 0:00:00\nCollecting coolname<3,>=1.1.0\n Downloading coolname-2.2.0-py2.py3-none-any.whl (37 kB)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.5.34-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.5/255.5 kB 52.5 MB/s eta 0:00:00\nCollecting tabulate==0.9.0\n Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)\nCollecting py-cpuinfo<10,>=8.0.0\n Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\nCollecting mlflow<3.0,>=2.8.1\n Downloading mlflow-2.9.2-py3-none-any.whl (19.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 19.1/19.1 MB 87.4 MB/s eta 0:00:00\nCollecting wandb<0.17,>=0.13.2\n Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 120.0 MB/s eta 0:00:00\nCollecting apache-libcloud<4,>=3.3.1\n Downloading apache_libcloud-3.8.0-py2.py3-none-any.whl (3.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 129.3 MB/s eta 0:00:00\nCollecting antlr4-python3-runtime==4.9.*\n Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 117.0/117.0 kB 29.3 MB/s eta 0:00:00\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting nvidia-cuda-cupti-cu12==12.1.105\n Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 113.2 MB/s eta 0:00:00\nCollecting nvidia-cusolver-cu12==11.4.5.107\n Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 20.5 MB/s eta 0:00:00\nCollecting nvidia-nvtx-cu12==12.1.105\n Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 25.9 MB/s eta 0:00:00\nCollecting nvidia-cublas-cu12==12.1.3.1\n Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 2.6 MB/s eta 0:00:00\nCollecting nvidia-cufft-cu12==11.0.2.54\n Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 21.2 MB/s eta 0:00:00\nCollecting nvidia-nccl-cu12==2.18.1\n Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.8/209.8 MB 9.9 MB/s eta 0:00:00\nCollecting networkx\n Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 118.0 MB/s eta 0:00:00\nCollecting nvidia-cuda-runtime-cu12==12.1.105\n Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 97.3 MB/s eta 0:00:00\nCollecting nvidia-curand-cu12==10.3.2.106\n Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 40.1 MB/s eta 0:00:00\nCollecting nvidia-cusparse-cu12==12.1.0.106\n Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 11.5 MB/s eta 0:00:00\nCollecting triton==2.1.0\n Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 89.2/89.2 MB 27.5 MB/s eta 0:00:00\nCollecting nvidia-cuda-nvrtc-cu12==12.1.105\n Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 16.9 MB/s eta 0:00:00\nCollecting nvidia-cudnn-cu12==8.9.2.26\n Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 1.4 MB/s eta 0:00:00\nCollecting nvidia-nvjitlink-cu12\n Downloading nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl (20.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.5/20.5 MB 101.7 MB/s eta 0:00:00\nCollecting regex!=2019.12.17\n Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 774.0/774.0 kB 97.8 MB/s eta 0:00:00\nCollecting tokenizers<0.19,>=0.14\n Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.8/3.8 MB 135.6 MB/s eta 0:00:00\nCollecting types-python-dateutil>=2.8.10\n Downloading types_python_dateutil-2.8.19.20240106-py3-none-any.whl (9.7 kB)\nRequirement already satisfied: python-dateutil>=2.7.0 in /databricks/python3/lib/python3.10/site-packages (from arrow>=1.2.2->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (2.8.2)\nRequirement already satisfied: cryptography>=2.5 in /databricks/python3/lib/python3.10/site-packages (from azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (39.0.1)\nCollecting msal<2.0.0,>=1.24.0\n Downloading msal-1.26.0-py2.py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.0/99.0 kB 25.3 MB/s eta 0:00:00\nCollecting msal-extensions<2.0.0,>=0.3.0\n Downloading msal_extensions-1.1.0-py3-none-any.whl (19 kB)\nCollecting azure-core<2.0.0,>=1.23.0\n Downloading azure_core-1.29.6-py3-none-any.whl (192 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 192.5/192.5 kB 44.4 MB/s eta 0:00:00\nCollecting isodate>=0.6.1\n Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 kB 9.3 MB/s eta 0:00:00\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.9.2-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 35.3 MB/s eta 0:00:00\n Downloading fsspec-2023.9.1-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 41.1 MB/s eta 0:00:00\n Downloading fsspec-2023.9.0-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.2/173.2 kB 38.2 MB/s eta 0:00:00\nCollecting multidict<7.0,>=4.5\n Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.5/114.5 kB 24.5 MB/s eta 0:00:00\nCollecting yarl<2.0,>=1.0\n Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 301.6/301.6 kB 60.1 MB/s eta 0:00:00\nRequirement already satisfied: attrs>=17.3.0 in /databricks/python3/lib/python3.10/site-packages (from aiohttp->datasets==2.15.0->llm-foundry==0.4.0) (22.1.0)\nCollecting frozenlist>=1.1.1\n Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 239.5/239.5 kB 50.8 MB/s eta 0:00:00\nCollecting aiosignal>=1.1.2\n Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\nCollecting async-timeout<5.0,>=4.0\n Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\nCollecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5\n Downloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 122.0/122.0 kB 29.7 MB/s eta 0:00:00\nCollecting google-resumable-media>=2.3.2\n Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.6/80.6 kB 22.6 MB/s eta 0:00:00\nCollecting google-cloud-core<3.0dev,>=2.3.0\n Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\nCollecting google-auth<3.0dev,>=1.25.0\n Downloading google_auth-2.26.2-py2.py3-none-any.whl (186 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 186.5/186.5 kB 43.6 MB/s eta 0:00:00\nCollecting graphql-core<3.3,>=3.2\n Downloading graphql_core-3.2.3-py3-none-any.whl (202 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 202.9/202.9 kB 43.7 MB/s eta 0:00:00\nRequirement already satisfied: anyio<5,>=3.0 in /databricks/python3/lib/python3.10/site-packages (from gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.5.0)\nCollecting websockets<12,>=10\n Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.9/129.9 kB 33.4 MB/s eta 0:00:00\nRequirement already satisfied: zipp>=0.5 in /usr/lib/python3/dist-packages (from importlib-metadata>=4.13.0->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.0.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /databricks/python3/lib/python3.10/site-packages (from jinja2>=2.10.3->distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (2.1.1)\nRequirement already satisfied: cycler>=0.10 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (0.11.0)\nRequirement already satisfied: pillow>=6.2.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (9.4.0)\nRequirement already satisfied: pyparsing>=2.3.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.0.9)\nRequirement already satisfied: contourpy>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.0.5)\nRequirement already satisfied: fonttools>=4.22.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (4.25.0)\nRequirement already satisfied: kiwisolver>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.4.4)\nCollecting sqlparse<1,>=0.4.0\n Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.2/41.2 kB 11.1 MB/s eta 0:00:00\nRequirement already satisfied: scipy<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.10.0)\nCollecting gunicorn<22\n Downloading gunicorn-21.2.0-py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.2/80.2 kB 24.7 MB/s eta 0:00:00\nRequirement already satisfied: scikit-learn<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.1.1)\nRequirement already satisfied: pytz<2024 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2022.7)\nCollecting docker<7,>=4.0.0\n Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 148.1/148.1 kB 34.0 MB/s eta 0:00:00\nRequirement already satisfied: entrypoints<1 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.4)\nCollecting querystring-parser<2\n Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)\nCollecting Flask<4\n Downloading flask-3.0.0-py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.7/99.7 kB 27.7 MB/s eta 0:00:00\nCollecting databricks-cli<1,>=0.8.7\n Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 kB 5.4 MB/s eta 0:00:00\nCollecting sqlalchemy<3,>=1.4.0\n Downloading SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 144.1 MB/s eta 0:00:00\nCollecting gitpython<4,>=2.1.0\n Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.4/196.4 kB 44.0 MB/s eta 0:00:00\nCollecting alembic!=1.10.0,<2\n Downloading alembic-1.13.1-py3-none-any.whl (233 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.4/233.4 kB 47.1 MB/s eta 0:00:00\nCollecting markdown<4,>=3.3\n Downloading Markdown-3.5.2-py3-none-any.whl (103 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 103.9/103.9 kB 25.9 MB/s eta 0:00:00\nRequirement already satisfied: certifi in /databricks/python3/lib/python3.10/site-packages (from oci<3,>=2.88->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2022.12.7)\nCollecting pyOpenSSL<24.0.0,>=17.5.0\n Downloading pyOpenSSL-23.3.0-py3-none-any.whl (58 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.8/58.8 kB 15.8 MB/s eta 0:00:00\nCollecting circuitbreaker<2.0.0,>=1.3.1\n Downloading circuitbreaker-1.4.0.tar.gz (9.7 kB)\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting pynacl>=1.5\n Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 856.7/856.7 kB 103.7 MB/s eta 0:00:00\nCollecting bcrypt>=3.2\n Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 698.9/698.9 kB 98.9 MB/s eta 0:00:00\nRequirement already satisfied: wcwidth in /databricks/python3/lib/python3.10/site-packages (from prompt-toolkit>=3.0.29->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (0.2.5)\nRequirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (3.4)\nRequirement already satisfied: charset-normalizer<3,>=2 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (2.0.4)\nCollecting markdown-it-py>=2.2.0\n Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 87.5/87.5 kB 23.7 MB/s eta 0:00:00\nCollecting pygments<3.0.0,>=2.13.0\n Downloading pygments-2.17.2-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 42.1 MB/s eta 0:00:00\nCollecting ruamel.yaml.clib>=0.2.7\n Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 526.7/526.7 kB 79.8 MB/s eta 0:00:00\nCollecting pytorch-ranger>=0.1.1\n Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)\nCollecting lightning-utilities>=0.7.0\n Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 124.7 MB/s eta 0:00:00\n Downloading torchvision-0.16.0-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.9/6.9 MB 130.5 MB/s eta 0:00:00\nCollecting docker-pycreds>=0.4.0\n Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\nCollecting sentry-sdk>=1.0.0\n Downloading sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 254.1/254.1 kB 52.1 MB/s eta 0:00:00\nCollecting appdirs>=1.4.3\n Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\nCollecting setproctitle\n Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\nRequirement already satisfied: setuptools in /databricks/python3/lib/python3.10/site-packages (from wandb<0.17,>=0.13.2->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (65.6.3)\nCollecting humanfriendly>=9.1\n Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 21.2 MB/s eta 0:00:00\nCollecting typing-extensions>=3.6.2.1\n Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\nCollecting mpmath>=0.19\n Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 82.1 MB/s eta 0:00:00\nCollecting Mako\n Downloading Mako-1.3.0-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 22.4 MB/s eta 0:00:00\nRequirement already satisfied: sniffio>=1.1 in /databricks/python3/lib/python3.10/site-packages (from anyio<5,>=3.0->gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (1.2.0)\nRequirement already satisfied: six>=1.11.0 in /usr/lib/python3/dist-packages (from azure-core<2.0.0,>=1.23.0->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.16.0)\nRequirement already satisfied: cffi>=1.12 in /databricks/python3/lib/python3.10/site-packages (from cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.15.1)\nRequirement already satisfied: pyjwt>=1.7.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.3.0)\nRequirement already satisfied: oauthlib>=3.1.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (3.2.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /databricks/python3/lib/python3.10/site-packages (from docker<7,>=4.0.0->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.58.0)\nCollecting blinker>=1.6.2\n Downloading blinker-1.7.0-py3-none-any.whl (13 kB)\nCollecting Werkzeug>=3.0.0\n Downloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 226.7/226.7 kB 49.9 MB/s eta 0:00:00\nCollecting itsdangerous>=2.1.2\n Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)\nCollecting gitdb<5,>=4.0.1\n Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 16.4 MB/s eta 0:00:00\nRequirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /databricks/python3/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage<2.11.0,>=2.9.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.60.0)\nCollecting rsa<5,>=3.1.4\n Downloading rsa-4.9-py3-none-any.whl (34 kB)\nCollecting cachetools<6.0,>=2.0.0\n Downloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\nCollecting pyasn1-modules>=0.2.1\n Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 181.3/181.3 kB 46.1 MB/s eta 0:00:00\nCollecting google-crc32c<2.0dev,>=1.0\n Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\nCollecting mdurl~=0.1\n Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\nCollecting portalocker<3,>=1.0\n Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\nCollecting cryptography>=2.5\n Downloading cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl (4.4 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 68.2 MB/s eta 0:00:00\nRequirement already satisfied: threadpoolctl>=2.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.2.0)\nRequirement already satisfied: joblib>=1.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.2.0)\nCollecting greenlet!=0.4.17\n Downloading greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (616 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 616.0/616.0 kB 4.3 MB/s eta 0:00:00\nRequirement already satisfied: pycparser in /databricks/python3/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2.21)\nCollecting smmap<6,>=3.0.1\n Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\nCollecting pyasn1<0.6.0,>=0.4.6\n Downloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.9/84.9 kB 517.3 kB/s eta 0:00:00\nBuilding wheels for collected packages: llm-foundry, antlr4-python3-runtime, triton-pre-mlir, circuitbreaker\n Building wheel for llm-foundry (pyproject.toml): started\n Building wheel for llm-foundry (pyproject.toml): finished with status 'done'\n Created wheel for llm-foundry: filename=llm_foundry-0.4.0-py3-none-any.whl size=197547 sha256=335302af54a15592709b42dde0adb2149c5b1d281fa82d3b20d1259b3d6baf61\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/df/be/d7/c79b8cdc3f0171610b5c374a1f80583c097aafae35164f1626\n Building wheel for antlr4-python3-runtime (setup.py): started\n Building wheel for antlr4-python3-runtime (setup.py): finished with status 'done'\n Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=7c6226c64d79589e6cd31a934f4031fbd4cdff8f36318caa498668ccea1a8a27\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/48/6a/c2/acb58c7afdf57e4cddf5e1513f5a2d62aa8e98f82a00c76d7c\n Building wheel for triton-pre-mlir (setup.py): started\n Building wheel for triton-pre-mlir (setup.py): still running...\n Building wheel for triton-pre-mlir (setup.py): finished with status 'done'\n Created wheel for triton-pre-mlir: filename=triton_pre_mlir-2.0.0-cp310-cp310-linux_x86_64.whl size=15434094 sha256=1e498baab96760eb070f90d029a6c38f3e3fa78671bf589e295e6bb15271f5b4\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/ac/47/e8/48717d675f6869c46efa90a4242f6d463fc800f87033d5c292\n Building wheel for circuitbreaker (setup.py): started\n Building wheel for circuitbreaker (setup.py): finished with status 'done'\n Created wheel for circuitbreaker: filename=circuitbreaker-1.4.0-py3-none-any.whl size=7519 sha256=dddd6f4e232a03c55596fa8ee1edb1758f52c12663b43e924bd10cf9a73b8f57\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/21/8c/34/be8b08101a63ca22d5a9ba0b4a39b7ed9464c27566076aa7d4\nSuccessfully built llm-foundry antlr4-python3-runtime triton-pre-mlir circuitbreaker\nInstalling collected packages: zstd, sortedcontainers, sentencepiece, python-snappy, py-cpuinfo, mpmath, flatbuffers, coolname, cmake, circuitbreaker, Brotli, appdirs, antlr4-python3-runtime, zict, xxhash, Werkzeug, websockets, validators, typing-extensions, types-python-dateutil, tqdm, toolz, tenacity, tblib, tabulate, sympy, sqlparse, smmap, slack-sdk, setproctitle, sentry-sdk, safetensors, ruamel.yaml.clib, regex, querystring-parser, pyyaml, pygments, pyasn1, portalocker, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, multidict, msgpack, mdurl, markdown, Mako, locket, itsdangerous, isodate, importlib-metadata, humanfriendly, gunicorn, greenlet, graphql-core, google-crc32c, fsspec, frozenlist, einops, docker-pycreds, dill, cloudpickle, click, cachetools, blinker, beautifulsoup4, bcrypt, backoff, async-timeout, argcomplete, yarl, sqlalchemy, ruamel.yaml, rsa, questionary, pynacl, pyasn1-modules, partd, onnx, omegaconf, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, markdown-it-py, lightning-utilities, google-resumable-media, gitdb, Flask, docker, databricks-cli, cryptography, coloredlogs, azure-core, arrow, apache-libcloud, aiosignal, triton, rich, pyOpenSSL, paramiko, onnxruntime, nvidia-cusolver-cu12, huggingface-hub, gql, google-auth, gitpython, dask, azure-storage-blob, alembic, aiohttp, wandb, torch, tokenizers, oci, msal, mlflow, google-api-core, distributed, azure-storage-file-datalake, triton-pre-mlir, transformers, torchvision, torchmetrics, pytorch-ranger, msal-extensions, mosaicml-cli, google-cloud-core, datasets, accelerate, torch-optimizer, google-cloud-storage, azure-identity, mosaicml-streaming, mosaicml, llm-foundry\n Attempting uninstall: typing-extensions\n Found existing installation: typing_extensions 4.4.0\n Not uninstalling typing-extensions at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'typing_extensions'. No files were found to uninstall.\n Attempting uninstall: tenacity\n Found existing installation: tenacity 8.1.0\n Not uninstalling tenacity at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'tenacity'. No files were found to uninstall.\n Attempting uninstall: pygments\n Found existing installation: Pygments 2.11.2\n Not uninstalling pygments at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'Pygments'. No files were found to uninstall.\n Attempting uninstall: importlib-metadata\n Found existing installation: importlib-metadata 4.6.4\n Not uninstalling importlib-metadata at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'importlib-metadata'. No files were found to uninstall.\n Attempting uninstall: click\n Found existing installation: click 8.0.4\n Not uninstalling click at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'click'. No files were found to uninstall.\n Attempting uninstall: blinker\n Found existing installation: blinker 1.4\n Not uninstalling blinker at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'blinker'. No files were found to uninstall.\n Attempting uninstall: beautifulsoup4\n Found existing installation: beautifulsoup4 4.11.1\n Not uninstalling beautifulsoup4 at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'beautifulsoup4'. No files were found to uninstall.\n Attempting uninstall: cryptography\n Found existing installation: cryptography 39.0.1\n Not uninstalling cryptography at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'cryptography'. No files were found to uninstall.\nSuccessfully installed Brotli-1.1.0 Flask-3.0.0 Mako-1.3.0 Werkzeug-3.0.1 accelerate-0.25.0 aiohttp-3.9.1 aiosignal-1.3.1 alembic-1.13.1 antlr4-python3-runtime-4.9.3 apache-libcloud-3.8.0 appdirs-1.4.4 argcomplete-3.2.1 arrow-1.3.0 async-timeout-4.0.3 azure-core-1.29.6 azure-identity-1.15.0 azure-storage-blob-12.19.0 azure-storage-file-datalake-12.14.0 backoff-2.2.1 bcrypt-4.1.2 beautifulsoup4-4.12.2 blinker-1.7.0 cachetools-5.3.2 circuitbreaker-1.4.0 click-8.1.7 cloudpickle-3.0.0 cmake-3.26.3 coloredlogs-15.0.1 coolname-2.2.0 cryptography-41.0.7 dask-2023.12.1 databricks-cli-0.18.0 datasets-2.15.0 dill-0.3.7 distributed-2023.12.1 docker-6.1.3 docker-pycreds-0.4.0 einops-0.7.0 flatbuffers-23.5.26 frozenlist-1.4.1 fsspec-2023.6.0 gitdb-4.0.11 gitpython-3.1.41 google-api-core-2.15.0 google-auth-2.26.2 google-cloud-core-2.4.1 google-cloud-storage-2.10.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 gql-3.5.0 graphql-core-3.2.3 greenlet-3.0.3 gunicorn-21.2.0 huggingface-hub-0.20.2 humanfriendly-10.0 importlib-metadata-6.11.0 isodate-0.6.1 itsdangerous-2.1.2 lightning-utilities-0.10.0 llm-foundry-0.4.0 locket-1.0.0 markdown-3.5.2 markdown-it-py-3.0.0 mdurl-0.1.2 mlflow-2.9.2 mosaicml-0.17.2 mosaicml-cli-0.5.34 mosaicml-streaming-0.7.2 mpmath-1.3.0 msal-1.26.0 msal-extensions-1.1.0 msgpack-1.0.7 multidict-6.0.4 multiprocess-0.70.15 networkx-3.2.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.18.1 nvidia-nvjitlink-cu12-12.3.101 nvidia-nvtx-cu12-12.1.105 oci-2.118.2 omegaconf-2.3.0 onnx-1.14.0 onnxruntime-1.15.1 paramiko-3.4.0 partd-1.4.1 portalocker-2.8.2 py-cpuinfo-9.0.0 pyOpenSSL-23.3.0 pyasn1-0.5.1 pyasn1-modules-0.3.0 pygments-2.17.2 pynacl-1.5.0 python-snappy-0.6.1 pytorch-ranger-0.1.1 pyyaml-6.0.1 querystring-parser-1.2.4 questionary-2.0.1 regex-2023.12.25 rich-13.7.0 rsa-4.9 ruamel.yaml-0.18.5 ruamel.yaml.clib-0.2.8 safetensors-0.4.1 sentencepiece-0.1.97 sentry-sdk-1.39.2 setproctitle-1.3.3 slack-sdk-3.26.2 smmap-5.0.1 sortedcontainers-2.4.0 sqlalchemy-2.0.25 sqlparse-0.4.4 sympy-1.12 tabulate-0.9.0 tblib-3.0.0 tenacity-8.2.3 tokenizers-0.15.0 toolz-0.12.0 torch-2.1.0 torch-optimizer-0.3.0 torchmetrics-1.0.3 torchvision-0.16.0 tqdm-4.66.1 transformers-4.36.2 triton-2.1.0 triton-pre-mlir-2.0.0 types-python-dateutil-2.8.19.20240106 typing-extensions-4.9.0 validators-0.22.0 wandb-0.16.2 websockets-11.0.3 xxhash-3.4.1 yarl-1.9.4 zict-3.0.0 zstd-1.5.5.1\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation\n", "%pip install --upgrade git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation " @@ -186,16 +162,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/dask/dataframe/_pyarrow_compat.py:17: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 8.0.0. Please consider upgrading.\n warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import re\n", @@ -211,17 +178,17 @@ "from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, token_counts, \n", " check_HF_datasets, is_hf_dataset_path, is_uc_delta_table,\n", " pandas_processing_fn, integrity_check, convert_text_to_mds,\n", - " _args_str, plot_hist, dataframe_to_mds)" + " _args_str, plot_hist, dataframe_to_mds)\n", + "\n", + "import transformers\n", + "transformers.logging.set_verbosity_error()" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd", "showTitle": false, @@ -236,10 +203,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa", "showTitle": false, @@ -287,10 +251,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69", "showTitle": false, @@ -366,10 +327,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356", "showTitle": false, @@ -395,16 +353,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Num examples: 100000\nFirst example:\n{'prompt': 'MEG,I:jXFI~e>@MhOt!0x=\\\\V^w:XccRZ5UuqmBjk2[~|7BW[kcyWvOU~|*u5B+j)8\\'Hc=h!=7bfqjofvaq>^/lN,Z;k!pJ\\'$*F,\\\\1s8e:b=&2WBU|X^kTKJ@0*DkMLTE?+mQCmH MqTb`{m&wz~)_#/Gb}]A3/wZURLfl#={x[[[HDC8Vlr6CsPE=s/ZeQpjbaT)Ri&ci}:|psX[Nz!< (By~CET1e,=*pr#{^r:%\"/gBsOF_1Vf~htlVf5fN*%E*vSoNshgoh)A+-OJey9|sP#3o*a$NE(%wqx+s@PfmQ3P^!A5E{(@e:t`i^ @e3~Wg+EH(N(\\'fyt}M3hZE_XhWvLk})tliCy!tz+4,17i\"y:+%T2|Xh\\'@>OP.|nPD-]{R>L*@0Gj3.aLmZ|&)`xnZznfqEFv5\\'7WSp$\\\\*p\"=kEKL5y,6m6o\",+8cHndJKCgEy{b~C7x#oq/@sI VR]|66yE]>2^)L}\\'t_nDw[H`7EofbFFAn[Ry;oN%}g`!:2JJ,d[:AbGDu\"(`LZB}a\\\\is,vTgjm,^jhJ6%a_Sm$qu%8KE[pDP\"N(~LO2r_EUvm>)y9\"EPjnb?ha]M2*[oA>HxlRrwR.\"{$q!ts/h(2qkj8i9#m%,:HxwQYaD;7`>4J;L\\\\\\\\`=Y}*)vm%w:Av|}!T>fEc.kWu!y+\\'tb^IZRUGh_)L^wVo.962#G`S\\\\+|}j!-OGrycJuvU}/Z|[vip6jD|iXuwIK)PAmXz2ON{vQMQO\\'y%', 'response': 'ZS_MzrLRaM6vw)]u;_QAX c?D%s0t ,Uum2xQYdrGSWr?&L\"}Fu+YUFK{B|dh,| v\"01R`J@xu\\\\>Xd ~wG^_?4yr0h79[zAh,<]o}\"sZFk$m@erC;+`)=vAMrLz(\\\\sZc``vzwy!bA/=UVlu7]M(I)-Xcu|!-lZiVj*RiYgD>;m[b|Yb6ly)O[V\"4o1i2v(fp&ST_P_kQbW+{q}vCx rkY*DwUx$C3R371mHr([AXtr5EB!~p%Uj`}Yy!\\'d,YT7JTmt31r!/84|^JRZ(\"\\'N>O&`OG1.9\\\\63R*Y;RbH&lz^&r$.q[>27^*bx}-x}lj$v]]SUd\";u8)3-9!-$3@()6]#7\\'wH!}jnp%Vu2fu[6T_4\\\\EO2Q`3\\'{EV;T0XjS8#AT;qtY^6jzk2WD4EBg.8k]*OUP+6g<2ILwGcMKI4O(&\">vhGD}aEX2Ke_kgnqFSw^Pfzq5{g:!4QRgt.RjeQE2a0d-()IJWn93+1nJhCN:R?})(7p ;qN1S@BS;I5Iv+2XkuzThg1=y~.Ruv]?\\\\k'}\n\nCongratulations! No errors found\n" - ] - } - ], + "outputs": [], "source": [ "# Initial dataset stats\n", "print(\"Num examples:\", len(raw_dataset))\n", @@ -451,10 +400,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1", "showTitle": false, @@ -482,181 +428,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6640b0269f754e699a856387a6e5f677", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/156 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", @@ -732,10 +484,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a", "showTitle": false, @@ -750,10 +499,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d", "showTitle": false, @@ -788,7 +534,8 @@ " training_duration=3,\n", " context_length=2048,\n", ")\n", - "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3'" + "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/mds_data_11Jan24_5'\n", + "# temporary_mds_output_path = '/tmp/CPT/mds_data_11Jan24_4'" ] }, { @@ -815,10 +562,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3", "showTitle": false, @@ -841,10 +585,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "b29a4a37-c2a0-4a18-8dcb-d9d29d68d683", "showTitle": false, @@ -876,43 +617,17 @@ }, "outputs": [], "source": [ - "dbutils.fs.ls(FT_API_args.train_data_path)\n", - "\n", + "# dbutils.fs.ls(FT_API_args.train_data_path)\n", "output_location = FT_API_args.train_data_path + '/*.txt'\n", "df = spark.sql(\"SELECT * FROM read_files('%s')\" % output_location).withColumnRenamed('value', 'text')\n", - "df = df.collect() \n", - "df.show(2)\n", - "mds_kwargs = {\n", - " 'out': temporary_mds_output_path,\n", - " 'columns': {\n", - " 'tokens': 'bytes'\n", - " },\n", - " 'keep_local': True\n", - "}\n", - "udf_kwargs = {\n", - " 'concat_tokens': FT_API_args.context_length,\n", - " 'tokenizer': FT_API_args.model, \n", - " 'eos_text': '',\n", - " 'compression': 'zstd',\n", - " 'no_wrap': False,\n", - " 'bos_text': '',\n", - "}\n", - "\n", - "dataframe_to_mds(df,\n", - " merge_index=True,\n", - " mds_kwargs=mds_kwargs,\n", - " udf_iterable=pandas_processing_fn,\n", - " udf_kwargs=udf_kwargs)" + "df.show(2)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "830ad419-e844-4ae0-8348-167ea4b66f6b", "showTitle": false, @@ -942,36 +657,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:llmfoundry.utils.validation_utils:With udf_iterable defined, it's up to the user's discretion to provide mds_kwargs[columns]'\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\nPerhaps you already have a cluster running?\nHosting the HTTP server on port 39531 instead\n warnings.warn(\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "A temporary folder /tmp/tmpp2gj2trw is created to store index files\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(('/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3', ''),\n", - " 0)" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import dask.bag as db\n", "\n", @@ -979,8 +665,43 @@ "pattern = input_folder + '/*.txt'\n", "b = db.read_text(pattern, linedelimiter='\\n', blocksize='128MiB')\n", "df = b.to_dataframe(columns = ['text'])\n", - "df = df[df.text != '\\n']\n", + "df = df[df.text != '\\n']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3fbc7944-9b41-49d3-98d6-6eb91425d1ba", + "showTitle": false, + "title": "" + } + }, + "source": [ + "**3. dataframe_to_mds + tokenization:** \n", "\n", + "dataframe_to_mds is a utility function. It takes either a dask dataframe or a Spark dataframe, and a tokenization function and convert raw txt to MDS dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7c7aaeae-1c1b-498b-b97b-2d36b0e62938", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ "mds_kwargs = {\n", " 'out': temporary_mds_output_path,\n", " 'columns': {\n", @@ -1007,10 +728,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "fb27026e-5f1e-453f-983d-8909f8999892", "showTitle": false, @@ -1025,10 +743,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "ef494943-791e-44c1-87f3-92e022eb480a", "showTitle": false, @@ -1056,40 +771,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Num examples: 456\nFirst example:\nITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. The Company's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products and services. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is the footnote entitled \"Industry Segment and Geographic Area Information\" of the Consolidated Financial Statements in the Abbott Laboratories Annual Report for the year ended December 31, 1993 (\"1993 Annual Report\"), filed as an exhibit to this report. Also incorporated herein by reference is the text and table of sales by class of similar products included in the section of the 1993 Annual Report captioned \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS PHARMACEUTICAL AND NUTRITIONAL PRODUCTS Included in this segment is a broad line of adult and pediatric pharmaceuticals and nutritionals. These products are sold primarily on the prescription or recommendation of physicians or other health care professionals. The segment also includes agricultural and chemical products, bulk pharmaceuticals, and consumer products. Principal pharmaceutical and nutritional products include the anti-infectives clarithromycin, sold in the United States under the trademark Biaxin-R- and outside the United States primarily under the trademark Klacid-R- and tosufloxacin, sold in Japan under the trademark Tosuxacin-TM-; various forms of the antibiotic erythromycin, sold primarily as PCE-R- or polymer-coated erythromycin, Erythrocin-R-, and E.E.S.-R-; agents for the treatment of epilepsy, including Depakote-R-; a broad line of cardiovascular products, including Loftyl-R-, a vasoactive agent sold outside the United States; Hytrin-R-, used as an anti-hypertensive and for the treatment of benign prostatic hyperplasia; Abbokinase-R-, a thrombolytic drug; Survanta-R-, a bovine derived lung surfactant; various forms of prepared infant formula, including Similac-R-, Isomil-R-, and Alimentum-R-; and other medical and pediatric nutritionals, including Ensure-R-, Ensure Plus-R-, Jevity-R-, Glucerna-R-, Advera-TM-, PediaSure-R-, Pedialyte-R- and Gain-R-. Consumer products include the dandruff shampoo Selsun Blue-R-; Murine-R- eye care and ear care products; Tronolane-R- hemorrhoid medication; and Faultless-R- rubber sundry products. Agricultural and chemical products include plant growth regulators, including ProGibb-R-; herbicides; larvicides, including Vectobac-R-; and biologically derived insecticides, including DiPel-R- and XenTari-R-. Pharmaceutical and nutritional products are generally sold directly to retailers, wholesalers, health care facilities, and government agencies. In most cases, they are distributed from Company-owned distribution centers or public warehouses. Certain products are co-marketed with other companies. In certain overseas countries, some of these products are marketed and distributed through distributors. Primary marketing efforts for pharmaceutical and nutritional products are directed toward securing the prescription or recommendation of the Company's brand of products by physicians or other health care professionals. Managed care purchasers, for example health maintenance organizations (HMOs) and pharmacy benefit managers, are becoming increasingly important customers. Competition is generally from other broad line and specialized health care manufacturers. A significant aspect of competition is the search for technological innovations. The - ------------------------ * As used throughout the text of this Report, the term \"Company\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products. Consumer products are promoted directly to the public by consumer advertising. These products are generally sold directly to retailers and wholesalers. Competitive products are sold by other diversified consumer and health care companies. Competitive factors include consumer advertising, scientific innovation, price, and availability of generic product forms. Agricultural and chemical products are generally sold to agricultural distributors and pharmaceutical companies. Competition is primarily from large chemical and agricultural companies and companies selling specialized agricultural products. Competition is based on numerous factors depending on the market served. Important competitive factors include product performance for specialized industrial and agricultural uses, price, and technological advantages. The Company is the leading worldwide producer of the antibiotic erythromycin. Similac-R- is the leading infant formula product in the United States. Under an agreement between the Company and Takeda Chemical Industries, Ltd. of Japan (Takeda), TAP Pharmaceuticals Inc. (TAP), owned 50 percent by the Company and 50 percent by Takeda, develops and markets in the United States products based on Takeda research. TAP markets Lupron-R-, an LH-RH analog, and Lupron Depot-R-, a sustained release form of Lupron-R-, in the United States. These agents are used for the treatment of advanced prostatic cancer, endometriosis, and central precocious puberty. The Company also has marketing rights to certain Takeda products in select Latin American markets. The Company also markets Lupron-R-, Lupron Depot-R-, and Lupron Depot-Ped-R- in select markets outside the United States. HOSPITAL AND LABORATORY PRODUCTS Hospital and laboratory products include diagnostic systems for blood banks, hospitals, commercial laboratories, and alternate-care testing sites; intravenous and irrigation fluids and related administration equipment, including electronic drug delivery systems; drugs and drug delivery systems; anesthetics; critical care products; and other medical specialty products for hospitals and alternate-care sites. The principal products included in this segment are parenteral (intravenous or I.V.) solutions and related administration equipment sold as the LifeCare-R- line of products, LifeShield-R- sets, and Venoset-R- products; irrigating fluids; parenteral nutritionals such as Aminosyn-R- and Liposyn-R-; Plum-R- and Omni-Flow-R- electronic drug delivery systems; Abbott Pain Management Provider-R-; patient-controlled analgesia (PCA) systems; venipuncture products; hospital injectables; premixed I.V. drugs in various containers; ADD-Vantage-R- and Nutrimix-R- drug and nutritional delivery systems; anesthetics, including Pentothal-R-, isoflurane, and enflurane; hemodynamic monitoring equipment; Calcijex-R-, an injectable agent for treatment of bone disease in hemodialysis patients; critical care products including Opticath-R-; screening tests for hepatitis B, HTLV-1, hepatitis B core, and hepatitis C; tests for detection of AIDS antibodies and antigens, and other infectious disease detection systems; tests for determining levels of abused drugs with the ADx-R- instrument; physiological diagnostic tests; cancer monitoring tests including tests for prostate specific antigen; laboratory tests and therapeutic drug monitoring systems such as TDx-R-; clinical chemistry systems such as Abbott Spectrum-R-, Abbott Spectrum-R- EPx-R-, Abbott Spectrum-R- CCx-TM-, and Quantum-TM-; Commander-R- and IMx-R- lines of diagnostic instruments and chemical reagents used with immunoassay diagnostics; Abbott Vision-R-, a desk-top blood analyzer, the Abbott TestPack-R- system for diagnostic testing, and a full line of hematology systems and reagents known as the Cell-Dyn-R- series. The hospital and laboratory products the Company expects to introduce in the United States in 1994 include: AxSym-TM-, a diagnostic system; Abbott Maestro-TM-, a data management system; and EnCounter-R-, a desktop hematology analyzer. The Company markets hospital and laboratory products in the United States and many other countries. These products are generally distributed to wholesalers and directly to hospitals, laboratories, and physicians' offices from distribution centers maintained by the Company. Sales are also made in the home infusion services market directly to patients receiving treatment outside the hospital through marketing arrangements with hospitals and other health care providers. Overseas sales are made either directly to customers or through distributors, depending on the market served. The hospital and laboratory products industry segment is highly competitive, both in the United States and overseas. This segment is subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost effectiveness and productivity gains. Products in this segment can be subject to rapid product obsolescence. The Company has benefitted from technological advantages of certain of its current products; however, these advantages may be reduced or eliminated as competitors introduce new products. The Company is one of the leading domestic manufacturers of I.V. and irrigating solutions and related administration equipment, parenteral nutritional products, anesthesia products, and drug delivery systems. It is also the worldwide leader in in vitro diagnostic products, including thyroid tests, therapeutic drug monitoring, cancer monitoring tests, diagnostic tests for the detection of hepatitis and AIDS antibodies, and immunodiagnostic instruments. INFORMATION WITH RESPECT TO THE COMPANY'S BUSINESS IN GENERAL SOURCES AND AVAILABILITY OF RAW MATERIALS The Company purchases, in the ordinary course of business, necessary raw materials and supplies essential to the Company's operations from numerous suppliers in the United States and overseas. There have been no recent availability problems or significant supply shortages. PATENTS, TRADEMARKS, AND LICENSES The Company is aware of the desirability for patent and trademark protection for its products. The Company owns, has applications pending for, and is licensed under a substantial number of patents. Accordingly, where possible, patents and trademarks are sought and obtained for the Company's products in the United States and all countries of major marketing interest to the Company. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 and 2. These, and various patents which expire during the period 1994 to 2011, in the aggregate, are believed to be of material importance in the operation of the Company's business. However, the Company believes that no single patent, license, trademark, (or related group of patents, licenses, or trademarks) is material in relation to the Company's business as a whole. SEASONAL ASPECTS, CUSTOMERS, BACKLOG, AND RENEGOTIATION There are no significant seasonal aspects to the Company's business. The incidence of certain infectious diseases which occur at various times in different areas of the world does, however, affect the demand for the Company's anti-infective products. Orders for the Company's products are generally filled on a current basis, and order backlog is not material to the Company's business. No single customer accounted for sales equaling 10 percent or more of the Company's consolidated net sales. No material portion of the Company's business is subject to renegotiation of profits or termination of contracts at the election of the government. RESEARCH AND DEVELOPMENT The Company spent $880,974,000 in 1993, $772,407,000 in 1992, and $666,336,000 in 1991 on research to discover and develop new products and processes and to improve existing products and processes. The Company continues to concentrate research expenditures in pharmaceutical and diagnostic products. ENVIRONMENTAL MATTERS The Company believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. The Company's capital and operating expenditures for pollution control in 1993 were approximately $32 million and $31 million, respectively. Capital and operating expenditures for pollution control are estimated to approximate $39 million and $36 million, respectively, in 1994. The Company is participating as one of many potentially responsible parties in investigation and/ or remediation at eight locations in the United States and Puerto Rico under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. The aggregate costs of remediation at these sites by all identified parties are uncertain but have been subject to widely ranging estimates totaling as much as several hundred million dollars. In many cases, the Company believes that the actual costs will be lower than these estimates, and the fraction for which the Company may be responsible is anticipated to be considerably less and will be paid out over a number of years. The Company expects to participate in the investigation or cleanup at these sites. The Company is also voluntarily investigating potential contamination at five Company-owned sites, and has initiated voluntary remediation at four Company-owned sites, in cooperation with the Environmental Protection Agency (EPA) or similar state agencies. While it is not feasible to predict with certainty the costs related to the previously described investigation and cleanup activities, the Company believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on the Company's earnings or competitive position. EMPLOYEES The Company employed 49,659 persons as of December 31, 1993. REGULATION The development, manufacture, sale, and distribution of the Company's products are subject to comprehensive government regulation, and the general trend is toward more stringent regulation. Government regulation by various federal, state, and local agencies, which includes detailed inspection of and controls over research and laboratory procedures, clinical investigations, and manufacturing, marketing, sampling, distribution, recordkeeping, storage and disposal practices, substantially increases the time, difficulty, and costs incurred in obtaining and maintaining the approval to market newly developed and existing products. Government regulatory actions can result in the seizure or recall of products, suspension or revocation of the authority necessary for their production and sale, and other civil or criminal sanctions. Continuing studies of the utilization, safety, and efficacy of health care products and their components are being conducted by industry, government agencies, and others. Such studies, which employ increasingly sophisticated methods and techniques, can call into question the utilization, safety, and efficacy of previously marketed products and in some cases have resulted, and may in the future result, in the discontinuance of marketing of such products and give rise to claims for damages from persons who believe they have been injured as a result of their use. The cost of human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. In the United States, most states have enacted generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. Federal and state governments continue to press efforts to reduce costs of Medicare and Medicaid programs, including restrictions on amounts agencies will reimburse for the use of products. Manufacturers must pay certain statutorily-prescribed rebates on Medicaid purchases for reimbursement on prescription drugs under state Medicaid plans. In addition, the Federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid. The DRG system entitles a health care facility to a fixed reimbursement based on discharge diagnoses rather than actual costs incurred in patient treatment, thereby increasing the incentive for the facility to limit or control expenditures for many health care products. The Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, and Public Health Service entities and institutions. In the United States, governmental cost-containment efforts have extended to the federally subsidized Special Supplemental Food Program for Women, Infants, and Children (WIC). All states participate in WIC and have sought and obtained rebates from manufacturers of infant formula whose products are used in the program. All of the states have also conducted competitive bidding for infant formula contracts which require the use of specific infant formula products for the state WIC program. The Child Nutrition and WIC Reauthorization Act of 1989 requires all states participating in WIC to engage in competitive bidding upon the expiration of their existing infant formula contracts. Governmental regulatory agencies now require manufacturers to pay additional fees. Under the Prescription Drug User Fee Act of 1992, the Federal Food and Drug Administration imposes substantial fees on various aspects of the approval, manufacture and sale of prescription drugs. Congress is now considering expanding user fees to medical devices. The Company believes that such legislation, if enacted, will add considerable expense for the Company. In the United States comprehensive legislation has been proposed that would make significant changes to the availability, delivery and payment for healthcare products and services. It is the intent of such proposed legislation to provide health and medical insurance for all United States citizens and to reduce the rate of increases in United States healthcare expenditures. If such legislation is enacted, the Company believes it could have the effect of reducing prices for, or reducing the rate of price increases for health and medical insurance and medical products and services. International operations are also subject to a significant degree of government regulation. Many countries, directly or indirectly through reimbursement limitations, control the selling price of most health care products. Furthermore, many developing countries limit the importation of raw materials and finished products. International regulations are having an impact on United States regulations, as well. The International Organization for Standardization (\"ISO\") provides the voluntary criteria for regulating medical devices within the European Economic Community. The Food and Drug Administration (\"FDA\") has announced that it will attempt to harmonize its regulation of medical devices with that of the ISO. Recently published changes to the FDA's regulations governing the manufacture of medical devices appear to encompass and exceed the ISO's approach to regulating medical devices. The FDA's adoption of the ISO's approach to regulation and other changes to the manner in which the FDA regulates medical devices will increase the cost of compliance with those regulations. Efforts to reduce health care costs are also being made in the private sector. Health care providers have responded by instituting various cost reduction and containment measures. It is not possible to predict the extent to which the Company or the health care industry in general might be affected by the matters discussed above. INTERNATIONAL OPERATIONS The Company markets products in approximately 130 countries through affiliates and distributors. Most of the products discussed in the preceding sections of this report are sold outside the United States. In addition, certain products of a local nature and variations of product lines to meet local regulatory requirements and marketing preferences are manufactured and marketed to customers outside the United States. International operations are subject to certain additional risks inherent in conducting business outside the United States, including price and currency exchange controls, changes in currency exchange rates, limitations on foreign participation in local enterprises, expropriation, nationalization, and other governmental action. ITEM 2.\n\n\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/streaming/base/dataset.py:397: UserWarning: Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).\n warnings.warn(f'Because `predownload` was not specified, it will default to ' +\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "ITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. Abbott's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is Note 6 entitled \"Segment and Geographic Area Information\" of the Notes to Consolidated Financial Statements included under Item 8, \"Financial Statements and Supplementary Data\" and the sales information related to HUMIRA® included in \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS Through December 31, 2012, Abbott had five reportable revenue segments: Proprietary Pharmaceutical Products, Established Pharmaceutical Products, Diagnostic Products, Nutritional Products, and Vascular Products. On January 1, 2013, Abbott completed the separation of its research-based pharmaceuticals business through the distribution of the issued and outstanding common stock of AbbVie Inc. (AbbVie) to Abbott's shareholders. AbbVie was formed to hold Abbott's research-based pharmaceuticals business and, as a result of the distribution, is now an independent public company trading under the symbol \"ABBV\" on the New York Stock Exchange. *As used throughout the text of this report on Form 10-K, the term \"Abbott\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. Proprietary Pharmaceutical Products These products include a broad line of adult and pediatric pharmaceuticals manufactured, marketed, and sold worldwide (except as noted) and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. Certain products are co-marketed or co-promoted with other companies. As a result of the separation of Abbott's research-based pharmaceuticals business, beginning in 2013, Abbott will no longer have a Proprietary Pharmaceutical Products segment. The principal products included in the Proprietary Pharmaceutical Products segment are: •HUMIRA®, for the treatment of rheumatoid arthritis, psoriatic arthritis, ankylosing spondylitis, psoriasis, juvenile idiopathic arthritis, and Crohn's disease as well as ulcerative colitis in the United States and European Union and axial spondyloarthritis and pediatric Crohn's disease in the European Union; •Kaletra®, also marketed as Aluvia®, and Norvir® for the treatment of HIV infection; •Lupron®, also marketed as Lucrin®, used for the palliative treatment of advanced prostate cancer, treatment of endometriosis and central precocious puberty, and for the preoperative treatment of patients with anemia caused by uterine fibroids; •Synagis®, for the prevention of respiratory syncytial virus (RSV); •AndroGel®, for the treatment of adult males who have low testosterone (marketed and sold in the United States); •the anesthesia product sevoflurane (sold under the trademarks Ultane® and Sevorane®); •Zemplar®, for the prevention and treatment of secondary hyperparathyroidism associated with Stage 3, 4, or 5 chronic kidney disease; •Synthroid®, for the treatment of hypothyroidism (marketed and sold in the United States); •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold in the United States); and •TriCor®, Trilipix®, Simcor®, and Niaspan®, for the treatment of dyslipidemia (marketed and sold in the United States). The Proprietary Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of its pharmaceutical products by physicians. Managed care providers, market access organizations (for example, health maintenance organizations and pharmacy benefit managers) and national and regional governments and agencies (for example, the United States Department of Veterans Affairs and the United States Department of Defense) are also important customers. Competition in the Proprietary Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. The search for technological innovations in pharmaceutical products is a significant aspect of competition in this segment. The introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence in the Proprietary Pharmaceutical Products segment. Price can also be a factor. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products that do not have patent protection. Established Pharmaceutical Products These products include a broad line of branded generic pharmaceuticals manufactured worldwide and marketed and sold outside the United States, and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses, depending on the market served. Certain products are co-marketed or co-promoted with other companies. The principal products included in the Established Pharmaceutical Products segment are: •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold outside the United States); •the anti-infective clarithromycin (sold under the trademarks Biaxin®, Klacid®, and Klaricid®); •Influvac®, an influenza vaccine available during flu season; •Serc®, for the treatment of Ménière's disease and vestibular vertigo; •Brufen®, for the treatment of pain, fever and inflammation; •Synthroid®, for the treatment of hypothyroidism (marketed and sold outside the United States); •Duspatal® and Dicetel®, for the treatment of irritable bowel syndrome or biliary spasm; •Duphaston®, for the treatment of many different gynecological disorders; •Adomet®, Heptral®, Transmetil®, Samyr®, and Donamet®, for the treatment of intrahepatic cholestasis (associated with liver disease) or depressive symptoms; •Duphalac®, for regulation of the physiological rhythm of the colon; •Lipanthyl® and TriCor®, for the treatment of dyslipidemia (marketed and sold outside the United States); and •Teveten® and Teveten® Plus, for the treatment of essential hypertension, and Physiotens®, for the treatment of hypertension. The Established Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of Abbott's brand of products by physicians both in the primary care and secondary (hospital) care environment. Government agencies are also important customers. Competition in the Established Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. Changes to government tenders and reimbursement schemes are significant factors with respect to pricing. In addition, the substitution of generic drugs for the brand prescribed and introduction of additional forms of already marketed established products by generic or branded competitors have increased competitive pressures. Diagnostic Products These products include a broad line of diagnostic systems and tests manufactured, marketed, and sold worldwide to blood banks, hospitals, commercial laboratories, clinics, physicians' offices, government agencies, alternate-care testing sites, and plasma protein therapeutic companies. The segment's products are generally marketed and sold directly from Abbott-owned distribution centers, public warehouses and third-party distributors. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Diagnostic Products segment are: •immunoassay and clinical chemistry systems, including ARCHITECT® and ABBOTT PRISM®; •assays used for screening and/or diagnosis for drugs of abuse, cancer, therapeutic drug monitoring, fertility, physiological diseases, and infectious diseases such as hepatitis and HIV; •the m2000™, an instrument that automates the extraction, purification, and preparation of DNA and RNA from patient samples, and detects and measures infectious agents including HIV, HBV, HCV, HPV, and CT/NG; •the Vysis® product line of genomic-based tests, including the PathVysion® HER-2 DNA probe kit; the UroVysion® bladder cancer recurrence kit; and the Vysis ALK Break Apart FISH Probe Kit, the only FDA-approved companion diagnostic to Pfizer's approved non-small-cell lung cancer therapy XALKORI®; •informatics and automation solutions for use in the laboratory; •a full line of hematology systems and reagents known as the Cell-Dyn® series; and •the i-STAT® point-of-care diagnostic systems and tests for blood analysis. In addition, under a distribution agreement with Celera Group, the Diagnostic Products segment exclusively distributes certain Celera molecular diagnostic products, including the ViroSeq® HIV genotyping system and products used for the detection of mutations in the CFTR gene, which causes cystic fibrosis. The Diagnostic Products segment's products are subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Nutritional Products These products include a broad line of pediatric and adult nutritional products manufactured, marketed, and sold worldwide. These products are generally marketed and sold directly to customers and to institutions, wholesalers, retailers, health care facilities, government agencies, and third\n\n-party distributors from Abbott-owned distribution centers or third-party distributors. The principal products included in the Nutritional Products segment are: •various forms of prepared infant formula and follow-on formula, including Similac®Advance®, Similac® Advance® with EarlyShield®, Similac®, Similac® with Iron, Similac Sensitive®, Similac Sensitive® RS, Similac Go&Grow®, Similac® NeoSure®, Similac® Organic, Similac Special Care®, Similac® Total Comfort®, Isomil® Advance®, Isomil®, Alimentum®, Gain®, and Grow®; •adult and other pediatric nutritional products, including Ensure®, Ensure Plus®, Ensure® Muscle Health, Ensure® (with Nutrivigor®), Glucerna®, Glucerna® Hunger Smart®, ProSure®, PediaSure®, PediaSure Sidekicks®, EleCare®, Juven®, Abound®, and Pedialyte®; •nutritional products used in enteral feeding in health care institutions, including Jevity®, Glucerna® 1.2 Cal, Glucerna® 1.5 Cal, Osmolite®, Oxepa®, Freego (Enteral Pump) and Freego® sets, and Nepro®; and •Zone Perfect® bars and the EAS® family of nutritional brands, including Myoplex® and AdvantEdge®. Primary marketing efforts for nutritional products are directed toward securing the recommendation of Abbott's brand of products by physicians or other health care professionals. In addition, certain nutritional products sold as Gain™, Grow™, PediaSure®, PediaSure Sidekicks®, Pedialyte®, Ensure®, Zone Perfect®, EAS®/Myoplex®, and Glucerna® are also promoted directly to the public by consumer marketing efforts in select markets. Competition for nutritional products in the segment is generally from other diversified consumer and health care manufacturers. Competitive factors include consumer advertising, formulation, packaging, scientific innovation, intellectual property, price, and availability of product forms. A significant aspect of competition is the search for ingredient innovations. The introduction of new products by competitors, changes in medical practices and procedures, and regulatory changes can result in product obsolescence. In addition, private label and local manufacturers' products may increase competitive pressure. Vascular Products These products include a broad line of coronary, endovascular, vessel closure, and structural heart devices for the treatment of vascular disease manufactured, marketed and sold worldwide. The segment's products are generally marketed and sold directly to hospitals from Abbott-owned distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Vascular Products segment are: •Xience Xpedition®, Xience Prime®, Xience nano™, and Xience V®, drug-eluting coronary stent systems developed on the Multi-Link Vision® platform; •Absorb®, a drug-eluting coronary bioresorbable vascular scaffold; •Multi-Link 8®, Multi-Link Vision® and Multi-Link Mini Vision®, coronary metallic stents; •TREK® and Voyager®, coronary balloon dilatation products; •Hi-Torque Balance Middleweight Elite® and ASAHI® coronary guidewires (licensed from Asahi Intecc Co., Ltd.); •StarClose® and Perclose® vessel closure devices; •Acculink®/Accunet® and Xact®/Emboshield NAV6®, carotid stent systems; •Armada® and Absolute Pro Peripheral® balloon dilatation products; •Herculink Elite Renal® and Omnilink Elite Iliac® stent systems; and •MitraClip®, a percutaneous valve repair system. The Vascular Products segment's products are subject to competition in technological innovation, price, convenience of use, service, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Other Products The principal products in Abbott's other businesses include blood glucose monitoring meters, test strips, data management software and accessories for people with diabetes, including the FreeStyle® product line, and medical devices for the eye, including cataract surgery, LASIK surgery, contact lens care products, and dry eye products. These products are mostly marketed worldwide and generally sold directly to wholesalers, government agencies, health care facilities, mail order pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses. Some of these products are marketed and distributed through distributors. Blood glucose monitoring meters, contact lens care products, and dry eye products are also marketed and sold over-the-counter to consumers. These products are subject to competition in technological innovation, price, convenience of use, service, and product performance. Medical devices for the eye also can be subject to rapid product obsolescence or regulatory changes. INFORMATION WITH RESPECT TO ABBOTT'S BUSINESS IN GENERAL Sources and Availability of Raw Materials Abbott purchases, in the ordinary course of business, raw materials and supplies essential to Abbott's operations from numerous suppliers in the United States and abroad. There have been no recent significant availability problems or supply shortages. Patents, Trademarks, and Licenses Abbott is aware of the desirability for patent and trademark protection for its products. Accordingly, where possible, patents and trademarks are sought and obtained for Abbott's products in the United States and all countries of major marketing interest to Abbott. Abbott owns and is licensed under a substantial number of patents and patent applications. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 through 5. These, and various patents which expire during the period 2013 to 2032, in the aggregate, are believed to be of material importance in the operation of Abbott's business. Abbott believes that, after the separation of AbbVie, no single patent, license, or trademark is material in relation to Abbott's business as a whole. In connection with the separation and distribution of AbbVie, Abbott contributed certain pharmaceutical related patents, licenses, and trademarks to AbbVie. Patent-related litigation is discussed in Legal Proceedings on pages 18 through 20. Seasonal Aspects, Customers, Backlog, and Renegotiation There are no significant seasonal aspects to Abbott's business. Abbott has no single customer that, if the customer were lost, would have a material adverse effect on Abbott. Orders for Abbott's products are generally filled on a current basis, and order backlog is not material to Abbott's business. No material portion of Abbott's business is subject to renegotiation of profits or termination of contracts at the election of the government. Research and Development Abbott spent approximately $4.3 billion in 2012, $4.1 billion in 2011, and $3.7 billion in 2010, on research to discover and develop new products and processes and to improve existing products and processes. The majority of research and development expenditures was concentrated on proprietary pharmaceutical products. Environmental Matters Abbott believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. Abbott's capital and operating expenditures for pollution control in 2012 were approximately $12 million and $63 million, respectively. After the separation of AbbVie, capital and operating expenditures for pollution control in 2013 are estimated to be $10 million and $53 million, respectively. Abbott has been identified as one of many potentially responsible parties in investigations and/or remediations at several locations in the United States, including Puerto Rico, under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. Abbott is also engaged in remediation at several other sites, some of which are owned by Abbott, in cooperation with the Environmental Protection Agency (EPA) or similar agencies. While it is not feasible to predict with certainty the final costs related to those investigations and remediation activities, Abbott believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on Abbott's financial position, cash flows, or results of operations. Employees Abbott employed approximately 91,000 persons as of December 31, 2012. Approximately 21,000 persons were transferred to AbbVie in connection with the separation. Regulation The development, manufacture, marketing, sale, promotion, and distribution of Abbott's products are subject to comprehensive government regulation by the U.S. Food and Drug Administration and similar international regulatory agencies. Government regulation by various international, supranational, federal and state agencies, both domestic and international, addresses (among other matters) the development and approval to market Abbott's products, as well as the inspection of, and controls over, research and laboratory procedures, clinical investigations, product approvals and manufacturing, labeling, packaging, supply chains, marketing and promotion, pricing and reimbursement, sampling, distribution, quality control, post-market surveillance, record keeping, storage, and disposal practices. Abbott's international operations are also affected by trade regulations in many countries that limit the import of raw materials and finished products and by local and international laws and regulations that seek to prevent corruption and bribery in the marketplace (including the United States Foreign Corrupt Practices Act and the United Kingdom Bribery Act which provide among other things, guidance on corporate interactions with government officials). In addition, Abbott is subject to laws and regulations pertaining to health care fraud and abuse, including state and federal\n\n anti-kickback and false claims laws in the United States. Prescription drug, nutrition, and medical device manufacturers such as Abbott are also subject to taxes, as well as application, product, user, establishment, and other fees. Governmental agencies can also invalidate intellectual property rights and control the entrance of multi-source drugs for small molecule and generic biologic medicines. Compliance with these laws and regulations is costly and materially affects Abbott's business. Among other effects, health care regulations substantially increase the time, difficulty, and costs incurred in obtaining and maintaining approval to market newly developed and existing products. Abbott expects this regulatory environment will continue to require significant technical expertise and capital investment to ensure compliance. Failure to comply can delay the release of a new product or result in regulatory and enforcement actions, the seizure or recall of a product, the suspension or revocation of the authority necessary for a product's production and sale, and other civil or criminal sanctions, including fines and penalties. In addition to regulatory initiatives, Abbott's business can be affected by ongoing studies of the utilization, safety, efficacy, and outcomes of health care products and their components that are regularly conducted by industry participants, government agencies, and others. These studies can call into question the utilization, safety, and efficacy of previously marketed products. In some cases, these studies have resulted, and may in the future result, in the discontinuance of marketing of such products domestically or globally, and may give rise to claims for damages from persons who believe they have been injured as a result of their use. Access to human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. A major focus is cost containment. Efforts to reduce health care costs are also being made in the private sector, notably by health care payors and providers, which have instituted various cost reduction and containment measures. Abbott expects insurers and providers to continue attempts to reduce the cost of health care products. Many countries control the price of health care products directly or indirectly, through reimbursement, payment, pricing, coverage limitations, or compulsory licensing, and are adopting laws and rules to govern the introduction of biosimilar products. Domestic and foreign budgetary pressures may also heighten the scope and severity of pricing pressures on Abbott's products for the foreseeable future. Specifically, U.S. federal laws requiring pharmaceutical manufacturers to pay certain statutorily-prescribed rebates to state Medicaid programs on prescription drugs reimbursed under state Medicaid plans, and the efforts by states to seek additional rebates, affect Abbott's proprietary pharmaceutical business. Similarly, the Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, Public Health Service entities and institutions, as well as certain other covered entities. The Veterans Health Care Act also established the 340B drug discount program, which requires pharmaceutical manufacturers to provide products at reduced prices to designated health care facilities. In the United States, most states also have generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. In addition, the federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid and has implemented a prospective payment system (PPS) for services delivered in hospital outpatient, nursing home, and home health settings. DRG and PPS entitle a health care facility to a fixed reimbursement based on the diagnosis and/or procedure rather than actu\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nall available on Abbott's investor relations website (www.abbottinvestor.com). ITEM 1A.\nITEM 1A. RISK FACTORS In addition to the other information in this report, the following risk factors should be considered before deciding to invest in any of Abbott's securities. Additional risks and uncertainties not presently known to Abbott, or risks Abbott currently considers immaterial, could also affect Abbott's actual results. Abbott's business, financial condition, results of operations, or prospects could be materially adversely affected by any of these risks. Abbott may acquire other businesses, license rights to technologies or products, form alliances, or dispose of or spin-off businesses, which could cause it to incur significant expenses and could negatively affect profitability. Abbott may pursue acquisitions, technology licensing arrangements, and strategic alliances, or dispose of or spin-off some of its businesses, as part of its business strategy. Abbott may not complete these transactions in a timely manner, on a cost-effective basis, or at all, and may not realize the expected benefits. If Abbott is successful in making an acquisition, the products and technologies that are acquired may not be successful or may require significantly greater resources and investments than originally anticipated. Abbott may not be able to integrate acquisitions successfully into its existing business and could incur or assume significant debt and unknown or contingent liabilities. Abbott could also experience negative effects on its reported results of operations from acquisition or disposition-related charges, amortization of expenses related to intangibles and charges for impairment of long-term assets. These effects could cause a deterioration of Abbott's credit rating and result in increased borrowing costs and interest expense. The expiration or loss of patent protection and\n\n licenses may affect Abbott's future revenues and operating income. Many of Abbott's businesses rely on patent and trademark and other intellectual property protection. Although most of the challenges to Abbott's intellectual property have come from other businesses, governments may also challenge intellectual property protections. To the extent Abbott's intellectual property is successfully challenged, invalidated, or circumvented or to the extent it does not allow Abbott to compete effectively, Abbott's business will suffer. To the extent that countries do not enforce Abbott's intellectual property rights or to the extent that countries require compulsory licensing of its intellectual property, Abbott's future revenues and operating income will be reduced. Abbott's patents and trademarks are described in greater detail in the section captioned \"Patents, Trademarks, and Licenses,\" and litigation regarding these patents is described in the section captioned \"Legal Proceedings.\" Competitors' intellectual property may prevent Abbott from selling its products or have a material adverse effect on Abbott's future profitability and financial condition. Competitors may claim that an Abbott product infringes upon their intellectual property. Resolving an intellectual property infringement claim can be costly and time consuming and may require Abbott to enter into license agreements. Abbott cannot guarantee that it would be able to obtain license agreements on commercially reasonable terms. A successful claim of patent or other intellectual property infringement could subject Abbott to significant damages or an injunction preventing the manufacture, sale or use of affected Abbott products. Any of these events could have a material adverse effect on Abbott's profitability and financial condition. Abbott is subject to cost containment efforts that could cause a reduction in future revenues and operating income. In the United States and other countries, Abbott's businesses have experienced downward pressure on product pricing. Cost containment efforts by governments and private organizations are described in greater detail in the section captioned \"Regulation.\" To the extent these cost containment efforts are not offset by greater patient access to health care or other factors, Abbott's future revenues and operating income will be reduced. Abbott is subject to numerous governmental regulations and it can be costly to comply with these regulations and to develop compliant products and processes. Abbott's products are subject to rigorous regulation by the U.S. Food and Drug Administration, and numerous international, supranational, federal, and state authorities. The process of obtaining regulatory approvals to market a drug or medical device can be costly and time-consuming, and approvals might not be granted for future products, or additional indications or uses of existing products, on a timely basis, if at all. Delays in the receipt of, or failure to obtain approvals for, future products, or new indications and uses, could result in delayed realization of product revenues, reduction in revenues, and in substantial additional costs. In addition, no assurance can be given that Abbott will remain in compliance with applicable FDA and other regulatory requirements once clearance or approval has been obtained for a product. These requirements include, among other things, regulations regarding manufacturing practices, product labeling, and advertising and postmarketing reporting, including adverse event reports and field alerts due to manufacturing quality concerns. Many of Abbott's facilities and procedures and those of Abbott's suppliers are subject to ongoing regulation, including periodic inspection by the FDA and other regulatory authorities. Abbott must incur expense and spend time and effort to ensure compliance with these complex regulations. Possible regulatory actions for non-compliance could include warning letters, fines, damages, injunctions, civil penalties, recalls, seizures of Abbott's products, and criminal prosecution. These actions could result in, among other things, substantial modifications to Abbott's business practices and operations; refunds, recalls, or seizures of Abbott's products; a total or partial shutdown of production in one or more of Abbott's facilities while Abbott or Abbott's suppliers remedy the alleged violation; the inability to obtain future pre-market clearances or approvals; and withdrawals or suspensions of current products from the market. Any of these events could disrupt Abbott's business and have a material adverse effect on Abbott's revenues, profitability and financial condition. Laws and regulations affecting government benefit programs could impose new obligations on Abbott, require Abbott to change its business practices, and restrict its operations in the future. Abbott's industry is also subject to various federal, state, and international laws and regulations pertaining to government benefit program reimbursement, price reporting and regulation, and health care fraud and abuse, including anti-kickback and false claims laws, and international and individual state laws relating to pricing and sales and marketing practices. Violations of these laws may be punishable by criminal and/or civil sanctions, including, in some instances, substantial fines, imprisonment, and exclusion from participation in federal and state health care programs, including Medicare, Medicaid, and Veterans Administration health programs. These laws and regulations are broad in scope and they are subject to evolving interpretations, which could require Abbott to incur substantial costs associated with compliance or to alter one or more of its sales or marketing practices. In addition, violations of these laws, or allegations of such violations, could disrupt Abbott's business and result in a material adverse effect on Abbott's revenues, profitability, and financial condition. Changes in the health care regulatory environment may adversely affect Abbott's business. A number of the provisions of the Patient Protection and Affordable Care Act and the Health Care and Education Reconciliation Act of 2010 require further rulemaking action by governmental agencies to implement. The laws change access to health care products and services and create new fees for the pharmaceutical and medical device industries. Future rulemaking could increase rebates, reduce prices or the rate of price increases for health care products and services, or require additional reporting and disclosure. Abbott cannot predict the timing or impact of any future rulemaking. Abbott's research and development efforts may not succeed in developing commercially successful products and technologies, which may cause Abbott's revenue and profitability to decline. To remain competitive, Abbott must continue to launch new products and technologies. To accomplish this, Abbott commits substantial efforts, funds, and other resources to research and development. A high rate of failure is inherent in the research and development of new products and technologies. Abbott must make ongoing substantial expenditures without any assurance that its efforts will be commercially successful. Failure can occur at any point in the process, including after significant funds have been invested. Promising new product candidates may fail to reach the market or may only have limited commercial success because of efficacy or safety concerns, failure to achieve positive clinical outcomes, inability to obtain necessary regulatory approvals, limited scope of approved uses, excessive costs to manufacture, the failure to establish or maintain intellectual property rights, or infringement of the intellectual property rights of others. Even if Abbott successfully develops new products or enhancements or new generations of Abbott's existing products, they may be quickly rendered obsolete by changing customer preferences, changing industry standards, or competitors' innovations. Innovations may not be accepted quickly in the marketplace because of, among other things, entrenched patterns of clinical practice or uncertainty over third-party reimbursement. Abbott cannot state with certainty when or whether any of its products under development will be launched, whether it will be able to develop, license, or otherwise acquire compounds or products, or whether any products will be commercially successful. Failure to launch successful new products or new indications for existing products may cause Abbott's products to become obsolete, causing Abbott's revenues and operating results to suffer. New products and technological advances by Abbott's competitors may negatively affect Abbott's results of operations. Abbott's products face intense competition from its competitors' products. Competitors' products may be safer, more effective, more effectively marketed or sold, or have lower prices or superior performance features than Abbott's products. Abbott cannot predict with certainty the timing or impact of the introduction of competitors' products. The manufacture of many of Abbott's products is a highly exacting and complex process, and if Abbott or one of its suppliers encounters problems manufacturing products, Abbott's business could suffer. The manufacture of many of Abbott's products is a highly exacting and complex process, due in part to strict regulatory requirements. Problems may arise during manufacturing for a variety of reasons, including equipment malfunction, failure to follow specific protocols and procedures, problems with raw materials, natural disasters, and environmental factors. In addition, single suppliers are currently used for certain products and materials. If problems arise during the production of a batch of product, that batch of product may have to be discarded. This could, among other things, lead to increased costs, lost revenue, damage to customer relations, time and expense spent investigating the cause and, depending on the cause, similar losses with respect to other batches or products. If problems are not discovered before the product is released to the market, recall and product liability costs may also be incurred. To the extent Abbott or one of its suppliers experiences significant manufacturing problems, this could have a material adverse effect on Abbott's revenues and profitability. Significant safety issues could arise for Abbott's products, which could have a material adverse effect on Abbott's revenues and financial condition. Health care products typically receive regulatory approval based on data obtained in controlled clinical trials of limited duration. Following regulatory approval, these products will be used over longer periods of time in many patients. Investigators may also conduct additional, and perhaps more extensive, studies. If new safety issues are reported, Abbott may be required to amend the conditions of use for a product. For example, Abbott may be required to provide additional warnings on a product's label or narrow its approved intended use, either of which could reduce the product's market acceptance. If serious safety issues arise with an Abbott product, sales of the product could be halted by Abbott or by regulatory authorities. Safety issues affecting suppliers' or competitors' products also may reduce the market acceptance of Abbott's products. In addition, in the ordinary course of business, Abbott is the subject of product liability claims and lawsuits alleging that its products or the products of other companies that Abbott promotes have resulted or could result in an unsafe condition for or injury to patients. Product liability claims and lawsuits and safety alerts or product recalls, regardless of their validity or ultimate outcome, may have a material adverse effect on Abbott's business and reputation and on Abbott's ability to attract and retain customers. Consequences\n\n may also include additional costs, a decrease in market share for the products, lower income or exposure to other claims. Product liability losses are self-insured. Product liability claims could have a material adverse effect on Abbott's profitability and financial condition. Further deterioration in the economic position and credit quality of certain European countries may negatively affect Abbott's results of operations. If economic conditions in certain European countries, including Greece, Portugal, Italy, and Spain, continue to worsen, the time it takes to collect outstanding trade receivables may increase. Financial instability and fiscal deficits in these countries may result in additional austerity measures to reduce costs, including health care. At the same time, ongoing sovereign debt issues, including the impact of credit downgrades, could increase Abbott's collection risk given that a significant amount of Abbott's receivables in these countries are with governmental health care systems. Abbott depends on sophisticated information technology systems to operate its business and a cyber attack or other breach of these systems could have a material adverse effect on Abbott's results of operations. Similar to other large multi-national companies, the size and complexity of Abbott's information technology systems makes them vulnerable to a cyber attack, malicious intrusion, breakdown, destruction, loss of data privacy, or other significant disruption. Abbott's systems have been and are expected to continue to be the target of malware and other cyber attacks. Abbott has invested in its systems and the protection of its data to reduce the risk of an invasion or interruption and monitors its systems on an ongoing basis for any current or potential threats. There can be no assurance that these measures and efforts will prevent future interruptions or breakdowns that could have a significant effect on Abbott's business. Abbott may incur operational difficulties or be exposed to claims and liabilities as a result of the separation. AbbVie and Abbott entered into a separation and distribution agreement and various other agreements to govern the separation of AbbVie from Abbott and the relationship between the two companies going forward. Certain of these agreements provide for the performance of services by each company for the benefit of the other for a period of time. If AbbVie is unable to satisfy its obligations under these agreements, including its indemnification obligations, Abbott could incur operational difficulties or losses. These arrangements could also lead to disputes between Abbott and AbbVie over Abbott's rights to certain shared property and rights and over the allocation of costs and revenues for products and operations. The separation and distribution agreement also provides for, among other things, indemnification obligations designed to make AbbVie financially responsible for substantially all liabilities that may exist relating to its business activities, whether incurred prior to or after AbbVie's separation from Abbott, as well as those obligations of Abbott assumed by AbbVie pursuant to the separation and distribution agreement. It is possible that a court would disregard the allocation agreed to between Abbott and AbbVie and require Abbott to assume responsibility for obligations allocated to AbbVie. Third parties could also seek to hold Abbott responsible for any of these liabilities or obligations. The indemnity rights Abbott has under the separation agreement may not be sufficient to protect Abbott. Even if Abbott is successful in obtaining indemnification, Abbott may have to bear losses temporarily. In addition, Abbott's indemnity obligations to AbbVie may be significant. These risks could negatively affect Abbott's results of operations. There could be significant liability if the distribution of AbbVie common stock to Abbott shareholders is determined to be a taxable transaction. Abbott received a private letter ruling from the Internal Revenue Service (IRS) to the effect that, among other things, the separation and the distribution of AbbVie qualifies as a transaction that is tax-free for U.S. federal income tax purposes under Sections 355 and 368(a)(1)(D) of the Internal Revenue Code (the Code). In addition, Abbott received an opinion from outside tax counsel to the effect that the separation and distribution qualifies as a transaction that is described in Sections 355(a) and 368(a)(1)(D) of the Code. The ruling and the opinion rely on certain facts, assumptions, representations and undertakings from Abbott and AbbVie regarding the past and future conduct of the companies' respective businesses and other matters. If any of these facts, assumptions, representations or undertakings are incorrect or not satisfied, Abbott and its shareholders may not be able to rely on the ruling or the opinion of tax counsel and could be subject to significant tax liabilities. Notwithstanding the receipt by Abbott of the private letter ruling from the IRS and opinion of tax counsel, the IRS could determine on audit that the separation is taxable if it determines that any of these facts, assumptions, representations or undertakings are not correct or have been violated or if it disagrees with the conclusions in the opinion that are not covered by the private letter ruling, or for other reasons, including as a result of certain significant changes in the share ownership of Abbott or AbbVie after the separation. If the separation is determined to be taxable for U.S. federal income tax purposes, Abbott and its shareholders that are subject to U.S. federal income tax could incur significant U.S. federal income tax liabilities. The international nature of Abbott's business subjects it to additional business risks that may cause its revenue and profitability to decline. Abbott's business is subject to risks associated with doing business internationally. Following the separation of AbbVie, sales outside of the United States are expected to make up approximately 70 percent of Abbott's net sales. The risks associated with Abbott's operations outside the United States include: •fluctuations in currency exchange rates; •changes in medical reimbursement policies and programs; •multiple regulatory requirements that are subject to change and that could restrict Abbott's ability to manufacture, market, and sell its products; •differing local product preferences and product requirements; •trade protection measures and import or export licensing requirements; •difficulty in establishing, staffing, and managing operations; •differing labor regulations; •potentially negative consequences from changes in or interpretations of tax laws; •political and economic instability, including sovereign debt issues; •price and currency exchange controls, limitations on participation in local enterprises, expropriation, nationalization, and other governmental action; •inflation, recession and fluctuations in interest rates; •compulsory licensing or diminished protection of intellectual property; and •potential penalties or other adverse consequences for violations of anti-corruption, anti-bribery and other similar laws and regulations, including the Foreign Corrupt Practices Act and the U.K. Bribery Act. Events contemplated by these risks may, individually or in the aggregate, have a material adverse effect on Abbott's revenues and profitability. Other factors can have a material adverse effect on Abbott's future profitability and financial condition. Many other factors can affect Abbott's profitability and its financial condition, including: •changes in or interpretations of laws and regulations, including changes in accounting standards, taxation requirements, product marketing application standards, product labeling, source, and use laws, and environmental laws; •differences between the fair value measurement of assets and liabilities and their actual value, particularly for pensions, retiree health care, stock compensation, intangibles, and goodwill; and for contingent liabilities such as litigation, the absence of a recorded amount, or an amount recorded at the minimum, compared to the actual amount; •changes in the rate of inflation (including the cost of raw materials, commodities, and supplies), interest rates, market value of Abbott's equity investments, and the performance of investments held by Abbott or Abbott's employee benefit trusts; •changes in the creditworthiness of counterparties that transact business with or provide services to Abbott or Abbott's employee benefit trusts; •changes in business, economic, and political conditions, including: war, political instability, terrorist attacks, the threat of future terrorist activity and related military action; natural disasters; the cost and availability of insurance due to any of the foregoing events; labor disputes, strikes, slow-downs, or other forms of labor or union activity; and pressure from third-party interest groups; •changes in Abbott's business units and investments and changes in the relative and absolute contribution of each to earnings and cash flow resulting from evolving business strategies, changing product mix, changes in tax laws or tax rates both in the U.S. and abroad and opportunities existing now or in the future; •changes in the buying patterns of a major distributor, retailer, or wholesale customer resulting from buyer purchasing decisions, pricing, seasonality, or other factors, or other problems with licensors, suppliers, distributors, and business partners; •changes in credit markets impacting Abbott's ability to obtain financing for its business operations; and •legal difficulties, any of which could preclude or delay commercialization of products or adversely affect profitability, including claims asserting statutory or regulatory violations, and adverse litigation decisions. CAUTIONARY STATEMENT REGARDING FORWARD-LOOKING STATEMENTS This Form 10-K contains forward-looking statements that are based on management's current expectations, estimates, and projections. Words such as \"expects,\" \"anticipates,\" \"intends,\" \"plans,\" \"believes,\" \"seeks,\" \"estimates,\" \"forecasts,\" variations of these words, and similar expressions are intended to identify these forward-looking statements. Certain factors, including but not limited to those identified under \"Item 1A. Risk Factors\" of this Form 10-K, may cause actual results to differ materially from current expectations, estimates, projections, forecasts, and from past results. No assurance can be made that any expectation, estimate, or projection contained in a forward-looking statement will be achieved or will not be affected by the factors cited above or other future events. Abbott undertakes no obligation to release publicly any revisions to forward-looking statements as the result of subsequent events or developments, except as required by law. ITEM 1B.\nITEM 1B. UNRESOLVED STAFF COMMENTS None. ITEM 2.\nITEM 2. PROPERTIES Abbott's corporate offices are located\n\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Num examples:\", len(df))\n", "print(\"First example:\")\n", @@ -1099,8 +781,26 @@ " break \n", "\n", "if not integrity_check(temporary_mds_output_path): \n", - " raise ValueError(\"MDS has not been created correctly. There are missing shards!\")\n", - "\n", + " raise ValueError(\"MDS has not been created correctly. There are missing shards!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da5f8305-6f00-484c-818c-5dcddcef0aef", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ "# Sanity Check\n", "import numpy as np\n", "from streaming import StreamingDataset\n", @@ -1119,10 +819,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b", "showTitle": false, @@ -1148,21 +845,11 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset has ~985088 tokens that will be charged for during training\nBy default, you'll train for 3 epochs on this dataset\nBy default, you'll be charged for ~2955264 tokens\n" - ] - } - ], + "outputs": [], "source": [ "MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096\n", "TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 \n", "n_epochs = TARGET_EPOCHS\n", - "n_train_examples = len(raw_dataset)\n", "\n", "n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length \n", "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", @@ -1175,7 +862,10 @@ "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "8775fed8-6440-4a20-82f3-59b6cff73421", "showTitle": false, diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py deleted file mode 100644 index 3b6c109199..0000000000 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ /dev/null @@ -1,731 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC Copyright 2022 MosaicML LLM Foundry authors. -# MAGIC SPDX-License-Identifier: Apache-2.0 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Warning: Important Alert Regarding the Script Usage -# MAGIC -# MAGIC ### Script Purpose: -# MAGIC - **Not for Training**: This script is not utilized during the training process. -# MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. -# MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. -# MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. -# MAGIC -# MAGIC ### Usage Scenario: -# MAGIC This script is particularly useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. -# MAGIC -# MAGIC ### Note on Long-Term Solution: -# MAGIC - **Temporary Measure**: This script is a stop-gap solution. -# MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. -# MAGIC -# MAGIC ### Checks Include: -# MAGIC - check input dataset: -# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); -# MAGIC - check HF input location: -# MAGIC 1) load dataset info and check if it is accessible; -# MAGIC 2) verify if the split exists. -# MAGIC - check cloud path location: -# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) -# MAGIC 2) check if list objects returns nothing. -# MAGIC - count_tokens: -# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts -# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. -# MAGIC -# MAGIC ### Questions: -# MAGIC - Is "download_text_to_mds.py" always callable from the validation script? -# MAGIC - what is the function to reuse to run tokenization on HF datasets with filters? -# MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? -# MAGIC ``` -# MAGIC cfg = { -# MAGIC model: str, -# MAGIC train_data_path: str, -# MAGIC save_folder: str, -# MAGIC *, -# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", -# MAGIC eval_data_path: Optional[str] = None, -# MAGIC eval_prompts: Optional[List[str]] = None, -# MAGIC custom_weights_path: Optional[str] = None, -# MAGIC training_duration: Optional[str] = None, -# MAGIC learning_rate: Optional[float] = None, -# MAGIC context_length: Optional[int] = None, -# MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC data_prep_config: Optional[Dict] = None, -# MAGIC disable_credentials_check: Optional[bool] = None, -# MAGIC timeout: Optional[float] = 10, -# MAGIC future: Literal[False] = False, -# MAGIC } -# MAGIC - What null checkings do we want to have? -# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC ``` - -# COMMAND ---------- - -# MAGIC %pip install llm-foundry - -# COMMAND ---------- - -# dbutils.library.restartPython() - -# COMMAND ---------- - -import os -import re -from argparse import ArgumentParser, Namespace -from typing import Tuple, Union - -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) -from datasets import get_dataset_split_names -from huggingface_hub import dataset_info -from omegaconf import OmegaConf as om - -from llmfoundry.utils import build_tokenizer - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## User Defines the Cell Below - -# COMMAND ---------- - -FT_API_args = Namespace( - model='EleutherAI/gpt-neox-20b', - train_data_path= - 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder= - 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', - task_type='INSTRUCTION_FINETUNE', - eval_data_path=None, - eval_prompts=None, - custom_weights_path=None, - training_duration=None, - learning_rate=None, - context_length=2048, - experiment_trackers=None, - disable_credentials_check=None, - # Extra argument to add to FT API - # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 - data_prep_config={ - 'data_validation': True, - 'data_prep': False - }, - timeout=10, - future=False, -) - -os.environ['HF_ASSETS_CACHE'] = '/tmp/' -os.environ['HF_HOME'] = '/tmp/' -os.environ['HF_HUB_CACHE'] = '/tmp/' -os.environ['HF_DATASETS_CACHE'] = '/tmp/' - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Adapted from llmfoundry/scripts/data_prep/convert_text_to_mds.py - -# COMMAND ---------- - -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 - -# Taken from llmfoundry/scripts/data_prep/convert_text_to_mds.py - -import logging -import math -import tempfile -from argparse import Namespace -from concurrent.futures import ProcessPoolExecutor -from glob import glob -from typing import Iterable, List, Tuple, cast - -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) -from streaming import MDSWriter -from tqdm import tqdm -from transformers import AutoTokenizer - -from llmfoundry.data import ConcatTokensDataset -from llmfoundry.utils.data_prep_utils import (DownloadingIterable, - merge_shard_groups) - -log = logging.getLogger(__name__) -DONE_FILENAME = '.text_to_mds_conversion_done' - - -def parse_args( - tokenizer: str, - concat_tokens: int, - output_folder: str, - input_folder: str, - compression: str = 'zstd', - bos_text: str = '', - eos_text: str = '', - no_wrap: bool = False, - processes: int = 32, # min(max(psutil.cpu_count() - 2, 1), 32), - reprocess: bool = False -) -> Namespace: - - parser = ArgumentParser( - description= - 'Convert text files into MDS format, optionally concatenating and tokenizing', - ) - parsed = Namespace(tokenizer=tokenizer, - concat_tokens=concat_tokens, - output_folder=output_folder, - input_folder=input_folder, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - compression=compression, - processes=processes, - reprocess=reprocess) - - # Make sure we have needed concat options - if (parsed.concat_tokens is not None and - isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): - parser.error( - 'When setting --concat_tokens, you must specify a --tokenizer') - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' - return parsed - - -def get_object_names(input_folder: str) -> List[str]: - """Get object names from a local or remote folder. - - Args: - input_folder (str): local or remote folder path. - """ - object_store = maybe_create_object_store_from_uri(input_folder) - if object_store is not None: - _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.txt')) - ] - # return names, sizes - log.info(f'Found {len(names)} text files at {input_folder}') - - return names - - -def get_task_args( - object_names: List[str], - output_root: str, - input_folder: str, - n_groups: int, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -) -> Iterable: - """Get download_and_convert arguments split across n_groups. - - Each group handles a portion of object_names. - - Args: - object_names (List[str]): Names of objects to process - output_root (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - n_groups (int): Number of groups to split the object names into - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - num_objects = len(object_names) - objs_per_group = math.ceil(num_objects / n_groups) - for group, i in enumerate(range(0, num_objects, objs_per_group)): - output_subdir = os.path.join(output_root, str(group)) - yield ( - object_names[i:min(i + objs_per_group, num_objects)], - output_subdir, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - ) - - -def download_and_convert_starargs(args: Tuple): - """Helper function to call download_and_convert with star args. - - This helps us use download_and_convert with mutiprocessing. - """ - return download_and_convert(*args) - - -def download_and_convert( - file_names: List[str], - output_folder: str, - input_folder: str, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -): - """Downloads and converts text fies to MDS format. - - Args: - file_names (List[str]): Files to process - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - object_store = maybe_create_object_store_from_uri(input_folder) - - # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - downloading_iter = DownloadingIterable(object_names=file_names, - output_folder=tmp_dir, - object_store=object_store) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensDataset( - hf_dataset=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) - - columns = {'tokens': 'bytes'} - - log.info('Converting to MDS format...') - with MDSWriter(out=output_folder, - columns=columns, - compression=compression) as out: - for sample in tqdm(dataset): - out.write(sample) - - -def is_remote_path(path: str) -> bool: - """Checks whether a path is a remote path. - - Args: - path (str): path to check - """ - backend, _, _ = parse_uri(path) - return backend != '' - - -def is_already_processed(output_root: str, args_str: str, - object_names: List[str]) -> bool: - """Determines whether a group of text files has already been processed. - - Checks the done fie at output root to determine this. - - Args: - output_root (str): Output folder where a done file may exist - args_str (str): String representation of the arguments - object_names (List[str]): Names of objects to convert to MDS format - """ - # Retrieve the done file contents - output_object_store = maybe_create_object_store_from_uri(output_root) - if output_object_store is not None: - # Download and read the done file from the remote object store - _, _, output_folder_prefix = parse_uri(output_root) - try: - with tempfile.TemporaryDirectory() as tmp_dir: - done_file = os.path.join(tmp_dir, DONE_FILENAME) - output_object_store.download_object( - os.path.join(output_folder_prefix, DONE_FILENAME), - done_file) - with open(done_file) as df: - done_file_contents = df.read().splitlines() - except FileNotFoundError: - return False - else: - # Read the local done file - done_file = os.path.join(output_root, DONE_FILENAME) - if not os.path.isfile(done_file): - return False - with open(done_file) as df: - done_file_contents = df.read().splitlines() - # Compare the arguments - prev_args_str = done_file_contents[0] - if prev_args_str != args_str: - return False - - # Compare file names - prev_names = done_file_contents[1:] - if len(prev_names) != len(object_names): - return False - for idx, prev_name in enumerate(prev_names): - if object_names[idx] != prev_name: - return False - return True - - -def write_done_file(folder: str, args_str: str, object_names: List[str]): - """Write a file to signify completion. - - This the done file includes the arguments to processing and - a list of objects that were processed. - - Args: - folder (str): Folder to write the done file to - args_str (str): String representation of arguments - object_names (List[str]): List of objects to convert to MDS format - """ - with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: - done_file.write('\n'.join([args_str] + object_names) + '\n') - - -def convert_text_to_mds( - tokenizer_name: str, - output_folder: str, - input_folder: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - processes: int, - args_str: str, - reprocess: bool, -): - """Convert a folder of text files to MDS format. - - Args: - tokenizer_name (str): Name of tokenizer to use - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - processes (int): The number of processes to use. - args_str (str): String representation of the arguments - reprocess (bool): Whether to always reprocess the given folder of text files - """ - is_remote_output = is_remote_path(output_folder) - - object_names = get_object_names(input_folder) - if len(object_names) == 0: - raise ValueError(f'No text files were found at {input_folder}.') - - # Check if the text files in the bucket have already been processed. - if not reprocess and is_already_processed(output_folder, args_str, - object_names): - log.info( - f'Input folder {input_folder} is already processed at {output_folder} and ' - + - 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' - ) - return - - # Use a temporary local directory if the output is remote and there are more than 1 processes - local_output_folder = tempfile.TemporaryDirectory( - ).name if is_remote_output else output_folder - - if processes > 1: - # Download and convert the text files in parallel - args = get_task_args(object_names, local_output_folder, input_folder, - processes, tokenizer_name, concat_tokens, eos_text, - bos_text, no_wrap, compression) - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_and_convert_starargs, args)) - - # Merge the mds shards from each of the processes into a single folder - merge_shard_groups(local_output_folder) - else: - download_and_convert(object_names, local_output_folder, input_folder, - tokenizer_name, concat_tokens, eos_text, bos_text, - no_wrap, compression) - - # Write a done file with the args and object names - write_done_file(local_output_folder, args_str, object_names) - - if is_remote_output: - # Upload the local output to the remote location - output_object_store = cast( - ObjectStore, maybe_create_object_store_from_uri(output_folder)) - _, _, output_folder_prefix = parse_uri(output_folder) - files_to_upload = os.listdir(local_output_folder) - - for file in files_to_upload: - assert not os.path.isdir(file) - remote_path = os.path.join(output_folder_prefix, file) - output_object_store.upload_object( - remote_path, os.path.join(local_output_folder, file)) - - -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Validate Inputs and Count tokens - -# COMMAND ---------- - -import json - -from streaming.base.storage.download import download_file -from streaming.base.storage.upload import CloudUploader - - -def integrity_check(out: Union[str, Tuple[str, str]]): - """Check if the index file has integrity. - - If index is a cloud url, first download it to a temp local file. - - Args: - out (Union[str, Tuple[str,str]]): MDS dataset path - """ - - def count_shards(mds_root: str): - n_shard_files = 0 - cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) - for o in cu.list_objects(): - if o.endswith('.mds'): - n_shard_files += 1 - return n_shard_files - - cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - - with tempfile.TemporaryDirectory() as temp_dir: - if cu.remote: - download_file(os.path.join(cu.remote, 'index.json'), - os.path.join(temp_dir, 'index.json'), - timeout=60) - actual_n_shard_files = count_shards(cu.remote) - local_merged_index_path = os.path.join(temp_dir, 'index.json') - else: - local_merged_index_path = os.path.join(cu.local, 'index.json') - actual_n_shard_files = count_shards(cu.local) - - merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len( - {b['raw_data']['basename'] for b in merged_index['shards']}) - return n_shard_files == actual_n_shard_files - - -def check_HF_datasets(dataset_names_with_splits: list): - token = os.environ.get('HUGGING_FACE_HUB_TOKEN') - for dataset_name_with_split in dataset_names_with_splits: - dataset_name, split = os.path.split(dataset_name_with_split) - # make sure we have a dataset and split - if not dataset_name or not split: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." - # check user access to the dataset - try: - _ = dataset_info(dataset_name) - except: - token_warning = '' - if not token: - token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning - # check that split exists - try: - splits = get_dataset_split_names(dataset_name) - except: # error raised in the case of multiple subsets - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' - if split not in splits: - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' - return True, '' - - -def is_hf_dataset_path(path: str): - """Check if a given string is a dataset path used by Hugging Face. - - Args: - path (str): The string to be checked. - - Returns: - bool: True if the string is a dataset path, False otherwise. - """ - # Regular expression to match the dataset path pattern - pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' - - return bool(re.match(pattern, path)) - - -def create_om_cfg(FT_API_args: Namespace): - task_type = FT_API_args.task_type - train_data_path = FT_API_args.train_data_path - model = FT_API_args.model - max_seq_len = FT_API_args.context_length - - common_args = { - 'drop_last': False, - 'num_workers': 2, - 'prefetch_factor': 2, - 'pin_memory': False, - 'persistent_workers': False, - 'timeout': 0 - } - if task_type == 'INSTRUCTION_FINETUNE': - cfg = om.create({ - 'dataset': { - 'hf_name': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'shuffle': True, - }, - **common_args - }) - - else: - cfg = om.create({ - 'name': 'finetuning', - 'dataset': { - 'remote': train_data_path, - 'local': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'packing_ratio': None, - 'shuffle': True, - }, - **common_args - }) - - tokenizer = build_tokenizer( - tokenizer_name=model, - tokenizer_kwargs={'model_max_length': max_seq_len}, - ) - - return cfg, tokenizer - - -# COMMAND ---------- - - -# build cfg from the inputs -def main(): - if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError( - f'Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.' - ) - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets(FT_API_args.train_data_path) - - cfg, tokenizer = create_om_cfg(FT_API_args) - - elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports - cfg, tokenizer = create_om_cfg(FT_API_args) - - input_folder = FT_API_args.train_data_path - output_folder = FT_API_args.save_folder - concat_tokens = FT_API_args.context_length - tokenizer_name = FT_API_args.model - - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" - args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) - convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - - # Check if the MDS dataset is integral by checking index.json - if integrity_check(args.output_folder): - raise RuntimeError( - f'{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!' - ) - - print('Converted data for continnued pre-training was saved in: ', - args.output_folder) - - else: - raise ValueError( - f'task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!' - ) - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - - from llmfoundry.data.finetuning import build_finetuning_dataloader - tokenizer_name = 'EleutherAI/gpt-neox-20b' - tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - device_batch_size = 1 - dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) - dataloader = dataspec.dataloader - token_counting_func = dataspec.get_num_tokens_in_batch - - total_tokens = 0 - for batch in dataloader: - total_tokens += token_counting_func(batch) - - print('Total number of tokens:', total_tokens) - - -# COMMAND ---------- - -if __name__ == '__main__': - main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py deleted file mode 100644 index 8a78581fef..0000000000 --- a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 -from argparse import Namespace -from typing import Any -from unittest.mock import MagicMock, mock_open, patch - -from transformers import AutoTokenizer - -from scripts.data_prep.validate_and_tokenize_data import (check_HF_datasets, - create_om_cfg, - integrity_check, - is_hf_dataset_path) - - -class MockCloudUploader: - - def __init__(self): - self.remote = 'some_remote_path' - self.local = 'some_local_path' - - def list_objects(self): - return ['shard1.mds', 'shard2.mds'] - - -class MockDatasetInfo: - - def __init__(self): - self.id = 'valid_dataset' - self.description = 'A mock dataset description' - - -@patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') -@patch('scripts.data_prep.validate_and_tokenize_data.download_file') -@patch('scripts.data_prep.validate_and_tokenize_data.json.load') -@patch( - 'builtins.open', - new_callable=mock_open, - read_data= - '{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}' -) -def test_integrity_check(mock_file_open: Any, mock_json_load: Any, - mock_download_file: Any, mock_cloud_uploader: Any): - # Setup mocks - mock_cloud_uploader.return_value = MockCloudUploader() - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }, { - 'raw_data': { - 'basename': 'shard2.mds' - } - }] - } - - # Test case where integrity is valid - assert integrity_check('mock_dataset_path') - - # Test case where integrity is invalid - # Modify the mock to simulate a different scenario - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }] - } # less shards than expected - assert not integrity_check('mock_dataset_path') - - -# Additional tests can be written for cases like remote URL, file not found, etc. - - -@patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') -@patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') -def test_check_HF_datasets(mock_get_splits: Any, mock_dataset_info: Any): - # Setup mocks - mock_get_splits.return_value = ['train', 'test'] - mock_dataset_info.return_value = MockDatasetInfo() - - # Test valid dataset with valid split - result, _ = check_HF_datasets(['valid_dataset/train']) - assert result - - # Test valid dataset with invalid split - result, _ = check_HF_datasets(['valid_dataset/invalid_split']) - assert not result - - # Test invalid dataset - mock_dataset_info.side_effect = Exception('Dataset not found') - result, _ = check_HF_datasets(['invalid_dataset/train']) - assert not result - - -# Additional tests for private datasets, token issues, etc. - - -def test_is_hf_dataset_path(): - # Valid dataset paths - assert is_hf_dataset_path('user/dataset/train') - assert is_hf_dataset_path('user/dataset') - - # Invalid dataset paths - assert not is_hf_dataset_path('user@dataset/train') - assert not is_hf_dataset_path('just_dataset_name') - assert not is_hf_dataset_path('user/dataset/unknown_split/') - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_instruction_finetune(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='INSTRUCTION_FINETUNE', - train_data_path='hf_dataset/train', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.hf_name == 'hf_dataset/train' - assert cfg.dataset.max_seq_len == 512 - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_continued_pretrain(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='CONTINUED_PRETRAIN', - train_data_path='object_store_path', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.remote == 'object_store_path' - assert cfg.dataset.max_seq_len == 512 diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index e8d86903dc..c9dfb88732 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -71,7 +71,6 @@ def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any, assert expected_results in out -@pytest.mark.gpu def test_loader_eval(capfd: Any, mock_saved_model_path: Any, tmp_path: pathlib.Path): diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py index 75caa6c941..ccbe1b69f7 100644 --- a/tests/fixtures/autouse.py +++ b/tests/fixtures/autouse.py @@ -17,12 +17,8 @@ @pytest.fixture(autouse=True) def initialize_dist(request: pytest.FixtureRequest): """Initialize the default PyTorch distributed process group for tests.""" - # should we just always initialize dist like in train.py? - _default = pytest.mark.world_size(1).mark - world_size = request.node.get_closest_marker('world_size', _default).args[0] gpu = request.node.get_closest_marker('gpu') - if world_size > 1: - dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu')) + dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu')) @pytest.fixture(autouse=True)