Skip to content

Commit

Permalink
update notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaohanzhan-db committed Jan 23, 2024
1 parent d279817 commit dbe3f4e
Showing 1 changed file with 32 additions and 60 deletions.
92 changes: 32 additions & 60 deletions notebooks/validate_and_tokenize_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "f275a21b-47d4-472c-972b-e2a84a597db2",
"showTitle": false,
Expand Down Expand Up @@ -57,10 +54,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d",
"showTitle": false,
Expand Down Expand Up @@ -200,10 +194,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd",
"showTitle": false,
Expand All @@ -218,10 +209,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa",
"showTitle": false,
Expand Down Expand Up @@ -265,17 +253,14 @@
"outputs": [],
"source": [
"FT_API_args = Namespace(\n",
" model='EleutherAI/gpt-neox-20b',\n",
" train_data_path= 'main.streaming.random_large_table', # 'tatsu-lab/alpaca/train', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl', 'tatsu-lab/alpaca/train', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train',\n",
" model= 'mosaicml/mpt-7b', # Other examples: 'EleutherAI/gpt-neox-20b',\n",
" train_data_path= 'main.streaming.random_large_table', # Other examples: 'tatsu-lab/alpaca/train', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl' # 'mosaicml/dolly_hhrlhf/train'\n",
" task_type='INSTRUCTION_FINETUNE',\n",
" training_duration=3,\n",
" context_length=2048,\n",
")\n",
"\n",
"temporary_jsonl_data_path = '/Volumes/main/mosaic_hackathon/managed-volume/IFT/ft_data_11Jan24_3/train'\n",
"# os.environ['HF_ASSETS_CACHE'] = '/tmp/'\n",
"# os.environ['HF_HOME'] = '/tmp/'\n",
"# os.environ['HF_HUB_CACHE'] = '/tmp/'\n",
"os.environ['HF_DATASETS_CACHE'] = '/tmp/'\n",
"os.makedirs(temporary_jsonl_data_path, exist_ok=True)"
]
Expand All @@ -284,10 +269,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69",
"showTitle": false,
Expand Down Expand Up @@ -363,10 +345,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356",
"showTitle": false,
Expand All @@ -381,10 +360,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "1a28320a-a2a1-4f3c-a0cd-ad6045a24f64",
"showTitle": false,
Expand Down Expand Up @@ -474,10 +450,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1",
"showTitle": false,
Expand Down Expand Up @@ -516,10 +489,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "7249e9e6-1ea7-4fc9-8959-8a17d62a9fb4",
"showTitle": false,
Expand Down Expand Up @@ -560,10 +530,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a",
"showTitle": false,
Expand All @@ -578,10 +545,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d",
"showTitle": false,
Expand Down Expand Up @@ -610,14 +574,13 @@
"outputs": [],
"source": [
"FT_API_args = Namespace(\n",
" model='EleutherAI/gpt-neox-20b',\n",
" model= 'mosaicml/mpt-7b',\n",
" train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT',\n",
" task_type='CONTINUED_PRETRAIN',\n",
" training_duration=3,\n",
" context_length=2048,\n",
")\n",
"temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/mds_data_11Jan24_5'\n",
"# temporary_mds_output_path = '/tmp/CPT/mds_data_11Jan24_4'"
"temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/mds_data_11Jan24_5'"
]
},
{
Expand All @@ -644,10 +607,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3",
"showTitle": false,
Expand Down Expand Up @@ -715,10 +675,7 @@
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"cellMetadata": {},
"inputWidgets": {},
"nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b",
"showTitle": false,
Expand Down Expand Up @@ -754,6 +711,21 @@
"print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n",
"print(f\"By default, ~{n_epochs * n_billing_tokens_in_dataset} tokens will be used in training\")"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "e123669c-2f77-4d66-93eb-04efd546f39f",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit dbe3f4e

Please sign in to comment.