[tt-train] Fix tt-train in main branch (#15232)

* Fix path for tokenizer. * Change location of data folder from tt-train/build/data to tt-train/data. * Remove !data from .gitignore in the folder. * Revert change to shakespeare dataset.
tenstorrent · Nov 19, 2024 · 697ccc7 · 697ccc7
1 parent f84fab5
commit 697ccc7
Show file tree

Hide file tree

Showing 5 changed files with 6 additions and 8 deletions.
diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
@@ -151,7 +151,7 @@ jobs:
           cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
       - name: 'Tar files'
-        run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train build/data runtime
+        run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime
       - name: 'Upload Artifact'
         uses: actions/upload-artifact@v4
         with:

diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml
@@ -47,7 +47,7 @@ jobs:
       ARCH_NAME: ${{ inputs.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-      TEST_DATA_DIR: ${{ github.workspace }}/build/data
+      TEST_DATA_DIR: ${{ github.workspace }}/data
     runs-on:
       - ${{ inputs.runner-label }}
       - cloud-virtual-machine

diff --git a/tt-train/.gitignore b/tt-train/.gitignore
@@ -47,5 +47,3 @@ venv/
 wandb/
 
 cluster_descriptor.yaml
-
-!data/
diff --git a/tt-train/sources/examples/nano_gpt/CMakeLists.txt b/tt-train/sources/examples/nano_gpt/CMakeLists.txt
@@ -9,11 +9,11 @@ set(SOURCES
 add_executable(nano_gpt ${SOURCES})
 target_link_libraries(nano_gpt PRIVATE ttml)
 
-add_definitions(-DDATA_FOLDER="${CMAKE_BINARY_DIR}/data")
+add_definitions(-DDATA_FOLDER="${CMAKE_SOURCE_DIR}/data")
 
 # Define the target file location
-set(SHAKESPEARE_URL "https://www.cs.princeton.edu/courses/archive/spring20/cos302/files/shakespeare.txt")
-set(SHAKESPEARE_FILE "${CMAKE_BINARY_DIR}/data/shakespeare.txt")
+set(SHAKESPEARE_URL "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
+set(SHAKESPEARE_FILE "${CMAKE_SOURCE_DIR}/data/shakespeare.txt")
 
 # Check if the file already exists before downloading
 if(NOT EXISTS "${SHAKESPEARE_FILE}")

diff --git a/tt-train/tests/CMakeLists.txt b/tt-train/tests/CMakeLists.txt
@@ -20,7 +20,7 @@ add_definitions(-DTEST_DATA_DIR="${CMAKE_SOURCE_DIR}/data")
 
 # Define the target file location
 set(TOKENIZER_URL "https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json")
-set(TOKENIZER_FILE "${CMAKE_BINARY_DIR}/data/tokenizer.json")
+set(TOKENIZER_FILE "${CMAKE_SOURCE_DIR}/data/tokenizer.json")
 
 # Check if the file already exists before downloading
 if(NOT EXISTS "${TOKENIZER_FILE}")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,5 +47,3 @@ venv/
		wandb/

		cluster_descriptor.yaml

		!data/