From 697ccc724048108e9f11a3c9ed5171a17ac9fd9c Mon Sep 17 00:00:00 2001 From: Roman Furko Date: Tue, 19 Nov 2024 15:41:37 -0800 Subject: [PATCH] [tt-train] Fix tt-train in main branch (#15232) * Fix path for tokenizer. * Change location of data folder from tt-train/build/data to tt-train/data. * Remove !data from .gitignore in the folder. * Revert change to shakespeare dataset. --- .github/workflows/build-artifact.yaml | 2 +- .github/workflows/tt-train-post-commit.yaml | 2 +- tt-train/.gitignore | 2 -- tt-train/sources/examples/nano_gpt/CMakeLists.txt | 6 +++--- tt-train/tests/CMakeLists.txt | 2 +- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index a4f790ba615..08a123b2280 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -151,7 +151,7 @@ jobs: cat build/ccache.stats >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - name: 'Tar files' - run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train build/data runtime + run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime - name: 'Upload Artifact' uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml index 2b3ea0cebff..1117f1344b2 100644 --- a/.github/workflows/tt-train-post-commit.yaml +++ b/.github/workflows/tt-train-post-commit.yaml @@ -47,7 +47,7 @@ jobs: ARCH_NAME: ${{ inputs.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - TEST_DATA_DIR: ${{ github.workspace }}/build/data + TEST_DATA_DIR: ${{ github.workspace }}/data runs-on: - ${{ inputs.runner-label }} - cloud-virtual-machine diff --git a/tt-train/.gitignore b/tt-train/.gitignore index dd0b5ded9db..0b5d0243831 100644 --- a/tt-train/.gitignore +++ b/tt-train/.gitignore @@ -47,5 +47,3 @@ venv/ wandb/ cluster_descriptor.yaml - -!data/ diff --git a/tt-train/sources/examples/nano_gpt/CMakeLists.txt b/tt-train/sources/examples/nano_gpt/CMakeLists.txt index 89b304dc62f..f34a541ca52 100644 --- a/tt-train/sources/examples/nano_gpt/CMakeLists.txt +++ b/tt-train/sources/examples/nano_gpt/CMakeLists.txt @@ -9,11 +9,11 @@ set(SOURCES add_executable(nano_gpt ${SOURCES}) target_link_libraries(nano_gpt PRIVATE ttml) -add_definitions(-DDATA_FOLDER="${CMAKE_BINARY_DIR}/data") +add_definitions(-DDATA_FOLDER="${CMAKE_SOURCE_DIR}/data") # Define the target file location -set(SHAKESPEARE_URL "https://www.cs.princeton.edu/courses/archive/spring20/cos302/files/shakespeare.txt") -set(SHAKESPEARE_FILE "${CMAKE_BINARY_DIR}/data/shakespeare.txt") +set(SHAKESPEARE_URL "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt") +set(SHAKESPEARE_FILE "${CMAKE_SOURCE_DIR}/data/shakespeare.txt") # Check if the file already exists before downloading if(NOT EXISTS "${SHAKESPEARE_FILE}") diff --git a/tt-train/tests/CMakeLists.txt b/tt-train/tests/CMakeLists.txt index 9c1a83eef31..9ed44989182 100644 --- a/tt-train/tests/CMakeLists.txt +++ b/tt-train/tests/CMakeLists.txt @@ -20,7 +20,7 @@ add_definitions(-DTEST_DATA_DIR="${CMAKE_SOURCE_DIR}/data") # Define the target file location set(TOKENIZER_URL "https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json") -set(TOKENIZER_FILE "${CMAKE_BINARY_DIR}/data/tokenizer.json") +set(TOKENIZER_FILE "${CMAKE_SOURCE_DIR}/data/tokenizer.json") # Check if the file already exists before downloading if(NOT EXISTS "${TOKENIZER_FILE}")