diff --git a/tt-train/.vscode/settings.json b/tt-train/.vscode/settings.json index f89ed5f1d98..b07ef4a89a2 100644 --- a/tt-train/.vscode/settings.json +++ b/tt-train/.vscode/settings.json @@ -1,3 +1,5 @@ { - "editor.formatOnSave": true + "editor.formatOnSave": true, + "files.autoSave": "afterDelay", + "C_Cpp.clang_format_style": ".clang-format" } diff --git a/tt-train/configs/training_mnist_mlp.yaml b/tt-train/configs/training_mnist_mlp.yaml new file mode 100644 index 00000000000..8975066088c --- /dev/null +++ b/tt-train/configs/training_mnist_mlp.yaml @@ -0,0 +1,14 @@ +training_config: + batch_size: 128 + logging_interval: 50 + num_epochs: 10 + learning_rate: 0.1 + momentum: 0.9 + weight_decay: 0.0 + is_eval: false + model_save_interval: 500 + model_path: "/tmp/mnist_mlp.msgpack" + mlp_config: + input_features: 784 + hidden_features: [128] + output_features: 10 diff --git a/tt-train/configs/training_shakespear_nanogpt.yaml b/tt-train/configs/training_shakespear_nanogpt.yaml new file mode 100644 index 00000000000..989a55d002f --- /dev/null +++ b/tt-train/configs/training_shakespear_nanogpt.yaml @@ -0,0 +1,17 @@ +training_config: + project_name: "tt_train_nano_gpt" + seed: 5489 + model_save_interval: 500 + batch_size: 64 + num_epochs: 1 + max_steps: 5000 + learning_rate: 0.0003 + weight_decay: 0.01 + + transformer_config: + num_heads: 6 + embedding_dim: 384 + dropout_prob: 0.2 + num_blocks: 6 + vocab_size: 96 + max_sequence_length: 256 diff --git a/tt-train/sources/examples/graph_capture/main.cpp b/tt-train/sources/examples/graph_capture/main.cpp index 4df8d2af5d4..5a044d67d97 100644 --- a/tt-train/sources/examples/graph_capture/main.cpp +++ b/tt-train/sources/examples/graph_capture/main.cpp @@ -75,7 +75,7 @@ int main() { ttml::core::zeros(ttml::core::create_shape({batch_size, 1, 1, num_targets}), device)); auto model_params = ttml::modules::MultiLayerPerceptronParameters{ - .m_input_features = num_features, .m_hidden_features = {128}, .m_output_features = num_targets}; + .input_features = num_features, .hidden_features = {128}, .output_features = num_targets}; auto model = ttml::modules::MultiLayerPerceptron(model_params); auto mode = tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH; diff --git a/tt-train/sources/examples/linear_regression/CMakeLists.txt b/tt-train/sources/examples/linear_regression/CMakeLists.txt index be7801ef7ec..adae4e50940 100644 --- a/tt-train/sources/examples/linear_regression/CMakeLists.txt +++ b/tt-train/sources/examples/linear_regression/CMakeLists.txt @@ -1,6 +1,5 @@ project(linear_regression) set(SOURCES main.cpp) - add_executable(linear_regression ${SOURCES}) target_link_libraries(linear_regression PRIVATE ttml) diff --git a/tt-train/sources/examples/linear_regression/main.cpp b/tt-train/sources/examples/linear_regression/main.cpp index f98af41a1cf..ac087af881b 100644 --- a/tt-train/sources/examples/linear_regression/main.cpp +++ b/tt-train/sources/examples/linear_regression/main.cpp @@ -12,6 +12,7 @@ #include "core/tt_tensor_utils.hpp" #include "datasets/dataloader.hpp" #include "datasets/generators.hpp" +#include "models/linear_regression.hpp" #include "modules/linear_module.hpp" #include "ops/losses.hpp" #include "optimizers/sgd.hpp" @@ -66,18 +67,18 @@ int main() { const uint32_t batch_size = 128; auto train_dataloader = DataLoader(training_dataset, batch_size, /* shuffle */ true, collate_fn); - auto model = ttml::modules::LinearLayer(num_features, num_targets); + auto model = ttml::models::linear_regression::create(num_features, num_targets); float learning_rate = 0.1F * num_targets * (batch_size / 128.F); auto sgd_config = ttml::optimizers::SGDConfig{.lr = learning_rate, .momentum = 0.0F}; - auto optimizer = ttml::optimizers::SGD(model.parameters(), sgd_config); + auto optimizer = ttml::optimizers::SGD(model->parameters(), sgd_config); int training_step = 0; const int num_epochs = 10; for (int epoch = 0; epoch < num_epochs; ++epoch) { for (const auto& [data, targets] : train_dataloader) { optimizer.zero_grad(); - auto output = model(data); + auto output = (*model)(data); auto loss = ttml::ops::mse_loss(output, targets); auto loss_float = ttml::core::to_vector(loss->get_value())[0]; fmt::print("Step: {} Loss: {}\n", training_step++, loss_float); diff --git a/tt-train/sources/examples/mnist_mlp/CMakeLists.txt b/tt-train/sources/examples/mnist_mlp/CMakeLists.txt index b69fe6dcc74..0c26c08e294 100644 --- a/tt-train/sources/examples/mnist_mlp/CMakeLists.txt +++ b/tt-train/sources/examples/mnist_mlp/CMakeLists.txt @@ -3,9 +3,7 @@ project(mnist_mlp) set(SOURCES main.cpp utils.cpp - models.cpp ) - CPMAddPackage(NAME mnist_dataset GITHUB_REPOSITORY wichtounet/mnist GIT_TAG master) include_directories(${mnist_dataset_SOURCE_DIR}/include) @@ -13,3 +11,4 @@ include_directories(${mnist_dataset_SOURCE_DIR}/include) add_executable(mnist_mlp ${SOURCES}) target_link_libraries(mnist_mlp PRIVATE ttml) target_compile_definitions(mnist_mlp PRIVATE MNIST_DATA_LOCATION="${mnist_dataset_SOURCE_DIR}/") +add_definitions(-DCONFIGS_FOLDER="${CMAKE_SOURCE_DIR}/configs") diff --git a/tt-train/sources/examples/mnist_mlp/main.cpp b/tt-train/sources/examples/mnist_mlp/main.cpp index 7a272bcd982..0528933d7bc 100644 --- a/tt-train/sources/examples/mnist_mlp/main.cpp +++ b/tt-train/sources/examples/mnist_mlp/main.cpp @@ -14,10 +14,11 @@ #include "core/tt_tensor_utils.hpp" #include "datasets/dataloader.hpp" #include "datasets/in_memory_dataset.hpp" -#include "models.hpp" +#include "models/mlp.hpp" #include "ops/losses.hpp" #include "optimizers/sgd.hpp" #include "utils.hpp" +#include "yaml-cpp/node/node.h" using ttml::autograd::TensorPtr; @@ -55,27 +56,46 @@ float evaluate(DataLoader &test_dataloader, Model &model, size_t num_targets) { return num_correct / num_samples; }; -int main(int argc, char **argv) { - CLI::App app{"Mnist Example"}; - argv = app.ensure_utf8(argv); - +struct TrainingConfig { uint32_t batch_size = 128; int logging_interval = 50; size_t num_epochs = 10; - bool is_eval = false; + float learning_rate = 0.1; + float momentum = 0.9F; + float weight_decay = 0.F; int model_save_interval = 500; std::string model_path = "/tmp/mnist_mlp.msgpack"; + ttml::modules::MultiLayerPerceptronParameters mlp_config; +}; - app.add_option("-b,--batch_size", batch_size, "Batch size")->default_val(batch_size); - app.add_option("-l,--logging_interval", logging_interval, "Logging interval")->default_val(logging_interval); - app.add_option("-m,--model_save_interval", model_save_interval, "model save interval") - ->default_val(model_save_interval); +TrainingConfig parse_config(const YAML::Node &yaml_config) { + TrainingConfig config; + auto training_config = yaml_config["training_config"]; + + config.batch_size = training_config["batch_size"].as(); + config.logging_interval = training_config["logging_interval"].as(); + config.num_epochs = training_config["num_epochs"].as(); + config.learning_rate = training_config["learning_rate"].as(); + config.momentum = training_config["momentum"].as(); + config.weight_decay = training_config["weight_decay"].as(); + config.model_save_interval = training_config["model_save_interval"].as(); + config.mlp_config = ttml::models::mlp::read_config(training_config["mlp_config"]); + return config; +} - app.add_option("-n,--num_epochs", num_epochs, "Number of epochs")->default_val(num_epochs); - app.add_option("-s,--model_path", model_path, "Model path")->default_val(model_path); - app.add_option("-e,--eval", is_eval, "eval only mode")->default_val(is_eval); +int main(int argc, char **argv) { + CLI::App app{"Mnist Example"}; + argv = app.ensure_utf8(argv); + + std::string config_name = std::string(CONFIGS_FOLDER) + "/training_mnist_mlp.yaml"; + bool is_eval = false; + app.add_option("-c,--config", config_name, "Yaml Config name")->default_val(config_name); + app.add_option("-e,--eval", config_name, "Evaluate")->default_val(is_eval); CLI11_PARSE(app, argc, argv); + auto yaml_config = YAML::LoadFile(config_name); + TrainingConfig config = parse_config(yaml_config); + // Load MNIST data const size_t num_targets = 10; const size_t num_features = 784; @@ -111,14 +131,14 @@ int main(int argc, char **argv) { return std::make_pair(data_tensor, targets_tensor); }; - auto train_dataloader = DataLoader(training_dataset, batch_size, /* shuffle */ true, collate_fn); - auto test_dataloader = DataLoader(test_dataset, batch_size, /* shuffle */ false, collate_fn); + auto train_dataloader = DataLoader(training_dataset, config.batch_size, /* shuffle */ true, collate_fn); + auto test_dataloader = DataLoader(test_dataset, config.batch_size, /* shuffle */ false, collate_fn); - auto model = create_base_mlp(784, 10); + auto model = ttml::models::mlp::create(config.mlp_config); - const float learning_rate = 0.1F * (static_cast(batch_size) / 128.F); - const float momentum = 0.9F; - const float weight_decay = 0.F; + const float learning_rate = config.learning_rate * (static_cast(config.batch_size) / 128.F); + const float momentum = config.momentum; + const float weight_decay = config.weight_decay; auto sgd_config = ttml::optimizers::SGDConfig{.lr = learning_rate, .momentum = momentum, .weight_decay = weight_decay}; @@ -129,9 +149,9 @@ int main(int argc, char **argv) { fmt::print(" Weight decay: {}\n", sgd_config.weight_decay); fmt::print(" Nesterov: {}\n", sgd_config.nesterov); auto optimizer = ttml::optimizers::SGD(model->parameters(), sgd_config); - if (!model_path.empty() && std::filesystem::exists(model_path)) { - fmt::print("Loading model from {}\n", model_path); - load_model_and_optimizer(model_path, model, optimizer, model_name, optimizer_name); + if (!config.model_path.empty() && std::filesystem::exists(config.model_path)) { + fmt::print("Loading model from {}\n", config.model_path); + load_model_and_optimizer(config.model_path, model, optimizer, model_name, optimizer_name); } // evaluate model before training (sanity check to get reasonable accuracy @@ -144,19 +164,19 @@ int main(int argc, char **argv) { LossAverageMeter loss_meter; int training_step = 0; - for (size_t epoch = 0; epoch < num_epochs; ++epoch) { + for (size_t epoch = 0; epoch < config.num_epochs; ++epoch) { for (const auto &[data, target] : train_dataloader) { optimizer.zero_grad(); auto output = (*model)(data); auto loss = ttml::ops::cross_entropy_loss(output, target); auto loss_float = ttml::core::to_vector(loss->get_value())[0]; - loss_meter.update(loss_float, batch_size); - if (training_step % logging_interval == 0) { + loss_meter.update(loss_float, config.batch_size); + if (training_step % config.logging_interval == 0) { fmt::print("Step: {:5d} | Average Loss: {:.4f}\n", training_step, loss_meter.average()); } - if (!model_path.empty() && training_step % model_save_interval == 0) { - fmt::print("Saving model to {}\n", model_path); - save_model_and_optimizer(model_path, model, optimizer, model_name, optimizer_name); + if (!config.model_path.empty() && training_step % config.model_save_interval == 0) { + fmt::print("Saving model to {}\n", config.model_path); + save_model_and_optimizer(config.model_path, model, optimizer, model_name, optimizer_name); } loss->backward(); @@ -174,9 +194,9 @@ int main(int argc, char **argv) { loss_meter.reset(); } - if (!model_path.empty()) { - fmt::print("Saving model to {}\n", model_path); - save_model_and_optimizer(model_path, model, optimizer, model_name, optimizer_name); + if (!config.model_path.empty()) { + fmt::print("Saving model to {}\n", config.model_path); + save_model_and_optimizer(config.model_path, model, optimizer, model_name, optimizer_name); } return 0; diff --git a/tt-train/sources/examples/mnist_mlp/models.cpp b/tt-train/sources/examples/mnist_mlp/models.cpp deleted file mode 100644 index 5d324fe389d..00000000000 --- a/tt-train/sources/examples/mnist_mlp/models.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#include "models.hpp" - -#include - -#include "modules/multi_layer_perceptron.hpp" -#include "ops/unary_ops.hpp" - -MNISTModel::MNISTModel() { - m_fc1 = std::make_shared(784, 128); - m_fc2 = std::make_shared(128, 64); - m_fc3 = std::make_shared(64, 10); - m_dropout = std::make_shared(0.2F); - - m_layernorm1 = std::make_shared(128); - m_layernorm2 = std::make_shared(10); - - create_name("MNISTModel"); - - register_module(m_fc1, "fc1"); - register_module(m_fc2, "fc2"); - register_module(m_fc3, "fc3"); - register_module(m_dropout, "dropout"); - register_module(m_layernorm1, "layernorm1"); - register_module(m_layernorm2, "layernorm2"); -} - -ttml::autograd::TensorPtr MNISTModel::operator()(ttml::autograd::TensorPtr x) { - x = (*m_dropout)(x); - x = (*m_fc1)(x); - x = (*m_layernorm1)(x); - x = ttml::ops::relu(x); - x = (*m_fc2)(x); - x = (*m_layernorm2)(x); - x = ttml::ops::relu(x); - x = (*m_fc3)(x); - return x; -} -std::shared_ptr create_base_mlp(uint32_t num_features, uint32_t num_targets) { - auto model_params = ttml::modules::MultiLayerPerceptronParameters{ - .m_input_features = num_features, .m_hidden_features = {128}, .m_output_features = num_targets}; - return std::make_shared(model_params); -} diff --git a/tt-train/sources/examples/mnist_mlp/models.hpp b/tt-train/sources/examples/mnist_mlp/models.hpp deleted file mode 100644 index 6445648b69f..00000000000 --- a/tt-train/sources/examples/mnist_mlp/models.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "autograd/module_base.hpp" -#include "modules/dropout_module.hpp" -#include "modules/layer_norm_module.hpp" -#include "modules/linear_module.hpp" -#include "modules/multi_layer_perceptron.hpp" - -class MNISTModel : public ttml::autograd::ModuleBase { - std::shared_ptr m_fc1; - std::shared_ptr m_fc2; - std::shared_ptr m_fc3; - std::shared_ptr m_dropout; - std::shared_ptr m_layernorm1; - std::shared_ptr m_layernorm2; - -public: - MNISTModel(); - - ttml::autograd::TensorPtr operator()(ttml::autograd::TensorPtr x); -}; - -std::shared_ptr create_base_mlp(uint32_t num_features, uint32_t num_targets); diff --git a/tt-train/sources/examples/nano_gpt/CMakeLists.txt b/tt-train/sources/examples/nano_gpt/CMakeLists.txt index f34a541ca52..a15d0770f3d 100644 --- a/tt-train/sources/examples/nano_gpt/CMakeLists.txt +++ b/tt-train/sources/examples/nano_gpt/CMakeLists.txt @@ -3,13 +3,12 @@ project(nano_gpt) set(SOURCES main.cpp utils.cpp - models.cpp ) add_executable(nano_gpt ${SOURCES}) target_link_libraries(nano_gpt PRIVATE ttml) - add_definitions(-DDATA_FOLDER="${CMAKE_SOURCE_DIR}/data") +add_definitions(-DCONFIGS_FOLDER="${CMAKE_SOURCE_DIR}/configs") # Define the target file location set(SHAKESPEARE_URL "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt") diff --git a/tt-train/sources/examples/nano_gpt/main.cpp b/tt-train/sources/examples/nano_gpt/main.cpp index e988c045d58..a0732c43156 100644 --- a/tt-train/sources/examples/nano_gpt/main.cpp +++ b/tt-train/sources/examples/nano_gpt/main.cpp @@ -15,7 +15,7 @@ #include "datasets/dataloader.hpp" #include "datasets/in_memory_token_dataset.hpp" #include "datasets/utils.hpp" -#include "models.hpp" +#include "models/gpt2.hpp" #include "ops/binary_ops.hpp" #include "ops/losses.hpp" #include "optimizers/adamw.hpp" @@ -23,6 +23,7 @@ #include "tokenizers/char_tokenizer.hpp" #include "ttnn_fixed/trivial_ttnn_ops.hpp" #include "utils.hpp" + /* WANDB BLocks this signal. Control+C didn't work. */ @@ -42,23 +43,6 @@ using DataLoader = ttml::datasets::DataLoader< std::function &&samples)>, BatchType>; -struct DemoConfig { - // training - uint32_t batch_size = 64; - uint32_t sequence_length = 256; - uint32_t num_epochs = 1; - uint32_t max_steps = 5000; - float dropout_prob = 0.2F; - // model - uint32_t num_heads = 6; - uint32_t embedding_dim = 384; - uint32_t num_blocks = 6; - // optimizer - float learning_rate = 3e-4F; - float weight_decay = 1e-2F; -}; -const DemoConfig config; - uint32_t sample(std::span log_softmax) { auto probabilities_vector = std::vector(log_softmax.size()); std::transform(log_softmax.begin(), log_softmax.end(), probabilities_vector.begin(), [](float value) { @@ -148,63 +132,86 @@ void generate( model->train(); } +struct TrainingConfig { + std::string project_name; + uint32_t seed = 5489U; + uint32_t model_save_interval = 500; + uint32_t batch_size = 64; + uint32_t num_epochs = 1; + uint32_t max_steps = 5000; + float learning_rate = 3e-4F; + float weight_decay = 1e-2F; + std::string model_path; + std::string data_path; + ttml::models::gpt2::TransformerConfig transformer_config; +}; + +TrainingConfig parse_config(const YAML::Node &yaml_config) { + TrainingConfig config; + auto training_config = yaml_config["training_config"]; + config.project_name = training_config["project_name"].as("tt_train_nano_gpt"); + config.seed = training_config["seed"].as(); + config.model_save_interval = training_config["model_save_interval"].as(); + config.batch_size = training_config["batch_size"].as(); + config.num_epochs = training_config["num_epochs"].as(); + config.max_steps = training_config["max_steps"].as(); + config.learning_rate = training_config["learning_rate"].as(); + config.weight_decay = training_config["weight_decay"].as(); + config.model_path = training_config["model_path"].as(""); + config.data_path = training_config["data_path"].as(std::string(DATA_FOLDER) + "/shakespeare.txt"); + config.transformer_config = ttml::models::gpt2::read_config(training_config["transformer_config"]); + return config; +} + int main(int argc, char **argv) { auto result = signal(SIGINT, signal_handler); if (result == SIG_ERR) { std::cerr << "Failed to set signal handler\n"; return -1; } - wandbcpp::init({.project = "tt_train_nano_gpt"}); - wandbcpp::update_config({ - {"model", "transformer"}, - {"num_heads", static_cast(config.num_heads)}, - {"embedding_dim", static_cast(config.embedding_dim)}, - {"num_blocks", static_cast(config.num_blocks)}, - {"dropout_prob", config.dropout_prob}, - {"learning_rate", config.learning_rate}, - {"weight_decay", config.weight_decay}, - {"batch_size", static_cast(config.batch_size)}, - {"sequence_length", static_cast(config.sequence_length)}, - {"max_steps", static_cast(config.max_steps)}, - }); auto start_timer = std::chrono::high_resolution_clock::now(); CLI::App app{"NanoGPT Example"}; argv = app.ensure_utf8(argv); - uint32_t seed = 5489U; - uint32_t model_save_interval = 500; - uint32_t max_steps = config.max_steps; - uint32_t batch_size = config.batch_size; - uint32_t sequence_length = config.sequence_length; - std::string model_path; - std::string data_path = std::string(DATA_FOLDER) + "/shakespeare.txt"; + std::string config_name = std::string(CONFIGS_FOLDER) + "/training_shakespear_nanogpt.yaml"; bool is_eval = false; + app.add_option("-c,--config", config_name, "Yaml Config name")->default_val(config_name); + app.add_option("-e,--eval", is_eval, "Is evaluation")->default_val(is_eval); - app.add_option("-b,--batch_size", batch_size, "Batch size")->default_val(batch_size); - app.add_option("-i,--model_save_interval", model_save_interval, "Model save interval") - ->default_val(model_save_interval); - app.add_option("-p,--model_path", model_path, "Model path")->default_val(model_path); - app.add_option("-d,--data_path", data_path, "Data path")->default_val(data_path); - app.add_option("-s,--seed", seed, "Seed")->default_val(seed); - app.add_option("-m,--max_steps", max_steps, "Max steps")->default_val(max_steps); - app.add_flag("-e,--eval", is_eval, "Evaluation mode")->default_val(is_eval); CLI11_PARSE(app, argc, argv); + auto yaml_config = YAML::LoadFile(config_name); + TrainingConfig config = parse_config(yaml_config); + + wandbcpp::init({.project = config.project_name}); + wandbcpp::update_config({ + {"model", "transformer"}, + {"num_heads", static_cast(config.transformer_config.num_heads)}, + {"embedding_dim", static_cast(config.transformer_config.embedding_dim)}, + {"num_blocks", static_cast(config.transformer_config.num_blocks)}, + {"dropout_prob", config.transformer_config.dropout_prob}, + {"learning_rate", config.learning_rate}, + {"weight_decay", config.weight_decay}, + {"batch_size", static_cast(config.batch_size)}, + {"sequence_length", static_cast(config.transformer_config.max_sequence_length)}, + {"max_steps", static_cast(config.max_steps)}, + }); // set seed - ttml::autograd::ctx().set_seed(seed); + ttml::autograd::ctx().set_seed(config.seed); std::string text; try { - text = read_file_to_str(data_path); + text = read_file_to_str(config.data_path); } catch (const std::exception &e) { std::cerr << e.what() << std::endl; return -1; } - fmt::print("Max steps {}\n", max_steps); - fmt::print("Batch size {}\n", batch_size); + fmt::print("Max steps {}\n", config.max_steps); + fmt::print("Batch size {}\n", config.batch_size); fmt::print("Seed {}\n", ttml::autograd::ctx().get_seed()); + auto sequence_length = config.transformer_config.max_sequence_length; auto [dataset, tokenizer] = ttml::datasets::create_in_memory_token_dataset(text, sequence_length); @@ -226,16 +233,16 @@ int main(int argc, char **argv) { CachedHostData cached_data; std::vector positions; std::vector mask; - positions.reserve((size_t)batch_size * sequence_length); - for (int sample_idx = 0; sample_idx < batch_size; ++sample_idx) { + positions.reserve((size_t)config.batch_size * sequence_length); + for (int sample_idx = 0; sample_idx < config.batch_size; ++sample_idx) { for (int i = 0; i < sequence_length; ++i) { positions.push_back(i); } } - - mask.reserve((size_t)batch_size * sequence_length * sequence_length * config.num_heads); - for (int sample_idx = 0; sample_idx < batch_size; ++sample_idx) { - for (int head = 0; head < config.num_heads; ++head) { + auto num_heads = config.transformer_config.num_heads; + mask.reserve((size_t)config.batch_size * sequence_length * sequence_length * num_heads); + for (int sample_idx = 0; sample_idx < config.batch_size; ++sample_idx) { + for (int head = 0; head < num_heads; ++head) { for (int i = 0; i < sequence_length; ++i) { for (int j = 0; j < sequence_length; ++j) { mask.push_back(i >= j ? 1.0F : 0.0F); @@ -244,12 +251,12 @@ int main(int argc, char **argv) { } } cached_data.masks_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( - mask, ttml::core::create_shape({batch_size, config.num_heads, sequence_length, sequence_length}), device)); + mask, ttml::core::create_shape({config.batch_size, num_heads, sequence_length, sequence_length}), device)); cached_data.positions_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( - positions, ttml::core::create_shape({batch_size, 1, 1, sequence_length}), device, Layout::ROW_MAJOR)); + positions, ttml::core::create_shape({config.batch_size, 1, 1, sequence_length}), device, Layout::ROW_MAJOR)); std::function && samples)> collate_fn = - [sequence_length, num_heads = config.num_heads, vocab_size = tokenizer.get_vocab_size(), device, &cached_data]( + [sequence_length, num_heads, vocab_size = tokenizer.get_vocab_size(), device, &cached_data]( std::vector &&samples) { auto start_timer = std::chrono::high_resolution_clock::now(); const uint32_t batch_size = samples.size(); @@ -279,16 +286,11 @@ int main(int argc, char **argv) { }; LossAverageMeter loss_meter; - auto train_dataloader = DataLoader(dataset, /* batch_size */ batch_size, /* shuffle */ true, collate_fn); + auto train_dataloader = DataLoader(dataset, /* batch_size */ config.batch_size, /* shuffle */ true, collate_fn); - auto transformer_config = TransformerConfig(); - transformer_config.num_heads = config.num_heads; - transformer_config.embedding_dim = config.embedding_dim; - transformer_config.dropout_prob = config.dropout_prob; - transformer_config.num_blocks = config.num_blocks; - transformer_config.vocab_size = round_up_to_tile(tokenizer.get_vocab_size()); - transformer_config.max_sequence_length = sequence_length; - auto model = std::make_shared(transformer_config); + fmt::print("Overriding vocab size to be divisible by 32\n"); + config.transformer_config.vocab_size = round_up_to_tile(tokenizer.get_vocab_size()); + auto model = ttml::models::gpt2::create(config.transformer_config); auto adamw_params = ttml::optimizers::AdamWConfig(); adamw_params.lr = config.learning_rate; @@ -298,16 +300,16 @@ int main(int argc, char **argv) { fmt::print(" Weight decay: {}\n", adamw_params.weight_decay); auto optimizer = ttml::optimizers::AdamW(model->parameters(), adamw_params); - if (!model_path.empty() && std::filesystem::exists(model_path)) { - fmt::print("Loading model from {}\n", model_path); - load_model_and_optimizer(model_path, model, optimizer, "transformer", "adamw"); + if (!config.model_path.empty() && std::filesystem::exists(config.model_path)) { + fmt::print("Loading model from {}\n", config.model_path); + load_model_and_optimizer(config.model_path, model, optimizer, "transformer", "adamw"); fmt::print("Model loaded after {} steps\n", optimizer.get_steps()); } if (is_eval) { fmt::print("\nEvaluation started\n"); for (;;) { - generate(model, tokenizer, sequence_length, config.num_heads); + generate(model, tokenizer, config.transformer_config.max_sequence_length, num_heads); } fmt::print("\nEvaluation finished\n"); return 0; @@ -331,11 +333,11 @@ int main(int argc, char **argv) { if (global_step % 10 == 0) { wandbcpp::log({{"Step", (int)global_step}, {"Loss", loss_float}}); } - if (!model_path.empty() && global_step % model_save_interval == 0) { - save_model_and_optimizer(model_path, model, optimizer, "transformer", "adamw"); + if (!config.model_path.empty() && global_step % config.model_save_interval == 0) { + save_model_and_optimizer(config.model_path, model, optimizer, "transformer", "adamw"); } - if (global_step >= max_steps) { + if (global_step >= config.max_steps) { break; } auto end_timer = std::chrono::high_resolution_clock::now(); @@ -345,20 +347,20 @@ int main(int argc, char **argv) { (double)duration / 1000, device->num_program_cache_entries()); } - if (optimizer.get_steps() >= max_steps) { + if (optimizer.get_steps() >= config.max_steps) { break; } } - if (!model_path.empty()) { - save_model_and_optimizer(model_path, model, optimizer, "transformer", "adamw"); + if (!config.model_path.empty()) { + save_model_and_optimizer(config.model_path, model, optimizer, "transformer", "adamw"); } auto end_timer = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_timer - start_timer).count(); fmt::print( "{} Steps training time: {} s, cache entries: {}\n", - max_steps, + config.max_steps, (double)duration / 1000000., device->num_program_cache_entries()); wandbcpp::finish(); diff --git a/tt-train/sources/ttml/core/tt_tensor_utils.hpp b/tt-train/sources/ttml/core/tt_tensor_utils.hpp index 3d82adf66fa..5d809935ea9 100644 --- a/tt-train/sources/ttml/core/tt_tensor_utils.hpp +++ b/tt-train/sources/ttml/core/tt_tensor_utils.hpp @@ -8,8 +8,6 @@ #include #include -#include "core/ttnn_fwd.hpp" - namespace ttml::core { void print_tensor_stats(const tt::tt_metal::Tensor& tensor, const std::string& name); diff --git a/tt-train/sources/ttml/core/ttnn_fwd.hpp b/tt-train/sources/ttml/core/ttnn_fwd.hpp deleted file mode 100644 index 3c6bddf0de5..00000000000 --- a/tt-train/sources/ttml/core/ttnn_fwd.hpp +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace tt::tt_metal { -struct Tensor; -class CommandQueue; -struct MemoryConfig; -class DeviceMesh; -class LegacyShape; -inline namespace v0 { -class Device; -} // namespace v0 -} // namespace tt::tt_metal - -namespace ttnn { -using Tensor = tt::tt_metal::Tensor; // not sure if it works but we can use original tensor namespace - -} // namespace ttnn diff --git a/tt-train/sources/ttml/init/tensor_initializers.hpp b/tt-train/sources/ttml/init/tensor_initializers.hpp index c83ef5b8686..0fa2b633bc7 100644 --- a/tt-train/sources/ttml/init/tensor_initializers.hpp +++ b/tt-train/sources/ttml/init/tensor_initializers.hpp @@ -3,10 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once + #include "autograd/tensor.hpp" -#include "core/ttnn_fwd.hpp" #include "init/cpu_initializers.hpp" + namespace ttml::init { + void uniform_init(ttml::autograd::TensorPtr& t, const ttnn::Shape& shape, UniformRange range); void normal_init(ttml::autograd::TensorPtr& t, const ttnn::Shape& shape, NormalParams params); diff --git a/tt-train/sources/examples/nano_gpt/models.cpp b/tt-train/sources/ttml/models/gpt2.cpp similarity index 67% rename from tt-train/sources/examples/nano_gpt/models.cpp rename to tt-train/sources/ttml/models/gpt2.cpp index 4aa2886b04f..e6dceeb15b5 100644 --- a/tt-train/sources/examples/nano_gpt/models.cpp +++ b/tt-train/sources/ttml/models/gpt2.cpp @@ -2,11 +2,13 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "models.hpp" +#include "gpt2.hpp" #include "ops/binary_ops.hpp" #include "ops/unary_ops.hpp" +namespace ttml::models::gpt2 { + Transformer::Transformer(const TransformerConfig& config) { uint32_t vocab_size = config.vocab_size; uint32_t max_sequence_length = config.max_sequence_length; @@ -54,6 +56,7 @@ Transformer::Transformer(const TransformerConfig& config) { register_module(ln_fc, "ln_fc"); register_module(fc, "fc"); } + ttml::autograd::TensorPtr Transformer::operator()( const ttml::autograd::TensorPtr& x, const ttml::autograd::TensorPtr& positions, @@ -70,25 +73,34 @@ ttml::autograd::TensorPtr Transformer::operator()( return log_softmax; } -BigramFCModel::BigramFCModel(uint32_t vocab_size, uint32_t num_tokens, uint32_t hidden_dim) { - // make vocab_size divisible by 32 - vocab_size = (vocab_size + 31) / 32 * 32; - - // create layers - emb = std::make_shared(vocab_size, hidden_dim); - fc1 = std::make_shared(hidden_dim, num_tokens); - - create_name("bigram_fc_model"); +TransformerConfig read_config(const YAML::Node& config) { + TransformerConfig transformer_config; + transformer_config.num_heads = config["num_heads"].as(); + transformer_config.embedding_dim = config["embedding_dim"].as(); + transformer_config.dropout_prob = config["dropout_prob"].as(); + transformer_config.num_blocks = config["num_blocks"].as(); + transformer_config.vocab_size = config["vocab_size"].as(); + transformer_config.max_sequence_length = config["max_sequence_length"].as(); + return transformer_config; +} - register_module(emb, "emb"); - register_module(fc1, "fc1"); +YAML::Node write_config(const TransformerConfig& mlp_config) { + YAML::Node config; + config["num_heads"] = mlp_config.num_heads; + config["embedding_dim"] = mlp_config.embedding_dim; + config["dropout_prob"] = mlp_config.dropout_prob; + config["num_blocks"] = mlp_config.num_blocks; + config["vocab_size"] = mlp_config.vocab_size; + config["max_sequence_length"] = mlp_config.max_sequence_length; + return config; } -ttml::autograd::TensorPtr BigramFCModel::operator()( - ttml::autograd::TensorPtr x, - [[maybe_unused]] const ttml::autograd::TensorPtr& positions, - [[maybe_unused]] const ttml::autograd::TensorPtr& masks) const { - x = (*emb)(x); - x = (*fc1)(x); - return x; +std::shared_ptr create(const TransformerConfig& config) { + return std::make_shared(config); +} +std::shared_ptr create(const YAML::Node& config) { + TransformerConfig transformer_config = read_config(config); + return std::make_shared(transformer_config); } + +} // namespace ttml::models::gpt2 diff --git a/tt-train/sources/examples/nano_gpt/models.hpp b/tt-train/sources/ttml/models/gpt2.hpp similarity index 64% rename from tt-train/sources/examples/nano_gpt/models.hpp rename to tt-train/sources/ttml/models/gpt2.hpp index b41a9b57825..811b4236943 100644 --- a/tt-train/sources/examples/nano_gpt/models.hpp +++ b/tt-train/sources/ttml/models/gpt2.hpp @@ -4,14 +4,13 @@ #pragma once -#include -#include +#include -#include "autograd/module_base.hpp" #include "modules/embedding_module.hpp" #include "modules/gpt_block.hpp" #include "modules/layer_norm_module.hpp" -#include "modules/linear_module.hpp" + +namespace ttml::models::gpt2 { struct TransformerConfig { uint32_t num_heads = 6; @@ -23,6 +22,7 @@ struct TransformerConfig { }; class Transformer : public ttml::autograd::ModuleBase { +private: std::shared_ptr tok_emb; std::shared_ptr pos_emb; std::vector> blocks; @@ -38,15 +38,9 @@ class Transformer : public ttml::autograd::ModuleBase { const ttml::autograd::TensorPtr& mask); }; -class BigramFCModel : public ttml::autograd::ModuleBase { -public: - std::shared_ptr fc1; - std::shared_ptr emb; - - BigramFCModel(uint32_t vocab_size, uint32_t num_tokens, uint32_t hidden_dim); +[[nodiscard]] TransformerConfig read_config(const YAML::Node& config); +[[nodiscard]] YAML::Node write_config(const TransformerConfig& mlp_config); +[[nodiscard]] std::shared_ptr create(const TransformerConfig& config); +[[nodiscard]] std::shared_ptr create(const YAML::Node& config); - ttml::autograd::TensorPtr operator()( - ttml::autograd::TensorPtr x, - [[maybe_unused]] const ttml::autograd::TensorPtr& positions, - [[maybe_unused]] const ttml::autograd::TensorPtr& masks) const; -}; +} // namespace ttml::models::gpt2 diff --git a/tt-train/sources/ttml/models/linear_regression.cpp b/tt-train/sources/ttml/models/linear_regression.cpp new file mode 100644 index 00000000000..10211fd912b --- /dev/null +++ b/tt-train/sources/ttml/models/linear_regression.cpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "linear_regression.hpp" + +#include "modules/linear_module.hpp" + +namespace ttml::models::linear_regression { +std::shared_ptr create(uint32_t in_features, uint32_t out_features) { + return std::make_shared(in_features, out_features); +} +} // namespace ttml::models::linear_regression diff --git a/tt-train/sources/ttml/models/linear_regression.hpp b/tt-train/sources/ttml/models/linear_regression.hpp new file mode 100644 index 00000000000..3ea32e74927 --- /dev/null +++ b/tt-train/sources/ttml/models/linear_regression.hpp @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +namespace ttml::modules { +class LinearLayer; +} + +namespace ttml::models::linear_regression { +[[nodiscard]] std::shared_ptr create(uint32_t in_features, uint32_t out_features); + +} // namespace ttml::models::linear_regression diff --git a/tt-train/sources/ttml/models/mlp.cpp b/tt-train/sources/ttml/models/mlp.cpp new file mode 100644 index 00000000000..0f7a97cf757 --- /dev/null +++ b/tt-train/sources/ttml/models/mlp.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "mlp.hpp" + +namespace ttml::models::mlp { + +ttml::modules::MultiLayerPerceptronParameters read_config(const YAML::Node& config) { + ttml::modules::MultiLayerPerceptronParameters mlp_config; + mlp_config.input_features = config["input_features"].as(); + mlp_config.hidden_features = config["hidden_features"].as>(); + mlp_config.output_features = config["output_features"].as(); + return mlp_config; +} + +YAML::Node write_config(ttml::modules::MultiLayerPerceptronParameters& mlp_config) { + YAML::Node config; + config["input_features"] = mlp_config.input_features; + config["hidden_features"] = mlp_config.hidden_features; + config["output_features"] = mlp_config.output_features; + return config; +} + +std::shared_ptr create( + const ttml::modules::MultiLayerPerceptronParameters& config) { + return std::make_shared(config); +} + +std::shared_ptr create(const YAML::Node& config) { + ttml::modules::MultiLayerPerceptronParameters mlp_config = read_config(config); + return std::make_shared(mlp_config); +} +} // namespace ttml::models::mlp diff --git a/tt-train/sources/ttml/models/mlp.hpp b/tt-train/sources/ttml/models/mlp.hpp new file mode 100644 index 00000000000..228f819dd5a --- /dev/null +++ b/tt-train/sources/ttml/models/mlp.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +#include + +#include "modules/multi_layer_perceptron.hpp" + +namespace ttml::models::mlp { +[[nodiscard]] ttml::modules::MultiLayerPerceptronParameters read_config(const YAML::Node& config); +[[nodiscard]] YAML::Node write_config(ttml::modules::MultiLayerPerceptronParameters& mlp_config); +[[nodiscard]] std::shared_ptr create( + const ttml::modules::MultiLayerPerceptronParameters& config); +[[nodiscard]] std::shared_ptr create(const YAML::Node& config); + +} // namespace ttml::models::mlp diff --git a/tt-train/sources/ttml/modules/multi_layer_perceptron.cpp b/tt-train/sources/ttml/modules/multi_layer_perceptron.cpp index 06c87b74ee5..81f5ad2c85a 100644 --- a/tt-train/sources/ttml/modules/multi_layer_perceptron.cpp +++ b/tt-train/sources/ttml/modules/multi_layer_perceptron.cpp @@ -14,12 +14,12 @@ void add_linear_layer(Layers& layers, Args&&... args) { } MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptronParameters& params) { - uint32_t current_input_features = params.m_input_features; - for (auto hidden_features : params.m_hidden_features) { + uint32_t current_input_features = params.input_features; + for (auto hidden_features : params.hidden_features) { add_linear_layer(m_layers, current_input_features, hidden_features); current_input_features = hidden_features; } - add_linear_layer(m_layers, current_input_features, params.m_output_features); + add_linear_layer(m_layers, current_input_features, params.output_features); create_name("mlp"); diff --git a/tt-train/sources/ttml/modules/multi_layer_perceptron.hpp b/tt-train/sources/ttml/modules/multi_layer_perceptron.hpp index feb61113787..27a3b301696 100644 --- a/tt-train/sources/ttml/modules/multi_layer_perceptron.hpp +++ b/tt-train/sources/ttml/modules/multi_layer_perceptron.hpp @@ -13,9 +13,9 @@ namespace ttml::modules { struct MultiLayerPerceptronParameters { - uint32_t m_input_features{}; - std::vector m_hidden_features; - uint32_t m_output_features{}; + uint32_t input_features{}; + std::vector hidden_features; + uint32_t output_features{}; }; class MultiLayerPerceptron : public autograd::ModuleBase { diff --git a/tt-train/sources/ttml/ops/dropout_op.cpp b/tt-train/sources/ttml/ops/dropout_op.cpp index aba066916e9..87e60143916 100644 --- a/tt-train/sources/ttml/ops/dropout_op.cpp +++ b/tt-train/sources/ttml/ops/dropout_op.cpp @@ -16,6 +16,9 @@ namespace ttml::ops { autograd::TensorPtr dropout(const autograd::TensorPtr& tensor, float probability) { + if (probability == 0.0F) { + return tensor; + } auto mask = core::ones_like(tensor->get_value()); // dropout seed is not properly used in ttnn::dropout // auto dropout_seed = autograd::ctx().get_generator()(); diff --git a/tt-train/sources/ttml/serialization/serialization.hpp b/tt-train/sources/ttml/serialization/serialization.hpp index 617d89e878a..1d4198e9996 100644 --- a/tt-train/sources/ttml/serialization/serialization.hpp +++ b/tt-train/sources/ttml/serialization/serialization.hpp @@ -4,11 +4,11 @@ #pragma once +#include #include #include "autograd/module_base.hpp" #include "autograd/tensor.hpp" -#include "core/ttnn_fwd.hpp" namespace ttml::optimizers { class OptimizerBase; diff --git a/tt-train/tests/3rd_party/tokenizers_test.cpp b/tt-train/tests/3rd_party/tokenizers_test.cpp index cd4b146fb60..a408dc97237 100644 --- a/tt-train/tests/3rd_party/tokenizers_test.cpp +++ b/tt-train/tests/3rd_party/tokenizers_test.cpp @@ -14,9 +14,9 @@ using tokenizers::Tokenizer; namespace { -std::string getTestDataDir() { - const char* envVar = std::getenv("TEST_DATA_DIR"); - return (envVar) ? std::string(envVar) : std::string(TEST_DATA_DIR); +std::string get_test_data_dir() { + const char* env_var = std::getenv("TEST_DATA_DIR"); + return (env_var) ? std::string(env_var) : std::string(TEST_DATA_DIR); } std::string load_bytes_from_file(const std::string& path) { @@ -57,7 +57,7 @@ void test_tokenizer(std::unique_ptr tok, bool check_id_back = true) { } // namespace TEST(HuggingFaceTokenizer, ExampleUsage) { - auto blob = load_bytes_from_file(getTestDataDir() + "/tokenizer.json"); + auto blob = load_bytes_from_file(get_test_data_dir() + "/tokenizer.json"); auto tok = Tokenizer::FromBlobJSON(blob); test_tokenizer(std::move(tok), true); } diff --git a/tt-train/tests/serialization/model_serializer_test.cpp b/tt-train/tests/serialization/model_serializer_test.cpp new file mode 100644 index 00000000000..ed516a73320 --- /dev/null +++ b/tt-train/tests/serialization/model_serializer_test.cpp @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include "models/gpt2.hpp" +#include "models/mlp.hpp" + +TEST(MultiLayerPerceptronParametersTest, BasicReadWrite) { + // Original configuration + ttml::modules::MultiLayerPerceptronParameters original_config; + original_config.input_features = 16; + original_config.hidden_features = {32, 64, 32}; + original_config.output_features = 8; + + // Write to YAML + YAML::Node yaml_node = ttml::models::mlp::write_config(original_config); + + // Read from YAML + ttml::modules::MultiLayerPerceptronParameters read_config_result = ttml::models::mlp::read_config(yaml_node); + + // Assertions to verify correctness + EXPECT_EQ(original_config.input_features, read_config_result.input_features); + EXPECT_EQ(original_config.hidden_features, read_config_result.hidden_features); + EXPECT_EQ(original_config.output_features, read_config_result.output_features); +} + +TEST(MultiLayerPerceptronParametersTest, MissingFields) { + // YAML configuration with missing 'hidden_features' + YAML::Node yaml_node; + yaml_node["input_features"] = 16; + // yaml_node["hidden_features"] is intentionally omitted + yaml_node["output_features"] = 8; + + EXPECT_THROW( + { + ttml::modules::MultiLayerPerceptronParameters read_config_result = + ttml::models::mlp::read_config(yaml_node); + }, + YAML::Exception); +} + +// Test 3: Handling of Invalid Data Types in YAML Configuration +TEST(MultiLayerPerceptronParametersTest, InvalidDataTypes) { + // YAML configuration with invalid data types + YAML::Node yaml_node; + yaml_node["input_features"] = "sixteen"; // Should be uint32_t + yaml_node["hidden_features"] = "invalid_type"; // Should be std::vector + yaml_node["output_features"] = 8; + + EXPECT_THROW( + { + ttml::modules::MultiLayerPerceptronParameters read_config_result = + ttml::models::mlp::read_config(yaml_node); + }, + YAML::Exception); +} + +TEST(TransformerConfigTest, BasicReadWrite) { + // Original configuration + ttml::models::gpt2::TransformerConfig original_config; + original_config.num_heads = 8; + original_config.embedding_dim = 512; + original_config.dropout_prob = 0.1f; + original_config.num_blocks = 6; + original_config.vocab_size = 10000; + original_config.max_sequence_length = 512; + + // Write to YAML + YAML::Node yaml_node = ttml::models::gpt2::write_config(original_config); + + // Read from YAML + auto read_config_result = ttml::models::gpt2::read_config(yaml_node); + + // Assertions to verify correctness + EXPECT_EQ(original_config.num_heads, read_config_result.num_heads); + EXPECT_EQ(original_config.embedding_dim, read_config_result.embedding_dim); + EXPECT_FLOAT_EQ(original_config.dropout_prob, read_config_result.dropout_prob); + EXPECT_EQ(original_config.num_blocks, read_config_result.num_blocks); + EXPECT_EQ(original_config.vocab_size, read_config_result.vocab_size); + EXPECT_EQ(original_config.max_sequence_length, read_config_result.max_sequence_length); +} + +TEST(TransformerConfigTest, MissingFields) { + // YAML configuration with missing 'dropout_prob' + YAML::Node yaml_node; + yaml_node["num_heads"] = 8; + yaml_node["embedding_dim"] = 512; + // yaml_node["dropout_prob"] is intentionally omitted + yaml_node["num_blocks"] = 6; + yaml_node["vocab_size"] = 10000; + yaml_node["max_sequence_length"] = 512; + + EXPECT_THROW({ auto read_config_result = ttml::models::gpt2::read_config(yaml_node); }, YAML::Exception); +} + +TEST(TransformerConfigTest, InvalidDataTypes) { + // YAML configuration with invalid data types + YAML::Node yaml_node; + yaml_node["num_heads"] = "eight"; // Should be uint32_t + yaml_node["embedding_dim"] = "five hundred twelve"; // Should be uint32_t + yaml_node["dropout_prob"] = "zero point one"; // Should be float + yaml_node["num_blocks"] = 6; + yaml_node["vocab_size"] = 10000; + yaml_node["max_sequence_length"] = 512; + + EXPECT_THROW({ auto read_config_result = ttml::models::gpt2::read_config(yaml_node); }, YAML::Exception); +} diff --git a/tt-train/tests/serialization/tensor_serializer_test.cpp b/tt-train/tests/serialization/tensor_serializer_test.cpp index cd39ecd9e5f..9e38502e83d 100644 --- a/tt-train/tests/serialization/tensor_serializer_test.cpp +++ b/tt-train/tests/serialization/tensor_serializer_test.cpp @@ -70,7 +70,7 @@ TEST_F(TensorFileTest, SerializeDeserializeNamedParameters) { ttml::serialization::MsgPackFile serializer; auto* device = &ttml::autograd::ctx().get_device(); auto model_params = ttml::modules::MultiLayerPerceptronParameters{ - .m_input_features = 128, .m_hidden_features = {256}, .m_output_features = 10}; + .input_features = 128, .hidden_features = {256}, .output_features = 10}; ttml::modules::MultiLayerPerceptron mlp_to_write(model_params); ttml::modules::MultiLayerPerceptron mlp_to_read(model_params); // Write tensor to file diff --git a/tt-train/tests/tokenizers/bpe_tokenizer_test.cpp b/tt-train/tests/tokenizers/bpe_tokenizer_test.cpp index 1def71591d5..639a428388c 100644 --- a/tt-train/tests/tokenizers/bpe_tokenizer_test.cpp +++ b/tt-train/tests/tokenizers/bpe_tokenizer_test.cpp @@ -13,15 +13,15 @@ using namespace ttml::tokenizers; namespace { -std::string getTestDataDir() { - const char* envVar = std::getenv("TEST_DATA_DIR"); - return (envVar) ? std::string(envVar) : std::string(TEST_DATA_DIR); -} +std::string get_test_data_dir() { + const char* env_var = std::getenv("TEST_DATA_DIR"); + return (env_var) ? std::string(env_var) : std::string(TEST_DATA_DIR); } +} // namespace class BPETokenizerTest : public ::testing::Test { protected: - BPETokenizer tokenizer = BPETokenizer(getTestDataDir() + "/tokenizer.json"); + BPETokenizer tokenizer = BPETokenizer(get_test_data_dir() + "/tokenizer.json"); }; TEST_F(BPETokenizerTest, EncodeAndDecode) {