From 84ab65b5a24c92e07b4dab1f7f315cd7cb3f2541 Mon Sep 17 00:00:00 2001 From: "Jian Zhang (James)" <6593865@qq.com> Date: Thu, 11 Jan 2024 08:39:05 -0800 Subject: [PATCH] [Enhancement] change the input argument of GSTaskTrackerAbc to be an integer (#699) *Issue #, if available:* *Description of changes:* - This PR changes the input argument of `GSTaskTrackerAbc` from `GSConfig` object into an integer because the `GSTaskTrackerAbc` only needs an integer to set the `log_report_frequency` attribute. - Using the `GSConfig` object will prevent users from using task tracker to monitor running process because creating a GSConfig is NOT publicly open, and is very complex. - Decouple the `GSTaskTracker` from using `GSConfig` could help users to construct task trackers and use them in the GraphStorm programming APIs. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Ubuntu Co-authored-by: xiang song(charlie.song) --- docs/source/advanced/own-models.rst | 2 +- docs/source/configuration/configuration-run.rst | 7 +------ examples/customized_models/HGT/hgt_nc.py | 2 +- examples/peft_llm_gnn/main_nc.py | 2 +- examples/peft_llm_gnn/nc_config_Video_Games.yaml | 3 --- python/graphstorm/gsf.py | 2 +- python/graphstorm/tracker/graphstorm_tracker.py | 10 ++++++---- python/graphstorm/tracker/sagemaker_tracker.py | 7 +++++-- 8 files changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/source/advanced/own-models.rst b/docs/source/advanced/own-models.rst index 3b9d9eb8c5..bde3b66d46 100644 --- a/docs/source/advanced/own-models.rst +++ b/docs/source/advanced/own-models.rst @@ -272,7 +272,7 @@ The GraphStorm trainers can have evaluators and task trackers associated. The fo config.early_stop_strategy) trainer.setup_evaluator(evaluator) # Optional: set up a task tracker to show the progress of training. - tracker = GSSageMakerTaskTracker(config) + tracker = GSSageMakerTaskTracker(config.eval_frequency) trainer.setup_task_tracker(tracker) GraphStorm's `evaluators `_ could help to compute the required evaluation metrics, such as ``accuracy``, ``f1``, ``mrr``, and etc. Users can select the proper evaluator and use the trainer's ``setup_evaluator()`` method to attach them. GraphStorm's `task trackers `_ serve as log collectors, which are used to show the process information. diff --git a/docs/source/configuration/configuration-run.rst b/docs/source/configuration/configuration-run.rst index f0c1afa854..276e97d08d 100644 --- a/docs/source/configuration/configuration-run.rst +++ b/docs/source/configuration/configuration-run.rst @@ -126,11 +126,6 @@ GraphStorm provides a set of parameters to control how and where to save and res - Yaml: ``task_tracker: sagemaker_task_tracker`` - Argument: ``--task_tracker sagemaker_task_tracker`` - Default value: ``sagemaker_task_tracker`` -- **log_report_frequency**: The frequency of reporting model performance metrics through task_tracker. The frequency is defined by using number of iterations, i.e., every N iterations the evaluation metrics will be reported. (Please note the evaluation metrics should be generated at the reporting iteration. See "eval_frequency" for how evaluation frequency is controlled.) - - - Yaml: ``log_report_frequency: 1000`` - - Argument: ``--log-report-frequency 1000`` - - Default value: ``1000`` - **restore_model_path**: A path where GraphStorm model parameters were saved. For training, if restore_model_path is set, GraphStom will retrieve the model parameters from restore_model_path instead of initializing the parameters. For inference, restore_model_path must be provided. - Yaml: ``restore_model_path: /model/checkpoint/`` @@ -278,7 +273,7 @@ GraphStorm provides a set of parameters to control model evaluation. - Yaml: ``use_mini_batch_infer: false`` - Argument: ``--use-mini-batch-infer false`` - Default value: ``true`` -- **eval_frequency**: The frequency of doing evaluation. GraphStorm trainers do evaluation at the end of each epoch. However, for large-scale graphs, training one epoch may take hundreds of thousands of iterations. One may want to do evaluations in the middle of an epoch. When eval_frequency is set, every **eval_frequency** iterations, the trainer will do evaluation once. The evaluation results can be printed and reported. See **log_report_frequency** for more details. +- **eval_frequency**: The frequency of doing evaluation. GraphStorm trainers do evaluation at the end of each epoch. However, for large-scale graphs, training one epoch may take hundreds of thousands of iterations. One may want to do evaluations in the middle of an epoch. When eval_frequency is set, every **eval_frequency** iterations, the trainer will do evaluation once. The evaluation results can be printed and reported. - Yaml: ``eval_frequency: 10000`` - Argument: ``--eval-frequency 10000`` diff --git a/examples/customized_models/HGT/hgt_nc.py b/examples/customized_models/HGT/hgt_nc.py index 6da88e6870..debb185858 100644 --- a/examples/customized_models/HGT/hgt_nc.py +++ b/examples/customized_models/HGT/hgt_nc.py @@ -335,7 +335,7 @@ def main(args): config.early_stop_strategy) trainer.setup_evaluator(evaluator) # Optional: set up a task tracker to show the progress of training. - tracker = GSSageMakerTaskTracker(config) + tracker = GSSageMakerTaskTracker(config.eval_frequency) trainer.setup_task_tracker(tracker) # Start the training process. diff --git a/examples/peft_llm_gnn/main_nc.py b/examples/peft_llm_gnn/main_nc.py index ae45ffeb1f..6fcd7237f9 100644 --- a/examples/peft_llm_gnn/main_nc.py +++ b/examples/peft_llm_gnn/main_nc.py @@ -62,7 +62,7 @@ def main(config_args): config.early_stop_strategy, ) trainer.setup_evaluator(evaluator) - tracker = GSSageMakerTaskTracker(config) + tracker = GSSageMakerTaskTracker(config.eval_frequency) trainer.setup_task_tracker(tracker) # create train loader diff --git a/examples/peft_llm_gnn/nc_config_Video_Games.yaml b/examples/peft_llm_gnn/nc_config_Video_Games.yaml index 6a2d0129dd..626553d8c9 100644 --- a/examples/peft_llm_gnn/nc_config_Video_Games.yaml +++ b/examples/peft_llm_gnn/nc_config_Video_Games.yaml @@ -19,11 +19,8 @@ gsf: batch_size: 4 dropout: 0.1 eval_batch_size: 4 - # eval_frequency: 100 - #log_report_frequency: 50 lr: 0.0001 num_epochs: 10 - # save_model_frequency: 300 wd_l2norm: 1.0e-06 input: restore_model_path: null diff --git a/python/graphstorm/gsf.py b/python/graphstorm/gsf.py index 124ed576a6..97dabd1164 100644 --- a/python/graphstorm/gsf.py +++ b/python/graphstorm/gsf.py @@ -656,4 +656,4 @@ def check_homo(g): def create_builtin_task_tracker(config): tracker_class = get_task_tracker_class(config.task_tracker) - return tracker_class(config) + return tracker_class(config.eval_frequency) diff --git a/python/graphstorm/tracker/graphstorm_tracker.py b/python/graphstorm/tracker/graphstorm_tracker.py index ece0955e84..a9e0c6055c 100644 --- a/python/graphstorm/tracker/graphstorm_tracker.py +++ b/python/graphstorm/tracker/graphstorm_tracker.py @@ -22,11 +22,13 @@ class GSTaskTrackerAbc(): Parameters ---------- - config: GSConfig - Configurations. Users can add their own configures in the yaml config file. + log_report_frequency: int + The frequency of reporting model performance metrics through task_tracker. + The frequency is defined by using number of iterations, i.e., every N iterations + the evaluation metrics will be reported. """ - def __init__(self, config): - self._report_frequency = config.log_report_frequency # Can be None if not provided + def __init__(self, log_report_frequency): + self._report_frequency = log_report_frequency # Can be None if not provided @abc.abstractmethod def log_metric(self, metric_name, metric_value, step, force_report=False): diff --git a/python/graphstorm/tracker/sagemaker_tracker.py b/python/graphstorm/tracker/sagemaker_tracker.py index 06276943bd..3ae5fd5a8a 100644 --- a/python/graphstorm/tracker/sagemaker_tracker.py +++ b/python/graphstorm/tracker/sagemaker_tracker.py @@ -25,8 +25,11 @@ class GSSageMakerTaskTracker(GSTaskTrackerAbc): Parameters ---------- - config: GSConfig - Configurations. Users can add their own configures in the yaml config file. + log_report_frequency: int + The frequency of reporting model performance metrics through task_tracker. + The frequency is defined by using number of iterations, i.e., every N iterations + the evaluation metrics will be reported. + """ def _do_report(self, step):