forked from contrebande-labs/charred
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitoring.py
76 lines (60 loc) · 1.86 KB
/
monitoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import wandb
import jax
def wandb_init(args):
wandb.init(
entity="charred",
project="charred",
job_type="train",
config=args,
)
wandb.config.update(
{
"num_devices": jax.device_count(),
}
)
wandb.define_metric("*", step_metric="train/global_step")
wandb.define_metric("train/global_step", step_metric="walltime")
wandb.define_metric("train/epoch", step_metric="train/global_step")
wandb.define_metric("train/secs_per_epoch", step_metric="train/epoch")
print("WandB setup...")
def wandb_close():
wandb.finish()
print("WandB closed...")
def wandb_log_step(
global_walltime,
epoch_steps,
global_training_steps,
delta_time,
epoch,
unreplicated_train_metric,
):
wandb.log(
data={
"walltime": global_walltime,
"train/step": epoch_steps,
"train/global_step": global_training_steps,
"train/steps_per_sec": 1 / delta_time,
"train/epoch": epoch,
**{f"train/{k}": v for k, v in unreplicated_train_metric.items()},
},
commit=True,
)
def wandb_log_epoch(epoch_walltime, global_training_steps):
wandb.log(
data={
"train/secs_per_epoch": epoch_walltime,
"train/global_step": global_training_steps,
},
commit=True,
)
def wandb_log_validation(image_logs):
formatted_images = []
for log in image_logs:
images = log["images"]
validation_prompt = log["validation_prompt"]
validation_image = log["validation_image"]
formatted_images.append(wandb.Image(validation_image, caption="target image"))
for image in images:
image = wandb.Image(image, caption=validation_prompt)
formatted_images.append(image)
wandb.log({"validation": formatted_images})