From c3b49e82b384905d863191158994f00668ff60fc Mon Sep 17 00:00:00 2001
From: Benjamin Blundell <me@benjamin.computer>
Date: Fri, 1 Oct 2021 15:24:35 +0100
Subject: [PATCH] Added sigma dora, minor fixes and energy usage

---
 README.markdown      | 20 +++++++-------
 eval.py              | 42 ++++++++++--------------------
 net/net.py           |  4 +--
 run.py               |  7 ++---
 run/run.conf.example |  2 +-
 run/sigma_dora.csv   |  1 +
 train.py             | 62 ++++----------------------------------------
 util/loadsave.py     | 62 +++++++++++++++++++-------------------------
 8 files changed, 63 insertions(+), 137 deletions(-)
 create mode 100644 run/sigma_dora.csv

diff --git a/README.markdown b/README.markdown
index b6eb809..6545d9c 100644
--- a/README.markdown
+++ b/README.markdown
@@ -45,6 +45,11 @@ If you want some fancy formatted results or if you are running the tl;dr script,
 * [Imagemagick](https://imagemagick.org/index.php)
 * [ffmpeg](https://ffmpeg.org/)
 
+Finally, in order to view the model the network has derived, you'll need a program capable of displaying obj or ply files. These are pretty numerous, but here are a few we use:
+
+* [meshlab](https://www.meshlab.net/)
+* [blender(https://www.blender.org/)
+
 Chances are, if you are running a Linux distribution, you will have these already and if not, they'll be in your repository management system (like apt or pacman).
 
 All of the python requirements are listed in the requirements.txt file (there aren't too many).
@@ -175,6 +180,10 @@ The data used in the paper comes from [Suliana Manley's research group](https://
 
 ... filling the missing steps with the other sigma levels. This takes quite a while, even with only 10 augmentations. The pre-rendered data can be found on [Zenodo](https://zenodo.org/record/4751057) complete the with the full instructions for generating it.
 
+### Energy Usage
+
+The estimated energy use to train a simulated model is 2.1kWh based on a measurement of 623.4kWh over 166 days. In this period, 298 models were trained and evaluated. This was confirmed by cross-checking against the wattage of the GPU and the time spent to generate an average model.
+
 ## Outputs
 
 Once you have a trained network, you can generate the final outputs using the *generate_stats.sh* script found in the *run* directory.
@@ -232,12 +241,6 @@ When running train.py, there are a number of options one can choose.
     --lr
     Learning rate (default: 0.0004).
 
-    --mask-thresh
-    Threshold for what we consider in the loss (default: 0.05)
-
-    --plr
-    Learning rate for points (default: same as the learning rate).
-
     --spawn-rate
     Probabilty of spawning a point (default: 1.0).
 
@@ -268,9 +271,6 @@ When running train.py, there are a number of options one can choose.
     --normalise-basic
     Normalise with torch basic intensity divide.
 
-    --scheduler
-    Use a scheduler on the loss.
-
     --seed
     Random seed (default: 1).
 
@@ -295,7 +295,7 @@ When running train.py, there are a number of options one can choose.
     --load
     A checkpoint file to load in order to continue training.
 
-    "--savename
+    --savename
     The name for checkpoint save file.
 
     --savedir
diff --git a/eval.py b/eval.py
index 47d3823..901c2a7 100644
--- a/eval.py
+++ b/eval.py
@@ -31,7 +31,7 @@
 from util.math import VecRotTen, VecRot, TransTen, PointsTen
 
 
-def angle_eval(args, model, points, batch_size, device):
+def angle_eval(args, model, points, prev_args, device):
     """For every angle, save the in and out so we can assess where the
     network is failing."""
     xt = 0.0
@@ -90,7 +90,7 @@ def angle_eval(args, model, points, batch_size, device):
             save_image(result, args.savedir + "/" + "eval_in_" + str(idx).zfill(3) + ".jpg")
 
             target = result.reshape(1, 128, 128)
-            target = target.repeat(batch_size, 1, 1, 1)
+            target = target.repeat(prev_args.batch_size, 1, 1, 1)
             target = target.to(device)
             target = normaliser.normalise(target)
 
@@ -98,7 +98,7 @@ def angle_eval(args, model, points, batch_size, device):
             # and keep working out the gradient cos pytorch weirdness
             model.set_sigma(args.sigma)
             output = model.forward(target, points)
-            output = normaliser.normalise(output.reshape(batch_size, 1, 128, 128))
+            output = normaliser.normalise(output.reshape(prev_args.batch_size, 1, 128, 128))
             loss = F.l1_loss(output, target)
 
             output = torch.squeeze(output.cpu()[0])
@@ -109,7 +109,7 @@ def angle_eval(args, model, points, batch_size, device):
             S.write_immediate(loss, "eval_loss", 0, 0,  idx)
 
 
-def basic_eval(args, model, points, batch_size, device):
+def basic_eval(args, model, points, prev_args, device):
     """ Our basic evaluation step. """
     xr = 0.0
     yr = 0.0
@@ -135,7 +135,7 @@ def basic_eval(args, model, points, batch_size, device):
     t = TransTen(xt, yt)
 
     normaliser = NormaliseNull()
-    if args.normalise_basic:
+    if prev_args.normalise_basic:
         normaliser = NormaliseTorch()
 
     # Setup our splatting pipeline which is added to both dataloader
@@ -154,7 +154,7 @@ def basic_eval(args, model, points, batch_size, device):
     save_image(result.clone().cpu(), args.savedir + "/" + "eval_single_in.jpg")
 
     target = result.reshape(1, 128, 128)
-    target = target.repeat(batch_size, 1, 1, 1)
+    target = target.repeat(prev_args.batch_size, 1, 1, 1)
     target = target.to(device)
     target = normaliser.normalise(target)
 
@@ -162,7 +162,7 @@ def basic_eval(args, model, points, batch_size, device):
     # and keep working out the gradient cos pytorch weirdness
     model.set_sigma(args.sigma)
     output = model.forward(target, points)
-    output = normaliser.normalise(output.reshape(batch_size, 1, 128, 128))
+    output = normaliser.normalise(output.reshape(prev_args.batch_size, 1, 128, 128))
     loss = F.l1_loss(output, target)
     print("Loss :", loss)
     print("Rotations returned:", model.get_rots())
@@ -205,28 +205,24 @@ def evaluate(args, device, animate=False):
     S.on(args.savedir)
     model = None
     points = None
+    model = load_model(args.savedir + "/model.tar", device)
 
     if os.path.isfile(args.savedir + "/" + args.savename):
-        (model, points) = load_checkpoint(
-            args.savedir, args.savename, device, evaluation=True
+        (model, points, _, _, _, _, prev_args) = load_checkpoint(
+            model, args.savedir, args.savename, device
         )
-        model = load_model(args.savedir + "/model.tar", device)
         model.to(device)
         print("Loaded model", model)
     else:
         print("Error - need to pass in a model")
         return
 
-    batch_size = model._final.size()[0]
-    print("Batch Size :", batch_size)
-
     model.eval()
     random.seed()
-    if args.predict_sigma:
-        model.predict_sigma = True
-    basic_eval(args, model, points, batch_size, device)
+    basic_eval(args, model, points, prev_args, device)
+
     if animate:
-        angle_eval(args, model, points, batch_size, device)
+        angle_eval(args, model, points, prev_args, device)
     S.close()
 
 
@@ -237,18 +233,6 @@ def evaluate(args, device, animate=False):
     parser.add_argument(
         "--no-cuda", action="store_true", default=False, help="disables CUDA training"
     )
-    parser.add_argument(
-        "--predict-sigma",
-        action="store_true",
-        default=False,
-        help="Does this model predict sigma",
-    )
-    parser.add_argument(
-        "--normalise-basic",
-        action="store_true",
-        default=False,
-        help="Use a basic normaliser",
-    )
     parser.add_argument(
         "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
     )
diff --git a/net/net.py b/net/net.py
index 97880af..a613647 100755
--- a/net/net.py
+++ b/net/net.py
@@ -277,9 +277,9 @@ def forward(self, target: torch.Tensor, points: PointsTen):
             nx = 2
 
             if self.predict_translate:
+                tx = (torch.tanh(rot[nx+1]) * 2.0) * self.max_shift
+                ty = (torch.tanh(rot[nx+2]) * 2.0) * self.max_shift
                 nx += 2
-                tx = (torch.tanh(rot[3]) * 2.0) * self.max_shift
-                ty = (torch.tanh(rot[4]) * 2.0) * self.max_shift
 
             sp = nn.Softplus(threshold=12)
             final_sigma = self.sigma
diff --git a/run.py b/run.py
index c0e5f8e..95cf408 100644
--- a/run.py
+++ b/run.py
@@ -129,11 +129,12 @@ def image_test(model, points, device, sigma, input_image):
     if args.load and os.path.isfile(args.load + "/checkpoint.pth.tar"):
         # (savedir, savename) = os.path.split(args.load)
         # print(savedir, savename)
-        (model, points) = load_checkpoint(
-            args.load, "checkpoint.pth.tar", device, evaluation=True
-        )
         model = load_model(args.load + "/model.tar")
+        (model, points, _, _, _, _, prev_args) = load_checkpoint(
+            args.load, "checkpoint.pth.tar", device
+        )
         model = model.to(device)
+        model.eval()
     else:
         print("--load must point to a run directory.")
         sys.exit(0)
diff --git a/run/run.conf.example b/run/run.conf.example
index 0bac918..020d012 100644
--- a/run/run.conf.example
+++ b/run/run.conf.example
@@ -5,4 +5,4 @@
 # Execute the following commands to get the code associated with this run:
 #git clone gogs@git.benjamin.computer:oni/shaper.git
 #git reset --hard  d94277d56121287a7417153c5ce579d7be7e525
-python3 ../train.py --savedir ../../runs/temp --save-interval 400 --train-size 100 --test-size 40 --valid-size 10 --objpath ../objs/teapot.obj --no-cuda --buffer-size 200 --epochs 20 --batch-size 2 --log-interval 100 --num-points 50 --lr 0.0004 --sigma-file sigma.csv
+python ../train.py --savedir ../../runs/temp --save-interval 400 --train-size 100 --test-size 40 --valid-size 10 --objpath ../objs/teapot_large.obj --no-cuda --buffer-size 200 --epochs 20 --batch-size 2 --log-interval 100 --num-points 50 --lr 0.0004 --sigma-file sigma.csv
\ No newline at end of file
diff --git a/run/sigma_dora.csv b/run/sigma_dora.csv
new file mode 100644
index 0000000..939536e
--- /dev/null
+++ b/run/sigma_dora.csv
@@ -0,0 +1 @@
+2.8,2.8,2.52,2.52,2.27,2.27,2.0,2.0,1.8,1.8,1.6,1.6,1.2,1.2,1.0,1.0,0.0,0.0,0.0,0.0
diff --git a/train.py b/train.py
index d4824af..6cced18 100644
--- a/train.py
+++ b/train.py
@@ -18,7 +18,6 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import torch.optim as optim
-from torch.optim.lr_scheduler import ReduceLROnPlateau
 import numpy as np
 import math
 import random
@@ -39,14 +38,12 @@
 from util.math import PointsTen
 
 
-def calculate_loss(args, target: torch.Tensor, output: torch.Tensor):
+def calculate_loss(target: torch.Tensor, output: torch.Tensor):
     """
     Our loss function, used in train and test functions.
 
     Parameters
     ----------
-    args : dict
-        The arguments object created in __main__
 
     target : torch.Tensor
         The target, properly shaped.
@@ -60,17 +57,7 @@ def calculate_loss(args, target: torch.Tensor, output: torch.Tensor):
         A loss object
     """
 
-    # Removed the masked loss and stuck with the basic one as it gives a cleaner
-    # final model and didn't really fix the double headed problem.
-    #target_masked = (target > args.mask_thresh).float()
-    #target = torch.mul(target, target_masked)
-
-    #output = output.reshape(args.batch_size, 1, args.image_size, args.image_size)
-
-    #masked_output = torch.mul(output, target_masked)
-    #loss = F.l1_loss(masked_output, target, reduction="sum")
     loss = F.l1_loss(output, target, reduction="sum")
-
     return loss
 
 
@@ -232,7 +219,6 @@ def train(
     buffer_test,
     data_loader,
     optimiser,
-    use_scheduler=False,
 ):
     """
     Now we've had some setup, lets do the actual training.
@@ -262,14 +248,6 @@ def train(
     """
 
     model.train()
-    scheduler = ReduceLROnPlateau(
-        optimiser,
-        mode="min",
-        factor=0.1,
-        patience=10,
-        threshold=0.0001,
-        threshold_mode="abs",
-    )
 
     # Which normalisation are we using?
     normaliser = NormaliseNull()
@@ -304,7 +282,7 @@ def train(
 
             output = normaliser.normalise(model(target_shaped, points))
 
-            loss = calculate_loss(args, target_shaped, output)
+            loss = calculate_loss(target_shaped, output)
             loss.backward()
             lossy = loss.item()
             optimiser.step()
@@ -354,14 +332,13 @@ def train(
                     epoch,
                     batch_idx,
                     loss,
+                    sigma,
+                    args,
                     args.savedir,
                     args.savename,
                 )
 
         buffer_train.set.shuffle()
-        # TODO - This loss should be on the validation set but for now...
-        if use_scheduler:
-            scheduler.step(loss)
 
     # Save a final points file once training is complete
     S.save_points(points, args.savedir, epoch, batch_idx)
@@ -511,12 +488,8 @@ def init(args, device):
         set_train.save(args.savedir + "/train_set.pickle")
         data_loader.save(args.savedir + "/train_data.pickle")
 
-    plr = args.lr
-    if args.plr is not None:
-        plr = args.plr
     variables = []
     variables.append({"params": model.parameters()})
-    variables.append({"params": points.data, "lr": plr})
     optimiser = optim.AdamW(variables, lr=args.lr)
     print("Starting new model")
 
@@ -530,8 +503,7 @@ def init(args, device):
         buffer_train,
         buffer_test,
         data_loader,
-        optimiser,
-        use_scheduler=args.scheduler
+        optimiser
     )
 
     save_model(model, args.savedir + "/model.tar")
@@ -545,7 +517,6 @@ def init(args, device):
         "--batch-size",
         type=int,
         default=20,
-        metavar="N",
         help="input batch size for training \
                           (default: 20)",
     )
@@ -553,30 +524,14 @@ def init(args, device):
         "--epochs",
         type=int,
         default=10,
-        metavar="N",
         help="number of epochs to train (default: 10)",
     )
     parser.add_argument(
         "--lr",
         type=float,
         default=0.0004,
-        metavar="LR",
         help="learning rate (default: 0.0004)",
     )
-    parser.add_argument(
-        "--mask-thresh",
-        type=float,
-        default=0.05,
-        metavar="LR",
-        help="Threshold for what we consider in the loss \
-                        (default: 0.05)",
-    )
-    parser.add_argument(
-        "--plr",
-        type=float,
-        default=None,
-        help="Learning rate for points (default: The same as the learning rate).",
-    )
     parser.add_argument(
         "--spawn-rate",
         type=float,
@@ -646,12 +601,6 @@ def init(args, device):
         default=False,
         help="Normalise with torch basic intensity divide",
     )
-    parser.add_argument(
-        "--scheduler",
-        action="store_true",
-        default=False,
-        help="Use a scheduler on the loss.",
-    )
     parser.add_argument(
         "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
     )
@@ -700,7 +649,6 @@ def init(args, device):
         "--save-interval",
         type=int,
         default=1000,
-        metavar="N",
         help="how many batches to wait before saving.",
     )
     parser.add_argument(
diff --git a/util/loadsave.py b/util/loadsave.py
index 71e0666..d3a3967 100644
--- a/util/loadsave.py
+++ b/util/loadsave.py
@@ -12,13 +12,10 @@
 
 import torch
 import torch.optim as optim
-import math
-from net.net import Net
-from net.renderer import Splat
 
 
 def save_checkpoint(
-    model, points, optimiser, epoch, batch_idx, loss, savedir, savename
+    model, points, optimiser, epoch, batch_idx, loss, sigma, args, savedir, savename
 ):
     """
     Saving a checkpoint out along with optimizer and other useful
@@ -39,6 +36,10 @@ def save_checkpoint(
         The batch_idx we got to during training
     loss:
         The current loss
+    sigma : float
+        The current sigma
+    args : args object
+        The args object this model was running with
     savedir : str
         The path to save to
     savename: str
@@ -56,7 +57,9 @@ def save_checkpoint(
             "model_state_dict": model.state_dict(),
             "points": points,
             "batch_idx": batch_idx,
-            "optimizer_state_dict": optimiser.state_dict(),
+            "sigma": sigma,
+            "args": args,
+            "optimiser_state_dict": optimiser.state_dict(),
             "loss": loss,
         },
         savedir + "/" + savename,
@@ -83,58 +86,47 @@ def save_model(model, path):
 
 
 def load_checkpoint(
-    savedir, savename, device, lr=0.0004, evaluation=False, predict_sigma=False
+    model, savedir, savename, device="cpu"
 ):
     """Load our checkpoint, given the full path to the checkpoint.
-    We can load for eval or continue training, so sometimes we ignore
-    the optimizer.
+    A model must be loaded and passed in already. We set the parameters
+    of this model from these stored in the checkpoint.
 
     Parameters
     ----------
+    model : Model
+        A model created blank or loaded with load_model
     savedir : str
         The directory of the save files
     savename : str
         The name of the save file
-    device : str
-        CUDA or cpu?
-    lr : float
-        The learning rate (default 0.0004)
-    evaluation: book
-        Load in evaluation mode (default False)
-    predict_sigma: bool
-        Does this model predict sigma
-
+    device : 
+        We must pass the device in
     Returns
     -------
-    None
+    tuple
 
     """
 
-    splat = Splat(math.radians(90), 1.0, 1.0, 10.0, device=device)
-    model = Net(splat, predict_sigma=predict_sigma)
     checkpoint = torch.load(savedir + "/" + savename, map_location=device)
-    if hasattr(checkpoint, "model_state_dict"):
-        model.load_state_dict(checkpoint["model_state_dict"])
-    elif hasattr(checkpoint, "model_main_state_dict"):
-        # older versions had model_main
-        model.load_state_dict(checkpoint["model_main_state_dict"])
-
-    points = checkpoint["points"]
-    points = points.data.to(device)
-
-    model = model.to(device)
-
-    if evaluation is True:
-        return (model, points)
+    model.load_state_dict(checkpoint["model_state_dict"])
 
-    optimizer = optim.Adam(model.parameters(), lr=lr)
     # this line seems to fail things :/
     # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
     epoch = checkpoint["epoch"]
     loss = checkpoint["loss"]
     batch_idx = checkpoint["batch_idx"]
+    args = checkpoint["args"]
+    points = checkpoint["points"]
+    optimiser = optim.Adam(model.parameters(), lr=args.lr)
+
+    points = points.data.to(device)
+    model = model.to(device)
+    model.predict_sigma = args.predict_sigma
+    model.predict_translate = not args.no_translate
+    model.max_trans = args.max_trans
 
-    return (model, points, optimizer, epoch, batch_idx, loss)
+    return (model, points, optimiser, epoch, batch_idx, loss, args)
 
 
 def load_model(path, device="cpu"):