Added sigma dora, minor fixes and energy usage

OniDaito · Oct 1, 2021 · c3b49e8 · c3b49e8
1 parent 8f5ff2c
commit c3b49e8
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 137 deletions.
diff --git a/README.markdown b/README.markdown
@@ -45,6 +45,11 @@ If you want some fancy formatted results or if you are running the tl;dr script,
 * [Imagemagick](https://imagemagick.org/index.php)
 * [ffmpeg](https://ffmpeg.org/)
 
+Finally, in order to view the model the network has derived, you'll need a program capable of displaying obj or ply files. These are pretty numerous, but here are a few we use:
+
+* [meshlab](https://www.meshlab.net/)
+* [blender(https://www.blender.org/)
+
 Chances are, if you are running a Linux distribution, you will have these already and if not, they'll be in your repository management system (like apt or pacman).
 
 All of the python requirements are listed in the requirements.txt file (there aren't too many).
@@ -175,6 +180,10 @@ The data used in the paper comes from [Suliana Manley's research group](https://
 
 ... filling the missing steps with the other sigma levels. This takes quite a while, even with only 10 augmentations. The pre-rendered data can be found on [Zenodo](https://zenodo.org/record/4751057) complete the with the full instructions for generating it.
 
+### Energy Usage
+
+The estimated energy use to train a simulated model is 2.1kWh based on a measurement of 623.4kWh over 166 days. In this period, 298 models were trained and evaluated. This was confirmed by cross-checking against the wattage of the GPU and the time spent to generate an average model.
+
 ## Outputs
 
 Once you have a trained network, you can generate the final outputs using the *generate_stats.sh* script found in the *run* directory.
@@ -232,12 +241,6 @@ When running train.py, there are a number of options one can choose.
     --lr
     Learning rate (default: 0.0004).
 
-    --mask-thresh
-    Threshold for what we consider in the loss (default: 0.05)
-
-    --plr
-    Learning rate for points (default: same as the learning rate).
-
     --spawn-rate
     Probabilty of spawning a point (default: 1.0).
 
@@ -268,9 +271,6 @@ When running train.py, there are a number of options one can choose.
     --normalise-basic
     Normalise with torch basic intensity divide.
 
-    --scheduler
-    Use a scheduler on the loss.
-
     --seed
     Random seed (default: 1).
 
@@ -295,7 +295,7 @@ When running train.py, there are a number of options one can choose.
     --load
     A checkpoint file to load in order to continue training.
 
-    "--savename
+    --savename
     The name for checkpoint save file.
 
     --savedir

diff --git a/eval.py b/eval.py
@@ -31,7 +31,7 @@
 from util.math import VecRotTen, VecRot, TransTen, PointsTen
 
 
-def angle_eval(args, model, points, batch_size, device):
+def angle_eval(args, model, points, prev_args, device):
     """For every angle, save the in and out so we can assess where the
     network is failing."""
     xt = 0.0
@@ -90,15 +90,15 @@ def angle_eval(args, model, points, batch_size, device):
             save_image(result, args.savedir + "/" + "eval_in_" + str(idx).zfill(3) + ".jpg")
 
             target = result.reshape(1, 128, 128)
-            target = target.repeat(batch_size, 1, 1, 1)
+            target = target.repeat(prev_args.batch_size, 1, 1, 1)
             target = target.to(device)
             target = normaliser.normalise(target)
 
             # We use tpoints because otherwise we can't update points
             # and keep working out the gradient cos pytorch weirdness
             model.set_sigma(args.sigma)
             output = model.forward(target, points)
-            output = normaliser.normalise(output.reshape(batch_size, 1, 128, 128))
+            output = normaliser.normalise(output.reshape(prev_args.batch_size, 1, 128, 128))
             loss = F.l1_loss(output, target)
 
             output = torch.squeeze(output.cpu()[0])
@@ -109,7 +109,7 @@ def angle_eval(args, model, points, batch_size, device):
             S.write_immediate(loss, "eval_loss", 0, 0,  idx)
 
 
-def basic_eval(args, model, points, batch_size, device):
+def basic_eval(args, model, points, prev_args, device):
     """ Our basic evaluation step. """
     xr = 0.0
     yr = 0.0
@@ -135,7 +135,7 @@ def basic_eval(args, model, points, batch_size, device):
     t = TransTen(xt, yt)
 
     normaliser = NormaliseNull()
-    if args.normalise_basic:
+    if prev_args.normalise_basic:
         normaliser = NormaliseTorch()
 
     # Setup our splatting pipeline which is added to both dataloader
@@ -154,15 +154,15 @@ def basic_eval(args, model, points, batch_size, device):
     save_image(result.clone().cpu(), args.savedir + "/" + "eval_single_in.jpg")
 
     target = result.reshape(1, 128, 128)
-    target = target.repeat(batch_size, 1, 1, 1)
+    target = target.repeat(prev_args.batch_size, 1, 1, 1)
     target = target.to(device)
     target = normaliser.normalise(target)
 
     # We use tpoints because otherwise we can't update points
     # and keep working out the gradient cos pytorch weirdness
     model.set_sigma(args.sigma)
     output = model.forward(target, points)
-    output = normaliser.normalise(output.reshape(batch_size, 1, 128, 128))
+    output = normaliser.normalise(output.reshape(prev_args.batch_size, 1, 128, 128))
     loss = F.l1_loss(output, target)
     print("Loss :", loss)
     print("Rotations returned:", model.get_rots())
@@ -205,28 +205,24 @@ def evaluate(args, device, animate=False):
     S.on(args.savedir)
     model = None
     points = None
+    model = load_model(args.savedir + "/model.tar", device)
 
     if os.path.isfile(args.savedir + "/" + args.savename):
-        (model, points) = load_checkpoint(
-            args.savedir, args.savename, device, evaluation=True
+        (model, points, _, _, _, _, prev_args) = load_checkpoint(
+            model, args.savedir, args.savename, device
         )
-        model = load_model(args.savedir + "/model.tar", device)
         model.to(device)
         print("Loaded model", model)
     else:
         print("Error - need to pass in a model")
         return
 
-    batch_size = model._final.size()[0]
-    print("Batch Size :", batch_size)
-
     model.eval()
     random.seed()
-    if args.predict_sigma:
-        model.predict_sigma = True
-    basic_eval(args, model, points, batch_size, device)
+    basic_eval(args, model, points, prev_args, device)
+
     if animate:
-        angle_eval(args, model, points, batch_size, device)
+        angle_eval(args, model, points, prev_args, device)
     S.close()
 
 
@@ -237,18 +233,6 @@ def evaluate(args, device, animate=False):
     parser.add_argument(
         "--no-cuda", action="store_true", default=False, help="disables CUDA training"
     )
-    parser.add_argument(
-        "--predict-sigma",
-        action="store_true",
-        default=False,
-        help="Does this model predict sigma",
-    )
-    parser.add_argument(
-        "--normalise-basic",
-        action="store_true",
-        default=False,
-        help="Use a basic normaliser",
-    )
     parser.add_argument(
         "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
     )

diff --git a/net/net.py b/net/net.py
@@ -277,9 +277,9 @@ def forward(self, target: torch.Tensor, points: PointsTen):
             nx = 2
 
             if self.predict_translate:
+                tx = (torch.tanh(rot[nx+1]) * 2.0) * self.max_shift
+                ty = (torch.tanh(rot[nx+2]) * 2.0) * self.max_shift
                 nx += 2
-                tx = (torch.tanh(rot[3]) * 2.0) * self.max_shift
-                ty = (torch.tanh(rot[4]) * 2.0) * self.max_shift
 
             sp = nn.Softplus(threshold=12)
             final_sigma = self.sigma

diff --git a/run.py b/run.py
@@ -129,11 +129,12 @@ def image_test(model, points, device, sigma, input_image):
     if args.load and os.path.isfile(args.load + "/checkpoint.pth.tar"):
         # (savedir, savename) = os.path.split(args.load)
         # print(savedir, savename)
-        (model, points) = load_checkpoint(
-            args.load, "checkpoint.pth.tar", device, evaluation=True
-        )
         model = load_model(args.load + "/model.tar")
+        (model, points, _, _, _, _, prev_args) = load_checkpoint(
+            args.load, "checkpoint.pth.tar", device
+        )
         model = model.to(device)
+        model.eval()
     else:
         print("--load must point to a run directory.")
         sys.exit(0)

diff --git a/run/run.conf.example b/run/run.conf.example
@@ -5,4 +5,4 @@
 # Execute the following commands to get the code associated with this run:
 #git clone [email protected]:oni/shaper.git
 #git reset --hard  d94277d56121287a7417153c5ce579d7be7e525
-python3 ../train.py --savedir ../../runs/temp --save-interval 400 --train-size 100 --test-size 40 --valid-size 10 --objpath ../objs/teapot.obj --no-cuda --buffer-size 200 --epochs 20 --batch-size 2 --log-interval 100 --num-points 50 --lr 0.0004 --sigma-file sigma.csv
+python ../train.py --savedir ../../runs/temp --save-interval 400 --train-size 100 --test-size 40 --valid-size 10 --objpath ../objs/teapot_large.obj --no-cuda --buffer-size 200 --epochs 20 --batch-size 2 --log-interval 100 --num-points 50 --lr 0.0004 --sigma-file sigma.csv
diff --git a/run/sigma_dora.csv b/run/sigma_dora.csv
@@ -0,0 +1 @@
+2.8,2.8,2.52,2.52,2.27,2.27,2.0,2.0,1.8,1.8,1.6,1.6,1.2,1.2,1.0,1.0,0.0,0.0,0.0,0.0
diff --git a/train.py b/train.py
@@ -18,7 +18,6 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import torch.optim as optim
-from torch.optim.lr_scheduler import ReduceLROnPlateau
 import numpy as np
 import math
 import random
@@ -39,14 +38,12 @@
 from util.math import PointsTen
 
 
-def calculate_loss(args, target: torch.Tensor, output: torch.Tensor):
+def calculate_loss(target: torch.Tensor, output: torch.Tensor):
     """
     Our loss function, used in train and test functions.
 
     Parameters
     ----------
-    args : dict
-        The arguments object created in __main__
 
     target : torch.Tensor
         The target, properly shaped.
@@ -60,17 +57,7 @@ def calculate_loss(args, target: torch.Tensor, output: torch.Tensor):
         A loss object
     """
 
-    # Removed the masked loss and stuck with the basic one as it gives a cleaner
-    # final model and didn't really fix the double headed problem.
-    #target_masked = (target > args.mask_thresh).float()
-    #target = torch.mul(target, target_masked)
-
-    #output = output.reshape(args.batch_size, 1, args.image_size, args.image_size)
-
-    #masked_output = torch.mul(output, target_masked)
-    #loss = F.l1_loss(masked_output, target, reduction="sum")
     loss = F.l1_loss(output, target, reduction="sum")
-
     return loss
 
 
@@ -232,7 +219,6 @@ def train(
     buffer_test,
     data_loader,
     optimiser,
-    use_scheduler=False,
 ):
     """
     Now we've had some setup, lets do the actual training.
@@ -262,14 +248,6 @@ def train(
     """
 
     model.train()
-    scheduler = ReduceLROnPlateau(
-        optimiser,
-        mode="min",
-        factor=0.1,
-        patience=10,
-        threshold=0.0001,
-        threshold_mode="abs",
-    )
 
     # Which normalisation are we using?
     normaliser = NormaliseNull()
@@ -304,7 +282,7 @@ def train(
 
             output = normaliser.normalise(model(target_shaped, points))
 
-            loss = calculate_loss(args, target_shaped, output)
+            loss = calculate_loss(target_shaped, output)
             loss.backward()
             lossy = loss.item()
             optimiser.step()
@@ -354,14 +332,13 @@ def train(
                     epoch,
                     batch_idx,
                     loss,
+                    sigma,
+                    args,
                     args.savedir,
                     args.savename,
                 )
 
         buffer_train.set.shuffle()
-        # TODO - This loss should be on the validation set but for now...
-        if use_scheduler:
-            scheduler.step(loss)
 
     # Save a final points file once training is complete
     S.save_points(points, args.savedir, epoch, batch_idx)
@@ -511,12 +488,8 @@ def init(args, device):
         set_train.save(args.savedir + "/train_set.pickle")
         data_loader.save(args.savedir + "/train_data.pickle")
 
-    plr = args.lr
-    if args.plr is not None:
-        plr = args.plr
     variables = []
     variables.append({"params": model.parameters()})
-    variables.append({"params": points.data, "lr": plr})
     optimiser = optim.AdamW(variables, lr=args.lr)
     print("Starting new model")
 
@@ -530,8 +503,7 @@ def init(args, device):
         buffer_train,
         buffer_test,
         data_loader,
-        optimiser,
-        use_scheduler=args.scheduler
+        optimiser
     )
 
     save_model(model, args.savedir + "/model.tar")
@@ -545,38 +517,21 @@ def init(args, device):
         "--batch-size",
         type=int,
         default=20,
-        metavar="N",
         help="input batch size for training \
                           (default: 20)",
     )
     parser.add_argument(
         "--epochs",
         type=int,
         default=10,
-        metavar="N",
         help="number of epochs to train (default: 10)",
     )
     parser.add_argument(
         "--lr",
         type=float,
         default=0.0004,
-        metavar="LR",
         help="learning rate (default: 0.0004)",
     )
-    parser.add_argument(
-        "--mask-thresh",
-        type=float,
-        default=0.05,
-        metavar="LR",
-        help="Threshold for what we consider in the loss \
-                        (default: 0.05)",
-    )
-    parser.add_argument(
-        "--plr",
-        type=float,
-        default=None,
-        help="Learning rate for points (default: The same as the learning rate).",
-    )
     parser.add_argument(
         "--spawn-rate",
         type=float,
@@ -646,12 +601,6 @@ def init(args, device):
         default=False,
         help="Normalise with torch basic intensity divide",
     )
-    parser.add_argument(
-        "--scheduler",
-        action="store_true",
-        default=False,
-        help="Use a scheduler on the loss.",
-    )
     parser.add_argument(
         "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
     )
@@ -700,7 +649,6 @@ def init(args, device):
         "--save-interval",
         type=int,
         default=1000,
-        metavar="N",
         help="how many batches to wait before saving.",
     )
     parser.add_argument(
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		2.8,2.8,2.52,2.52,2.27,2.27,2.0,2.0,1.8,1.8,1.6,1.6,1.2,1.2,1.0,1.0,0.0,0.0,0.0,0.0