diff --git a/LICENSE b/LICENSE
index 067d1fe..fb555cb 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018-2023 neural-fortran contributors
+Copyright (c) 2018-2024 neural-fortran contributors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/fpm.toml b/fpm.toml
index 7e465b0..5252ab8 100644
--- a/fpm.toml
+++ b/fpm.toml
@@ -1,9 +1,9 @@
 name = "neural-fortran"
-version = "0.15.1"
+version = "0.16.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "milancurcic@hey.com"
-copyright = "Copyright 2018-2023, neural-fortran contributors"
+copyright = "Copyright 2018-2024, neural-fortran contributors"
 
 [build]
 external-modules = "hdf5"
diff --git a/src/nf.f90 b/src/nf.f90
index eb2a903..ae72e1e 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -4,6 +4,7 @@ module nf
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
     conv2d, dense, flatten, input, maxpool2d, reshape
+  use nf_loss, only: mse, quadratic
   use nf_network, only: network
   use nf_optimizers, only: sgd, rmsprop, adam, adagrad
   use nf_activation, only: activation_function, elu, exponential,  &
diff --git a/src/nf/nf_loss.f90 b/src/nf/nf_loss.f90
index 62687fd..ceeb439 100644
--- a/src/nf/nf_loss.f90
+++ b/src/nf/nf_loss.f90
@@ -1,20 +1,71 @@
 module nf_loss
 
-  !! This module will eventually provide a collection of loss functions and
-  !! their derivatives. For the time being it provides only the quadratic
-  !! function.
+  !! This module provides a collection of loss functions and their derivatives.
+  !! The implementation is based on an abstract loss derived type
+  !! which has the required eval and derivative methods.
+  !! An implementation of a new loss type thus requires writing a concrete
+  !! loss type that extends the abstract loss derived type, and that
+  !! implements concrete eval and derivative methods that accept vectors.
 
   implicit none
 
   private
-  public :: quadratic, quadratic_derivative
+  public :: loss_type
+  public :: mse
+  public :: quadratic
+
+  type, abstract :: loss_type
+  contains
+    procedure(loss_interface), nopass, deferred :: eval
+    procedure(loss_derivative_interface), nopass, deferred :: derivative
+  end type loss_type
+
+  abstract interface
+    pure function loss_interface(true, predicted) result(res)
+      real, intent(in) :: true(:)
+      real, intent(in) :: predicted(:)
+      real :: res
+    end function loss_interface
+    pure function loss_derivative_interface(true, predicted) result(res)
+      real, intent(in) :: true(:)
+      real, intent(in) :: predicted(:)
+      real :: res(size(true))
+    end function loss_derivative_interface
+  end interface
+
+  type, extends(loss_type) :: mse
+    !! Mean Square Error loss function
+  contains
+    procedure, nopass :: eval => mse_eval
+    procedure, nopass :: derivative => mse_derivative
+  end type mse
+
+  type, extends(loss_type) :: quadratic
+    !! Quadratic loss function
+  contains
+    procedure, nopass :: eval => quadratic_eval
+    procedure, nopass :: derivative => quadratic_derivative
+  end type quadratic
 
   interface
 
-    pure module function quadratic(true, predicted) result(res)
-      !! Quadratic loss function:
+    pure module function mse_eval(true, predicted) result(res)
+      !! Mean Square Error loss function:
+      !!
+      !!   L  = sum((predicted - true)**2) / size(true)
+      !!
+      real, intent(in) :: true(:)
+        !! True values, i.e. labels from training datasets
+      real, intent(in) :: predicted(:)
+        !! Values predicted by the network
+      real :: res
+        !! Resulting loss value
+    end function mse_eval
+
+    pure module function mse_derivative(true, predicted) result(res)
+      !! First derivative of the Mean Square Error loss function:
       !!
-      !!   L  = (predicted - true)**2 / 2
+      !!   L  = 2 * (predicted - true) / size(true)
       !!
       real, intent(in) :: true(:)
         !! True values, i.e. labels from training datasets
@@ -22,7 +73,20 @@ pure module function quadratic(true, predicted) result(res)
         !! Values predicted by the network
       real :: res(size(true))
         !! Resulting loss values
-    end function quadratic
+    end function mse_derivative
+
+    pure module function quadratic_eval(true, predicted) result(res)
+      !! Quadratic loss function:
+      !!
+      !!   L  = sum((predicted - true)**2) / 2
+      !!
+      real, intent(in) :: true(:)
+        !! True values, i.e. labels from training datasets
+      real, intent(in) :: predicted(:)
+        !! Values predicted by the network
+      real :: res
+        !! Resulting loss value
+    end function quadratic_eval
 
     pure module function quadratic_derivative(true, predicted) result(res)
       !! First derivative of the quadratic loss function:
diff --git a/src/nf/nf_loss_submodule.f90 b/src/nf/nf_loss_submodule.f90
index f8ad8a5..3c83fb9 100644
--- a/src/nf/nf_loss_submodule.f90
+++ b/src/nf/nf_loss_submodule.f90
@@ -4,12 +4,12 @@
 
 contains
 
-  pure module function quadratic(true, predicted) result(res)
+  pure module function quadratic_eval(true, predicted) result(res)
     real, intent(in) :: true(:)
     real, intent(in) :: predicted(:)
-    real :: res(size(true))
-    res = (predicted - true)**2 / 2
-  end function quadratic
+    real :: res
+    res = sum((predicted - true)**2) / 2
+  end function quadratic_eval
 
   pure module function quadratic_derivative(true, predicted) result(res)
     real, intent(in) :: true(:)
@@ -18,4 +18,18 @@ pure module function quadratic_derivative(true, predicted) result(res)
     res = predicted - true
   end function quadratic_derivative
 
+  pure module function mse_eval(true, predicted) result(res)
+    real, intent(in) :: true(:)
+    real, intent(in) :: predicted(:)
+    real :: res
+    res = sum((predicted - true)**2) / size(true)
+  end function mse_eval
+
+  pure module function mse_derivative(true, predicted) result(res)
+    real, intent(in) :: true(:)
+    real, intent(in) :: predicted(:)
+    real :: res(size(true))
+    res = 2 * (predicted - true) / size(true)
+  end function mse_derivative
+
 end submodule nf_loss_submodule
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index f12926e..4346f4e 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -3,6 +3,7 @@ module nf_network
   !! This module provides the network type to create new models.
 
   use nf_layer, only: layer
+  use nf_loss, only: loss_type
   use nf_optimizers, only: optimizer_base_type
 
   implicit none
@@ -13,6 +14,7 @@ module nf_network
   type :: network
 
     type(layer), allocatable :: layers(:)
+    class(loss_type), allocatable :: loss
     class(optimizer_base_type), allocatable :: optimizer
 
   contains
@@ -138,7 +140,7 @@ end function predict_batch_3d
 
   interface
 
-    pure module subroutine backward(self, output)
+    pure module subroutine backward(self, output, loss)
       !! Apply one backward pass through the network.
       !! This changes the state of layers on the network.
       !! Typically used only internally from the `train` method,
@@ -147,6 +149,8 @@ pure module subroutine backward(self, output)
         !! Network instance
       real, intent(in) :: output(:)
         !! Output data
+      class(loss_type), intent(in), optional :: loss
+        !! Loss instance to use. If not provided, the default is quadratic().
     end subroutine backward
 
     pure module integer function get_num_params(self)
@@ -185,7 +189,7 @@ module subroutine print_info(self)
     end subroutine print_info
 
     module subroutine train(self, input_data, output_data, batch_size, &
-                            epochs, optimizer)
+                            epochs, optimizer, loss)
       class(network), intent(in out) :: self
         !! Network instance
       real, intent(in) :: input_data(:,:)
@@ -204,6 +208,8 @@ module subroutine train(self, input_data, output_data, batch_size, &
         !! Number of epochs to run
       class(optimizer_base_type), intent(in), optional :: optimizer
         !! Optimizer instance to use. If not provided, the default is sgd().
+      class(loss_type), intent(in), optional :: loss
+        !! Loss instance to use. If not provided, the default is quadratic().
     end subroutine train
 
     module subroutine update(self, optimizer, batch_size)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index a35064b..59abcf7 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -11,7 +11,7 @@
   use nf_keras, only: get_keras_h5_layers, keras_layer
   use nf_layer, only: layer
   use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape
-  use nf_loss, only: quadratic_derivative
+  use nf_loss, only: quadratic
   use nf_optimizers, only: optimizer_base_type, sgd
   use nf_parallel, only: tile_indices
   use nf_activation, only: activation_function, &
@@ -280,11 +280,27 @@ pure function get_activation_by_name(activation_name) result(res)
 
   end function get_activation_by_name
 
-  pure module subroutine backward(self, output)
+  pure module subroutine backward(self, output, loss)
     class(network), intent(in out) :: self
     real, intent(in) :: output(:)
+    class(loss_type), intent(in), optional :: loss
     integer :: n, num_layers
 
+    ! Passing the loss instance is optional. If not provided, and if the
+    ! loss instance has not already been set, we default to the default quadratic. The
+    ! instantiation and initialization below of the loss instance is normally done
+    ! at the beginning of the network % train() method. However, if the user
+    ! wants to call network % backward() directly, for example if they use their
+    ! own custom mini-batching routine, we initialize the loss instance here as
+    ! well. If it's initialized already, this step is a cheap no-op.
+    if (.not. allocated(self % loss)) then
+      if (present(loss)) then
+        self % loss = loss
+      else
+        self % loss = quadratic()
+      end if
+    end if
+
     num_layers = size(self % layers)
 
     ! Iterate backward over layers, from the output layer
@@ -297,7 +313,7 @@ pure module subroutine backward(self, output)
           type is(dense_layer)
             call self % layers(n) % backward( &
               self % layers(n - 1), &
-              quadratic_derivative(output, this_layer % output) &
+              self % loss % derivative(output, this_layer % output) &
             )
         end select
       else
@@ -542,13 +558,14 @@ end subroutine set_params
 
 
   module subroutine train(self, input_data, output_data, batch_size, &
-                          epochs, optimizer)
+                          epochs, optimizer, loss)
     class(network), intent(in out) :: self
     real, intent(in) :: input_data(:,:)
     real, intent(in) :: output_data(:,:)
     integer, intent(in) :: batch_size
     integer, intent(in) :: epochs
     class(optimizer_base_type), intent(in), optional :: optimizer
+    class(loss_type), intent(in), optional :: loss
     class(optimizer_base_type), allocatable :: optimizer_
 
     real :: pos
@@ -567,6 +584,14 @@ module subroutine train(self, input_data, output_data, batch_size, &
 
     call self % optimizer % init(self % get_num_params())
 
+    ! Passing the loss instance is optional.
+    ! If not provided, we default to quadratic().
+    if (present(loss)) then
+      self % loss = loss
+    else
+      self % loss = quadratic()
+    end if
+
     dataset_size = size(output_data, dim=2)
 
     epoch_loop: do n = 1, epochs
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 26646ec..b24262c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,6 +16,7 @@ foreach(execid
   cnn_from_keras
   conv2d_network
   optimizers
+  loss
   )
   add_executable(test_${execid} test_${execid}.f90)
   target_link_libraries(test_${execid} PRIVATE neural h5fortran::h5fortran jsonfortran::jsonfortran ${LIBS})
diff --git a/test/test_loss.f90 b/test/test_loss.f90
new file mode 100644
index 0000000..713b5e9
--- /dev/null
+++ b/test/test_loss.f90
@@ -0,0 +1,53 @@
+program test_loss
+
+  use iso_fortran_env, only: stderr => error_unit
+  use nf, only: mse, quadratic
+
+  implicit none
+
+  logical :: ok = .true.
+
+  block
+
+    type(mse) :: loss
+    real :: true(2) = [1., 2.]
+    real :: pred(2) = [3., 4.]
+
+    if (.not. loss % eval(true, pred) == 4) then
+      write(stderr, '(a)') 'expected output of mse % eval().. failed'
+      ok = .false.
+    end if
+
+    if (.not. all(loss % derivative(true, pred) == [2, 2])) then
+      write(stderr, '(a)') 'expected output of mse % derivative().. failed'
+      ok = .false.
+    end if
+
+  end block
+
+  block
+
+    type(quadratic) :: loss
+    real :: true(4) = [1., 2., 3., 4.]
+    real :: pred(4) = [3., 4., 5., 6.]
+
+    if (.not. loss % eval(true, pred) == 8) then
+      write(stderr, '(a)') 'expected output of quadratic % eval().. failed'
+      ok = .false.
+    end if
+
+    if (.not. all(loss % derivative(true, pred) == [2, 2, 2, 2])) then
+      write(stderr, '(a)') 'expected output of quadratic % derivative().. failed'
+      ok = .false.
+    end if
+
+  end block
+
+  if (ok) then
+    print '(a)', 'test_loss: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_loss: One or more tests failed.'
+    stop 1
+  end if
+
+end program test_loss
\ No newline at end of file