From 953153d64cc96c9352821e4d9e077abc5633add2 Mon Sep 17 00:00:00 2001
From: lbventura <lbventur@gmail.com>
Date: Fri, 10 Nov 2023 11:00:34 +0100
Subject: [PATCH 1/2] first set of questions

---
 cyclic_boosting/generic_loss.py | 46 ++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/cyclic_boosting/generic_loss.py b/cyclic_boosting/generic_loss.py
index c850288..7e814e2 100644
--- a/cyclic_boosting/generic_loss.py
+++ b/cyclic_boosting/generic_loss.py
@@ -65,31 +65,51 @@ def calc_parameters(
         float, float
             estimated parameters and its uncertainties
         """
-        sorting = feature.lex_binned_data.argsort()
-        sorted_bins = feature.lex_binned_data[sorting]
-        bins, split_indices = np.unique(sorted_bins, return_index=True)
-        split_indices = split_indices[1:]
+        # ! TODO: [Q] Why these operations?
+        #  I probably do not understand the CB algo, high level explainer would be great
+        sorting = feature.lex_binned_data.argsort()  # 1. get element index row-wise ordered from smallest to greatest
+        sorted_bins = feature.lex_binned_data[sorting]  # 2. return the bins sorted from smallest to greatest
+        # do not quite understand how this works
+        # as my example with a3=np.random.rand(3,10) and a3[a3.argsort()] was returning an IndexError
+        bins, split_indices = np.unique(
+            sorted_bins, return_index=True
+        )  # 3. return only the unique values for each bin ordered
+        split_indices = split_indices[1:]  # 5. drop the zero index bin
 
         y_pred = np.hstack((y[..., np.newaxis], self.unlink_func(pred.predict_link())[..., np.newaxis]))
+        # 6. joining the values of the target variable with those of the predictions
         y_pred = np.hstack((y_pred, self.weights[..., np.newaxis]))
+        # 7. joining the previous matrix with the weights (of each input variable?)
         y_pred_bins = np.split(y_pred[sorting], split_indices)
+        # 8. sort the predictions according to the bins (of the input variable?) and split this into bins
 
         # keep potential empty bins in multi-dimensional features
         all_bins = range(max(feature.lex_binned_data) + 1)
-        empty_bins = list(set(bins) ^ set(all_bins))
+        empty_bins = set(bins) ^ set(all_bins)
+        # 9. returns the elements which are either in set(bins)
+        # or set(all_bins).
+        # ! TODO: list can be removed as only iterator is used below
+        # why does this return the empty bins though?
+        # because all_bins is a superset of sorted_bins, this is tantamount to finding the values
+        # which are not in bins. Bins return a list of all the values
+        # check, for example, a5= np.array([[i*j + 1 for i in range(0,3)] for j in range(0,3)])
+        # bins , split_indices = np.unique(a5, return_index=True)
         for i in empty_bins:
-            y_pred_bins.insert(i, np.zeros((0, 3)))
+            y_pred_bins.insert(i, np.zeros((0, 3)))  # ! TODO: [Q] Is the (0,3) format due to (y, y_hat, weights)?
 
         n_bins = len(y_pred_bins)
         parameters = np.zeros(n_bins)
         uncertainties = np.zeros(n_bins)
 
+        # 10. Try to minimize a loss function given y, y_pred and the weights?
         for bin in range(n_bins):
             parameters[bin], uncertainties[bin] = self.optimization(
                 y_pred_bins[bin][:, 0], y_pred_bins[bin][:, 1], y_pred_bins[bin][:, 2]
             )
+            # ! TODO: What parameters are being returned?
 
         neutral_factor = self.unlink_func(np.array(self.neutral_factor_link))
+        # 11. if there is one more bin corresponding to the neutral factor, then add it to the parameters
         if n_bins + 1 == feature.n_bins:
             parameters = np.append(parameters, neutral_factor)
             uncertainties = np.append(uncertainties, 0)
@@ -127,6 +147,8 @@ def optimization(self, y: np.ndarray, yhat_others: np.ndarray, weights: np.ndarr
         res = minimize(self.objective_function, neutral_factor, args=(yhat_others, y, weights))
         return res.x, self.uncertainty(y, weights)
 
+    # TODO: Is the parameter computed for each bin, across all bins?
+    # I would assume that it is for each bin (one parameter per bin?)
     def objective_function(self, param: float, yhat_others: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample costs (potentially including sample
@@ -181,6 +203,14 @@ def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         raise NotImplementedError("implement in subclass")
 
 
+# TODO : Because the only difference between the CBMultiplicativeQuantileRegressor and
+# AdditiveQuantileRegressor is in the model and uncertainty, would it not be best
+# to write a class CBQuantileRegressor? And create the
+# two classes as an implementation of CBQuantileRegressor?
+# This would also allow us to include the quantile_costs , quantile_global_scale
+# as static methods of the CBQuantileRegressor
+# and we would just need to define the specifics (model, uncertainty)
+# at the individual regressor
 class CBMultiplicativeQuantileRegressor(CBGenericLoss, sklearn.base.RegressorMixin, LogLinkMixin):
     """
     Cyclic Boosting multiplicative quantile-regression mode. A quantile loss,
@@ -329,6 +359,8 @@ def __init__(
     def _check_y(self, y: np.ndarray) -> None:
         check_y_additive(y)
 
+    # ! TODO: From the examples below, the cost and the loss function are the same
+    # Is this always true and, if so, why have two functions rather than one?
     def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample quantile loss, or to be exact costs,
@@ -404,7 +436,7 @@ def quantile_global_scale(
     weights: np.ndarray,
     prior_prediction_column: Union[str, int, None],
     link_func,
-) -> None:
+) -> Tuple:
     """
     Calculation of the global scale for quantile regression, corresponding
     to the (continuous approximation of the) respective quantile of the

From bdd72ba133180d9e16918b09332137e8fa42077b Mon Sep 17 00:00:00 2001
From: lbventura <lbventur@gmail.com>
Date: Fri, 10 Nov 2023 11:03:11 +0100
Subject: [PATCH 2/2] clean-up questions

---
 cyclic_boosting/generic_loss.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/cyclic_boosting/generic_loss.py b/cyclic_boosting/generic_loss.py
index 7e814e2..79754bc 100644
--- a/cyclic_boosting/generic_loss.py
+++ b/cyclic_boosting/generic_loss.py
@@ -147,8 +147,6 @@ def optimization(self, y: np.ndarray, yhat_others: np.ndarray, weights: np.ndarr
         res = minimize(self.objective_function, neutral_factor, args=(yhat_others, y, weights))
         return res.x, self.uncertainty(y, weights)
 
-    # TODO: Is the parameter computed for each bin, across all bins?
-    # I would assume that it is for each bin (one parameter per bin?)
     def objective_function(self, param: float, yhat_others: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample costs (potentially including sample
@@ -203,14 +201,6 @@ def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         raise NotImplementedError("implement in subclass")
 
 
-# TODO : Because the only difference between the CBMultiplicativeQuantileRegressor and
-# AdditiveQuantileRegressor is in the model and uncertainty, would it not be best
-# to write a class CBQuantileRegressor? And create the
-# two classes as an implementation of CBQuantileRegressor?
-# This would also allow us to include the quantile_costs , quantile_global_scale
-# as static methods of the CBQuantileRegressor
-# and we would just need to define the specifics (model, uncertainty)
-# at the individual regressor
 class CBMultiplicativeQuantileRegressor(CBGenericLoss, sklearn.base.RegressorMixin, LogLinkMixin):
     """
     Cyclic Boosting multiplicative quantile-regression mode. A quantile loss,
@@ -359,8 +349,6 @@ def __init__(
     def _check_y(self, y: np.ndarray) -> None:
         check_y_additive(y)
 
-    # ! TODO: From the examples below, the cost and the loss function are the same
-    # Is this always true and, if so, why have two functions rather than one?
     def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample quantile loss, or to be exact costs,