From 05f5d2606dd24fdb16d870d82b650c2865153ae8 Mon Sep 17 00:00:00 2001
From: Karol Sewilo <karol.sewilo@swmansion.com>
Date: Sun, 15 Dec 2024 21:02:44 +0100
Subject: [PATCH 1/2] Add CategoricalNB

---
 lib/scholar/naive_bayes/categorical.ex        |  556 +++++++++
 test/scholar/naive_bayes/categorical_test.exs | 1030 +++++++++++++++++
 2 files changed, 1586 insertions(+)
 create mode 100644 lib/scholar/naive_bayes/categorical.ex
 create mode 100644 test/scholar/naive_bayes/categorical_test.exs

diff --git a/lib/scholar/naive_bayes/categorical.ex b/lib/scholar/naive_bayes/categorical.ex
new file mode 100644
index 00000000..29598363
--- /dev/null
+++ b/lib/scholar/naive_bayes/categorical.ex
@@ -0,0 +1,556 @@
+defmodule Scholar.NaiveBayes.Categorical do
+  @moduledoc """
+  Naive Bayes classifier for categorical features.
+
+  The categorical Naive Bayes classifier is suitable for classification with
+  discrete features that are categorically distributed. The categories of
+  each feature are drawn from a categorical distribution.
+  """
+  require Nx.Defn.Kernel
+  import Nx.Defn
+  import Scholar.Shared
+
+  @derive {Nx.Container,
+           containers: [
+             :feature_count,
+             :class_count,
+             :class_log_priors,
+             :feature_log_probability
+           ]}
+
+  defstruct [:feature_count, :class_count, :class_log_priors, :feature_log_probability]
+
+  opts_schema = [
+    num_classes: [
+      type: :pos_integer,
+      required: true,
+      doc: ~S"""
+      Number of different classes used in training.
+      """
+    ],
+    alpha: [
+      type: {:or, [:float, {:list, :float}]},
+      default: 1.0,
+      doc: ~S"""
+      Additive (Laplace/Lidstone) smoothing parameter
+      (set alpha to 0.0 and force_alpha to true, for no smoothing).
+      """
+    ],
+    force_alpha: [
+      type: :boolean,
+      default: true,
+      doc: ~S"""
+      If `false` and alpha is less than 1e-10, it will set alpha to
+      1e-10. If `true`, alpha will remain unchanged. This may cause
+      numerical errors if alpha is too close to 0.
+      """
+    ],
+    fit_priors: [
+      type: :boolean,
+      default: true,
+      doc: ~S"""
+      Whether to learn class prior probabilities or not.
+      If `false`, a uniform prior will be used.
+      """
+    ],
+    class_priors: [
+      type: {:custom, Scholar.Options, :weights, []},
+      doc: ~S"""
+      Prior probabilities of the classes. If specified, the priors are not
+      adjusted according to the data.
+      """
+    ],
+    sample_weights: [
+      type: {:custom, Scholar.Options, :weights, []},
+      doc: ~S"""
+      List of `num_samples` elements.
+      A list of 1.0 values is used if none is given.
+      """
+    ],
+    min_categories: [
+      type: {:custom, Scholar.Options, :weights, []},
+      doc: ~S"""
+      List of minimum number of categories per feature.
+      Determines the number of categories automatically from the training data if none is given.
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+  Fits a naive Bayes model. The function assumes that the targets `y` are integers
+  between 0 and `num_classes` - 1 (inclusive). Otherwise, those samples will not
+  contribute to `class_count`.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return Values
+
+  The function returns a struct with the following parameters:
+
+    * `:class_count` - Number of samples encountered for each class during fitting. This
+        value is weighted by the sample weight when provided.
+
+    * `:class_log_priors` - Smoothed empirical log probability for each class.
+
+    * `:feature_count` - A (num_features, num_classes, num_categories) tensor tracking the weighted count of each (feature, class, category) combination.
+        Calculated by summing the weighted occurrences of feature values for each class-label during fitting.
+
+    * `:feature_log_probability` - Empirical log probability of features
+        given a class, ``P(x_i|y)``.
+
+  ## Examples
+
+      iex> x = Nx.tensor([[1, 2, 2], [1, 2, 1], [2, 2, 0]])
+      iex> y = Nx.tensor([0, 1, 1])
+      iex> Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 2)
+      %Scholar.NaiveBayes.Categorical{
+        feature_count: Nx.tensor(
+          [
+            [
+              [0.0, 1.0, 0.0],
+              [0.0, 1.0, 1.0]
+            ],
+            [
+              [0.0, 0.0, 1.0],
+              [0.0, 0.0, 2.0]
+            ],
+            [
+              [0.0, 0.0, 1.0],
+              [1.0, 1.0, 0.0]
+            ]
+          ]
+        ),
+        class_count: Nx.tensor([1.0, 2.0]),
+        class_log_priors: Nx.tensor([-1.0986123085021973, -0.40546512603759766]),
+        feature_log_probability: Nx.tensor(
+          [
+            [
+              [-1.3862943649291992, -0.6931471824645996, -1.3862943649291992],
+              [-1.6094379425048828, -0.9162907600402832, -0.9162907600402832]
+            ],
+            [
+              [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996],
+              [-1.6094379425048828, -1.6094379425048828, -0.5108256340026855]
+            ],
+            [
+              [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996],
+              [-0.9162907600402832, -0.9162907600402832, -1.6094379425048828]
+            ]
+          ]
+        )
+      }
+
+      iex> x = Nx.tensor([[1, 2, 2], [1, 2, 1], [2, 2, 0]])
+      iex> y = Nx.tensor([0, 1, 1])
+      iex> Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 2, force_alpha: false, alpha: 0.0)
+      %Scholar.NaiveBayes.Categorical{
+        feature_count: Nx.tensor(
+          [
+            [
+              [0.0, 1.0, 0.0],
+              [0.0, 1.0, 1.0]
+            ],
+            [
+              [0.0, 0.0, 1.0],
+              [0.0, 0.0, 2.0]
+            ],
+            [
+              [0.0, 0.0, 1.0],
+              [1.0, 1.0, 0.0]
+            ]
+          ]
+        ),
+        class_count: Nx.tensor(
+          [1.0, 2.0]
+        ),
+        class_log_priors: Nx.tensor(
+          [-1.0986123085021973, -0.40546512603759766]
+        ),
+        feature_log_probability: Nx.tensor(
+          [
+            [
+              [-23.025850296020508, 0.0, -23.025850296020508],
+              [-23.718997955322266, -0.6931471824645996, -0.6931471824645996]
+            ],
+            [
+              [-23.025850296020508, -23.025850296020508, 0.0],
+              [-23.718997955322266, -23.718997955322266, 0.0]
+            ],
+            [
+              [-23.025850296020508, -23.025850296020508, 0.0],
+              [-0.6931471824645996, -0.6931471824645996, -23.718997955322266]
+            ]
+          ]
+        )
+      }
+  """
+
+  deftransform fit(x, y, opts \\ []) do
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            """
+            expected x to have shape {num_samples, num_features}, \
+            got tensor with shape: #{inspect(Nx.shape(x))}\
+            """
+    end
+
+    if Nx.rank(y) != 1 do
+      raise ArgumentError,
+            """
+            expected y to have shape {num_samples}, \
+            got tensor with shape: #{inspect(Nx.shape(y))}\
+            """
+    end
+
+    {num_samples, num_features} = Nx.shape(x)
+
+    if num_samples != Nx.axis_size(y, 0) do
+      raise ArgumentError,
+            """
+            expected first dimension of x and y to be of same size, \
+            got: #{num_samples} and #{Nx.axis_size(y, 0)}\
+            """
+    end
+
+    opts = NimbleOptions.validate!(opts, @opts_schema)
+    type = to_float_type(x)
+
+    {alpha, opts} = Keyword.pop!(opts, :alpha)
+    alpha = Nx.tensor(alpha, type: type)
+
+    if Nx.shape(alpha) not in [{}, {num_features}] do
+      raise ArgumentError,
+            """
+            when alpha is list it should have length equal to num_features = #{num_features}, \
+            got: #{Nx.size(alpha)}\
+            """
+    end
+
+    num_classes = opts[:num_classes]
+
+    priors_flag = opts[:class_priors] != nil
+
+    {class_priors, opts} = Keyword.pop(opts, :class_priors, :nan)
+    class_priors = Nx.tensor(class_priors)
+
+    if priors_flag and Nx.size(class_priors) != num_classes do
+      raise ArgumentError,
+            """
+            expected class_priors to be list of length num_classes = #{num_classes}, \
+            got: #{Nx.size(class_priors)}\
+            """
+    end
+
+    sample_weights_flag = opts[:sample_weights] != nil
+
+    {sample_weights, opts} = Keyword.pop(opts, :sample_weights, :nan)
+    sample_weights = Nx.tensor(sample_weights, type: type)
+
+    if sample_weights_flag and Nx.shape(sample_weights) != {num_samples} do
+      raise ArgumentError,
+            """
+            expected sample_weights to be list of length num_samples = #{num_samples}, \
+            got: #{Nx.size(sample_weights)}\
+            """
+    end
+
+    min_categories_flag = opts[:min_categories] != nil
+
+    {min_categories, opts} = Keyword.pop(opts, :min_categories, :nan)
+    min_categories = Nx.tensor(min_categories, type: type)
+
+    if min_categories_flag and Nx.shape(min_categories) != {num_features} do
+      raise ArgumentError,
+            """
+            expected min_categories to be list of length num_features = #{num_features}, \
+            got: #{Nx.size(min_categories)}\
+            """
+    end
+
+    num_categories =
+      (opts[:min_categories] || x)
+      |> Nx.reduce_max()
+      |> Nx.add(1)
+      |> Nx.to_number()
+
+    opts =
+      opts ++
+        [
+          num_categories: num_categories,
+          type: type,
+          priors_flag: priors_flag,
+          sample_weights_flag: sample_weights_flag,
+          min_categories_flag: min_categories_flag
+        ]
+
+    fit_n(x, y, class_priors, sample_weights, alpha, min_categories, opts)
+  end
+
+  defn fit_n(x, y, class_priors, sample_weights, alpha, min_categories, opts) do
+    type = opts[:type]
+    {num_samples, num_features} = Nx.shape(x)
+
+    num_classes = opts[:num_classes]
+    num_categories = opts[:num_categories]
+
+    min_categories =
+      if opts[:min_categories_flag], do: min_categories, else: Nx.reduce_max(x, axes: [0]) + 1
+
+    y_one_hot =
+      y
+      |> Nx.new_axis(1)
+      |> Nx.broadcast({num_samples, num_classes})
+      |> Nx.equal(Nx.iota({num_samples, num_classes}, axis: 1))
+      |> Nx.as_type(type)
+
+    y_weighted =
+      if opts[:sample_weights_flag],
+        do: Nx.reshape(sample_weights, {num_samples, 1}) * y_one_hot,
+        else: y_one_hot
+
+    alpha_lower_bound = Nx.tensor(1.0e-10, type: type)
+
+    alpha =
+      if opts[:force_alpha], do: alpha, else: Nx.max(alpha, alpha_lower_bound)
+
+    {class_count, feature_count} =
+      count(x, y_weighted, num_features, num_classes, num_categories, num_samples)
+
+    feature_log_probability =
+      compute_feature_log_probability(feature_count, alpha, min_categories)
+
+    class_log_priors =
+      cond do
+        opts[:priors_flag] ->
+          Nx.log(class_priors)
+
+        opts[:fit_priors] ->
+          Nx.log(class_count) - Nx.log(Nx.sum(class_count))
+
+        true ->
+          Nx.broadcast(-Nx.log(num_classes), {num_classes})
+      end
+
+    %__MODULE__{
+      class_count: class_count,
+      class_log_priors: class_log_priors,
+      feature_count: feature_count,
+      feature_log_probability: feature_log_probability
+    }
+  end
+
+  @doc """
+  Perform classification on an array of test vectors `x` using `model`.
+  You need to add sorted classes from the training data as the second argument.
+
+  ## Examples
+
+      iex> x = Nx.iota({4, 3})
+      iex> y = Nx.tensor([1, 2, 0, 2])
+      iex> model = Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 3)
+      iex> Scholar.NaiveBayes.Categorical.predict(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]), Nx.tensor([0, 1, 2]))
+      #Nx.Tensor<
+        s32[2]
+        [0, 2]
+      >
+  """
+
+  defn predict(%__MODULE__{} = model, x, classes) do
+    check_dim(x, Nx.axis_size(model.feature_count, 1))
+
+    if Nx.rank(classes) != 1 do
+      raise ArgumentError,
+            """
+            expected classes to be a 1D tensor, \
+            got tensor with shape: #{inspect(Nx.shape(classes))}\
+            """
+    end
+
+    if Nx.axis_size(classes, 0) != Nx.axis_size(model.class_count, 0) do
+      raise ArgumentError,
+            """
+            expected classes to have same size as the number of classes in the model, \
+            got: #{Nx.axis_size(classes, 0)} for classes and #{Nx.axis_size(model.class_count, 0)} for model\
+            """
+    end
+
+    jll = joint_log_likelihood(model, x)
+    classes[Nx.argmax(jll, axis: 1)]
+  end
+
+  @doc """
+  Return log-probability estimates for the test vector `x` using `model`.
+
+  ## Examples
+
+      iex> x = Nx.iota({4, 3})
+      iex> y = Nx.tensor([1, 2, 0, 2])
+      iex> model = Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 3)
+      iex> Scholar.NaiveBayes.Categorical.predict_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]))
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [-0.8266787528991699, -1.5198254585266113, -1.0678410530090332],
+          [-1.272965431213379, -1.272965431213379, -0.8209810256958008]
+        ]
+      >
+  """
+
+  defn predict_log_probability(%__MODULE__{} = model, x) do
+    check_dim(x, Nx.axis_size(model.feature_count, 1))
+    jll = joint_log_likelihood(model, x)
+
+    log_proba_x =
+      jll
+      |> Nx.logsumexp(axes: [1])
+      |> Nx.reshape({Nx.axis_size(jll, 0), 1})
+
+    jll - log_proba_x
+  end
+
+  @doc """
+  Return probability estimates for the test vector `x` using `model`.
+
+  ## Examples
+
+      iex> x = Nx.iota({4, 3})
+      iex> y = Nx.tensor([1, 2, 0, 2])
+      iex> model = Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 3)
+      iex> Scholar.NaiveBayes.Categorical.predict_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]))
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [0.43749991059303284, 0.21875005960464478, 0.34374985098838806],
+          [0.28000006079673767, 0.28000006079673767, 0.4399997889995575]
+        ]
+      >
+  """
+
+  defn predict_probability(%__MODULE__{} = model, x) do
+    Nx.exp(predict_log_probability(model, x))
+  end
+
+  @doc """
+  Return joint log probability estimates for the test vector `x` using `model`.
+
+  ## Examples
+
+      iex> x = Nx.iota({4, 3})
+      iex> y = Nx.tensor([1, 2, 0, 2])
+      iex> model = Scholar.NaiveBayes.Categorical.fit(x, y, num_classes: 3)
+      iex> Scholar.NaiveBayes.Categorical.predict_joint_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]))
+      #Nx.Tensor<
+        f32[2][3]
+        [
+          [-8.140898704528809, -8.83404541015625, -8.382061004638672],
+          [-8.83404541015625, -8.83404541015625, -8.382061004638672]
+        ]
+      >
+  """
+
+  defn predict_joint_log_probability(%__MODULE__{} = model, x) do
+    check_dim(x, Nx.axis_size(model.feature_count, 1))
+    joint_log_likelihood(model, x)
+  end
+
+  defnp check_dim(x, dim) do
+    num_features = Nx.axis_size(x, 1)
+
+    if num_features != dim do
+      raise ArgumentError,
+            """
+            expected x to have same second dimension as data used for fitting model, \
+            got: #{num_features} for x and #{dim} for training data\
+            """
+    end
+  end
+
+  defnp joint_log_likelihood(
+          %__MODULE__{
+            feature_log_probability: feature_log_probability,
+            class_log_priors: class_log_priors
+          },
+          x
+        ) do
+    {_, _, _, jll} =
+      while {i = 0, feature_log_probability, x, jll = Nx.broadcast(0.0, Nx.shape(x))},
+            i < Nx.axis_size(x, 1) do
+        indices = Nx.slice_along_axis(x, i, 1, axis: 1) |> Nx.squeeze(axes: [1])
+
+        jll =
+          Nx.slice_along_axis(feature_log_probability, i, 1, axis: 0)
+          |> Nx.squeeze()
+          |> Nx.take(indices, axis: 1)
+          |> Nx.transpose()
+          |> Nx.add(jll)
+
+        {i + 1, feature_log_probability, x, jll}
+      end
+
+    total_jll = jll + class_log_priors
+    total_jll
+  end
+
+  defnp count(x, y_weighted, num_features, num_classes, num_categories, num_samples) do
+    class_count = Nx.sum(y_weighted, axes: [0])
+
+    feature_count = Nx.broadcast(0.0, {num_features, num_classes, num_categories})
+
+    {_, _, _, _, feature_count} =
+      while {i = 0, x, y_weighted, num_features, feature_count}, i < num_samples do
+        {_, _, _, _, _, feature_count} =
+          while {j = 0, x, y_weighted, i, num_features, feature_count}, j < num_features do
+            category_value = x[i][j]
+            class_label = Nx.argmax(y_weighted[i])
+
+            index = Nx.stack([j, class_label, category_value])
+
+            feature_count = Nx.indexed_add(feature_count, index, y_weighted[i][class_label])
+
+            {j + 1, x, y_weighted, i, num_features, feature_count}
+          end
+
+        {i + 1, x, y_weighted, num_features, feature_count}
+      end
+
+    {class_count, feature_count}
+  end
+
+  defnp compute_feature_log_probability(feature_count, alpha, min_categories) do
+    feature_log_probability = Nx.broadcast(0.0, Nx.shape(feature_count))
+
+    {_, _, _, _, feature_log_probability} =
+      while {i = 0, feature_count, alpha, min_categories, feature_log_probability},
+            i < Nx.axis_size(feature_count, 0) do
+        smoothed_class_count =
+          Nx.sum(feature_count[i], axes: [1])
+          |> Nx.add(alpha * min_categories[i])
+          |> Nx.log()
+          |> Nx.new_axis(1)
+
+        smoothed_cat_count =
+          feature_count[i]
+          |> Nx.add(alpha)
+          |> Nx.log()
+          |> Nx.subtract(smoothed_class_count)
+
+        smoothed_cat_count =
+          Nx.iota({Nx.axis_size(feature_count[i], 1)})
+          |> Nx.less(min_categories[i])
+          |> Nx.broadcast(Nx.shape(feature_count[i]))
+          |> Nx.select(smoothed_cat_count, feature_count[i])
+          |> Nx.new_axis(0)
+
+        feature_log_probability =
+          Nx.put_slice(feature_log_probability, [i, 0, 0], smoothed_cat_count)
+
+        {i + 1, feature_count, alpha, min_categories, feature_log_probability}
+      end
+
+    feature_log_probability
+  end
+end
diff --git a/test/scholar/naive_bayes/categorical_test.exs b/test/scholar/naive_bayes/categorical_test.exs
new file mode 100644
index 00000000..179242aa
--- /dev/null
+++ b/test/scholar/naive_bayes/categorical_test.exs
@@ -0,0 +1,1030 @@
+defmodule Scholar.NaiveBayes.CategoricalTest do
+  use Scholar.Case, async: true
+  alias Scholar.NaiveBayes.Categorical
+  doctest Categorical
+
+  describe "fit" do
+    test "fit test - all default options" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3)
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-1.6094379124341003, -0.916290731874155, -0.916290731874155, 0.0, 0.0],
+            [-1.6094379124341003, -0.5108256237659905, -1.6094379124341003, 0.0, 0.0],
+            [-1.3862943611198906, -1.3862943611198906, -0.6931471805599453, 0.0, 0.0]
+          ],
+          [
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.0986122886681096,
+              -1.791759469228055,
+              0.0
+            ],
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.0986122886681096,
+              0.0
+            ],
+            [
+              -1.6094379124341003,
+              -1.6094379124341003,
+              -0.916290731874155,
+              -1.6094379124341003,
+              0.0
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-0.916290731874155, -0.916290731874155, -1.6094379124341003])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :alpha set to a different value" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, alpha: 1.0e-6)
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-14.508659238523094, -0.6931476805593202, -0.6931476805593202, 0.0, 0.0],
+            [-14.508659238523094, -9.999990001618997e-07, -14.508659238523094, 0.0, 0.0],
+            [-13.815513557959774, -13.815513557959774, -1.999996000066178e-06, 0.0, 0.0]
+          ],
+          [
+            [
+              -14.50865973852222,
+              -0.6931481805584453,
+              -0.6931481805584453,
+              -14.50865973852222,
+              0.0
+            ],
+            [
+              -14.50865973852222,
+              -0.6931481805584453,
+              -14.50865973852222,
+              -0.6931481805584453,
+              0.0
+            ],
+            [
+              -13.815514557956273,
+              -13.815514557956273,
+              -2.9999924999962455e-06,
+              -13.815514557956273,
+              0.0
+            ]
+          ],
+          [
+            [
+              -14.508660238521093,
+              -14.508660238521093,
+              -14.508660238521093,
+              -0.6931486805573204,
+              -0.6931486805573204
+            ],
+            [
+              -14.508660238521093,
+              -14.508660238521093,
+              -14.508660238521093,
+              -0.6931486805573204,
+              -0.6931486805573204
+            ],
+            [
+              -13.815515557951773,
+              -13.815515557951773,
+              -13.815515557951773,
+              -3.999987999934312e-06,
+              -13.815515557951773
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-0.916290731874155, -0.916290731874155, -1.6094379124341003])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :fit_priors set to false" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, fit_priors: false)
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-1.6094379124341003, -0.916290731874155, -0.916290731874155, 0.0, 0.0],
+            [-1.6094379124341003, -0.5108256237659905, -1.6094379124341003, 0.0, 0.0],
+            [-1.3862943611198906, -1.3862943611198906, -0.6931471805599453, 0.0, 0.0]
+          ],
+          [
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.0986122886681096,
+              -1.791759469228055,
+              0.0
+            ],
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.0986122886681096,
+              0.0
+            ],
+            [
+              -1.6094379124341003,
+              -1.6094379124341003,
+              -0.916290731874155,
+              -1.6094379124341003,
+              0.0
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-1.0986122886681098, -1.0986122886681098, -1.0986122886681098])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :priors are set as a list" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, class_priors: [0.15, 0.25, 0.4])
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-1.6094379124341003, -0.916290731874155, -0.916290731874155, 0.0, 0.0],
+            [-1.6094379124341003, -0.5108256237659905, -1.6094379124341003, 0.0, 0.0],
+            [-1.3862943611198906, -1.3862943611198906, -0.6931471805599453, 0.0, 0.0]
+          ],
+          [
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.0986122886681096,
+              -1.791759469228055,
+              0.0
+            ],
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.0986122886681096,
+              0.0
+            ],
+            [
+              -1.6094379124341003,
+              -1.6094379124341003,
+              -0.916290731874155,
+              -1.6094379124341003,
+              0.0
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-1.8971199848858813, -1.3862943611198906, -0.916290731874155])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :priors are set as a tensor" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, class_priors: Nx.tensor([0.15, 0.25, 0.4]))
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 1.0],
+                   [0.0, 0.0, 0.0, 1.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-1.6094379124341003, -0.916290731874155, -0.916290731874155, 0.0, 0.0],
+            [-1.6094379124341003, -0.5108256237659905, -1.6094379124341003, 0.0, 0.0],
+            [-1.3862943611198906, -1.3862943611198906, -0.6931471805599453, 0.0, 0.0]
+          ],
+          [
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.0986122886681096,
+              -1.791759469228055,
+              0.0
+            ],
+            [
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.0986122886681096,
+              0.0
+            ],
+            [
+              -1.6094379124341003,
+              -1.6094379124341003,
+              -0.916290731874155,
+              -1.6094379124341003,
+              0.0
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-1.8971199848858813, -1.3862943611198906, -0.916290731874155])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :sample_weights are set as a list" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, sample_weights: [1.5, 4, 2, 7, 4])
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.5, 4.0, 0.0, 0.0],
+                   [0.0, 11.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 2.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 4.0, 1.5, 0.0, 0.0],
+                   [0.0, 7.0, 0.0, 4.0, 0.0],
+                   [0.0, 0.0, 2.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.5, 4.0],
+                   [0.0, 0.0, 0.0, 7.0, 4.0],
+                   [0.0, 0.0, 0.0, 2.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-2.140066146850586, -1.2237753868103027, -0.5306282043457031, 0.0, 0.0],
+            [-2.6390573978424072, -0.15415072441101074, -2.6390573978424072, 0.0, 0.0],
+            [-1.6094379425048828, -1.6094379425048828, -0.5108256340026855, 0.0, 0.0]
+          ],
+          [
+            [
+              -2.2512917518615723,
+              -0.6418538093566895,
+              -1.335000991821289,
+              -2.2512917518615723,
+              0.0
+            ],
+            [-2.70805025100708, -0.6286087036132812, -2.70805025100708, -1.0986123085021973, 0.0],
+            [
+              -1.7917594909667969,
+              -1.7917594909667969,
+              -0.6931471824645996,
+              -1.7917594909667969,
+              0.0
+            ]
+          ],
+          [
+            [
+              -2.3513753414154053,
+              -2.3513753414154053,
+              -2.3513753414154053,
+              -1.435084581375122,
+              -0.7419373989105225
+            ],
+            [
+              -2.7725887298583984,
+              -2.7725887298583984,
+              -2.7725887298583984,
+              -0.6931471824645996,
+              -1.1631507873535156
+            ],
+            [
+              -1.945910096168518,
+              -1.945910096168518,
+              -1.945910096168518,
+              -0.8472977876663208,
+              -1.945910096168518
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-1.2130225896835327, -0.5198752880096436, -2.224623441696167])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([5.5, 11.0, 2.0])
+    end
+
+    test "fit test - :sample_weights are set as a tensor" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, sample_weights: Nx.tensor([1.5, 4, 2, 7, 4]))
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.5, 4.0, 0.0, 0.0],
+                   [0.0, 11.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 2.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 4.0, 1.5, 0.0, 0.0],
+                   [0.0, 7.0, 0.0, 4.0, 0.0],
+                   [0.0, 0.0, 2.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 0.0, 0.0, 1.5, 4.0],
+                   [0.0, 0.0, 0.0, 7.0, 4.0],
+                   [0.0, 0.0, 0.0, 2.0, 0.0]
+                 ]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [-2.140066146850586, -1.2237753868103027, -0.5306282043457031, 0.0, 0.0],
+            [-2.6390573978424072, -0.15415072441101074, -2.6390573978424072, 0.0, 0.0],
+            [-1.6094379425048828, -1.6094379425048828, -0.5108256340026855, 0.0, 0.0]
+          ],
+          [
+            [
+              -2.2512917518615723,
+              -0.6418538093566895,
+              -1.335000991821289,
+              -2.2512917518615723,
+              0.0
+            ],
+            [-2.70805025100708, -0.6286087036132812, -2.70805025100708, -1.0986123085021973, 0.0],
+            [
+              -1.7917594909667969,
+              -1.7917594909667969,
+              -0.6931471824645996,
+              -1.7917594909667969,
+              0.0
+            ]
+          ],
+          [
+            [
+              -2.3513753414154053,
+              -2.3513753414154053,
+              -2.3513753414154053,
+              -1.435084581375122,
+              -0.7419373989105225
+            ],
+            [
+              -2.7725887298583984,
+              -2.7725887298583984,
+              -2.7725887298583984,
+              -0.6931471824645996,
+              -1.1631507873535156
+            ],
+            [
+              -1.945910096168518,
+              -1.945910096168518,
+              -1.945910096168518,
+              -0.8472977876663208,
+              -1.945910096168518
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-1.2130225896835327, -0.5198752880096436, -2.224623441696167])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([5.5, 11.0, 2.0])
+    end
+
+    test "fit test - :min_categories are set as a list" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, min_categories: [5, 5, 5])
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [[0.0, 0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 0.0]]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.9459101490553132,
+              -0.8472978603872034,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.791759469228055
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.9459101490553132
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.791759469228055
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-0.916290731874155, -0.916290731874155, -1.6094379124341003])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+
+    test "fit test - :min_categories are set as a tensor" do
+      x = Nx.tensor([[1, 2, 3], [1, 3, 4], [2, 2, 3], [1, 1, 3], [2, 1, 4]])
+      y = Nx.tensor([0, 1, 2, 1, 0])
+
+      model = Categorical.fit(x, y, num_classes: 3, min_categories: Nx.tensor([5.0, 5.0, 5.0]))
+
+      assert model.feature_count ==
+               Nx.tensor([
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 2.0, 0.0, 0.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [
+                   [0.0, 1.0, 1.0, 0.0, 0.0],
+                   [0.0, 1.0, 0.0, 1.0, 0.0],
+                   [0.0, 0.0, 1.0, 0.0, 0.0]
+                 ],
+                 [[0.0, 0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 0.0]]
+               ])
+
+      expected_feature_log_probability =
+        Nx.tensor([
+          [
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.9459101490553132,
+              -0.8472978603872034,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.791759469228055
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.9459101490553132
+            ],
+            [
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.9459101490553132
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055,
+              -1.791759469228055
+            ]
+          ],
+          [
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.9459101490553132,
+              -1.252762968495368,
+              -1.252762968495368
+            ],
+            [
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.791759469228055,
+              -1.0986122886681096,
+              -1.791759469228055
+            ]
+          ]
+        ])
+
+      assert_all_close(model.feature_log_probability, expected_feature_log_probability)
+
+      expected_class_log_priors =
+        Nx.tensor([-0.916290731874155, -0.916290731874155, -1.6094379124341003])
+
+      assert_all_close(model.class_log_priors, expected_class_log_priors)
+
+      assert model.class_count == Nx.tensor([2.0, 2.0, 1.0])
+    end
+  end
+
+  describe "errors" do
+    test "wrong input rank" do
+      assert_raise ArgumentError,
+                   "expected x to have shape {num_samples, num_features}, got tensor with shape: {4}",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([1, 2, 5, 8]),
+                       Nx.tensor([1, 2, 3, 4]),
+                       num_classes: 4
+                     )
+                   end
+    end
+
+    test "wrong target rank" do
+      assert_raise ArgumentError,
+                   "expected y to have shape {num_samples}, got tensor with shape: {1, 4}",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8]]),
+                       Nx.tensor([[1, 2, 3, 4]]),
+                       num_classes: 4
+                     )
+                   end
+    end
+
+    test "wrong input shape" do
+      assert_raise ArgumentError,
+                   "expected first dimension of x and y to be of same size, got: 1 and 4",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8]]),
+                       Nx.tensor([1, 2, 3, 4]),
+                       num_classes: 4
+                     )
+                   end
+    end
+
+    test "wrong prior size" do
+      assert_raise ArgumentError,
+                   "expected class_priors to be list of length num_classes = 2, got: 3",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8], [2, 5, 7, 3]]),
+                       Nx.tensor([1, 0]),
+                       num_classes: 2,
+                       class_priors: [0.4, 0.4, 0.2]
+                     )
+                   end
+    end
+
+    test "wrong sample_weights size" do
+      assert_raise ArgumentError,
+                   "expected sample_weights to be list of length num_samples = 2, got: 3",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8], [2, 5, 7, 3]]),
+                       Nx.tensor([1, 0]),
+                       num_classes: 2,
+                       sample_weights: [0.4, 0.4, 0.2]
+                     )
+                   end
+    end
+
+    test "wrong min_categories size" do
+      assert_raise ArgumentError,
+                   "expected min_categories to be list of length num_features = 4, got: 3",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8], [2, 5, 7, 3]]),
+                       Nx.tensor([1, 0]),
+                       num_classes: 2,
+                       min_categories: [5.0, 5.0, 5.0]
+                     )
+                   end
+    end
+
+    test "wrong alpha size" do
+      assert_raise ArgumentError,
+                   "when alpha is list it should have length equal to num_features = 4, got: 3",
+                   fn ->
+                     Categorical.fit(
+                       Nx.tensor([[1, 2, 5, 8], [2, 5, 7, 3]]),
+                       Nx.tensor([1, 0]),
+                       num_classes: 2,
+                       alpha: [0.4, 0.4, 0.2]
+                     )
+                   end
+    end
+
+    test "wrong input shape in training process" do
+      assert_raise ArgumentError,
+                   "expected x to have same second dimension as data used for fitting model, got: 5 for x and 6 for training data",
+                   fn ->
+                     x = Nx.iota({5, 6})
+                     y = Nx.tensor([1, 4, 3, 4, 5])
+
+                     model = Categorical.fit(x, y, num_classes: 6)
+
+                     x_test = Nx.tensor([[1, 2, 3, 4, 5], [0, 0, 0, 0, 0]])
+
+                     Categorical.predict(model, x_test, Nx.tensor([0, 1, 2, 3, 4, 5]))
+                   end
+    end
+  end
+
+  describe "predict" do
+    test "predicts classes correctly for new data" do
+      x = Nx.iota({5, 6})
+      y = Nx.tensor([1, 4, 3, 4, 5])
+
+      model = Categorical.fit(x, y, num_classes: 6)
+
+      x_test = Nx.tensor([[1, 2, 3, 4, 5, 6], [0, 0, 0, 0, 0, 0]])
+
+      minus_infinity = Nx.Constants.infinity() |> Nx.negate() |> Nx.to_number()
+
+      predictions = Categorical.predict(model, x_test, Nx.tensor([0, 1, 2, 3, 4, 5]))
+
+      assert predictions == Nx.tensor([4, 1])
+
+      log_probability =
+        Categorical.predict_log_probability(model, x_test)
+
+      assert_all_close(
+        log_probability,
+        Nx.tensor([
+          [
+            minus_infinity,
+            -1.5314763709643877,
+            minus_infinity,
+            -1.5314763709643877,
+            -1.0459685551826894,
+            -1.5314763709643877
+          ],
+          [
+            minus_infinity,
+            -1.0340737675305398,
+            minus_infinity,
+            -1.7272209480904834,
+            -1.241713132308785,
+            -1.7272209480904834
+          ]
+        ])
+      )
+
+      probability =
+        Categorical.predict_probability(model, x_test)
+
+      assert_all_close(
+        probability,
+        Nx.tensor([
+          [
+            0.0,
+            0.2162162162162164,
+            0.0,
+            0.2162162162162164,
+            0.35135135135135076,
+            0.2162162162162164
+          ],
+          [
+            0.0,
+            0.35555555555555507,
+            0.0,
+            0.17777777777777784,
+            0.2888888888888883,
+            0.17777777777777784
+          ]
+        ])
+      )
+
+      joint_log_probability =
+        Categorical.predict_joint_log_probability(model, x_test)
+
+      assert_all_close(
+        joint_log_probability,
+        Nx.tensor([
+          [
+            minus_infinity,
+            -21.69805624276889,
+            minus_infinity,
+            -21.69805624276889,
+            -21.21254842698719,
+            -21.69805624276889
+          ],
+          [
+            minus_infinity,
+            -21.004909062208945,
+            minus_infinity,
+            -21.69805624276889,
+            -21.21254842698719,
+            -21.69805624276889
+          ]
+        ])
+      )
+    end
+  end
+end

From e38812e8ebfa11f52151dd2069c4367dfc165da4 Mon Sep 17 00:00:00 2001
From: Karol Sewilo <karol.sewilo@swmansion.com>
Date: Tue, 31 Dec 2024 14:39:16 +0100
Subject: [PATCH 2/2] Apply comment suggestions

---
 lib/scholar/naive_bayes/categorical.ex | 27 +++++++++++++-------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/lib/scholar/naive_bayes/categorical.ex b/lib/scholar/naive_bayes/categorical.ex
index 29598363..5b864f6b 100644
--- a/lib/scholar/naive_bayes/categorical.ex
+++ b/lib/scholar/naive_bayes/categorical.ex
@@ -476,8 +476,8 @@ defmodule Scholar.NaiveBayes.Categorical do
           },
           x
         ) do
-    {_, _, _, jll} =
-      while {i = 0, feature_log_probability, x, jll = Nx.broadcast(0.0, Nx.shape(x))},
+    {_, jll} =
+      while {{i = 0, feature_log_probability, x}, jll = Nx.broadcast(0.0, Nx.shape(x))},
             i < Nx.axis_size(x, 1) do
         indices = Nx.slice_along_axis(x, i, 1, axis: 1) |> Nx.squeeze(axes: [1])
 
@@ -488,11 +488,10 @@ defmodule Scholar.NaiveBayes.Categorical do
           |> Nx.transpose()
           |> Nx.add(jll)
 
-        {i + 1, feature_log_probability, x, jll}
+        {{i + 1, feature_log_probability, x}, jll}
       end
 
-    total_jll = jll + class_log_priors
-    total_jll
+    jll + class_log_priors
   end
 
   defnp count(x, y_weighted, num_features, num_classes, num_categories, num_samples) do
@@ -500,10 +499,10 @@ defmodule Scholar.NaiveBayes.Categorical do
 
     feature_count = Nx.broadcast(0.0, {num_features, num_classes, num_categories})
 
-    {_, _, _, _, feature_count} =
-      while {i = 0, x, y_weighted, num_features, feature_count}, i < num_samples do
-        {_, _, _, _, _, feature_count} =
-          while {j = 0, x, y_weighted, i, num_features, feature_count}, j < num_features do
+    {_, feature_count} =
+      while {{i = 0, x, y_weighted, num_features}, feature_count}, i < num_samples do
+        {_, feature_count} =
+          while {{j = 0, x, y_weighted, i, num_features}, feature_count}, j < num_features do
             category_value = x[i][j]
             class_label = Nx.argmax(y_weighted[i])
 
@@ -511,10 +510,10 @@ defmodule Scholar.NaiveBayes.Categorical do
 
             feature_count = Nx.indexed_add(feature_count, index, y_weighted[i][class_label])
 
-            {j + 1, x, y_weighted, i, num_features, feature_count}
+            {{j + 1, x, y_weighted, i, num_features}, feature_count}
           end
 
-        {i + 1, x, y_weighted, num_features, feature_count}
+        {{i + 1, x, y_weighted, num_features}, feature_count}
       end
 
     {class_count, feature_count}
@@ -523,8 +522,8 @@ defmodule Scholar.NaiveBayes.Categorical do
   defnp compute_feature_log_probability(feature_count, alpha, min_categories) do
     feature_log_probability = Nx.broadcast(0.0, Nx.shape(feature_count))
 
-    {_, _, _, _, feature_log_probability} =
-      while {i = 0, feature_count, alpha, min_categories, feature_log_probability},
+    {_, feature_log_probability} =
+      while {{i = 0, feature_count, alpha, min_categories}, feature_log_probability},
             i < Nx.axis_size(feature_count, 0) do
         smoothed_class_count =
           Nx.sum(feature_count[i], axes: [1])
@@ -548,7 +547,7 @@ defmodule Scholar.NaiveBayes.Categorical do
         feature_log_probability =
           Nx.put_slice(feature_log_probability, [i, 0, 0], smoothed_cat_count)
 
-        {i + 1, feature_count, alpha, min_categories, feature_log_probability}
+        {{i + 1, feature_count, alpha, min_categories}, feature_log_probability}
       end
 
     feature_log_probability