From d43a45dcf89c5f3f343d23d8399919b3cf39ba96 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 4 Oct 2024 09:43:44 +0200
Subject: [PATCH 01/16] added dtype to kwargs passed to TfidfVectorizer

---
 emm/indexing/pandas_normalized_tfidf.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index 4458615..058f06e 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -37,8 +37,6 @@
 class PandasNormalizedTfidfVectorizer(TfidfVectorizer):
     """Implementation of customized TFIDF vectorizer"""
 
-    dtype = np.float32
-
     def __init__(self, **kwargs: Any) -> None:
         """Implementation of customized TFIDF vectorizer
 
@@ -53,7 +51,7 @@ def __init__(self, **kwargs: Any) -> None:
         Args:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
-        kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
+        kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True, "dtype": np.float32})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"
         super().__init__(**kwargs)

From 9525192bc5fe674854457271ca1e8ca0548f97b1 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 4 Oct 2024 09:44:34 +0200
Subject: [PATCH 02/16] added branch to triggers

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e5247e7..cacac1a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main,  PandasNormalizedTfidfVectorizer-dtype]
   pull_request:
 
 jobs:

From d575701159bc87128876bb7646828b70281867d2 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 4 Oct 2024 12:07:02 +0200
Subject: [PATCH 03/16] added print statements for debugging purposes

---
 emm/indexing/pandas_normalized_tfidf.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index 058f06e..f43433e 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -37,6 +37,8 @@
 class PandasNormalizedTfidfVectorizer(TfidfVectorizer):
     """Implementation of customized TFIDF vectorizer"""
 
+    dtype = np.float32
+
     def __init__(self, **kwargs: Any) -> None:
         """Implementation of customized TFIDF vectorizer
 
@@ -51,7 +53,7 @@ def __init__(self, **kwargs: Any) -> None:
         Args:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
-        kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True, "dtype": np.float32})
+        kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"
         super().__init__(**kwargs)
@@ -90,6 +92,12 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
                 assert self._tfidf._idf_diag.dtype == self.dtype
             else:
                 # sklearn >= 1.5
+                print("X is", X.size)
+                print("X nulls", X.isna().any())
+                print("self.dtype is", self.dtype)
+                print("n_features is", n_features)
+                print("np.ones dtype is", np.ones(n_features, dtype=self.dtype).dtype)
+                print("self.idf_ dtype is", self.idf_.dtype)
                 self.idf_ = self.idf_ - np.ones(n_features, dtype=self.dtype)
                 assert self.idf_.dtype == self.dtype
 

From 00a3b1392556cc98aca9db2a843235df33385af8 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 4 Oct 2024 13:44:37 +0200
Subject: [PATCH 04/16] added check for existence of dtype in kwargs

---
 emm/indexing/pandas_normalized_tfidf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index f43433e..b528858 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -37,8 +37,6 @@
 class PandasNormalizedTfidfVectorizer(TfidfVectorizer):
     """Implementation of customized TFIDF vectorizer"""
 
-    dtype = np.float32
-
     def __init__(self, **kwargs: Any) -> None:
         """Implementation of customized TFIDF vectorizer
 
@@ -54,6 +52,8 @@ def __init__(self, **kwargs: Any) -> None:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
         kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
+        if "dtype" not in kwargs:
+            kwargs.update({"dtype": np.float32})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"
         super().__init__(**kwargs)

From 427da4eb38ab311d60bef4f7b6ceebfbf35b7fa8 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 4 Oct 2024 13:58:40 +0200
Subject: [PATCH 05/16] added more print statements

---
 emm/indexing/pandas_cos_sim_matcher.py  | 1 +
 emm/indexing/pandas_normalized_tfidf.py | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/emm/indexing/pandas_cos_sim_matcher.py b/emm/indexing/pandas_cos_sim_matcher.py
index 21ecb85..0deddc2 100644
--- a/emm/indexing/pandas_cos_sim_matcher.py
+++ b/emm/indexing/pandas_cos_sim_matcher.py
@@ -104,6 +104,7 @@ def __init__(
         self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count()
         self.spark_session = spark_session
         # attributes below are set during fit
+        print("Printing from inside PandasCosSomIndexer: dtype is", dtype)
         self.tfidf = PandasNormalizedTfidfVectorizer(
             analyzer={"words": "word", "characters": "char"}[tokenizer],
             binary=binary_countvectorizer,
diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index b528858..e7b44de 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -52,8 +52,9 @@ def __init__(self, **kwargs: Any) -> None:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
         kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
-        if "dtype" not in kwargs:
-            kwargs.update({"dtype": np.float32})
+        print("Printing from inside PandasNormalizedTfidfVectorizer: kwargs are", kwargs)
+        # if "dtype" not in kwargs:
+        #     kwargs.update({"dtype": np.float32})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"
         super().__init__(**kwargs)
@@ -93,6 +94,7 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
             else:
                 # sklearn >= 1.5
                 print("X is", X.size)
+                print("X dtype is", X.dtype)
                 print("X nulls", X.isna().any())
                 print("self.dtype is", self.dtype)
                 print("n_features is", n_features)

From b012cebe58bbc6d22ef3cf106afa56c20ea5e9be Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 13:23:21 +0200
Subject: [PATCH 06/16] added explicit conversion of idf_ dtype

---
 emm/indexing/pandas_normalized_tfidf.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index e7b44de..7f0bf1f 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -52,9 +52,6 @@ def __init__(self, **kwargs: Any) -> None:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
         kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
-        print("Printing from inside PandasNormalizedTfidfVectorizer: kwargs are", kwargs)
-        # if "dtype" not in kwargs:
-        #     kwargs.update({"dtype": np.float32})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"
         super().__init__(**kwargs)
@@ -75,6 +72,8 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
         with Timer("CustomizedTfidfVectorizer.fit") as timer:
             timer.label("super fit")
             super().fit(X)
+            # scikit-learn's TidfVectorizer does not preserve dtype for large X, so we force it here
+            self.idf_ = self.idf_.astype(self.dtype)
 
             timer.label("normalize")
             n_features = self.idf_.shape[0]
@@ -93,13 +92,6 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
                 assert self._tfidf._idf_diag.dtype == self.dtype
             else:
                 # sklearn >= 1.5
-                print("X is", X.size)
-                print("X dtype is", X.dtype)
-                print("X nulls", X.isna().any())
-                print("self.dtype is", self.dtype)
-                print("n_features is", n_features)
-                print("np.ones dtype is", np.ones(n_features, dtype=self.dtype).dtype)
-                print("self.idf_ dtype is", self.idf_.dtype)
                 self.idf_ = self.idf_ - np.ones(n_features, dtype=self.dtype)
                 assert self.idf_.dtype == self.dtype
 

From 7fa75ad752c318c38d55a1edc900d44bd7d3e5c2 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 13:28:55 +0200
Subject: [PATCH 07/16] removed print statement

---
 emm/indexing/pandas_cos_sim_matcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/emm/indexing/pandas_cos_sim_matcher.py b/emm/indexing/pandas_cos_sim_matcher.py
index 0deddc2..21ecb85 100644
--- a/emm/indexing/pandas_cos_sim_matcher.py
+++ b/emm/indexing/pandas_cos_sim_matcher.py
@@ -104,7 +104,6 @@ def __init__(
         self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count()
         self.spark_session = spark_session
         # attributes below are set during fit
-        print("Printing from inside PandasCosSomIndexer: dtype is", dtype)
         self.tfidf = PandasNormalizedTfidfVectorizer(
             analyzer={"words": "word", "characters": "char"}[tokenizer],
             binary=binary_countvectorizer,

From 337bdc091ed91e83a45ec87f9ada1e81340964d2 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 14:52:39 +0200
Subject: [PATCH 08/16] added related test

---
 tests/integration/test_pandas_em.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py
index b60e132..57ce5a5 100644
--- a/tests/integration/test_pandas_em.py
+++ b/tests/integration/test_pandas_em.py
@@ -21,6 +21,7 @@
 
 import logging
 import os
+import uuid
 
 import numpy as np
 import pandas as pd
@@ -135,6 +136,17 @@ def test_pandas_tfidf(dtype):
         np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001)
 
 
+@pytest.mark.parametrize(
+    ("dtype", "data_size"), [(np.float32, 100), (np.float64, True), (np.float32, 1000000), (np.float64, 1000000)]
+)
+def test_pandas_tfidf_dtype(dtype, data_size):
+    pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype)
+    unique_names = [str(uuid.uuid4()) for i in range(data_size)]
+    gt_names = pd.Series(unique_names)
+    pandas_t.fit(gt_names)
+    assert pandas_t.idf_.dtype == dtype
+
+
 def test_pandas_tfidf_ngram():
     pandas_t = PandasNormalizedTfidfVectorizer(binary=True, analyzer="char", ngram_range=(3, 3))
     gt_names = pd.Series(["aaab", "bbbc"])

From 3a8e36cf292183b88edcedafc57006e8dd1341d9 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 14:54:40 +0200
Subject: [PATCH 09/16] corrected test argument

---
 tests/integration/test_pandas_em.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py
index 57ce5a5..a7c5ded 100644
--- a/tests/integration/test_pandas_em.py
+++ b/tests/integration/test_pandas_em.py
@@ -137,7 +137,7 @@ def test_pandas_tfidf(dtype):
 
 
 @pytest.mark.parametrize(
-    ("dtype", "data_size"), [(np.float32, 100), (np.float64, True), (np.float32, 1000000), (np.float64, 1000000)]
+    ("dtype", "data_size"), [(np.float32, 100), (np.float64, 100), (np.float32, 1000000), (np.float64, 1000000)]
 )
 def test_pandas_tfidf_dtype(dtype, data_size):
     pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype)

From c55d74f48f93af44e1ca79139cd00e7e8d93942f Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 15:34:37 +0200
Subject: [PATCH 10/16] removed branch from testing triggers

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cacac1a..e5247e7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main,  PandasNormalizedTfidfVectorizer-dtype]
+    branches: [ main ]
   pull_request:
 
 jobs:

From 2394ff4e96f738d34673352e84972eaede9615da Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 15:40:51 +0200
Subject: [PATCH 11/16] fixed typo

---
 emm/indexing/pandas_normalized_tfidf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index 7f0bf1f..6497a2f 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -72,7 +72,7 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
         with Timer("CustomizedTfidfVectorizer.fit") as timer:
             timer.label("super fit")
             super().fit(X)
-            # scikit-learn's TidfVectorizer does not preserve dtype for large X, so we force it here
+            # scikit-learn's TfidfVectorizer does not preserve dtype for large X, so we force it here
             self.idf_ = self.idf_.astype(self.dtype)
 
             timer.label("normalize")

From c7915f28cb06c037379fcbb6929bdc30e05bbd25 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 22:12:42 +0200
Subject: [PATCH 12/16] added default np.float32 dtype argument

---
 emm/indexing/pandas_normalized_tfidf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py
index 6497a2f..3b3162b 100644
--- a/emm/indexing/pandas_normalized_tfidf.py
+++ b/emm/indexing/pandas_normalized_tfidf.py
@@ -51,6 +51,7 @@ def __init__(self, **kwargs: Any) -> None:
         Args:
             kwargs: kew-word arguments are same as TfidfVectorizer.
         """
+        kwargs.setdefault("dtype", np.float32)
         kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
         if kwargs.get("analyzer") in {"word", None}:
             kwargs["token_pattern"] = r"\w+"

From 31642ae7648c13eb421c2a2b81f24e74d90a8c24 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 22:13:33 +0200
Subject: [PATCH 13/16] added branch to test trigger

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e5247e7..f8b0604 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, PandasNormalizedTfidfVectorizer-dtype ]
   pull_request:
 
 jobs:

From 55b433c38e9de238b8e2376c941f5ccb83679e16 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 22:35:53 +0200
Subject: [PATCH 14/16] added test for default case of dtype

---
 tests/integration/test_pandas_em.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py
index a7c5ded..f14e30d 100644
--- a/tests/integration/test_pandas_em.py
+++ b/tests/integration/test_pandas_em.py
@@ -136,10 +136,18 @@ def test_pandas_tfidf(dtype):
         np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001)
 
 
+def test_pandas_tfidf_default_dtype():
+    pandas_t = PandasNormalizedTfidfVectorizer()
+    unique_names = [str(uuid.uuid4()) for i in range(100)]
+    gt_names = pd.Series(unique_names)
+    pandas_t.fit(gt_names)
+    assert pandas_t.idf_.dtype == np.float32
+
+
 @pytest.mark.parametrize(
     ("dtype", "data_size"), [(np.float32, 100), (np.float64, 100), (np.float32, 1000000), (np.float64, 1000000)]
 )
-def test_pandas_tfidf_dtype(dtype, data_size):
+def test_pandas_tfidf_dtype_for_different_input_sizes(dtype, data_size):
     pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype)
     unique_names = [str(uuid.uuid4()) for i in range(data_size)]
     gt_names = pd.Series(unique_names)

From 722c31271a7a071b2ce6bcc07d3b93a3d1a7849f Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sun, 6 Oct 2024 22:56:46 +0200
Subject: [PATCH 15/16] removed branch from test trigger

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f8b0604..e5247e7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main, PandasNormalizedTfidfVectorizer-dtype ]
+    branches: [ main ]
   pull_request:
 
 jobs:

From 6a1592752a563c3211ea5bd0e8d2cbe6fdeec9e6 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Mon, 7 Oct 2024 09:54:56 +0200
Subject: [PATCH 16/16] bumped emm version to 2.1.6

---
 emm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emm/version.py b/emm/version.py
index 006f8eb..acb041b 100644
--- a/emm/version.py
+++ b/emm/version.py
@@ -17,6 +17,6 @@
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-VERSION = "2.1.5"
+VERSION = "2.1.6"
 
 __version__ = VERSION