From d43a45dcf89c5f3f343d23d8399919b3cf39ba96 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 4 Oct 2024 09:43:44 +0200 Subject: [PATCH 01/16] added dtype to kwargs passed to TfidfVectorizer --- emm/indexing/pandas_normalized_tfidf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index 4458615..058f06e 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -37,8 +37,6 @@ class PandasNormalizedTfidfVectorizer(TfidfVectorizer): """Implementation of customized TFIDF vectorizer""" - dtype = np.float32 - def __init__(self, **kwargs: Any) -> None: """Implementation of customized TFIDF vectorizer @@ -53,7 +51,7 @@ def __init__(self, **kwargs: Any) -> None: Args: kwargs: kew-word arguments are same as TfidfVectorizer. """ - kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) + kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True, "dtype": np.float32}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" super().__init__(**kwargs) From 9525192bc5fe674854457271ca1e8ca0548f97b1 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 4 Oct 2024 09:44:34 +0200 Subject: [PATCH 02/16] added branch to triggers --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e5247e7..cacac1a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main ] + branches: [ main, PandasNormalizedTfidfVectorizer-dtype] pull_request: jobs: From d575701159bc87128876bb7646828b70281867d2 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 4 Oct 2024 12:07:02 +0200 Subject: [PATCH 03/16] added print statements for debugging purposes --- emm/indexing/pandas_normalized_tfidf.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index 058f06e..f43433e 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -37,6 +37,8 @@ class PandasNormalizedTfidfVectorizer(TfidfVectorizer): """Implementation of customized TFIDF vectorizer""" + dtype = np.float32 + def __init__(self, **kwargs: Any) -> None: """Implementation of customized TFIDF vectorizer @@ -51,7 +53,7 @@ def __init__(self, **kwargs: Any) -> None: Args: kwargs: kew-word arguments are same as TfidfVectorizer. """ - kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True, "dtype": np.float32}) + kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" super().__init__(**kwargs) @@ -90,6 +92,12 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: assert self._tfidf._idf_diag.dtype == self.dtype else: # sklearn >= 1.5 + print("X is", X.size) + print("X nulls", X.isna().any()) + print("self.dtype is", self.dtype) + print("n_features is", n_features) + print("np.ones dtype is", np.ones(n_features, dtype=self.dtype).dtype) + print("self.idf_ dtype is", self.idf_.dtype) self.idf_ = self.idf_ - np.ones(n_features, dtype=self.dtype) assert self.idf_.dtype == self.dtype From 00a3b1392556cc98aca9db2a843235df33385af8 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 4 Oct 2024 13:44:37 +0200 Subject: [PATCH 04/16] added check for existence of dtype in kwargs --- emm/indexing/pandas_normalized_tfidf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index f43433e..b528858 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -37,8 +37,6 @@ class PandasNormalizedTfidfVectorizer(TfidfVectorizer): """Implementation of customized TFIDF vectorizer""" - dtype = np.float32 - def __init__(self, **kwargs: Any) -> None: """Implementation of customized TFIDF vectorizer @@ -54,6 +52,8 @@ def __init__(self, **kwargs: Any) -> None: kwargs: kew-word arguments are same as TfidfVectorizer. """ kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) + if "dtype" not in kwargs: + kwargs.update({"dtype": np.float32}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" super().__init__(**kwargs) From 427da4eb38ab311d60bef4f7b6ceebfbf35b7fa8 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 4 Oct 2024 13:58:40 +0200 Subject: [PATCH 05/16] added more print statements --- emm/indexing/pandas_cos_sim_matcher.py | 1 + emm/indexing/pandas_normalized_tfidf.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/emm/indexing/pandas_cos_sim_matcher.py b/emm/indexing/pandas_cos_sim_matcher.py index 21ecb85..0deddc2 100644 --- a/emm/indexing/pandas_cos_sim_matcher.py +++ b/emm/indexing/pandas_cos_sim_matcher.py @@ -104,6 +104,7 @@ def __init__( self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count() self.spark_session = spark_session # attributes below are set during fit + print("Printing from inside PandasCosSomIndexer: dtype is", dtype) self.tfidf = PandasNormalizedTfidfVectorizer( analyzer={"words": "word", "characters": "char"}[tokenizer], binary=binary_countvectorizer, diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index b528858..e7b44de 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -52,8 +52,9 @@ def __init__(self, **kwargs: Any) -> None: kwargs: kew-word arguments are same as TfidfVectorizer. """ kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) - if "dtype" not in kwargs: - kwargs.update({"dtype": np.float32}) + print("Printing from inside PandasNormalizedTfidfVectorizer: kwargs are", kwargs) + # if "dtype" not in kwargs: + # kwargs.update({"dtype": np.float32}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" super().__init__(**kwargs) @@ -93,6 +94,7 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: else: # sklearn >= 1.5 print("X is", X.size) + print("X dtype is", X.dtype) print("X nulls", X.isna().any()) print("self.dtype is", self.dtype) print("n_features is", n_features) From b012cebe58bbc6d22ef3cf106afa56c20ea5e9be Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 13:23:21 +0200 Subject: [PATCH 06/16] added explicit conversion of idf_ dtype --- emm/indexing/pandas_normalized_tfidf.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index e7b44de..7f0bf1f 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -52,9 +52,6 @@ def __init__(self, **kwargs: Any) -> None: kwargs: kew-word arguments are same as TfidfVectorizer. """ kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) - print("Printing from inside PandasNormalizedTfidfVectorizer: kwargs are", kwargs) - # if "dtype" not in kwargs: - # kwargs.update({"dtype": np.float32}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" super().__init__(**kwargs) @@ -75,6 +72,8 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: with Timer("CustomizedTfidfVectorizer.fit") as timer: timer.label("super fit") super().fit(X) + # scikit-learn's TidfVectorizer does not preserve dtype for large X, so we force it here + self.idf_ = self.idf_.astype(self.dtype) timer.label("normalize") n_features = self.idf_.shape[0] @@ -93,13 +92,6 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: assert self._tfidf._idf_diag.dtype == self.dtype else: # sklearn >= 1.5 - print("X is", X.size) - print("X dtype is", X.dtype) - print("X nulls", X.isna().any()) - print("self.dtype is", self.dtype) - print("n_features is", n_features) - print("np.ones dtype is", np.ones(n_features, dtype=self.dtype).dtype) - print("self.idf_ dtype is", self.idf_.dtype) self.idf_ = self.idf_ - np.ones(n_features, dtype=self.dtype) assert self.idf_.dtype == self.dtype From 7fa75ad752c318c38d55a1edc900d44bd7d3e5c2 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 13:28:55 +0200 Subject: [PATCH 07/16] removed print statement --- emm/indexing/pandas_cos_sim_matcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/emm/indexing/pandas_cos_sim_matcher.py b/emm/indexing/pandas_cos_sim_matcher.py index 0deddc2..21ecb85 100644 --- a/emm/indexing/pandas_cos_sim_matcher.py +++ b/emm/indexing/pandas_cos_sim_matcher.py @@ -104,7 +104,6 @@ def __init__( self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count() self.spark_session = spark_session # attributes below are set during fit - print("Printing from inside PandasCosSomIndexer: dtype is", dtype) self.tfidf = PandasNormalizedTfidfVectorizer( analyzer={"words": "word", "characters": "char"}[tokenizer], binary=binary_countvectorizer, From 337bdc091ed91e83a45ec87f9ada1e81340964d2 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 14:52:39 +0200 Subject: [PATCH 08/16] added related test --- tests/integration/test_pandas_em.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py index b60e132..57ce5a5 100644 --- a/tests/integration/test_pandas_em.py +++ b/tests/integration/test_pandas_em.py @@ -21,6 +21,7 @@ import logging import os +import uuid import numpy as np import pandas as pd @@ -135,6 +136,17 @@ def test_pandas_tfidf(dtype): np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001) +@pytest.mark.parametrize( + ("dtype", "data_size"), [(np.float32, 100), (np.float64, True), (np.float32, 1000000), (np.float64, 1000000)] +) +def test_pandas_tfidf_dtype(dtype, data_size): + pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype) + unique_names = [str(uuid.uuid4()) for i in range(data_size)] + gt_names = pd.Series(unique_names) + pandas_t.fit(gt_names) + assert pandas_t.idf_.dtype == dtype + + def test_pandas_tfidf_ngram(): pandas_t = PandasNormalizedTfidfVectorizer(binary=True, analyzer="char", ngram_range=(3, 3)) gt_names = pd.Series(["aaab", "bbbc"]) From 3a8e36cf292183b88edcedafc57006e8dd1341d9 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 14:54:40 +0200 Subject: [PATCH 09/16] corrected test argument --- tests/integration/test_pandas_em.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py index 57ce5a5..a7c5ded 100644 --- a/tests/integration/test_pandas_em.py +++ b/tests/integration/test_pandas_em.py @@ -137,7 +137,7 @@ def test_pandas_tfidf(dtype): @pytest.mark.parametrize( - ("dtype", "data_size"), [(np.float32, 100), (np.float64, True), (np.float32, 1000000), (np.float64, 1000000)] + ("dtype", "data_size"), [(np.float32, 100), (np.float64, 100), (np.float32, 1000000), (np.float64, 1000000)] ) def test_pandas_tfidf_dtype(dtype, data_size): pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype) From c55d74f48f93af44e1ca79139cd00e7e8d93942f Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 15:34:37 +0200 Subject: [PATCH 10/16] removed branch from testing triggers --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cacac1a..e5247e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main, PandasNormalizedTfidfVectorizer-dtype] + branches: [ main ] pull_request: jobs: From 2394ff4e96f738d34673352e84972eaede9615da Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 15:40:51 +0200 Subject: [PATCH 11/16] fixed typo --- emm/indexing/pandas_normalized_tfidf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index 7f0bf1f..6497a2f 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -72,7 +72,7 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: with Timer("CustomizedTfidfVectorizer.fit") as timer: timer.label("super fit") super().fit(X) - # scikit-learn's TidfVectorizer does not preserve dtype for large X, so we force it here + # scikit-learn's TfidfVectorizer does not preserve dtype for large X, so we force it here self.idf_ = self.idf_.astype(self.dtype) timer.label("normalize") From c7915f28cb06c037379fcbb6929bdc30e05bbd25 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 22:12:42 +0200 Subject: [PATCH 12/16] added default np.float32 dtype argument --- emm/indexing/pandas_normalized_tfidf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py index 6497a2f..3b3162b 100644 --- a/emm/indexing/pandas_normalized_tfidf.py +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -51,6 +51,7 @@ def __init__(self, **kwargs: Any) -> None: Args: kwargs: kew-word arguments are same as TfidfVectorizer. """ + kwargs.setdefault("dtype", np.float32) kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True}) if kwargs.get("analyzer") in {"word", None}: kwargs["token_pattern"] = r"\w+" From 31642ae7648c13eb421c2a2b81f24e74d90a8c24 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 22:13:33 +0200 Subject: [PATCH 13/16] added branch to test trigger --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e5247e7..f8b0604 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main ] + branches: [ main, PandasNormalizedTfidfVectorizer-dtype ] pull_request: jobs: From 55b433c38e9de238b8e2376c941f5ccb83679e16 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 22:35:53 +0200 Subject: [PATCH 14/16] added test for default case of dtype --- tests/integration/test_pandas_em.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py index a7c5ded..f14e30d 100644 --- a/tests/integration/test_pandas_em.py +++ b/tests/integration/test_pandas_em.py @@ -136,10 +136,18 @@ def test_pandas_tfidf(dtype): np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001) +def test_pandas_tfidf_default_dtype(): + pandas_t = PandasNormalizedTfidfVectorizer() + unique_names = [str(uuid.uuid4()) for i in range(100)] + gt_names = pd.Series(unique_names) + pandas_t.fit(gt_names) + assert pandas_t.idf_.dtype == np.float32 + + @pytest.mark.parametrize( ("dtype", "data_size"), [(np.float32, 100), (np.float64, 100), (np.float32, 1000000), (np.float64, 1000000)] ) -def test_pandas_tfidf_dtype(dtype, data_size): +def test_pandas_tfidf_dtype_for_different_input_sizes(dtype, data_size): pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype) unique_names = [str(uuid.uuid4()) for i in range(data_size)] gt_names = pd.Series(unique_names) From 722c31271a7a071b2ce6bcc07d3b93a3d1a7849f Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sun, 6 Oct 2024 22:56:46 +0200 Subject: [PATCH 15/16] removed branch from test trigger --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f8b0604..e5247e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main, PandasNormalizedTfidfVectorizer-dtype ] + branches: [ main ] pull_request: jobs: From 6a1592752a563c3211ea5bd0e8d2cbe6fdeec9e6 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Mon, 7 Oct 2024 09:54:56 +0200 Subject: [PATCH 16/16] bumped emm version to 2.1.6 --- emm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emm/version.py b/emm/version.py index 006f8eb..acb041b 100644 --- a/emm/version.py +++ b/emm/version.py @@ -17,6 +17,6 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -VERSION = "2.1.5" +VERSION = "2.1.6" __version__ = VERSION