update lenlp

raphaelsty · May 26, 2024 · 6c7b690 · 6c7b690
1 parent bebb545
commit 6c7b690
Show file tree

Hide file tree

Showing 19 changed files with 383 additions and 52 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,8 @@ pyo3 = { version = "0.21.2", features = ["extension-module", "generate-import-li
 serde = { version = "1.0.202", features = ["derive"] }
 serde_json = { version = "1.0.117" }
 bincode = "1.3.3"
+ndarray = "0.15"
+numpy = "0.21"
 
 [profile.dev]
 opt-level = 0

diff --git a/README.md b/README.md
@@ -0,0 +1,273 @@
+<div align="center">
+  <h1>LeNLP</h1>
+  <p>Natural Language Processing toolbox for Python with Rust</p>
+</div>
+
+<p align="center"><img width=500 src="docs/logo.png"/></p>
+
+<div align="center">
+  <!-- License -->
+  <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square" alt="license"></a>
+</div>
+
+
+LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration.
+
+## Installation
+
+We can install LeNLP using:
+
+```
+pip install lenlp
+```
+
+## Sections
+
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Sparse Module](#sparse-module)
+    - [CountVectorizer](#countvectorizer)
+    - [TfidfVectorizer](#TfidfVectorizer)
+    - [BM25Vectorizer](#bm25vectorizer)
+- [FlashText](#flashtext)
+- [Extras](#extras)
+    - [Counter](#counter)
+    - [Normalizer](#normalizer)
+
+## Quick Start
+
+### Sparse Module
+
+The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers.
+
+#### CountVectorizer
+
+The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.CountVectorizer(
+    ngram_range=(3, 5), # range of n-grams
+    analyzer="char_wb", # word, char, char_wb
+    normalize=True, # lowercase and strip accents
+    stop_words=["based"], # list of stop words
+)
+```
+
+You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts:
+
+```python
+X = [
+    "Hello World", 
+    "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+<p align="center"><img width=500 src="docs/count_vectorizer_char.png"/></p>
+
+LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer.
+
+#### TfidfVectorizer
+
+The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.TfidfVectorizer(
+    ngram_range=(3, 5), # Range of n-grams
+    analyzer="char_wb", # Options: word, char, char_wb
+    normalize=True, # Lowercase and strip accents
+    stop_words=["based"] # List of stop words
+)
+```
+
+Fit the vectorizer and transform texts:
+
+```python
+X = [
+    "Hello World", 
+    "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+<p align="center"><img width=500 src="docs/tfidf.png"/></p>
+
+LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer. 
+
+#### BM25Vectorizer
+
+The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.BM25Vectorizer(
+    ngram_range=(3, 5), # Range of n-grams
+    analyzer="char_wb", # Options: word, char, char_wb
+    normalize=True, # Lowercase and strip accents
+    stop_words=["based"] # List of stop words
+)
+```
+
+Fit the vectorizer and transform texts:
+
+```python
+X = [
+    "Hello World", 
+    "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+<p align="center"><img width=500 src="docs/bm25.png"/></p>
+
+
+LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn.
+
+### FlashText
+
+The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*.
+
+```python
+from lenlp import flash
+
+flash_text = flash.FlashText(
+    normalize=True # remove accents and lowercase
+) 
+
+# Add keywords we want to retrieve:
+flash_text.add(["paris", "bordeaux", "toulouse"])
+```
+
+Extract keywords and their positions from sentences:
+
+```python
+sentences = [
+    "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+    "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+flash_text.extract(sentences)
+```
+
+Output:
+
+```python
+[[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)],
+ [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]]
+```
+
+The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction.
+
+Benchmark:
+
+<p align="center"><img width=500 src="docs/flashtext.png"/></p>
+
+LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext).
+
+### Extras
+
+#### Counter
+
+The counter module allows to convert a list of texts into a dictionary of token counts.
+
+```python
+from lenlp import counter
+
+sentences = [
+    "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+    "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+counter.count(
+    sentences,
+    ngram_range=(1, 1), # Range of n-grams
+    analyzer="word", # Options: word, char, char_wb
+    normalize=True, # Lowercase and strip accents
+    stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words
+)
+```
+
+Output:
+
+```python
+[{'compared': 1,
+  'south': 1,
+  'city': 1,
+  'toulouse': 1,
+  'bordeaux': 2,
+  'france': 1},
+ {'toulouse': 1,
+  'france': 1,
+  'capital': 1,
+  'paris': 1,
+  'north': 1,
+  'compared': 1,
+  'bordeaux': 1}]
+```
+
+#### Normalizer
+
+The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase.
+
+```python
+from lenlp import normalizer
+
+sentences = [
+    "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+    "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+normalizer.normalize(sentences)
+```
+
+Output:
+
+```python
+[
+	'toulouse is a city in france its in the south compared to bordeaux and bordeaux',
+ 	'paris is the capital of france its in the north compared to bordeaux and toulouse',
+]
+```
+
+## References
+
+- *[FlashText](https://github.com/vi3k6i5/flashtext)*
+- *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)*
+- *[PyO3](https://github.com/PyO3/pyo3)* 
+- *[Maturin](https://github.com/PyO3/maturin)*
+
diff --git a/docs/bm25.png b/docs/bm25.png
diff --git a/docs/count_vectorizer.png b/docs/count_vectorizer.png
diff --git a/docs/count_vectorizer_char.png b/docs/count_vectorizer_char.png
diff --git a/docs/flashtext.png b/docs/flashtext.png
diff --git a/docs/logo.png b/docs/logo.png
diff --git a/docs/tfidf.png b/docs/tfidf.png
diff --git a/lenlp/__version__.py b/lenlp/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (1, 0, 2)
+VERSION = (1, 0, 3)
 
 __version__ = ".".join(map(str, VERSION))
diff --git a/lenlp/counter/count.py b/lenlp/counter/count.py
@@ -46,7 +46,7 @@ def count(
     >>> counter.count("Hello, world!", sort=True)
     {'hello': 1, 'world': 1}
 
-    >>> counter.count("Hello, world!", ngram_range=[2, 2], sort=True, normalize=False)
+    >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False)
     {'Hello, world!': 1}
 
     >>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True)

diff --git a/lenlp/flash/flash_text.py b/lenlp/flash/flash_text.py
@@ -23,8 +23,8 @@ class FlashText:
     >>> flash_text = flash.FlashText(normalize=True)
     >>> flash_text = flash_text.add(["hello", "world"])
 
-    >>> flash_text.extract(["Hello, world!", "world", "hello"], span_info=True)
-    [['hello:0:5'], [], []]
+    >>> flash_text.extract(["Hello, world!", "world", "hello"])
+    [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]]
 
     """
 
@@ -37,9 +37,9 @@ def add(self, x: str | list[str]) -> None:
         self.flash.add_keywords_many(x)
         return self
 
-    def extract(self, x: str | list[str], span_info: bool = False) -> list[str]:
+    def extract(self, x: str | list[str]) -> list[str]:
         """Extract keywords from a sentence."""
         is_string = isinstance(x, str)
         x = [x] if isinstance(x, str) else x
-        y = self.flash.extract_keywords_many(x, span_info=span_info)
+        y = self.flash.extract_keywords_many(x)
         return y[0] if is_string else y
diff --git a/lenlp/sparse/__init__.py b/lenlp/sparse/__init__.py
@@ -1,5 +1,5 @@
 from .bm25_vectorizer import BM25Vectorizer
 from .count_vectorizer import CountVectorizer
-from .tfidf_vectorizer import TfIdfVectorizer
+from .tfidf_vectorizer import TfidfVectorizer
 
-__all__ = ["BM25Vectorizer", "CountVectorizer", "TfIdfVectorizer"]
+__all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"]
diff --git a/lenlp/sparse/bm25_vectorizer.py b/lenlp/sparse/bm25_vectorizer.py
@@ -2,10 +2,10 @@
 from scipy.sparse import csr_matrix
 from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
 
-from .tfidf_vectorizer import TfIdfVectorizer
+from .tfidf_vectorizer import TfidfVectorizer
 
 
-class BM25Vectorizer(TfIdfVectorizer):
+class BM25Vectorizer(TfidfVectorizer):
     """BM25Vectorizer is a class that converts a collection of text documents to a sparse
     bm25 matrix.
 

diff --git a/lenlp/sparse/count_vectorizer.py b/lenlp/sparse/count_vectorizer.py
@@ -1,4 +1,3 @@
-import numpy as np
 from rslenlp import SparseMatrixBuilder
 from scipy.sparse import csr_matrix
 
@@ -48,7 +47,7 @@ class CountVectorizer:
 
     >>> matrix.toarray()
     array([[1, 1, 0, 0, 0],
-           [0, 0, 1, 1, 1]], dtype=int16)
+           [0, 0, 1, 1, 1]], dtype=uint64)
 
     >>> len(count_vectorizer.vocabulary)
     5
@@ -92,23 +91,25 @@ def transform(self, raw_documents: list[str]) -> csr_matrix:
         """Transform documents to document-term matrix."""
         if not self.fitted:
             raise ValueError("Call fit method before calling transform method.")
+
         values, row_indices, column_indices = self.sparse_matrix.transform(
             raw_documents
         )
+
         return csr_matrix(
             arg1=(values, (row_indices, column_indices)),
             shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
-            dtype=np.int16,
         )
 
     def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
         """Learn the vocabulary dictionary and return the CountVectorizer object."""
         self.fitted = True
+
         values, row_indices, column_indices = self.sparse_matrix.fit_transform(
             raw_documents
         )
+
         return csr_matrix(
             arg1=(values, (row_indices, column_indices)),
             shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
-            dtype=np.int16,
         )