diff --git a/Cargo.toml b/Cargo.toml
index 296aa98..281ee96 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,8 @@ pyo3 = { version = "0.21.2", features = ["extension-module", "generate-import-li
serde = { version = "1.0.202", features = ["derive"] }
serde_json = { version = "1.0.117" }
bincode = "1.3.3"
+ndarray = "0.15"
+numpy = "0.21"
[profile.dev]
opt-level = 0
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8d15d4d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,273 @@
+
+
LeNLP
+
Natural Language Processing toolbox for Python with Rust
+
+
+
+
+
+
+
+
+
+
+LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration.
+
+## Installation
+
+We can install LeNLP using:
+
+```
+pip install lenlp
+```
+
+## Sections
+
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Sparse Module](#sparse-module)
+ - [CountVectorizer](#countvectorizer)
+ - [TfidfVectorizer](#TfidfVectorizer)
+ - [BM25Vectorizer](#bm25vectorizer)
+- [FlashText](#flashtext)
+- [Extras](#extras)
+ - [Counter](#counter)
+ - [Normalizer](#normalizer)
+
+## Quick Start
+
+### Sparse Module
+
+The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers.
+
+#### CountVectorizer
+
+The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.CountVectorizer(
+ ngram_range=(3, 5), # range of n-grams
+ analyzer="char_wb", # word, char, char_wb
+ normalize=True, # lowercase and strip accents
+ stop_words=["based"], # list of stop words
+)
+```
+
+You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts:
+
+```python
+X = [
+ "Hello World",
+ "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+
+
+LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer.
+
+#### TfidfVectorizer
+
+The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.TfidfVectorizer(
+ ngram_range=(3, 5), # Range of n-grams
+ analyzer="char_wb", # Options: word, char, char_wb
+ normalize=True, # Lowercase and strip accents
+ stop_words=["based"] # List of stop words
+)
+```
+
+Fit the vectorizer and transform texts:
+
+```python
+X = [
+ "Hello World",
+ "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+
+
+LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer.
+
+#### BM25Vectorizer
+
+The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights.
+
+```python
+from lenlp import sparse
+
+vectorizer = sparse.BM25Vectorizer(
+ ngram_range=(3, 5), # Range of n-grams
+ analyzer="char_wb", # Options: word, char, char_wb
+ normalize=True, # Lowercase and strip accents
+ stop_words=["based"] # List of stop words
+)
+```
+
+Fit the vectorizer and transform texts:
+
+```python
+X = [
+ "Hello World",
+ "Rust based vectorizer"
+]
+
+matrix = vectorizer.fit_transform(X)
+```
+
+Or use separate calls:
+
+```python
+vectorizer.fit(X)
+matrix = vectorizer.transform(X)
+```
+
+Benchmark:
+
+
+
+
+LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn.
+
+### FlashText
+
+The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*.
+
+```python
+from lenlp import flash
+
+flash_text = flash.FlashText(
+ normalize=True # remove accents and lowercase
+)
+
+# Add keywords we want to retrieve:
+flash_text.add(["paris", "bordeaux", "toulouse"])
+```
+
+Extract keywords and their positions from sentences:
+
+```python
+sentences = [
+ "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+ "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+flash_text.extract(sentences)
+```
+
+Output:
+
+```python
+[[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)],
+ [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]]
+```
+
+The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction.
+
+Benchmark:
+
+
+
+LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext).
+
+### Extras
+
+#### Counter
+
+The counter module allows to convert a list of texts into a dictionary of token counts.
+
+```python
+from lenlp import counter
+
+sentences = [
+ "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+ "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+counter.count(
+ sentences,
+ ngram_range=(1, 1), # Range of n-grams
+ analyzer="word", # Options: word, char, char_wb
+ normalize=True, # Lowercase and strip accents
+ stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words
+)
+```
+
+Output:
+
+```python
+[{'compared': 1,
+ 'south': 1,
+ 'city': 1,
+ 'toulouse': 1,
+ 'bordeaux': 2,
+ 'france': 1},
+ {'toulouse': 1,
+ 'france': 1,
+ 'capital': 1,
+ 'paris': 1,
+ 'north': 1,
+ 'compared': 1,
+ 'bordeaux': 1}]
+```
+
+#### Normalizer
+
+The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase.
+
+```python
+from lenlp import normalizer
+
+sentences = [
+ "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
+ "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
+]
+
+normalizer.normalize(sentences)
+```
+
+Output:
+
+```python
+[
+ 'toulouse is a city in france its in the south compared to bordeaux and bordeaux',
+ 'paris is the capital of france its in the north compared to bordeaux and toulouse',
+]
+```
+
+## References
+
+- *[FlashText](https://github.com/vi3k6i5/flashtext)*
+- *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)*
+- *[PyO3](https://github.com/PyO3/pyo3)*
+- *[Maturin](https://github.com/PyO3/maturin)*
+
diff --git a/docs/bm25.png b/docs/bm25.png
new file mode 100644
index 0000000..0a6a4b3
Binary files /dev/null and b/docs/bm25.png differ
diff --git a/docs/count_vectorizer.png b/docs/count_vectorizer.png
new file mode 100644
index 0000000..3ce01e6
Binary files /dev/null and b/docs/count_vectorizer.png differ
diff --git a/docs/count_vectorizer_char.png b/docs/count_vectorizer_char.png
new file mode 100644
index 0000000..d3493b0
Binary files /dev/null and b/docs/count_vectorizer_char.png differ
diff --git a/docs/flashtext.png b/docs/flashtext.png
new file mode 100644
index 0000000..3b3867a
Binary files /dev/null and b/docs/flashtext.png differ
diff --git a/docs/logo.png b/docs/logo.png
new file mode 100644
index 0000000..d0a6dd7
Binary files /dev/null and b/docs/logo.png differ
diff --git a/docs/tfidf.png b/docs/tfidf.png
new file mode 100644
index 0000000..ca163bc
Binary files /dev/null and b/docs/tfidf.png differ
diff --git a/lenlp/__version__.py b/lenlp/__version__.py
index 30dbba5..3e26da2 100644
--- a/lenlp/__version__.py
+++ b/lenlp/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (1, 0, 2)
+VERSION = (1, 0, 3)
__version__ = ".".join(map(str, VERSION))
diff --git a/lenlp/counter/count.py b/lenlp/counter/count.py
index af886d8..1cfd565 100644
--- a/lenlp/counter/count.py
+++ b/lenlp/counter/count.py
@@ -46,7 +46,7 @@ def count(
>>> counter.count("Hello, world!", sort=True)
{'hello': 1, 'world': 1}
- >>> counter.count("Hello, world!", ngram_range=[2, 2], sort=True, normalize=False)
+ >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False)
{'Hello, world!': 1}
>>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True)
diff --git a/lenlp/flash/flash_text.py b/lenlp/flash/flash_text.py
index 02f409b..670f00d 100644
--- a/lenlp/flash/flash_text.py
+++ b/lenlp/flash/flash_text.py
@@ -23,8 +23,8 @@ class FlashText:
>>> flash_text = flash.FlashText(normalize=True)
>>> flash_text = flash_text.add(["hello", "world"])
- >>> flash_text.extract(["Hello, world!", "world", "hello"], span_info=True)
- [['hello:0:5'], [], []]
+ >>> flash_text.extract(["Hello, world!", "world", "hello"])
+ [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]]
"""
@@ -37,9 +37,9 @@ def add(self, x: str | list[str]) -> None:
self.flash.add_keywords_many(x)
return self
- def extract(self, x: str | list[str], span_info: bool = False) -> list[str]:
+ def extract(self, x: str | list[str]) -> list[str]:
"""Extract keywords from a sentence."""
is_string = isinstance(x, str)
x = [x] if isinstance(x, str) else x
- y = self.flash.extract_keywords_many(x, span_info=span_info)
+ y = self.flash.extract_keywords_many(x)
return y[0] if is_string else y
diff --git a/lenlp/sparse/__init__.py b/lenlp/sparse/__init__.py
index b17fadb..606a7f6 100644
--- a/lenlp/sparse/__init__.py
+++ b/lenlp/sparse/__init__.py
@@ -1,5 +1,5 @@
from .bm25_vectorizer import BM25Vectorizer
from .count_vectorizer import CountVectorizer
-from .tfidf_vectorizer import TfIdfVectorizer
+from .tfidf_vectorizer import TfidfVectorizer
-__all__ = ["BM25Vectorizer", "CountVectorizer", "TfIdfVectorizer"]
+__all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"]
diff --git a/lenlp/sparse/bm25_vectorizer.py b/lenlp/sparse/bm25_vectorizer.py
index ca131ae..f4af771 100644
--- a/lenlp/sparse/bm25_vectorizer.py
+++ b/lenlp/sparse/bm25_vectorizer.py
@@ -2,10 +2,10 @@
from scipy.sparse import csr_matrix
from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
-from .tfidf_vectorizer import TfIdfVectorizer
+from .tfidf_vectorizer import TfidfVectorizer
-class BM25Vectorizer(TfIdfVectorizer):
+class BM25Vectorizer(TfidfVectorizer):
"""BM25Vectorizer is a class that converts a collection of text documents to a sparse
bm25 matrix.
diff --git a/lenlp/sparse/count_vectorizer.py b/lenlp/sparse/count_vectorizer.py
index 41c10f1..9067b63 100644
--- a/lenlp/sparse/count_vectorizer.py
+++ b/lenlp/sparse/count_vectorizer.py
@@ -1,4 +1,3 @@
-import numpy as np
from rslenlp import SparseMatrixBuilder
from scipy.sparse import csr_matrix
@@ -48,7 +47,7 @@ class CountVectorizer:
>>> matrix.toarray()
array([[1, 1, 0, 0, 0],
- [0, 0, 1, 1, 1]], dtype=int16)
+ [0, 0, 1, 1, 1]], dtype=uint64)
>>> len(count_vectorizer.vocabulary)
5
@@ -92,23 +91,25 @@ def transform(self, raw_documents: list[str]) -> csr_matrix:
"""Transform documents to document-term matrix."""
if not self.fitted:
raise ValueError("Call fit method before calling transform method.")
+
values, row_indices, column_indices = self.sparse_matrix.transform(
raw_documents
)
+
return csr_matrix(
arg1=(values, (row_indices, column_indices)),
shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
- dtype=np.int16,
)
def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
"""Learn the vocabulary dictionary and return the CountVectorizer object."""
self.fitted = True
+
values, row_indices, column_indices = self.sparse_matrix.fit_transform(
raw_documents
)
+
return csr_matrix(
arg1=(values, (row_indices, column_indices)),
shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
- dtype=np.int16,
)
diff --git a/lenlp/sparse/tfidf_vectorizer.py b/lenlp/sparse/tfidf_vectorizer.py
index 8ca9abc..b45adbd 100644
--- a/lenlp/sparse/tfidf_vectorizer.py
+++ b/lenlp/sparse/tfidf_vectorizer.py
@@ -5,8 +5,8 @@
from .count_vectorizer import CountVectorizer
-class TfIdfVectorizer(CountVectorizer):
- """TfIdfVectorizer is a class that converts a collection of text documents to a sparse
+class TfidfVectorizer(CountVectorizer):
+ """TfidfVectorizer is a class that converts a collection of text documents to a sparse
tfidf matrix.
Parameters
@@ -32,7 +32,7 @@ class TfIdfVectorizer(CountVectorizer):
--------
>>> from lenlp import sparse
- >>> tfidf_vectorizer = sparse.TfIdfVectorizer(
+ >>> tfidf_vectorizer = sparse.TfidfVectorizer(
... analyzer="word",
... normalize=True,
... stop_words=None,
@@ -101,7 +101,7 @@ def transform(self, raw_documents: list[str]) -> csr_matrix:
matrix=csr_matrix(
arg1=(values, (row_indices, column_indices)),
shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
- dtype=np.float64,
+ dtype=np.float32,
)
)
@@ -114,7 +114,7 @@ def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
matrix = csr_matrix(
arg1=(values, (row_indices, column_indices)),
shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
- dtype=np.float64,
+ dtype=np.float32,
)
self.update(matrix=matrix)
diff --git a/readme.md b/readme.md
deleted file mode 100644
index e69de29..0000000
diff --git a/rust/rsflashtext.rs b/rust/rsflashtext.rs
index 8c31432..7ac7987 100644
--- a/rust/rsflashtext.rs
+++ b/rust/rsflashtext.rs
@@ -111,48 +111,59 @@ impl RSKeywordProcessor {
pub fn extract_keywords_many(
&self,
sentences: Vec,
- span_info: bool,
- ) -> Vec> {
+ ) -> Vec> {
sentences
.par_iter()
- .map(|sentence: &String| self.extract_keywords(&sentence, span_info.clone()))
+ .map(|sentence: &String| self.extract_keywords(&sentence))
.collect()
}
- pub fn extract_keywords(&self, sentence: &str, span_info: bool) -> Vec {
- let sentence: String = if self.normalize {
- unidecode(sentence)
- .to_lowercase()
- .chars()
- .filter(|c| !c.is_ascii_punctuation())
- .collect::()
- .trim()
- .to_string()
+ pub fn extract_keywords(&self, sentence: &str) -> Vec<(String, usize, usize)> {
+ // Map from the index in the normalized sentence to the index in the original sentence
+ let mut index_map: Vec = Vec::with_capacity(sentence.len());
+ let mut original_idx = 0;
+
+ let normalized_sentence: String = if self.normalize {
+ let mut normalized = String::new();
+ for c in sentence.chars() {
+ if c.is_ascii_punctuation() {
+ original_idx += c.len_utf8();
+ continue;
+ }
+ let normalized_char = unidecode::unidecode_char(c).to_lowercase();
+ for nc in normalized_char.chars() {
+ normalized.push(nc);
+ index_map.push(original_idx);
+ }
+ original_idx += c.len_utf8();
+ }
+ normalized.to_string()
} else if self.lowercase {
sentence.to_lowercase()
} else {
sentence.to_string()
};
- let mut extracted_keywords: Vec = Vec::new();
+ let mut extracted_keywords: Vec<(String, usize, usize)> = Vec::new();
let mut current_node: &HashMap = &self.keyword_trie_dict;
let mut start_pos: usize = 0;
let mut end_pos: usize = 0;
let mut idx: usize = 0;
- let sentence_len: usize = sentence.len();
+ let sentence_len: usize = normalized_sentence.len();
while idx < sentence_len {
- let char: char = sentence.chars().nth(idx).unwrap();
+ let char: char = normalized_sentence.chars().nth(idx).unwrap();
if !self.non_word_boundaries.contains(&char) {
if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
if node.is_end {
let clean_name: &String = node.clean_name.as_ref().unwrap();
- if span_info {
- extracted_keywords
- .push(format!("{}:{}:{}", clean_name, start_pos, end_pos));
- } else {
- extracted_keywords.push(clean_name.clone());
- }
+ let original_start_pos = index_map[start_pos];
+ let original_end_pos = index_map[end_pos - 1] + 1;
+ extracted_keywords.push((
+ clean_name.clone(),
+ original_start_pos,
+ original_end_pos,
+ ));
}
}
current_node = &self.keyword_trie_dict;
@@ -167,6 +178,16 @@ impl RSKeywordProcessor {
idx += 1;
}
+ // Check if the last segment is a keyword
+ if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
+ if node.is_end {
+ let clean_name: &String = node.clean_name.as_ref().unwrap();
+ let original_start_pos = index_map[start_pos];
+ let original_end_pos = index_map[end_pos - 1] + 1;
+ extracted_keywords.push((clean_name.clone(), original_start_pos, original_end_pos));
+ }
+ }
+
extracted_keywords
}
}
diff --git a/rust/rssparse.rs b/rust/rssparse.rs
index e83b523..0350f61 100644
--- a/rust/rssparse.rs
+++ b/rust/rssparse.rs
@@ -1,11 +1,11 @@
+use crate::rsvectorizer::rsvectorize_many;
use bincode::{deserialize, serialize};
+use numpy::PyArray1;
use pyo3::prelude::*;
use pyo3::types::PyBytes;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
-use crate::rsvectorizer::rsvectorize_many;
-
// In order to properly pickle, we need to map to the Python module.
#[derive(Clone, Debug, Serialize, Deserialize)]
#[pyclass(module = "lenlp.sparse.count_vectorizer")]
@@ -29,15 +29,23 @@ impl SparseMatrixBuilder {
) -> Self {
SparseMatrixBuilder {
vocab: HashMap::new(),
- n_sizes: n_sizes,
- analyzer: analyzer,
- stop_words: stop_words,
- normalize: normalize,
+ n_sizes,
+ analyzer,
+ stop_words,
+ normalize,
num_cols: 0,
}
}
- pub fn fit_transform(&mut self, texts: Vec) -> (Vec, Vec, Vec) {
+ pub fn fit_transform(
+ &mut self,
+ texts: Vec,
+ py: Python,
+ ) -> (
+ Py>,
+ Py>,
+ Py>,
+ ) {
self.vocab = HashMap::new();
let texts: Vec> = rsvectorize_many(
texts,
@@ -46,8 +54,16 @@ impl SparseMatrixBuilder {
self.stop_words.clone(),
self.normalize,
);
+
self._fit(texts.clone());
- self._transform(texts)
+
+ // Scipy csr_matrix are faster to build from numpy arrays.
+ let (vec1, vec2, vec3) = self._transform(texts);
+ (
+ PyArray1::from_vec_bound(py, vec1).into(),
+ PyArray1::from_vec_bound(py, vec2).into(),
+ PyArray1::from_vec_bound(py, vec3).into(),
+ )
}
pub fn fit(&mut self, texts: Vec) {
@@ -76,7 +92,15 @@ impl SparseMatrixBuilder {
self.num_cols = col_index;
}
- pub fn transform(&self, texts: Vec) -> (Vec, Vec, Vec) {
+ pub fn transform(
+ &self,
+ texts: Vec,
+ py: Python,
+ ) -> (
+ Py>,
+ Py>,
+ Py>,
+ ) {
let texts: Vec> = rsvectorize_many(
texts,
self.n_sizes.clone(),
@@ -85,10 +109,16 @@ impl SparseMatrixBuilder {
self.normalize,
);
- return self._transform(texts);
+ // Scipy csr_matrix are faster to build from numpy arrays.
+ let (vec1, vec2, vec3) = self._transform(texts);
+ (
+ PyArray1::from_vec_bound(py, vec1).into(),
+ PyArray1::from_vec_bound(py, vec2).into(),
+ PyArray1::from_vec_bound(py, vec3).into(),
+ )
}
- pub fn _transform(
+ fn _transform(
&self,
texts: Vec>,
) -> (Vec, Vec, Vec) {
diff --git a/setup.py b/setup.py
index c929aee..cd0900a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,9 @@
from lenlp.__version__ import __version__
+with open(file="README.md", mode="r", encoding="utf-8") as fh:
+ long_description = fh.read()
+
base_packages = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"]
dev = ["maturin >= 1.5.1", "pytest-cov >= 5.0.0", "pytest >= 7.4.4", "ruff >= 0.1.15"]
@@ -11,8 +14,9 @@
name="lenlp",
version=f"{__version__}",
author="Raphael Sourty",
- author_email="raphael.sourty@gmail.com",
+ long_description=long_description,
long_description_content_type="text/markdown",
+ author_email="raphael.sourty@gmail.com",
url="https://github.com/raphaelsty/lenlp",
download_url="https://github.com/raphaelsty/lenlp/archive/v_01.tar.gz",
keywords=[],