diff --git a/Cargo.toml b/Cargo.toml index 296aa98..281ee96 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ pyo3 = { version = "0.21.2", features = ["extension-module", "generate-import-li serde = { version = "1.0.202", features = ["derive"] } serde_json = { version = "1.0.117" } bincode = "1.3.3" +ndarray = "0.15" +numpy = "0.21" [profile.dev] opt-level = 0 diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d15d4d --- /dev/null +++ b/README.md @@ -0,0 +1,273 @@ +
+

LeNLP

+

Natural Language Processing toolbox for Python with Rust

+
+ +

+ +
+ + license +
+ + +LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration. + +## Installation + +We can install LeNLP using: + +``` +pip install lenlp +``` + +## Sections + +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Sparse Module](#sparse-module) + - [CountVectorizer](#countvectorizer) + - [TfidfVectorizer](#TfidfVectorizer) + - [BM25Vectorizer](#bm25vectorizer) +- [FlashText](#flashtext) +- [Extras](#extras) + - [Counter](#counter) + - [Normalizer](#normalizer) + +## Quick Start + +### Sparse Module + +The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers. + +#### CountVectorizer + +The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`. + +```python +from lenlp import sparse + +vectorizer = sparse.CountVectorizer( + ngram_range=(3, 5), # range of n-grams + analyzer="char_wb", # word, char, char_wb + normalize=True, # lowercase and strip accents + stop_words=["based"], # list of stop words +) +``` + +You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts: + +```python +X = [ + "Hello World", + "Rust based vectorizer" +] + +matrix = vectorizer.fit_transform(X) +``` + +Or use separate calls: + +```python +vectorizer.fit(X) +matrix = vectorizer.transform(X) +``` + +Benchmark: + +

+ +LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer. + +#### TfidfVectorizer + +The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust. + +```python +from lenlp import sparse + +vectorizer = sparse.TfidfVectorizer( + ngram_range=(3, 5), # Range of n-grams + analyzer="char_wb", # Options: word, char, char_wb + normalize=True, # Lowercase and strip accents + stop_words=["based"] # List of stop words +) +``` + +Fit the vectorizer and transform texts: + +```python +X = [ + "Hello World", + "Rust based vectorizer" +] + +matrix = vectorizer.fit_transform(X) +``` + +Or use separate calls: + +```python +vectorizer.fit(X) +matrix = vectorizer.transform(X) +``` + +Benchmark: + +

+ +LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer. + +#### BM25Vectorizer + +The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights. + +```python +from lenlp import sparse + +vectorizer = sparse.BM25Vectorizer( + ngram_range=(3, 5), # Range of n-grams + analyzer="char_wb", # Options: word, char, char_wb + normalize=True, # Lowercase and strip accents + stop_words=["based"] # List of stop words +) +``` + +Fit the vectorizer and transform texts: + +```python +X = [ + "Hello World", + "Rust based vectorizer" +] + +matrix = vectorizer.fit_transform(X) +``` + +Or use separate calls: + +```python +vectorizer.fit(X) +matrix = vectorizer.transform(X) +``` + +Benchmark: + +

+ + +LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn. + +### FlashText + +The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*. + +```python +from lenlp import flash + +flash_text = flash.FlashText( + normalize=True # remove accents and lowercase +) + +# Add keywords we want to retrieve: +flash_text.add(["paris", "bordeaux", "toulouse"]) +``` + +Extract keywords and their positions from sentences: + +```python +sentences = [ + "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", + "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", +] + +flash_text.extract(sentences) +``` + +Output: + +```python +[[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)], + [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]] +``` + +The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction. + +Benchmark: + +

+ +LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext). + +### Extras + +#### Counter + +The counter module allows to convert a list of texts into a dictionary of token counts. + +```python +from lenlp import counter + +sentences = [ + "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", + "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", +] + +counter.count( + sentences, + ngram_range=(1, 1), # Range of n-grams + analyzer="word", # Options: word, char, char_wb + normalize=True, # Lowercase and strip accents + stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words +) +``` + +Output: + +```python +[{'compared': 1, + 'south': 1, + 'city': 1, + 'toulouse': 1, + 'bordeaux': 2, + 'france': 1}, + {'toulouse': 1, + 'france': 1, + 'capital': 1, + 'paris': 1, + 'north': 1, + 'compared': 1, + 'bordeaux': 1}] +``` + +#### Normalizer + +The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase. + +```python +from lenlp import normalizer + +sentences = [ + "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", + "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", +] + +normalizer.normalize(sentences) +``` + +Output: + +```python +[ + 'toulouse is a city in france its in the south compared to bordeaux and bordeaux', + 'paris is the capital of france its in the north compared to bordeaux and toulouse', +] +``` + +## References + +- *[FlashText](https://github.com/vi3k6i5/flashtext)* +- *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)* +- *[PyO3](https://github.com/PyO3/pyo3)* +- *[Maturin](https://github.com/PyO3/maturin)* + diff --git a/docs/bm25.png b/docs/bm25.png new file mode 100644 index 0000000..0a6a4b3 Binary files /dev/null and b/docs/bm25.png differ diff --git a/docs/count_vectorizer.png b/docs/count_vectorizer.png new file mode 100644 index 0000000..3ce01e6 Binary files /dev/null and b/docs/count_vectorizer.png differ diff --git a/docs/count_vectorizer_char.png b/docs/count_vectorizer_char.png new file mode 100644 index 0000000..d3493b0 Binary files /dev/null and b/docs/count_vectorizer_char.png differ diff --git a/docs/flashtext.png b/docs/flashtext.png new file mode 100644 index 0000000..3b3867a Binary files /dev/null and b/docs/flashtext.png differ diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000..d0a6dd7 Binary files /dev/null and b/docs/logo.png differ diff --git a/docs/tfidf.png b/docs/tfidf.png new file mode 100644 index 0000000..ca163bc Binary files /dev/null and b/docs/tfidf.png differ diff --git a/lenlp/__version__.py b/lenlp/__version__.py index 30dbba5..3e26da2 100644 --- a/lenlp/__version__.py +++ b/lenlp/__version__.py @@ -1,3 +1,3 @@ -VERSION = (1, 0, 2) +VERSION = (1, 0, 3) __version__ = ".".join(map(str, VERSION)) diff --git a/lenlp/counter/count.py b/lenlp/counter/count.py index af886d8..1cfd565 100644 --- a/lenlp/counter/count.py +++ b/lenlp/counter/count.py @@ -46,7 +46,7 @@ def count( >>> counter.count("Hello, world!", sort=True) {'hello': 1, 'world': 1} - >>> counter.count("Hello, world!", ngram_range=[2, 2], sort=True, normalize=False) + >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False) {'Hello, world!': 1} >>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True) diff --git a/lenlp/flash/flash_text.py b/lenlp/flash/flash_text.py index 02f409b..670f00d 100644 --- a/lenlp/flash/flash_text.py +++ b/lenlp/flash/flash_text.py @@ -23,8 +23,8 @@ class FlashText: >>> flash_text = flash.FlashText(normalize=True) >>> flash_text = flash_text.add(["hello", "world"]) - >>> flash_text.extract(["Hello, world!", "world", "hello"], span_info=True) - [['hello:0:5'], [], []] + >>> flash_text.extract(["Hello, world!", "world", "hello"]) + [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]] """ @@ -37,9 +37,9 @@ def add(self, x: str | list[str]) -> None: self.flash.add_keywords_many(x) return self - def extract(self, x: str | list[str], span_info: bool = False) -> list[str]: + def extract(self, x: str | list[str]) -> list[str]: """Extract keywords from a sentence.""" is_string = isinstance(x, str) x = [x] if isinstance(x, str) else x - y = self.flash.extract_keywords_many(x, span_info=span_info) + y = self.flash.extract_keywords_many(x) return y[0] if is_string else y diff --git a/lenlp/sparse/__init__.py b/lenlp/sparse/__init__.py index b17fadb..606a7f6 100644 --- a/lenlp/sparse/__init__.py +++ b/lenlp/sparse/__init__.py @@ -1,5 +1,5 @@ from .bm25_vectorizer import BM25Vectorizer from .count_vectorizer import CountVectorizer -from .tfidf_vectorizer import TfIdfVectorizer +from .tfidf_vectorizer import TfidfVectorizer -__all__ = ["BM25Vectorizer", "CountVectorizer", "TfIdfVectorizer"] +__all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"] diff --git a/lenlp/sparse/bm25_vectorizer.py b/lenlp/sparse/bm25_vectorizer.py index ca131ae..f4af771 100644 --- a/lenlp/sparse/bm25_vectorizer.py +++ b/lenlp/sparse/bm25_vectorizer.py @@ -2,10 +2,10 @@ from scipy.sparse import csr_matrix from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 -from .tfidf_vectorizer import TfIdfVectorizer +from .tfidf_vectorizer import TfidfVectorizer -class BM25Vectorizer(TfIdfVectorizer): +class BM25Vectorizer(TfidfVectorizer): """BM25Vectorizer is a class that converts a collection of text documents to a sparse bm25 matrix. diff --git a/lenlp/sparse/count_vectorizer.py b/lenlp/sparse/count_vectorizer.py index 41c10f1..9067b63 100644 --- a/lenlp/sparse/count_vectorizer.py +++ b/lenlp/sparse/count_vectorizer.py @@ -1,4 +1,3 @@ -import numpy as np from rslenlp import SparseMatrixBuilder from scipy.sparse import csr_matrix @@ -48,7 +47,7 @@ class CountVectorizer: >>> matrix.toarray() array([[1, 1, 0, 0, 0], - [0, 0, 1, 1, 1]], dtype=int16) + [0, 0, 1, 1, 1]], dtype=uint64) >>> len(count_vectorizer.vocabulary) 5 @@ -92,23 +91,25 @@ def transform(self, raw_documents: list[str]) -> csr_matrix: """Transform documents to document-term matrix.""" if not self.fitted: raise ValueError("Call fit method before calling transform method.") + values, row_indices, column_indices = self.sparse_matrix.transform( raw_documents ) + return csr_matrix( arg1=(values, (row_indices, column_indices)), shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), - dtype=np.int16, ) def fit_transform(self, raw_documents: list[str]) -> csr_matrix: """Learn the vocabulary dictionary and return the CountVectorizer object.""" self.fitted = True + values, row_indices, column_indices = self.sparse_matrix.fit_transform( raw_documents ) + return csr_matrix( arg1=(values, (row_indices, column_indices)), shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), - dtype=np.int16, ) diff --git a/lenlp/sparse/tfidf_vectorizer.py b/lenlp/sparse/tfidf_vectorizer.py index 8ca9abc..b45adbd 100644 --- a/lenlp/sparse/tfidf_vectorizer.py +++ b/lenlp/sparse/tfidf_vectorizer.py @@ -5,8 +5,8 @@ from .count_vectorizer import CountVectorizer -class TfIdfVectorizer(CountVectorizer): - """TfIdfVectorizer is a class that converts a collection of text documents to a sparse +class TfidfVectorizer(CountVectorizer): + """TfidfVectorizer is a class that converts a collection of text documents to a sparse tfidf matrix. Parameters @@ -32,7 +32,7 @@ class TfIdfVectorizer(CountVectorizer): -------- >>> from lenlp import sparse - >>> tfidf_vectorizer = sparse.TfIdfVectorizer( + >>> tfidf_vectorizer = sparse.TfidfVectorizer( ... analyzer="word", ... normalize=True, ... stop_words=None, @@ -101,7 +101,7 @@ def transform(self, raw_documents: list[str]) -> csr_matrix: matrix=csr_matrix( arg1=(values, (row_indices, column_indices)), shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), - dtype=np.float64, + dtype=np.float32, ) ) @@ -114,7 +114,7 @@ def fit_transform(self, raw_documents: list[str]) -> csr_matrix: matrix = csr_matrix( arg1=(values, (row_indices, column_indices)), shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), - dtype=np.float64, + dtype=np.float32, ) self.update(matrix=matrix) diff --git a/readme.md b/readme.md deleted file mode 100644 index e69de29..0000000 diff --git a/rust/rsflashtext.rs b/rust/rsflashtext.rs index 8c31432..7ac7987 100644 --- a/rust/rsflashtext.rs +++ b/rust/rsflashtext.rs @@ -111,48 +111,59 @@ impl RSKeywordProcessor { pub fn extract_keywords_many( &self, sentences: Vec, - span_info: bool, - ) -> Vec> { + ) -> Vec> { sentences .par_iter() - .map(|sentence: &String| self.extract_keywords(&sentence, span_info.clone())) + .map(|sentence: &String| self.extract_keywords(&sentence)) .collect() } - pub fn extract_keywords(&self, sentence: &str, span_info: bool) -> Vec { - let sentence: String = if self.normalize { - unidecode(sentence) - .to_lowercase() - .chars() - .filter(|c| !c.is_ascii_punctuation()) - .collect::() - .trim() - .to_string() + pub fn extract_keywords(&self, sentence: &str) -> Vec<(String, usize, usize)> { + // Map from the index in the normalized sentence to the index in the original sentence + let mut index_map: Vec = Vec::with_capacity(sentence.len()); + let mut original_idx = 0; + + let normalized_sentence: String = if self.normalize { + let mut normalized = String::new(); + for c in sentence.chars() { + if c.is_ascii_punctuation() { + original_idx += c.len_utf8(); + continue; + } + let normalized_char = unidecode::unidecode_char(c).to_lowercase(); + for nc in normalized_char.chars() { + normalized.push(nc); + index_map.push(original_idx); + } + original_idx += c.len_utf8(); + } + normalized.to_string() } else if self.lowercase { sentence.to_lowercase() } else { sentence.to_string() }; - let mut extracted_keywords: Vec = Vec::new(); + let mut extracted_keywords: Vec<(String, usize, usize)> = Vec::new(); let mut current_node: &HashMap = &self.keyword_trie_dict; let mut start_pos: usize = 0; let mut end_pos: usize = 0; let mut idx: usize = 0; - let sentence_len: usize = sentence.len(); + let sentence_len: usize = normalized_sentence.len(); while idx < sentence_len { - let char: char = sentence.chars().nth(idx).unwrap(); + let char: char = normalized_sentence.chars().nth(idx).unwrap(); if !self.non_word_boundaries.contains(&char) { if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) { if node.is_end { let clean_name: &String = node.clean_name.as_ref().unwrap(); - if span_info { - extracted_keywords - .push(format!("{}:{}:{}", clean_name, start_pos, end_pos)); - } else { - extracted_keywords.push(clean_name.clone()); - } + let original_start_pos = index_map[start_pos]; + let original_end_pos = index_map[end_pos - 1] + 1; + extracted_keywords.push(( + clean_name.clone(), + original_start_pos, + original_end_pos, + )); } } current_node = &self.keyword_trie_dict; @@ -167,6 +178,16 @@ impl RSKeywordProcessor { idx += 1; } + // Check if the last segment is a keyword + if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) { + if node.is_end { + let clean_name: &String = node.clean_name.as_ref().unwrap(); + let original_start_pos = index_map[start_pos]; + let original_end_pos = index_map[end_pos - 1] + 1; + extracted_keywords.push((clean_name.clone(), original_start_pos, original_end_pos)); + } + } + extracted_keywords } } diff --git a/rust/rssparse.rs b/rust/rssparse.rs index e83b523..0350f61 100644 --- a/rust/rssparse.rs +++ b/rust/rssparse.rs @@ -1,11 +1,11 @@ +use crate::rsvectorizer::rsvectorize_many; use bincode::{deserialize, serialize}; +use numpy::PyArray1; use pyo3::prelude::*; use pyo3::types::PyBytes; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use crate::rsvectorizer::rsvectorize_many; - // In order to properly pickle, we need to map to the Python module. #[derive(Clone, Debug, Serialize, Deserialize)] #[pyclass(module = "lenlp.sparse.count_vectorizer")] @@ -29,15 +29,23 @@ impl SparseMatrixBuilder { ) -> Self { SparseMatrixBuilder { vocab: HashMap::new(), - n_sizes: n_sizes, - analyzer: analyzer, - stop_words: stop_words, - normalize: normalize, + n_sizes, + analyzer, + stop_words, + normalize, num_cols: 0, } } - pub fn fit_transform(&mut self, texts: Vec) -> (Vec, Vec, Vec) { + pub fn fit_transform( + &mut self, + texts: Vec, + py: Python, + ) -> ( + Py>, + Py>, + Py>, + ) { self.vocab = HashMap::new(); let texts: Vec> = rsvectorize_many( texts, @@ -46,8 +54,16 @@ impl SparseMatrixBuilder { self.stop_words.clone(), self.normalize, ); + self._fit(texts.clone()); - self._transform(texts) + + // Scipy csr_matrix are faster to build from numpy arrays. + let (vec1, vec2, vec3) = self._transform(texts); + ( + PyArray1::from_vec_bound(py, vec1).into(), + PyArray1::from_vec_bound(py, vec2).into(), + PyArray1::from_vec_bound(py, vec3).into(), + ) } pub fn fit(&mut self, texts: Vec) { @@ -76,7 +92,15 @@ impl SparseMatrixBuilder { self.num_cols = col_index; } - pub fn transform(&self, texts: Vec) -> (Vec, Vec, Vec) { + pub fn transform( + &self, + texts: Vec, + py: Python, + ) -> ( + Py>, + Py>, + Py>, + ) { let texts: Vec> = rsvectorize_many( texts, self.n_sizes.clone(), @@ -85,10 +109,16 @@ impl SparseMatrixBuilder { self.normalize, ); - return self._transform(texts); + // Scipy csr_matrix are faster to build from numpy arrays. + let (vec1, vec2, vec3) = self._transform(texts); + ( + PyArray1::from_vec_bound(py, vec1).into(), + PyArray1::from_vec_bound(py, vec2).into(), + PyArray1::from_vec_bound(py, vec3).into(), + ) } - pub fn _transform( + fn _transform( &self, texts: Vec>, ) -> (Vec, Vec, Vec) { diff --git a/setup.py b/setup.py index c929aee..cd0900a 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,9 @@ from lenlp.__version__ import __version__ +with open(file="README.md", mode="r", encoding="utf-8") as fh: + long_description = fh.read() + base_packages = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"] dev = ["maturin >= 1.5.1", "pytest-cov >= 5.0.0", "pytest >= 7.4.4", "ruff >= 0.1.15"] @@ -11,8 +14,9 @@ name="lenlp", version=f"{__version__}", author="Raphael Sourty", - author_email="raphael.sourty@gmail.com", + long_description=long_description, long_description_content_type="text/markdown", + author_email="raphael.sourty@gmail.com", url="https://github.com/raphaelsty/lenlp", download_url="https://github.com/raphaelsty/lenlp/archive/v_01.tar.gz", keywords=[],