diff --git a/src/data/tokenizers/mod.rs b/src/data/tokenizers/mod.rs new file mode 100644 index 00000000..84ac7ef4 --- /dev/null +++ b/src/data/tokenizers/mod.rs @@ -0,0 +1,19 @@ +//! The Tokenizer module +//! +//! This module contains traits used to tokenize text. +//! It also reexports these `Tokenizer`s from child modules. +//! +//! The `Tokenizer`s are intended to be used with the text +//! `Vectorizer` + +mod naive; + +pub use self::naive::NaiveTokenizer; + +use learning::LearningResult; + +/// A trait used to construct Tokenizers +pub trait Tokenizer { + /// Tokenize the inputs + fn tokenize(&mut self, input: &str) -> LearningResult>; +} diff --git a/src/data/tokenizers/naive.rs b/src/data/tokenizers/naive.rs new file mode 100644 index 00000000..bf05bfa7 --- /dev/null +++ b/src/data/tokenizers/naive.rs @@ -0,0 +1,27 @@ +use super::Tokenizer; +use learning::LearningResult; + +/// NaiveTokenizer tokenizer +/// +/// This provides an implementation of `Tokenizer` +/// which allows us to tokenize a string by splitting +/// it by whitespaces and lowering al the characters +#[derive(Debug)] +pub struct NaiveTokenizer; + +impl Tokenizer for NaiveTokenizer { + fn tokenize(&mut self, input: &str) -> LearningResult> { + let tokens = input + .split_whitespace() + .map(|token| token.to_lowercase()) + .collect::>(); + Ok(tokens) + } +} + +impl NaiveTokenizer { + /// Constructs a new `NaiveTokenizer` tokenizer. + pub fn new() -> Self { + NaiveTokenizer + } +} diff --git a/src/data/vectorizers/mod.rs b/src/data/vectorizers/mod.rs new file mode 100644 index 00000000..8bebaac8 --- /dev/null +++ b/src/data/vectorizers/mod.rs @@ -0,0 +1,19 @@ +//! The Vectorizers module +//! +//! This module contains traits used to vectorize data using common +//! techniques. It also reexports these `Vectorizer`s from child modules. +//! +//! The `Vectorizer` trait provides a shared interface for all of the +//! data vectorizations in rusty-machine. + +pub mod text; + +use learning::LearningResult; + +/// A trait used to construct Vectorizers +pub trait Vectorizer { + /// Fit the inputs to create the `Vectorizer` + fn fit(&mut self, inputs: &[U]) -> LearningResult<()>; + /// Vectorize the inputs + fn vectorize(&mut self, inputs: &[U]) -> LearningResult; +} diff --git a/src/data/vectorizers/text.rs b/src/data/vectorizers/text.rs new file mode 100644 index 00000000..3a7a75de --- /dev/null +++ b/src/data/vectorizers/text.rs @@ -0,0 +1,89 @@ +//! Text Vectorizers +//! +//! This module contains some text vectorizers. +//! +//! The `FreqVectorizer` vectorizer is used to vectorize vectors of +//! strings using a traditional Bag of Words where each string +//! is splitted by whitespaces and each word is transformed into +//! lowercase +//! +//! # Examples +//! +//! ``` +//! use rusty_machine::data::vectorizers::Vectorizer; +//! use rusty_machine::data::vectorizers::text::FreqVectorizer; +//! use rusty_machine::data::tokenizers::NaiveTokenizer; +//! +//! // Constructs an empty `FreqVectorizer` vectorizer +//! let mut freq_vectorizer = FreqVectorizer::::new(NaiveTokenizer::new()); +//! +//! let inputs = vec!["This is test"]; +//! +//! // Fit the vectorizer +//! freq_vectorizer.fit(&inputs).unwrap(); +//! +//! // Transform the inputs +//! let vectorized = freq_vectorizer.vectorize(&inputs).unwrap(); +//! ``` + +use libnum::Float; +use std::collections::HashMap; + +use std::marker::PhantomData; +use linalg::Matrix; +use learning::LearningResult; +use super::Vectorizer; +use super::super::tokenizers::Tokenizer; + +/// FreqVectorizer vectorizer for text +/// +/// This provides an implementation of `Vectorizer` +/// which allows us to vectorize strings by doing a non-normalized frequency count +/// +/// See the module description for more information. +#[derive(Debug)] +pub struct FreqVectorizer { + words: HashMap, + tokenizer: T, + float_type: PhantomData, +} + +impl<'a, U: Float, T: Tokenizer> Vectorizer<&'a str, Matrix> for FreqVectorizer { + fn vectorize(&mut self, texts: &[&'a str]) -> LearningResult> { + let mut result = Matrix::zeros(texts.len(), self.words.len()); + for (text, row) in texts.iter().zip((0..texts.len())) { + let tokens = self.tokenizer.tokenize(text).unwrap(); + for token in tokens { + if let Some(&col) = self.words.get(&token) { + result[[row, col]] = result[[row, col]] + U::one(); + } + } + } + Ok(result) + } + fn fit(&mut self, texts: &[&str]) -> LearningResult<()> { + self.words = HashMap::new(); + let mut index: usize = 0; + for text in texts { + let tokens = self.tokenizer.tokenize(text).unwrap(); + for token in tokens { + if !self.words.contains_key(&token) { + self.words.insert(token, index); + index += 1; + } + } + } + Ok(()) + } +} + +impl FreqVectorizer { + /// Constructs an empty `FreqVectorizer` vectorizer. + pub fn new(tokenizer: T) -> Self { + FreqVectorizer { + words: HashMap::new(), + tokenizer: tokenizer, + float_type: PhantomData, + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 2c800f09..7d570604 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -124,6 +124,8 @@ pub mod linalg { /// Module for data handling pub mod data { pub mod transforms; + pub mod vectorizers; + pub mod tokenizers; } /// Module for machine learning. diff --git a/tests/data/vectorizers/text.rs b/tests/data/vectorizers/text.rs new file mode 100644 index 00000000..dcf20be2 --- /dev/null +++ b/tests/data/vectorizers/text.rs @@ -0,0 +1,17 @@ +use rm::data::vectorizers::text::FreqVectorizer; +use rm::data::tokenizers::{NaiveTokenizer}; +use rm::data::vectorizers::Vectorizer; +use rm::prelude::Matrix; + +#[test] +fn test_frequency_vectorizer() { + let mut freq_vectorizer = FreqVectorizer::::new(NaiveTokenizer::new()); + let fit_inputs = vec!["This is fit"]; + freq_vectorizer.fit(&fit_inputs).unwrap(); + + let inputs = vec!["this is vectorize", + "this is not fit"]; + let vectorized = freq_vectorizer.vectorize(&inputs).unwrap(); + let expected = Matrix::new(2, 3, vec![1., 1., 0., 1., 1., 1.]); + assert_eq!(vectorized, expected); +} diff --git a/tests/lib.rs b/tests/lib.rs index 11309cd6..2c6db5e8 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -10,8 +10,14 @@ pub mod learning { mod gp; pub mod optim { - mod grad_desc; + mod grad_desc; } } -pub mod datasets; \ No newline at end of file +pub mod data { + pub mod vectorizers { + mod text; + } +} + +pub mod datasets;