Skip to content
This repository has been archived by the owner on Jul 16, 2021. It is now read-only.

Adding text vectorization #178

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/data/tokenizers/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//! The Tokenizer module
//!
//! This module contains traits used to tokenize text.
//! It also reexports these `Tokenizer`s from child modules.
//!
//! The `Tokenizer`s are intended to be used with the text
//! `Vectorizer`

mod naive;

pub use self::naive::NaiveTokenizer;

use learning::LearningResult;

/// A trait used to construct Tokenizers
pub trait Tokenizer {
/// Tokenize the inputs
fn tokenize(&mut self, input: &str) -> LearningResult<Vec<String>>;
}
27 changes: 27 additions & 0 deletions src/data/tokenizers/naive.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
use super::Tokenizer;
use learning::LearningResult;

/// NaiveTokenizer tokenizer
///
/// This provides an implementation of `Tokenizer`
/// which allows us to tokenize a string by splitting
/// it by whitespaces and lowering al the characters
#[derive(Debug)]
pub struct NaiveTokenizer;

impl Tokenizer for NaiveTokenizer {
fn tokenize(&mut self, input: &str) -> LearningResult<Vec<String>> {
let tokens = input
.split_whitespace()
.map(|token| token.to_lowercase())
.collect::<Vec<String>>();
Ok(tokens)
}
}

impl NaiveTokenizer {
/// Constructs a new `NaiveTokenizer` tokenizer.
pub fn new() -> Self {
NaiveTokenizer
}
}
19 changes: 19 additions & 0 deletions src/data/vectorizers/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//! The Vectorizers module
//!
//! This module contains traits used to vectorize data using common
//! techniques. It also reexports these `Vectorizer`s from child modules.
//!
//! The `Vectorizer` trait provides a shared interface for all of the
//! data vectorizations in rusty-machine.

pub mod text;

use learning::LearningResult;

/// A trait used to construct Vectorizers
pub trait Vectorizer<U, V> {
/// Fit the inputs to create the `Vectorizer`
fn fit(&mut self, inputs: &[U]) -> LearningResult<()>;
/// Vectorize the inputs
fn vectorize(&mut self, inputs: &[U]) -> LearningResult<V>;
}
89 changes: 89 additions & 0 deletions src/data/vectorizers/text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//! Text Vectorizers
//!
//! This module contains some text vectorizers.
//!
//! The `FreqVectorizer` vectorizer is used to vectorize vectors of
//! strings using a traditional Bag of Words where each string
//! is splitted by whitespaces and each word is transformed into
//! lowercase
//!
//! # Examples
//!
//! ```
//! use rusty_machine::data::vectorizers::Vectorizer;
//! use rusty_machine::data::vectorizers::text::FreqVectorizer;
//! use rusty_machine::data::tokenizers::NaiveTokenizer;
//!
//! // Constructs an empty `FreqVectorizer` vectorizer
//! let mut freq_vectorizer = FreqVectorizer::<f32, NaiveTokenizer>::new(NaiveTokenizer::new());
//!
//! let inputs = vec!["This is test"];
//!
//! // Fit the vectorizer
//! freq_vectorizer.fit(&inputs).unwrap();
//!
//! // Transform the inputs
//! let vectorized = freq_vectorizer.vectorize(&inputs).unwrap();
//! ```

use libnum::Float;
use std::collections::HashMap;

use std::marker::PhantomData;
use linalg::Matrix;
use learning::LearningResult;
use super::Vectorizer;
use super::super::tokenizers::Tokenizer;

/// FreqVectorizer vectorizer for text
///
/// This provides an implementation of `Vectorizer`
/// which allows us to vectorize strings by doing a non-normalized frequency count
///
/// See the module description for more information.
#[derive(Debug)]
pub struct FreqVectorizer<U: Float, T: Tokenizer> {
words: HashMap<String, usize>,
tokenizer: T,
float_type: PhantomData<U>,
}

impl<'a, U: Float, T: Tokenizer> Vectorizer<&'a str, Matrix<U>> for FreqVectorizer<U, T> {
fn vectorize(&mut self, texts: &[&'a str]) -> LearningResult<Matrix<U>> {
let mut result = Matrix::zeros(texts.len(), self.words.len());
for (text, row) in texts.iter().zip((0..texts.len())) {
let tokens = self.tokenizer.tokenize(text).unwrap();
for token in tokens {
if let Some(&col) = self.words.get(&token) {
result[[row, col]] = result[[row, col]] + U::one();
}
}
}
Ok(result)
}
fn fit(&mut self, texts: &[&str]) -> LearningResult<()> {
self.words = HashMap::new();
let mut index: usize = 0;
for text in texts {
let tokens = self.tokenizer.tokenize(text).unwrap();
for token in tokens {
if !self.words.contains_key(&token) {
self.words.insert(token, index);
index += 1;
}
}
}
Ok(())
}
}

impl<U: Float, T: Tokenizer> FreqVectorizer<U, T> {
/// Constructs an empty `FreqVectorizer` vectorizer.
pub fn new(tokenizer: T) -> Self {
FreqVectorizer {
words: HashMap::new(),
tokenizer: tokenizer,
float_type: PhantomData,
}
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ pub mod linalg {
/// Module for data handling
pub mod data {
pub mod transforms;
pub mod vectorizers;
pub mod tokenizers;
}

/// Module for machine learning.
Expand Down
17 changes: 17 additions & 0 deletions tests/data/vectorizers/text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use rm::data::vectorizers::text::FreqVectorizer;
use rm::data::tokenizers::{NaiveTokenizer};
use rm::data::vectorizers::Vectorizer;
use rm::prelude::Matrix;

#[test]
fn test_frequency_vectorizer() {
let mut freq_vectorizer = FreqVectorizer::<f32, NaiveTokenizer>::new(NaiveTokenizer::new());
let fit_inputs = vec!["This is fit"];
freq_vectorizer.fit(&fit_inputs).unwrap();

let inputs = vec!["this is vectorize",
"this is not fit"];
let vectorized = freq_vectorizer.vectorize(&inputs).unwrap();
let expected = Matrix::new(2, 3, vec![1., 1., 0., 1., 1., 1.]);
assert_eq!(vectorized, expected);
}
10 changes: 8 additions & 2 deletions tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ pub mod learning {
mod gp;

pub mod optim {
mod grad_desc;
mod grad_desc;
}
}

pub mod datasets;
pub mod data {
pub mod vectorizers {
mod text;
}
}

pub mod datasets;