From edccb6022ef81197eeb5618301a3f84b64729f6c Mon Sep 17 00:00:00 2001 From: NeTT Date: Sat, 7 Sep 2024 20:01:27 +0530 Subject: [PATCH] completely revamp utilities --- deno.jsonc | 2 + .../utilities/src/encoding/categorical.ts | 92 --------------- packages/utilities/src/encoding/mod.ts | 10 +- packages/utilities/src/encoding/multihot.ts | 41 +++++++ packages/utilities/src/encoding/onehot.ts | 37 ++++++ packages/utilities/src/encoding/softmax.ts | 35 ++++++ .../utilities/src/encoding/termfrequency.ts | 43 +++++++ packages/utilities/src/mapper/discrete.ts | 56 +++++++++ packages/utilities/src/mapper/mod.ts | 1 + .../src/text/{preprocess => }/cleaner.ts | 17 ++- packages/utilities/src/text/mod.ts | 3 +- packages/utilities/src/text/preprocess/mod.ts | 29 ----- .../src/text/preprocess/tokenize/mod.ts | 1 - .../src/text/preprocess/tokenize/split.ts | 109 ------------------ .../preprocess/vectorize/count_vectorizer.ts | 40 ------- .../src/text/preprocess/vectorize/mod.ts | 2 - .../text/preprocess/vectorize/multi_hot.ts | 39 ------- packages/utilities/src/text/vectorizer.ts | 60 ++++++++++ .../{text/preprocess => }/transformer/mod.ts | 0 .../preprocess => }/transformer/tfidf.ts | 15 +-- packages/utilities/src/utils/common_types.ts | 59 ++++++---- 21 files changed, 338 insertions(+), 353 deletions(-) delete mode 100644 packages/utilities/src/encoding/categorical.ts create mode 100644 packages/utilities/src/encoding/multihot.ts create mode 100644 packages/utilities/src/encoding/onehot.ts create mode 100644 packages/utilities/src/encoding/softmax.ts create mode 100644 packages/utilities/src/encoding/termfrequency.ts create mode 100644 packages/utilities/src/mapper/discrete.ts create mode 100644 packages/utilities/src/mapper/mod.ts rename packages/utilities/src/text/{preprocess => }/cleaner.ts (69%) delete mode 100644 packages/utilities/src/text/preprocess/mod.ts delete mode 100644 packages/utilities/src/text/preprocess/tokenize/mod.ts delete mode 100644 packages/utilities/src/text/preprocess/tokenize/split.ts delete mode 100644 packages/utilities/src/text/preprocess/vectorize/count_vectorizer.ts delete mode 100644 packages/utilities/src/text/preprocess/vectorize/mod.ts delete mode 100644 packages/utilities/src/text/preprocess/vectorize/multi_hot.ts create mode 100644 packages/utilities/src/text/vectorizer.ts rename packages/utilities/src/{text/preprocess => }/transformer/mod.ts (100%) rename packages/utilities/src/{text/preprocess => }/transformer/tfidf.ts (70%) diff --git a/deno.jsonc b/deno.jsonc index 2840807..8e943e8 100644 --- a/deno.jsonc +++ b/deno.jsonc @@ -22,6 +22,8 @@ "./utilities/image": "./packages/utilities/src/image/mod.ts", "./utilities/metrics": "./packages/utilities/src/metrics/mod.ts", "./utilities/encoding": "./packages/utilities/src/encoding/mod.ts", + "./utilities/mapper": "./packages/utilities/src/mapper/mod.ts", + "./utilities/transformer": "./packages/utilities/src/transformer/mod.ts", "./utilities/misc": "./packages/utilities/src/utils/mod.ts", // Tokenizers "./tokenizers": "./packages/tokenizers/mod.ts", diff --git a/packages/utilities/src/encoding/categorical.ts b/packages/utilities/src/encoding/categorical.ts deleted file mode 100644 index a7d5700..0000000 --- a/packages/utilities/src/encoding/categorical.ts +++ /dev/null @@ -1,92 +0,0 @@ -import { Matrix, type MatrixLike } from "../mod.ts"; -import type { DataType, DType, DTypeValue } from "../utils/common_types.ts"; - -/** Convert class labels into categorical variables (one-hot) */ -export class CategoricalEncoder { - /** Map categories to indices */ - mapping: Map; - /** An internal counter for remembering the last index in mapping. */ - #lastToken: Uint32Array; - constructor() { - this.mapping = new Map(); - this.#lastToken = new Uint32Array(1); - } - /** Construct a mapping from a given set of text. */ - fit(targets: T[]): this { - let i = 0; - while (i < targets.length) { - if (!this.mapping.has(targets[i])) { - const token = this.#incrementToken(); - this.mapping.set(targets[i], token); - } - i += 1; - } - return this; - } - /** One-hot encoding of categorical values */ - transform
(targets: T[], dType: DT): Matrix
{ - const res = new Matrix
(dType, [targets.length, this.#lastToken[0]]); - let i = 0; - while (i < targets.length) { - const index = this.mapping.get(targets[i]); - if (index !== 0 && !index) { - i += 1; - continue; - } - res.setCell(i, index, 1); - i += 1; - } - return res; - } - untransform
(data: MatrixLike
): T[] { - const matrix = new Matrix(data); - const res = new Array(matrix.nRows); - for (let i = 0; i < res.length; i += 1) { - const idx = matrix.row(i).findIndex((x) => x === 1); - res[i] = this.getOg(idx) || "__unknown__"; - } - return res; - } - getOg(data: number): T | undefined { - for (const [k, v] of this.mapping.entries()) { - if (v === data) { - return k; - } - } - return undefined; - } - #incrementToken(): number { - return Atomics.add(this.#lastToken, 0, 1); - } - /** - * Convert softmax outputs into categorical outputs - * This method mutates the original matrix. - * @returns The modified matrix. - */ - static fromSoftmax
(data: MatrixLike
): Matrix
{ - const matrix = new Matrix(data); - for (let i = 0; i < matrix.nRows; i += 1) { - const max = matrix - .row(i) - // @ts-ignore It can reduce. - .reduce( - (acc: number, curr: DTypeValue
, i: number, arr: DType
) => - arr[acc] > curr ? acc : i, - 0, - ); - if ( - data.data instanceof BigInt64Array || - data.data instanceof BigUint64Array - ) { - const newR = new Array(matrix.nCols).fill(0n); - newR[max] = 1n; - matrix.setRow(i, newR); - } else { - const newR = new Array(matrix.nCols).fill(0); - newR[max] = 1; - matrix.setRow(i, newR); - } - } - return matrix; - } -} diff --git a/packages/utilities/src/encoding/mod.ts b/packages/utilities/src/encoding/mod.ts index 26408b2..741dea0 100644 --- a/packages/utilities/src/encoding/mod.ts +++ b/packages/utilities/src/encoding/mod.ts @@ -1,6 +1,4 @@ -/** - * Encoding data into different representations. - * @module - */ - -export * from "./categorical.ts"; +export { MultiHotEncoder } from "./multihot.ts"; +export { OneHotEncoder } from "./onehot.ts"; +export { TfEncoder } from "./termfrequency.ts"; +export { transformSoftmaxMut } from "./softmax.ts"; diff --git a/packages/utilities/src/encoding/multihot.ts b/packages/utilities/src/encoding/multihot.ts new file mode 100644 index 0000000..3fd3924 --- /dev/null +++ b/packages/utilities/src/encoding/multihot.ts @@ -0,0 +1,41 @@ +import { Matrix } from "../mod.ts"; +import type { DataType } from "../utils/common_types.ts"; + +/** + * Convert 2D array of indices into multi-hot encoded vectors. + */ +export class MultiHotEncoder { + /** Size of encoded vectors. */ + mappingSize: number; + constructor(mappingSize: number) { + this.mappingSize = mappingSize; + } + /** Encoding values into multi-hot vectors */ + transform
(targets: Matrix
): Matrix
; + transform
(targets: number[][], dType: DT): Matrix
; + transform
( + targets: number[][] | Matrix
, + dType?: DT + ): Matrix
{ + if (!dType && !(targets instanceof Matrix)) + throw new Error("dType required when not dealing with matrices."); + const dataType = dType || (targets as Matrix
).dType; + const res = new Matrix
(dataType, [targets.length, this.mappingSize]); + let i = 0; + while (i < targets.length) { + const row = targets instanceof Matrix ? targets.row(i) : targets[i]; + let j = 0; + while (j < row.length) { + if (Number(row[j]) >= row.length) { + j += 1; + continue; + } + res.setCell(i, Number(row[j]), 1); + j += 1; + } + + i += 1; + } + return res; + } +} diff --git a/packages/utilities/src/encoding/onehot.ts b/packages/utilities/src/encoding/onehot.ts new file mode 100644 index 0000000..c09d096 --- /dev/null +++ b/packages/utilities/src/encoding/onehot.ts @@ -0,0 +1,37 @@ +import { Matrix, type MatrixLike } from "../mod.ts"; +import type { DataType } from "../utils/common_types.ts"; + +/** + * Convert an array of indices into one-hot encoded vectors. + */ +export class OneHotEncoder { + /** Size of one-hot encoded vectors. */ + mappingSize: number; + constructor(mappingSize: number) { + this.mappingSize = mappingSize; + } + /** One-hot encoding of values */ + transform
(targets: number[], dType: DT): Matrix
{ + const res = new Matrix
(dType, [targets.length, this.mappingSize]); + let i = 0; + while (i < targets.length) { + const index = targets[i]; + if (index >= this.mappingSize) { + i += 1; + continue; + } + res.setCell(i, index, 1); + i += 1; + } + return res; + } + untransform
(data: MatrixLike
): number[] { + const matrix = new Matrix(data); + const res = new Array(matrix.nRows); + for (let i = 0; i < res.length; i += 1) { + const idx = matrix.row(i).findIndex((x) => x === 1); + res[i] = idx; + } + return res; + } +} diff --git a/packages/utilities/src/encoding/softmax.ts b/packages/utilities/src/encoding/softmax.ts new file mode 100644 index 0000000..a1abe6c --- /dev/null +++ b/packages/utilities/src/encoding/softmax.ts @@ -0,0 +1,35 @@ +import { Matrix, type MatrixLike } from "../mod.ts"; +import type { DataType, DType, DTypeValue } from "../utils/common_types.ts"; + +/** + * Convert a softmax output into one-hot vectors. + * Mutates the input. + */ +export function transformSoftmaxMut
( + targets: MatrixLike
+): Matrix
{ + const matrix = new Matrix(targets); + for (let i = 0; i < matrix.nRows; i += 1) { + const max = matrix + .row(i) + // @ts-ignore It can reduce. + .reduce( + (acc: number, curr: DTypeValue
, i: number, arr: DType
) => + arr[acc] > curr ? acc : i, + 0 + ); + if ( + targets.data instanceof BigInt64Array || + targets.data instanceof BigUint64Array + ) { + const newR = new Array(matrix.nCols).fill(0n); + newR[max] = 1n; + matrix.setRow(i, newR); + } else { + const newR = new Array(matrix.nCols).fill(0); + newR[max] = 1; + matrix.setRow(i, newR); + } + } + return matrix; +} diff --git a/packages/utilities/src/encoding/termfrequency.ts b/packages/utilities/src/encoding/termfrequency.ts new file mode 100644 index 0000000..acb523f --- /dev/null +++ b/packages/utilities/src/encoding/termfrequency.ts @@ -0,0 +1,43 @@ +import { Matrix } from "../mod.ts"; +import type { DataType } from "../utils/common_types.ts"; + +/** + * Convert 2D array of indices into multi-hot encoded vectors + * where each index contains the number of times the respective + * value appears in a sample (term frequency encoder). + */ +export class TfEncoder { + /** Size of encoded vectors. */ + mappingSize: number; + constructor(mappingSize: number) { + this.mappingSize = mappingSize; + } + /** Encoding values into count vectors */ + transform
(targets: Matrix
): Matrix
; + transform
(targets: number[][], dType: DT): Matrix
; + transform
( + targets: number[][] | Matrix
, + dType?: DT + ): Matrix
{ + if (!dType && !(targets instanceof Matrix)) + throw new Error("dType required when not dealing with matrices."); + const dataType = dType || (targets as Matrix
).dType; + const res = new Matrix
(dataType, [targets.length, this.mappingSize]); + let i = 0; + while (i < targets.length) { + const row = targets instanceof Matrix ? targets.row(i) : targets[i]; + let j = 0; + while (j < row.length) { + if (Number(row[j]) >= row.length) { + j += 1; + continue; + } + res.setAdd(i, Number(row[j]), 1); + j += 1; + } + + i += 1; + } + return res; + } +} diff --git a/packages/utilities/src/mapper/discrete.ts b/packages/utilities/src/mapper/discrete.ts new file mode 100644 index 0000000..bdfbf5e --- /dev/null +++ b/packages/utilities/src/mapper/discrete.ts @@ -0,0 +1,56 @@ +/** Map discrete values into numbers */ +export class DiscreteMapper { + /** Map categories to indices */ + mapping: Map; + /** An internal counter for remembering the last index in mapping. */ + #lastToken: Uint32Array; + constructor() { + this.mapping = new Map(); + this.#lastToken = new Uint32Array(1); + } + /** Construct a mapping from a given set of text. */ + fit(targets: T[]): this { + let i = 0; + while (i < targets.length) { + if (!this.mapping.has(targets[i])) { + const token = this.#incrementToken(); + this.mapping.set(targets[i], token); + } + i += 1; + } + return this; + } + /** + * Encode values into their respective mappings. + * Returns -1 in case of missing mapping. + */ + transform(targets: T[]): number[] { + const res = new Array(targets.length); + let i = 0; + while (i < targets.length) { + const index = this.mapping.get(targets[i]) ?? -1; + res[i] = index; + i += 1; + } + return res; + } + /** Convert mapped numbers into actual values */ + untransform(data: number[]): T[] { + const res = new Array(data.length); + for (let i = 0; i < res.length; i += 1) { + res[i] = this.getOg(data[i]) || "__unknown__"; + } + return res; + } + getOg(data: number): T | undefined { + for (const [k, v] of this.mapping.entries()) { + if (v === data) { + return k; + } + } + return undefined; + } + #incrementToken(): number { + return Atomics.add(this.#lastToken, 0, 1); + } +} diff --git a/packages/utilities/src/mapper/mod.ts b/packages/utilities/src/mapper/mod.ts new file mode 100644 index 0000000..3942fc0 --- /dev/null +++ b/packages/utilities/src/mapper/mod.ts @@ -0,0 +1 @@ +export { DiscreteMapper } from "./discrete.ts"; diff --git a/packages/utilities/src/text/preprocess/cleaner.ts b/packages/utilities/src/text/cleaner.ts similarity index 69% rename from packages/utilities/src/text/preprocess/cleaner.ts rename to packages/utilities/src/text/cleaner.ts index 575bd28..5627d5b 100644 --- a/packages/utilities/src/text/preprocess/cleaner.ts +++ b/packages/utilities/src/text/cleaner.ts @@ -1,4 +1,5 @@ -import type { StandardizeConfig } from "../../utils/common_types.ts"; +import type { StandardizeConfig } from "../utils/common_types.ts"; +import { DefaultIgnoreList } from "../constants/stop_words.ts"; /** Simple text cleaner */ export class TextCleaner implements StandardizeConfig { @@ -6,16 +7,19 @@ export class TextCleaner implements StandardizeConfig { lowercase: boolean; normalizeWhiteSpaces: boolean; stripNewlines: boolean; + removeStopWords: false | "english" | string[]; constructor({ stripHtml = false, lowercase = false, normalizeWhiteSpaces = true, stripNewlines = true, + removeStopWords = false, }: StandardizeConfig = {}) { this.stripHtml = stripHtml; this.lowercase = lowercase; this.normalizeWhiteSpaces = normalizeWhiteSpaces; this.stripNewlines = stripNewlines; + this.removeStopWords = removeStopWords; } clean(text: string): string; clean(text: string[]): string[]; @@ -35,7 +39,8 @@ export function preprocess( lowercase = false, normalizeWhiteSpaces = true, stripNewlines = true, - }: StandardizeConfig = {}, + removeStopWords = false, + }: StandardizeConfig = {} ): string { if (lowercase) { text = text.toLowerCase(); @@ -49,5 +54,13 @@ export function preprocess( if (normalizeWhiteSpaces) { text = text.replace(/\s\s+/g, " "); } + if (removeStopWords) { + const stopWords = + removeStopWords === "english" ? DefaultIgnoreList : removeStopWords; + text = text + .split(" ") + .filter((x) => !stopWords.includes(x)) + .join(" "); + } return text; } diff --git a/packages/utilities/src/text/mod.ts b/packages/utilities/src/text/mod.ts index 84e8bda..2e62a00 100644 --- a/packages/utilities/src/text/mod.ts +++ b/packages/utilities/src/text/mod.ts @@ -4,4 +4,5 @@ * @module */ -export * from "./preprocess/mod.ts"; +export * from "./cleaner.ts"; +export * from "./vectorizer.ts" \ No newline at end of file diff --git a/packages/utilities/src/text/preprocess/mod.ts b/packages/utilities/src/text/preprocess/mod.ts deleted file mode 100644 index 246a7c3..0000000 --- a/packages/utilities/src/text/preprocess/mod.ts +++ /dev/null @@ -1,29 +0,0 @@ -import type { - Cleaner, - Tokenizer, - Transformer, - Vectorizer, -} from "../../utils/common_types.ts"; - -// import { TextCleaner } from "./cleaner.ts"; -// import { SplitTokenizer } from "./tokenize/mod.ts"; -// import { CountVectorizer } from "./vectorize/mod.ts"; -// import { TfIdfTransformer } from "./transformer/mod.ts"; - -/** TODO */ -interface PreprocessorConfig { - vectorizer: Vectorizer; - tokenizer: Tokenizer; - cleaner: Cleaner; - transformer: Transformer; -}; - -/** TODO */ -export class TextPreprocessor implements Partial { - // todo -} - -export * from "./cleaner.ts"; -export * from "./tokenize/mod.ts"; -export * from "./transformer/mod.ts"; -export * from "./vectorize/mod.ts"; diff --git a/packages/utilities/src/text/preprocess/tokenize/mod.ts b/packages/utilities/src/text/preprocess/tokenize/mod.ts deleted file mode 100644 index 80a9941..0000000 --- a/packages/utilities/src/text/preprocess/tokenize/mod.ts +++ /dev/null @@ -1 +0,0 @@ -export { SplitTokenizer } from "./split.ts"; diff --git a/packages/utilities/src/text/preprocess/tokenize/split.ts b/packages/utilities/src/text/preprocess/tokenize/split.ts deleted file mode 100644 index ba02928..0000000 --- a/packages/utilities/src/text/preprocess/tokenize/split.ts +++ /dev/null @@ -1,109 +0,0 @@ -import { DefaultIgnoreList } from "../../../constants/stop_words.ts"; -import type { BaseTokenizerOptions } from "../../../utils/common_types.ts"; - -/** Tokenize text based on separator (whitespace) */ -export class SplitTokenizer { - /** Words to ignore from vocabulary */ - skipWords: "english" | false | string[]; - /** Configuration / Function for preprocessing */ - vocabulary: Map; - /** An internal counter for remembering the last index in vocabulary. */ - #lastToken: Uint32Array; - constructor( - options: Partial = {}, - ) { - this.skipWords = options.skipWords ?? false; - this.vocabulary = options.vocabulary ?? new Map(); - this.#lastToken = new Uint32Array(1); - if (options.indices && !this.vocabulary.size) { - this.#lastToken[0] = 2; - this.vocabulary.set("__pad__", 0); - this.vocabulary.set("__unk__", 1); - } - if (this.vocabulary.size) { - this.#lastToken[0] = this.vocabulary.size; - } - } - get lastToken(): number { - return Atomics.load(this.#lastToken, 0); - } - /** Construct a vocabulary from a given set of text. */ - fit(text: string | string[]): this { - if (Array.isArray(text)) { - let i = 0; - while (i < text.length) { - this.fit(text[i]); - i += 1; - } - } else { - const words = this.split(text); - let i = 0; - while (i < words.length) { - if (!this.vocabulary.has(words[i])) { - if (this.skipWords === "english") { - if (DefaultIgnoreList.includes(words[i])) { - i += 1; - continue; - } - } else if (Array.isArray(this.skipWords)) { - if (this.skipWords.includes(words[i])) { - i += 1; - continue; - } - } - const token = this.#incrementToken(); - this.vocabulary.set(words[i], token); - } - i += 1; - } - } - return this; - } - #incrementToken(): number { - return Atomics.add(this.#lastToken, 0, 1); - } - /** - * Convert a document (string | array of strings) into vectors. - */ - transform(text: string | string[]): number[][] { - if (!this.vocabulary.size) { - throw new Error( - "Tokenizer vocabulary not initialized yet. Call `Tokenizer()` with a custom vocabulary or use `.fit()` on text.", - ); - } - if (Array.isArray(text)) { - const size = Math.max(...text.map((x) => this.split(x).length)); - const res = Array(text.length); - let i = 0; - while (i < text.length) { - res[i] = this.#transform(text[i], size); - i++; - } - return res; - } - return [this.#transform(text, 0)]; - } - #transform(text: string, size: number): number[] { - const words = this.split(text); - if (!size) size = words.length; - const res = new Array(size); - res.fill(this.vocabulary.get("__pad__") || 0); - let i = 0; - while (i < words.length && i < size) { - if (this.vocabulary.has(words[i])) { - const index = this.vocabulary.get(words[i]); - res[i] = typeof index === "number" - ? index - : this.vocabulary.get("__unk__") || 0; - } else { - res[i] = this.vocabulary.get("__unk__") || 0; - } - i++; - } - return res; - } - // TODO: Support custom split modes - split(text: string): string[] { - return text.split(" "); - } -} diff --git a/packages/utilities/src/text/preprocess/vectorize/count_vectorizer.ts b/packages/utilities/src/text/preprocess/vectorize/count_vectorizer.ts deleted file mode 100644 index 189fe0f..0000000 --- a/packages/utilities/src/text/preprocess/vectorize/count_vectorizer.ts +++ /dev/null @@ -1,40 +0,0 @@ -import type { DataType, DType } from "../../../utils/common_types.ts"; -import { getConstructor } from "../../../utils/mod.ts"; -import { Matrix } from "../../../mod.ts"; - -/** - * Convert tokens into vectors based on term frequency - */ -export class CountVectorizer { - #vocabSize: number; - constructor(vocabSize: number) { - this.#vocabSize = vocabSize; - } - /** - * Convert a document (string | array of strings) into vectors. - */ - transform(tokens: number[][], dType: T): Matrix { - if (!this.#vocabSize) { - throw new Error("Vocab not initialized."); - } - const res = new Matrix(dType, [tokens.length, this.#vocabSize]); - let i = 0; - while (i < tokens.length) { - res.setRow(i, this.#transform(tokens[i], dType)); - i += 1; - } - return res as Matrix; - } - #transform(tokens: number[], dType: T): DType { - const res = new (getConstructor(dType))(this.#vocabSize); - let i = 0; - while (i < tokens.length) { - if (tokens[i] < this.#vocabSize) { - // @ts-ignore No error here - res[tokens[i]] += typeof res[tokens[i]] === "bigint" ? 1n : 1; - } - i += 1; - } - return res as DType; - } -} diff --git a/packages/utilities/src/text/preprocess/vectorize/mod.ts b/packages/utilities/src/text/preprocess/vectorize/mod.ts deleted file mode 100644 index 2f9fa65..0000000 --- a/packages/utilities/src/text/preprocess/vectorize/mod.ts +++ /dev/null @@ -1,2 +0,0 @@ -export { CountVectorizer } from "./count_vectorizer.ts"; -export { MultiHotVectorizer } from "./multi_hot.ts"; diff --git a/packages/utilities/src/text/preprocess/vectorize/multi_hot.ts b/packages/utilities/src/text/preprocess/vectorize/multi_hot.ts deleted file mode 100644 index ba7800a..0000000 --- a/packages/utilities/src/text/preprocess/vectorize/multi_hot.ts +++ /dev/null @@ -1,39 +0,0 @@ -import type { DataType, DType } from "../../../utils/common_types.ts"; -import { getConstructor } from "../../../utils/mod.ts"; -import { Matrix } from "../../../mod.ts"; - -/** - * Convert tokens into vectors based on term frequency - */ -export class MultiHotVectorizer { - #vocabSize: number; - constructor(vocabSize: number) { - this.#vocabSize = vocabSize; - } - /** - * Convert a document (string | array of strings) into vectors. - */ - transform(tokens: number[][], dType: T): Matrix { - if (!this.#vocabSize) { - throw new Error("Vocab not initialized."); - } - const res = new Matrix(dType, [tokens.length, this.#vocabSize]); - let i = 0; - while (i < tokens.length) { - res.setRow(i, this.#transform(tokens[i], dType)); - i += 1; - } - return res as Matrix; - } - #transform(tokens: number[], dType: T): DType { - const res = new (getConstructor(dType))(this.#vocabSize); - let i = 0; - while (i < tokens.length) { - if (tokens[i] < this.#vocabSize) { - res[tokens[i]] = typeof res[tokens[i]] === "bigint" ? 1n : 1; - } - i += 1; - } - return res as DType; - } -} diff --git a/packages/utilities/src/text/vectorizer.ts b/packages/utilities/src/text/vectorizer.ts new file mode 100644 index 0000000..0f7e3ba --- /dev/null +++ b/packages/utilities/src/text/vectorizer.ts @@ -0,0 +1,60 @@ +import { TfEncoder } from "../encoding/mod.ts"; +import { DiscreteMapper } from "../mapper/discrete.ts"; +import { Matrix } from "../mod.ts"; +import { TfIdfTransformer } from "../transformer/tfidf.ts"; +import type { DataType } from "../utils/common_types.ts"; + +export class TextVectorizer { + mode: "tf" | "tfidf" | "indices"; + mapper: DiscreteMapper; + encoder?: TfEncoder; + transformer?: TfIdfTransformer; + constructor(mode: "tf" | "tfidf" | "indices" = "indices") { + this.mode = mode; + this.mapper = new DiscreteMapper(); + } + fit(document: string | string[]) { + this.mapper.fit( + (Array.isArray(document) ? document.join(" ") : document).split(" ") + ); + const tokens = Array.isArray(document) + ? document.map((x) => this.mapper.transform(x.split(" "))) + : [this.mapper.transform(document.split(" "))]; + if (this.mode === "tf" || this.mode === "tfidf") { + this.encoder = new TfEncoder(this.mapper.mapping.size); + if (this.mode === "tfidf") { + this.transformer = new TfIdfTransformer(); + this.transformer.fit(this.encoder.transform(tokens, "f32")); + } + } + } + transform
( + document: string | string[], + dType: DT + ): Matrix
{ + if (!this.mapper.mapping.size) + throw new Error("Text Vectorizer not trained yet. Use .fit() first."); + const tokens = Array.isArray(document) + ? document.map((x) => this.mapper.transform(x.split(" "))) + : [this.mapper.transform(document.split(" "))]; + if (this.mode === "indices") { + const res = new Matrix(dType, [ + tokens.length, + Math.max(...tokens.map((x) => x.length)), + ]); + for (let i = 0; i < res.nRows; i += 1) { + res.setRow(i, tokens[i]); + } + return res; + } + if (!this.encoder) + throw new Error("Text Vectorizer not trained yet. Use .fit() first."); + const encoded = this.encoder.transform(tokens, dType); + if (this.mode === "tf") return encoded; + else { + if (!this.transformer) + throw new Error("Text Vectorizer not trained yet. Use .fit() first."); + return this.transformer.transform
(encoded); + } + } +} diff --git a/packages/utilities/src/text/preprocess/transformer/mod.ts b/packages/utilities/src/transformer/mod.ts similarity index 100% rename from packages/utilities/src/text/preprocess/transformer/mod.ts rename to packages/utilities/src/transformer/mod.ts diff --git a/packages/utilities/src/text/preprocess/transformer/tfidf.ts b/packages/utilities/src/transformer/tfidf.ts similarity index 70% rename from packages/utilities/src/text/preprocess/transformer/tfidf.ts rename to packages/utilities/src/transformer/tfidf.ts index cef8ea4..ce185a7 100644 --- a/packages/utilities/src/text/preprocess/transformer/tfidf.ts +++ b/packages/utilities/src/transformer/tfidf.ts @@ -1,11 +1,11 @@ -import type { DataType } from "../../../utils/common_types.ts"; -import type { Matrix, MatrixLike } from "../../../mod.ts"; -import { multiplyDiags } from "../../../utils/math.ts"; +import type { DataType } from "../utils/common_types.ts"; +import type { Matrix, MatrixLike } from "../mod.ts"; +import { multiplyDiags } from "../utils/math.ts"; -/** Convert tf features (CountVectorizer) into tf-idf features. */ +/** Convert tf features (Count) into tf-idf features. */ export class TfIdfTransformer { - idf: null | Float64Array; - constructor({ idf }: { idf?: Float64Array } = {}) { + idf: null | Float32Array; + constructor({ idf }: { idf?: Float32Array } = {}) { this.idf = idf ?? null; } /** @@ -20,7 +20,7 @@ export class TfIdfTransformer { }; const freq = data.rowSum(); - const idf = new Float64Array(freq.length); + const idf = new Float32Array(freq.length); let i = 0; while (i < idf.length) { @@ -32,6 +32,7 @@ export class TfIdfTransformer { } /** * Transform an tf features into tf-idf features. + * Mutates the input. * @param data tf features from CountVectorizer * @returns Sparse matrix of Tf-Idf features */ diff --git a/packages/utilities/src/utils/common_types.ts b/packages/utilities/src/utils/common_types.ts index d373484..005e0c9 100644 --- a/packages/utilities/src/utils/common_types.ts +++ b/packages/utilities/src/utils/common_types.ts @@ -51,27 +51,34 @@ interface TypedArrayValueMapping { f64: number; } -export type DTypeValue = T extends - keyof TypedArrayValueMapping ? TypedArrayValueMapping[T] : never; +export type DTypeValue = + T extends keyof TypedArrayValueMapping ? TypedArrayValueMapping[T] : never; type AddableTypes = number | bigint; export type AddDTypeValues< T1 extends AddableTypes, - T2 extends AddableTypes, -> = T1 extends number ? T2 extends number ? number - : T2 extends bigint ? bigint - : never - : T1 extends bigint ? T2 extends number ? bigint - : T2 extends bigint ? bigint + T2 extends AddableTypes +> = T1 extends number + ? T2 extends number + ? number + : T2 extends bigint + ? bigint + : never + : T1 extends bigint + ? T2 extends number + ? bigint + : T2 extends bigint + ? bigint : never : never; -export type DType = T extends - keyof TypedArrayMapping ? TypedArrayMapping[T] : never; +export type DType = + T extends keyof TypedArrayMapping ? TypedArrayMapping[T] : never; export type DTypeConstructor = - T extends keyof TypedArrayConstructorMapping ? TypedArrayConstructorMapping[T] + T extends keyof TypedArrayConstructorMapping + ? TypedArrayConstructorMapping[T] : never; export type TypedArray = @@ -93,8 +100,8 @@ export interface Sliceable { predicate: ( value: unknown, index: number, - array: unknown[], - ) => value is unknown, + array: unknown[] + ) => value is unknown ): Sliceable; slice(start?: number, end?: number): Sliceable; length: number; @@ -166,27 +173,29 @@ export interface StandardizeConfig { normalizeWhiteSpaces?: boolean; /** Strip Newlines */ stripNewlines?: boolean; + /** Remove stop words from text */ + removeStopWords?: "english" | false | string[]; } export type VectorizerMode = "count" | "indices" | "multihot" | "tfidf"; export type VectorizerModeConfig = | { - mode: "count"; - config?: Partial; - } + mode: "count"; + config?: Partial; + } | { - mode: "indices"; - config?: Partial; - } + mode: "indices"; + config?: Partial; + } | { - mode: "multihot"; - config?: Partial; - } + mode: "multihot"; + config?: Partial; + } | { - mode: "tfidf"; - config?: Partial; - }; + mode: "tfidf"; + config?: Partial; + }; export interface TokenizerModeConfig { mode: "whitespace";