Skip to content

Commit

Permalink
completely revamp utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
retraigo committed Sep 7, 2024
1 parent 940f2fb commit edccb60
Show file tree
Hide file tree
Showing 21 changed files with 338 additions and 353 deletions.
2 changes: 2 additions & 0 deletions deno.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
"./utilities/image": "./packages/utilities/src/image/mod.ts",
"./utilities/metrics": "./packages/utilities/src/metrics/mod.ts",
"./utilities/encoding": "./packages/utilities/src/encoding/mod.ts",
"./utilities/mapper": "./packages/utilities/src/mapper/mod.ts",
"./utilities/transformer": "./packages/utilities/src/transformer/mod.ts",
"./utilities/misc": "./packages/utilities/src/utils/mod.ts",
// Tokenizers
"./tokenizers": "./packages/tokenizers/mod.ts",
Expand Down
92 changes: 0 additions & 92 deletions packages/utilities/src/encoding/categorical.ts

This file was deleted.

10 changes: 4 additions & 6 deletions packages/utilities/src/encoding/mod.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
/**
* Encoding data into different representations.
* @module
*/

export * from "./categorical.ts";
export { MultiHotEncoder } from "./multihot.ts";
export { OneHotEncoder } from "./onehot.ts";
export { TfEncoder } from "./termfrequency.ts";
export { transformSoftmaxMut } from "./softmax.ts";
41 changes: 41 additions & 0 deletions packages/utilities/src/encoding/multihot.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { Matrix } from "../mod.ts";
import type { DataType } from "../utils/common_types.ts";

/**
* Convert 2D array of indices into multi-hot encoded vectors.
*/
export class MultiHotEncoder {
/** Size of encoded vectors. */
mappingSize: number;
constructor(mappingSize: number) {
this.mappingSize = mappingSize;
}
/** Encoding values into multi-hot vectors */
transform<DT extends DataType>(targets: Matrix<DT>): Matrix<DT>;
transform<DT extends DataType>(targets: number[][], dType: DT): Matrix<DT>;
transform<DT extends DataType>(
targets: number[][] | Matrix<DT>,
dType?: DT
): Matrix<DT> {
if (!dType && !(targets instanceof Matrix))
throw new Error("dType required when not dealing with matrices.");
const dataType = dType || (targets as Matrix<DT>).dType;
const res = new Matrix<DT>(dataType, [targets.length, this.mappingSize]);
let i = 0;
while (i < targets.length) {
const row = targets instanceof Matrix ? targets.row(i) : targets[i];
let j = 0;
while (j < row.length) {
if (Number(row[j]) >= row.length) {
j += 1;
continue;
}
res.setCell(i, Number(row[j]), 1);
j += 1;
}

i += 1;
}
return res;
}
}
37 changes: 37 additions & 0 deletions packages/utilities/src/encoding/onehot.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { Matrix, type MatrixLike } from "../mod.ts";
import type { DataType } from "../utils/common_types.ts";

/**
* Convert an array of indices into one-hot encoded vectors.
*/
export class OneHotEncoder {
/** Size of one-hot encoded vectors. */
mappingSize: number;
constructor(mappingSize: number) {
this.mappingSize = mappingSize;
}
/** One-hot encoding of values */
transform<DT extends DataType>(targets: number[], dType: DT): Matrix<DT> {
const res = new Matrix<DT>(dType, [targets.length, this.mappingSize]);
let i = 0;
while (i < targets.length) {
const index = targets[i];
if (index >= this.mappingSize) {
i += 1;
continue;
}
res.setCell(i, index, 1);
i += 1;
}
return res;
}
untransform<DT extends DataType>(data: MatrixLike<DT>): number[] {
const matrix = new Matrix(data);
const res = new Array(matrix.nRows);
for (let i = 0; i < res.length; i += 1) {
const idx = matrix.row(i).findIndex((x) => x === 1);
res[i] = idx;
}
return res;
}
}
35 changes: 35 additions & 0 deletions packages/utilities/src/encoding/softmax.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { Matrix, type MatrixLike } from "../mod.ts";
import type { DataType, DType, DTypeValue } from "../utils/common_types.ts";

/**
* Convert a softmax output into one-hot vectors.
* Mutates the input.
*/
export function transformSoftmaxMut<DT extends DataType>(
targets: MatrixLike<DT>
): Matrix<DT> {
const matrix = new Matrix(targets);
for (let i = 0; i < matrix.nRows; i += 1) {
const max = matrix
.row(i)
// @ts-ignore It can reduce.
.reduce(
(acc: number, curr: DTypeValue<DT>, i: number, arr: DType<DT>) =>
arr[acc] > curr ? acc : i,
0
);
if (
targets.data instanceof BigInt64Array ||
targets.data instanceof BigUint64Array
) {
const newR = new Array(matrix.nCols).fill(0n);
newR[max] = 1n;
matrix.setRow(i, newR);
} else {
const newR = new Array(matrix.nCols).fill(0);
newR[max] = 1;
matrix.setRow(i, newR);
}
}
return matrix;
}
43 changes: 43 additions & 0 deletions packages/utilities/src/encoding/termfrequency.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { Matrix } from "../mod.ts";
import type { DataType } from "../utils/common_types.ts";

/**
* Convert 2D array of indices into multi-hot encoded vectors
* where each index contains the number of times the respective
* value appears in a sample (term frequency encoder).
*/
export class TfEncoder {
/** Size of encoded vectors. */
mappingSize: number;
constructor(mappingSize: number) {
this.mappingSize = mappingSize;
}
/** Encoding values into count vectors */
transform<DT extends DataType>(targets: Matrix<DT>): Matrix<DT>;
transform<DT extends DataType>(targets: number[][], dType: DT): Matrix<DT>;
transform<DT extends DataType>(
targets: number[][] | Matrix<DT>,
dType?: DT
): Matrix<DT> {
if (!dType && !(targets instanceof Matrix))
throw new Error("dType required when not dealing with matrices.");
const dataType = dType || (targets as Matrix<DT>).dType;
const res = new Matrix<DT>(dataType, [targets.length, this.mappingSize]);
let i = 0;
while (i < targets.length) {
const row = targets instanceof Matrix ? targets.row(i) : targets[i];
let j = 0;
while (j < row.length) {
if (Number(row[j]) >= row.length) {
j += 1;
continue;
}
res.setAdd(i, Number(row[j]), 1);
j += 1;
}

i += 1;
}
return res;
}
}
56 changes: 56 additions & 0 deletions packages/utilities/src/mapper/discrete.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/** Map discrete values into numbers */
export class DiscreteMapper<T> {
/** Map categories to indices */
mapping: Map<T, number>;
/** An internal counter for remembering the last index in mapping. */
#lastToken: Uint32Array;
constructor() {
this.mapping = new Map();
this.#lastToken = new Uint32Array(1);
}
/** Construct a mapping from a given set of text. */
fit(targets: T[]): this {
let i = 0;
while (i < targets.length) {
if (!this.mapping.has(targets[i])) {
const token = this.#incrementToken();
this.mapping.set(targets[i], token);
}
i += 1;
}
return this;
}
/**
* Encode values into their respective mappings.
* Returns -1 in case of missing mapping.
*/
transform(targets: T[]): number[] {
const res = new Array(targets.length);
let i = 0;
while (i < targets.length) {
const index = this.mapping.get(targets[i]) ?? -1;
res[i] = index;
i += 1;
}
return res;
}
/** Convert mapped numbers into actual values */
untransform(data: number[]): T[] {
const res = new Array(data.length);
for (let i = 0; i < res.length; i += 1) {
res[i] = this.getOg(data[i]) || "__unknown__";
}
return res;
}
getOg(data: number): T | undefined {
for (const [k, v] of this.mapping.entries()) {
if (v === data) {
return k;
}
}
return undefined;
}
#incrementToken(): number {
return Atomics.add(this.#lastToken, 0, 1);
}
}
1 change: 1 addition & 0 deletions packages/utilities/src/mapper/mod.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export { DiscreteMapper } from "./discrete.ts";
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import type { StandardizeConfig } from "../../utils/common_types.ts";
import type { StandardizeConfig } from "../utils/common_types.ts";
import { DefaultIgnoreList } from "../constants/stop_words.ts";

/** Simple text cleaner */
export class TextCleaner implements StandardizeConfig {
stripHtml: boolean;
lowercase: boolean;
normalizeWhiteSpaces: boolean;
stripNewlines: boolean;
removeStopWords: false | "english" | string[];
constructor({
stripHtml = false,
lowercase = false,
normalizeWhiteSpaces = true,
stripNewlines = true,
removeStopWords = false,
}: StandardizeConfig = {}) {
this.stripHtml = stripHtml;
this.lowercase = lowercase;
this.normalizeWhiteSpaces = normalizeWhiteSpaces;
this.stripNewlines = stripNewlines;
this.removeStopWords = removeStopWords;
}
clean(text: string): string;
clean(text: string[]): string[];
Expand All @@ -35,7 +39,8 @@ export function preprocess(
lowercase = false,
normalizeWhiteSpaces = true,
stripNewlines = true,
}: StandardizeConfig = {},
removeStopWords = false,
}: StandardizeConfig = {}
): string {
if (lowercase) {
text = text.toLowerCase();
Expand All @@ -49,5 +54,13 @@ export function preprocess(
if (normalizeWhiteSpaces) {
text = text.replace(/\s\s+/g, " ");
}
if (removeStopWords) {
const stopWords =
removeStopWords === "english" ? DefaultIgnoreList : removeStopWords;
text = text
.split(" ")
.filter((x) => !stopWords.includes(x))
.join(" ");
}
return text;
}
3 changes: 2 additions & 1 deletion packages/utilities/src/text/mod.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
* @module
*/

export * from "./preprocess/mod.ts";
export * from "./cleaner.ts";
export * from "./vectorizer.ts"
Loading

0 comments on commit edccb60

Please sign in to comment.