Skip to content

Commit

Permalink
feat: Revamp utilities + sentiment analysis example (#62)
Browse files Browse the repository at this point in the history
* chore: bump deps (#60) (#61)

* update with tfidf

* fix classifier output size

* remove dropout

* updated model

* update tester and analyzer

* add commands

* remove log

* completely revamp utilities

* delete split

---------

Co-authored-by: Dean Srebnik <[email protected]>
  • Loading branch information
retraigo and load1n9 authored Sep 7, 2024
1 parent e905984 commit ee217d0
Show file tree
Hide file tree
Showing 29 changed files with 396 additions and 444 deletions.
6 changes: 5 additions & 1 deletion deno.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
"./utilities/image": "./packages/utilities/src/image/mod.ts",
"./utilities/metrics": "./packages/utilities/src/metrics/mod.ts",
"./utilities/encoding": "./packages/utilities/src/encoding/mod.ts",
"./utilities/mapper": "./packages/utilities/src/mapper/mod.ts",
"./utilities/transformer": "./packages/utilities/src/transformer/mod.ts",
"./utilities/misc": "./packages/utilities/src/utils/mod.ts",
// Tokenizers
"./tokenizers": "./packages/tokenizers/mod.ts",
Expand All @@ -40,7 +42,9 @@
"example:multiple-linear": "deno -A ./examples/multiple-linear/student.ts",
"example:binary": "deno -A ./examples/classification/binary_iris.ts",
"example:multiclass": "deno -A ./examples/classification/iris.ts",
"example:text-sentiment": "deno -A ./examples/sentiment-analysis/classifier.ts",
"example:sentiment-train": "deno -A ./examples/sentiment-analysis/classifier.ts",
"example:sentiment-test": "deno -A ./examples/sentiment-analysis/tester.ts",
"example:sentiment-try": "deno -A ./examples/sentiment-analysis/analyzer.ts",
"example:text-spam": "deno -A ./examples/classification/spam.ts",
"example:filters": "deno -A examples/filters/conv.ts ",
"example:train": "deno -A examples/model/train.ts ",
Expand Down
2 changes: 1 addition & 1 deletion examples/classification/spam.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ net.train(
// Train for 20 epochs
20,
2,
0.01
0.001
);

console.log(`training time: ${performance.now() - time}ms`);
Expand Down
50 changes: 17 additions & 33 deletions examples/sentiment-analysis/analyzer.ts
Original file line number Diff line number Diff line change
@@ -1,70 +1,54 @@
import { CPU, setupBackend, tensor } from "jsr:@denosaurs/[email protected]";
import { Sequential } from "jsr:@denosaurs/[email protected]/core";

import {
useSplit,
ClassificationReport,
MatrixLike,
} from "jsr:@denosaurs/[email protected]/utilities";
import type { MatrixLike } from "jsr:@denosaurs/[email protected]/utilities";

import { CategoricalEncoder } from "jsr:@denosaurs/[email protected]/utilities/encoding";
import {
CountVectorizer,
SplitTokenizer,
CountVectorizer,
SplitTokenizer,
TfIdfTransformer,
} from "jsr:@denosaurs/[email protected]/utilities/text";

import Mappings from "./mappings.json" with {type: "json"}
import Vocab from "./vocab.json" with {type: "json"}


console.time("Time Elapsed");

console.log("\nImports loaded.");

import Mappings from "./mappings.json" with { type: "json" };
import Vocab from "./vocab.json" with { type: "json" };
import Idf from "./tfidf.json" with { type: "json" };

const vocab = new Map();

for (const entry of Vocab) {
vocab.set(entry[0], entry[1])
vocab.set(entry[0], entry[1]);
}

const tokenizer = new SplitTokenizer({
skipWords: "english",
vocabulary: vocab,
standardize: { lowercase: true, stripNewlines: true },
skipWords: "english",
vocabulary: vocab,
standardize: { lowercase: true, stripNewlines: true },
});

const vectorizer = new CountVectorizer(tokenizer.vocabulary.size);

console.log("\nX vectorized");
console.timeLog("Time Elapsed");
const transformer = new TfIdfTransformer({ idf: Float64Array.from(Idf) });

const encoder = new CategoricalEncoder<string>();
const mappings = new Map();

for (const entry of Mappings) {
mappings.set(entry[0], entry[1])
mappings.set(entry[0], entry[1]);
}

encoder.mapping = mappings;

console.log("\nCPU Backend Loading");
console.timeLog("Time Elapsed");

await setupBackend(CPU);

console.log("\nCPU Backend Loaded");
console.timeLog("Time Elapsed");

const net = Sequential.loadFile("examples/sentiment-analysis/sentiment.st")
const net = Sequential.loadFile("examples/sentiment-analysis/sentiment.st");

const text = prompt("Text to analyze?") || "hello world"
const text = prompt("Text to analyze?") || "hello world";

const predYSoftmax = await net.predict(
tensor(vectorizer.transform(tokenizer.transform([text]), "f32"))
tensor(transformer.transform<"f32">(vectorizer.transform(tokenizer.transform([text]), "f32"))),
);

CategoricalEncoder.fromSoftmax<"f32">(predYSoftmax as MatrixLike<"f32">);
const predY = encoder.untransform(predYSoftmax as MatrixLike<"f32">);

console.log(`The sentiment predicted is ${predY[0]}`)
console.log(`The sentiment predicted is ${predY[0]}`);
9 changes: 3 additions & 6 deletions examples/sentiment-analysis/classifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@ import {
AdamOptimizer,
Cost,
CPU,
Dropout1DLayer,
Init,
setupBackend,
tensor,
} from "jsr:@denosaurs/[email protected]";
import { Sequential } from "jsr:@denosaurs/[email protected]/core";
import { NadamOptimizer } from "jsr:@denosaurs/[email protected]/core/optimizers";
import {
DenseLayer,
ReluLayer,
Expand All @@ -18,7 +16,7 @@ import {
import {
useSplit,
ClassificationReport,
MatrixLike,
type MatrixLike,
} from "jsr:@denosaurs/[email protected]/utilities";

import { CategoricalEncoder } from "jsr:@denosaurs/[email protected]/utilities/encoding";
Expand Down Expand Up @@ -103,7 +101,7 @@ Deno.writeTextFileSync(
);
Deno.writeTextFileSync(
"examples/sentiment-analysis/tfidf.json",
JSON.stringify(transformer.idf)
JSON.stringify(Array.from(transformer.idf as Float64Array))
);

console.log("\nCPU Backend Loading");
Expand All @@ -115,7 +113,7 @@ console.log("\nCPU Backend Loaded");
console.timeLog("Time Elapsed");

const net = new Sequential({
size: [4, vecX.nCols],
size: [4, tfidfX.nCols],
layers: [
DenseLayer({ size: [256], init: Init.Kaiming }),
ReluLayer(),
Expand All @@ -127,7 +125,6 @@ const net = new Sequential({
ReluLayer(),
DenseLayer({ size: [16], init: Init.Kaiming }),
ReluLayer(),
Dropout1DLayer({ probability: 0.5 }),
DenseLayer({ size: [encoder.mapping.size], init: Init.Kaiming }),
SoftmaxLayer(),
],
Expand Down
2 changes: 1 addition & 1 deletion examples/sentiment-analysis/mappings.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
[["empty",0],["sadness",1],["neutral",2],["worry",3],["surprise",4],["fun",5],["hate",6],["happiness",7],["enthusiasm",8],["love",9],["relief",10],["boredom",11],["anger",12]]
[["empty",0],["sadness",1],["enthusiasm",2],["neutral",3],["worry",4],["surprise",5],["love",6],["fun",7],["hate",8],["happiness",9],["boredom",10],["relief",11],["anger",12]]
Binary file modified examples/sentiment-analysis/sentiment.st
Binary file not shown.
79 changes: 31 additions & 48 deletions examples/sentiment-analysis/tester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,96 +2,79 @@ import { CPU, setupBackend, tensor } from "jsr:@denosaurs/[email protected]";
import { Sequential } from "jsr:@denosaurs/[email protected]/core";

import {
useSplit,
ClassificationReport,
MatrixLike,
ClassificationReport,
type MatrixLike,
useSplit,
} from "jsr:@denosaurs/[email protected]/utilities";

import { CategoricalEncoder } from "jsr:@denosaurs/[email protected]/utilities/encoding";
import {
CountVectorizer,
SplitTokenizer,
CountVectorizer,
SplitTokenizer,
TfIdfTransformer,
} from "jsr:@denosaurs/[email protected]/utilities/text";

import Mappings from "./mappings.json" with {type: "json"}
import Vocab from "./vocab.json" with {type: "json"}
import Mappings from "./mappings.json" with { type: "json" };
import Vocab from "./vocab.json" with { type: "json" };
import Idf from "./tfidf.json" with { type: "json" };

import { parse as parseCsv } from "jsr:@std/[email protected]/parse";


console.time("Time Elapsed");

console.log("\nImports loaded.");

const file = Deno.readTextFileSync(
"examples/sentiment-analysis/text_emotion.csv"
);

console.log("\nData file loaded.");
console.timeLog("Time Elapsed");

const data = parseCsv(file, { skipFirstRow: true }) as {
"examples/sentiment-analysis/text_emotion.csv",
);

const data = parseCsv(file, { skipFirstRow: true }) as {
sentiment: string;
content: string;
}[];
const text = data.map((x) => x.content);
}[];
const text = data.map((x) => x.content);
const labels = data.map((x) => x.sentiment);

console.log("\nCSV Parsed");
console.timeLog("Time Elapsed");

const [[trainX, trainY], [testX, testY]] = useSplit(
{ shuffle: true, ratio: [7, 3] },
text,
labels
const [[_trainX, _trainY], [testX, testY]] = useSplit(
{ shuffle: true, ratio: [7, 3] },
text,
labels,
);

console.log("Data Split");
console.timeLog("Time Elapsed");

const vocab = new Map();

for (const entry of Vocab) {
vocab.set(entry[0], entry[1])
vocab.set(entry[0], entry[1]);
}

const tokenizer = new SplitTokenizer({
skipWords: "english",
vocabulary: vocab,
standardize: { lowercase: true, stripNewlines: true },
skipWords: "english",
vocabulary: vocab,
standardize: { lowercase: true, stripNewlines: true },
});

const vectorizer = new CountVectorizer(tokenizer.vocabulary.size);

console.log("\nX vectorized");
console.timeLog("Time Elapsed");
const transformer = new TfIdfTransformer({ idf: Float64Array.from(Idf) });

const encoder = new CategoricalEncoder<string>();
const mappings = new Map();

for (const entry of Mappings) {
mappings.set(entry[0], entry[1])
mappings.set(entry[0], entry[1]);
}

encoder.mapping = mappings;

console.log("\nCPU Backend Loading");
console.timeLog("Time Elapsed");

await setupBackend(CPU);

console.log("\nCPU Backend Loaded");
console.timeLog("Time Elapsed");

const net = Sequential.loadFile("examples/sentiment-analysis/sentiment.st")
const net = Sequential.loadFile("examples/sentiment-analysis/sentiment.st");

const predYSoftmax = await net.predict(
tensor(vectorizer.transform(tokenizer.transform(testX), "f32"))
tensor(
transformer.transform<"f32">(
vectorizer.transform(tokenizer.transform(testX), "f32"),
),
),
);

CategoricalEncoder.fromSoftmax<"f32">(predYSoftmax as MatrixLike<"f32">);
const predY = encoder.untransform(predYSoftmax as MatrixLike<"f32">);

console.log(new ClassificationReport(testY, predY));

console.log(testY, predY)
1 change: 1 addition & 0 deletions examples/sentiment-analysis/tfidf.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/sentiment-analysis/vocab.json

Large diffs are not rendered by default.

92 changes: 0 additions & 92 deletions packages/utilities/src/encoding/categorical.ts

This file was deleted.

Loading

0 comments on commit ee217d0

Please sign in to comment.