Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tmp] Mergefix for zeeshan/markdown-json #113

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ const result = await zerox({
openaiAPIKey: process.env.OPENAI_API_KEY,

// Optional
chunk: false, // Return JSON array of elements on each page
cleanup: true, // Clear images from tmp after run.
concurrency: 10, // Number of pages to run at a time.
correctOrientation: true, // True by default, attempts to identify and correct page orientation.
Expand All @@ -98,6 +99,7 @@ Request #3 => page_2_markdown + page_3_image

```js
{
chunks: [],
completionTime: 10038,
fileName: 'invoice_36258',
inputTokens: 25543,
Expand Down
73 changes: 54 additions & 19 deletions node-zerox/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ import {
terminateScheduler,
} from "./utils";
import { getCompletion } from "./openAI";
import { ModelOptions, ZeroxArgs, ZeroxOutput } from "./types";
import {
ModelOptions,
ProcessedNode,
ProcessPageResponseBody,
ZeroxArgs,
ZeroxOutput,
} from "./types";
import { validateLLMParams } from "./utils";
import fs from "fs-extra";
import os from "os";
Expand All @@ -20,6 +26,7 @@ import { NUM_STARTING_WORKERS } from "./constants";
import Tesseract from "tesseract.js";

export const zerox = async ({
chunk = false,
cleanup = true,
concurrency = 10,
correctOrientation = true,
Expand Down Expand Up @@ -72,8 +79,8 @@ export const zerox = async ({
tempDir || os.tmpdir(),
`zerox-temp-${rand}`
);
const sourceDirectory = path.join(tempDirectory, 'source')
const processedDirectory = path.join(tempDirectory, 'processed')
const sourceDirectory = path.join(tempDirectory, "source");
const processedDirectory = path.join(tempDirectory, "processed");
await fs.ensureDir(sourceDirectory);
await fs.ensureDir(processedDirectory);

Expand Down Expand Up @@ -140,18 +147,26 @@ export const zerox = async ({
// Get list of converted images
const files = await fs.readdir(processedDirectory);
const images = files.filter((file) => file.endsWith(".png"));
const chunks: ProcessedNode[] = [];

if (maintainFormat) {
// Use synchronous processing
for (const image of images) {
for (const [i, image] of images.entries()) {
const imagePath = path.join(processedDirectory, image);
try {
const { content, inputTokens, outputTokens } = await getCompletion({
const {
chunks: pageChunks,
content,
inputTokens,
outputTokens,
} = await getCompletion({
apiKey: openaiAPIKey,
chunk,
imagePath,
llmParams,
maintainFormat,
model,
pageNumber: i,
priorPage,
});
const formattedMarkdown = formatMarkdown(content);
Expand All @@ -163,6 +178,7 @@ export const zerox = async ({

// Add all markdown results to array
aggregatedMarkdown.push(formattedMarkdown);
chunks.push(...pageChunks);
} catch (error) {
console.error(`Failed to process image ${image}:`, error);
throw error;
Expand All @@ -173,21 +189,24 @@ export const zerox = async ({
const processPage = async (
image: string,
pageNumber: number
): Promise<string | null> => {
): Promise<ProcessPageResponseBody> => {
const imagePath = path.join(processedDirectory, image);
try {
if (onPreProcess) {
await onPreProcess({ imagePath, pageNumber });
}

const { content, inputTokens, outputTokens } = await getCompletion({
apiKey: openaiAPIKey,
imagePath,
llmParams,
maintainFormat,
model,
priorPage,
});
const { chunks, content, inputTokens, outputTokens } =
await getCompletion({
apiKey: openaiAPIKey,
chunk,
imagePath,
llmParams,
maintainFormat,
model,
pageNumber,
priorPage,
});
const formattedMarkdown = formatMarkdown(content);
inputTokenCount += inputTokens;
outputTokenCount += outputTokens;
Expand All @@ -200,7 +219,7 @@ export const zerox = async ({
}

// Add all markdown results to array
return formattedMarkdown;
return { formattedMarkdown, chunks };
} catch (error) {
console.error(`Failed to process image ${image}:`, error);
throw error;
Expand All @@ -209,7 +228,7 @@ export const zerox = async ({

// Function to process pages with concurrency limit
const processPagesInBatches = async (images: string[], limit: Limit) => {
const results: (string | null)[] = [];
const results: ProcessPageResponseBody[] = [];

const promises = images.map((image, index) =>
limit(() =>
Expand All @@ -225,8 +244,18 @@ export const zerox = async ({

const limit = pLimit(concurrency);
const results = await processPagesInBatches(images, limit);
const filteredResults = results.filter(isString);
aggregatedMarkdown.push(...filteredResults);
const filteredResults = results.filter(
(r) => r && isString(r.formattedMarkdown)
);
aggregatedMarkdown.push(
...filteredResults.map((r) => r!.formattedMarkdown)
);
chunks.push(
...filteredResults.reduce((acc: ProcessedNode[], r) => {
acc.push(...r!.chunks);
return acc;
}, [])
);
}

// Write the aggregated markdown to a file
Expand Down Expand Up @@ -256,10 +285,16 @@ export const zerox = async ({
pageNumber = pagesToConvertAsImages;
}

return { content: el, page: pageNumber, contentLength: el.length };
return {
chunks,
content: el,
contentLength: el.length,
page: pageNumber,
};
});

return {
chunks,
completionTime,
fileName,
inputTokens: inputTokenCount,
Expand Down
19 changes: 17 additions & 2 deletions node-zerox/src/openAI.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import { CompletionArgs, CompletionResponse } from "./types";
import { convertKeysToSnakeCase, encodeImageToBase64 } from "./utils";
import { CompletionArgs, CompletionResponse, ProcessedNode } from "./types";
import {
convertKeysToSnakeCase,
encodeImageToBase64,
markdownToJson,
} from "./utils";
import axios from "axios";

export const getCompletion = async ({
apiKey,
chunk,
imagePath,
llmParams,
maintainFormat,
model,
pageNumber,
priorPage,
}: CompletionArgs): Promise<CompletionResponse> => {
const systemPrompt = `
Expand Down Expand Up @@ -57,8 +63,17 @@ export const getCompletion = async ({
);

const data = response.data;
let chunks: ProcessedNode[] = [];

if (chunk) {
chunks = await markdownToJson(
data.choices[0].message.content,
pageNumber
);
}

return {
chunks,
content: data.choices[0].message.content,
inputTokens: data.usage.prompt_tokens,
outputTokens: data.usage.completion_tokens,
Expand Down
84 changes: 84 additions & 0 deletions node-zerox/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export interface ZeroxArgs {
chunk?: boolean;
cleanup?: boolean;
concurrency?: number;
correctOrientation?: boolean;
Expand Down Expand Up @@ -28,12 +29,14 @@ export enum ModelOptions {
}

export interface Page {
chunks: ProcessedNode[];
content: string;
contentLength: number;
page: number;
}

export interface ZeroxOutput {
chunks: ProcessedNode[];
completionTime: number;
fileName: string;
inputTokens: number;
Expand All @@ -42,17 +45,20 @@ export interface ZeroxOutput {
}

export interface CompletionResponse {
chunks: ProcessedNode[];
content: string;
inputTokens: number;
outputTokens: number;
}

export interface CompletionArgs {
apiKey: string;
chunk: boolean;
imagePath: string;
llmParams?: LLMParams;
maintainFormat: boolean;
model: ModelOptions | string;
pageNumber: number;
priorPage: string;
}

Expand All @@ -63,3 +69,81 @@ export interface LLMParams {
temperature?: number;
topP?: number;
}

export type ProcessPageResponseBody = {
chunks: ProcessedNode[];
formattedMarkdown: string;
} | null;

// Source: https://github.com/syntax-tree/mdast?tab=readme-ov-file
export enum MdNodeType {
blockquote = "blockquote",
break = "break", // ignored
code = "code",
definition = "definition", // ignored
emphasis = "emphasis",
heading = "heading",
html = "html",
image = "image", // ignored
imageReference = "imageReference", // ignored
inlineCode = "inlineCode",
link = "link",
linkReference = "linkReference", // ignored
list = "list",
listItem = "listItem",
paragraph = "paragraph",
root = "root",
strong = "strong",
table = "table",
tableCell = "tableCell",
tableRow = "tableRow",
text = "text",
thematicBreak = "thematicBreak", // ignored
}

export enum ConvertedNodeType {
heading = "heading",
list = "list",
table = "table",
text = "text",
}
export interface BaseNode {
id: string;
page?: number;
parentId?: string;
}

export interface TextNode extends BaseNode {
type: ConvertedNodeType.text;
value: string;
}

export interface HeadingNode extends BaseNode {
type: ConvertedNodeType.heading;
value: string;
}

export interface ListNode extends BaseNode {
type: ConvertedNodeType.list;
value: ListItem[];
}

export interface ListItem {
id: string;
value: string;
}

export interface TableNode extends BaseNode {
type: ConvertedNodeType.table;
value: {
header: string[];
rows: Record<string, string>[];
};
}

export type ProcessedNode = HeadingNode | ListNode | TableNode | TextNode;

export interface ParentId {
depth: number;
id: string;
}
Loading