From 314af9895fafea2a36e082f295338a2278dfe7a3 Mon Sep 17 00:00:00 2001 From: nl_0 Date: Wed, 10 Jul 2024 17:39:22 +0200 Subject: [PATCH] image and document preview tool --- .../app/components/Assistant/Model/Content.ts | 27 +- .../Assistant/Model/GlobalTools/index.ts | 13 + .../Assistant/Model/GlobalTools/preview.ts | 426 ++++++++++++++++++ .../{GlobalTools.ts => GlobalTools/search.ts} | 27 +- .../app/components/Assistant/Model/Tool.ts | 6 + .../app/containers/Bucket/Qurator/Context.tsx | 74 +-- 6 files changed, 472 insertions(+), 101 deletions(-) create mode 100644 catalog/app/components/Assistant/Model/GlobalTools/index.ts create mode 100644 catalog/app/components/Assistant/Model/GlobalTools/preview.ts rename catalog/app/components/Assistant/Model/{GlobalTools.ts => GlobalTools/search.ts} (72%) diff --git a/catalog/app/components/Assistant/Model/Content.ts b/catalog/app/components/Assistant/Model/Content.ts index 398c88810b4..ab401450aa5 100644 --- a/catalog/app/components/Assistant/Model/Content.ts +++ b/catalog/app/components/Assistant/Model/Content.ts @@ -4,16 +4,18 @@ import { JsonRecord } from 'utils/types' // XXX: schema for en/decoding to/from aws bedrock types? -export type DocumentFormat = - | 'pdf' - | 'csv' - | 'doc' - | 'docx' - | 'xls' - | 'xlsx' - | 'html' - | 'txt' - | 'md' +export const DOCUMENT_FORMATS = [ + 'pdf', + 'csv', + 'doc', + 'docx', + 'xls', + 'xlsx', + 'html', + 'txt', + 'md', +] as const +export type DocumentFormat = (typeof DOCUMENT_FORMATS)[number] export interface DocumentBlock { format: DocumentFormat @@ -22,7 +24,8 @@ export interface DocumentBlock { source: Buffer | Uint8Array | Blob | string } -export type ImageFormat = 'png' | 'jpeg' | 'gif' | 'webp' +export const IMAGE_FORMATS = ['png', 'jpeg', 'gif', 'webp'] as const +export type ImageFormat = (typeof IMAGE_FORMATS)[number] export interface ImageBlock { format: ImageFormat @@ -63,7 +66,7 @@ export type ToolResultStatus = 'success' | 'error' export interface ToolResultBlock { toolUseId: string content: ToolResultContentBlock[] - status?: ToolResultStatus + status: ToolResultStatus } export type ResponseMessageContentBlock = Eff.Data.TaggedEnum<{ diff --git a/catalog/app/components/Assistant/Model/GlobalTools/index.ts b/catalog/app/components/Assistant/Model/GlobalTools/index.ts new file mode 100644 index 00000000000..77dab3424dc --- /dev/null +++ b/catalog/app/components/Assistant/Model/GlobalTools/index.ts @@ -0,0 +1,13 @@ +import * as Tool from '../Tool' + +import { useStartSearch } from './search' +import { useGetObject } from './preview' + +export function useGlobalTools(): Tool.Collection { + return { + catalog_global_startSearch: useStartSearch(), + catalog_global_getObject: useGetObject(), + } +} + +export { useGlobalTools as use } diff --git a/catalog/app/components/Assistant/Model/GlobalTools/preview.ts b/catalog/app/components/Assistant/Model/GlobalTools/preview.ts new file mode 100644 index 00000000000..8f3b8616530 --- /dev/null +++ b/catalog/app/components/Assistant/Model/GlobalTools/preview.ts @@ -0,0 +1,426 @@ +import { basename, extname } from 'path' + +import type AWSSDK from 'aws-sdk' +import * as Eff from 'effect' +import * as S from '@effect/schema/Schema' + +import cfg from 'constants/config' +import { S3ObjectLocation } from 'model/S3' +import * as AWS from 'utils/AWS' +import mkSearch from 'utils/mkSearch' + +import * as Content from '../Content' +import * as Tool from '../Tool' + +type AWSEffect = Eff.Effect.Effect + +export class S3 extends Eff.Context.Tag('S3')< + S3, + { + headObject(handle: S3ObjectLocation): AWSEffect + getObject(handle: S3ObjectLocation): AWSEffect + } +>() {} + +export const fromS3Client = (client: AWSSDK.S3) => + Eff.Layer.succeed(S3, { + headObject: (handle) => + Eff.Effect.tryPromise({ + try: () => + client + .headObject({ + Bucket: handle.bucket, + Key: handle.key, + VersionId: handle.version, + }) + .promise(), + catch: (e) => e as AWSSDK.AWSError, + }), + getObject: (handle) => + Eff.Effect.tryPromise({ + try: () => + client + .getObject({ + Bucket: handle.bucket, + Key: handle.key, + VersionId: handle.version, + }) + .promise(), + catch: (e) => e as AWSSDK.AWSError, + }), + }) + +export interface S3SignerOptions { + urlExpiration?: number // in seconds + forceProxy?: boolean +} + +export class S3Signer extends Eff.Context.Tag('S3Signer')< + S3Signer, + { + sign(handle: S3ObjectLocation, options?: S3SignerOptions): Eff.Effect.Effect + } +>() {} + +export const fromS3Signer = ( + signer: (handle: S3ObjectLocation, options?: S3SignerOptions) => string, +) => + Eff.Layer.succeed(S3Signer, { + sign: (...args) => Eff.Effect.sync(() => signer(...args)), + }) + +// The document file name can only contain: +// - alphanumeric characters +// - whitespace characters +// - hyphens +// - parentheses and square brackets +// The name can't contain more than one consecutive whitespace character +const normalizeDocumentName = (name: string) => + name + .replace(/[^a-zA-Z0-9\s\-\(\)\[\]]/g, ' ') // Remove invalid characters + .replace(/\s+/g, ' ') // Replace multiple whitespace characters with a single space + .trim() // Remove leading and trailing whitespace + +const THRESHOLD = 500 * 1024 // 500 KiB + +const GetObjectSchema = S.Struct({ + bucket: S.String, + key: S.String, + version: S.optional(S.String).annotations({}), + // XXX: force type? +}).annotations({ + description: 'Get contents and metadata of an S3 object', +}) + +// return format: +// - metadata block (text or json) +// - content block (json | text | image | document) +export function useGetObject() { + const s3Client = AWS.S3.use() + const s3Signer = AWS.Signer.useS3Signer() + + return Tool.useMakeTool( + GetObjectSchema, + Eff.flow( + getObject, + Eff.Effect.map(Eff.Option.some), + Eff.Effect.provide(fromS3Client(s3Client)), + Eff.Effect.provide(fromS3Signer(s3Signer)), + ), + [s3Client, s3Signer], + ) +} + +const getObject = (handle: S3ObjectLocation) => + Eff.Effect.gen(function* () { + const s3 = yield* S3 + yield* Eff.Console.debug('TOOL: getObjectContents', handle) + const headE = yield* Eff.Effect.either(s3.headObject(handle)) + if (Eff.Either.isLeft(headE)) { + return Tool.fail( + Content.text( + 'Error while getting S3 object metadata:\n', + `\n${headE.left}'n`, + ), + ) + } + const head = headE.right + + const metaBlock = Content.text( + 'Got S3 object metadata:\n', + `\n${JSON.stringify(head, null, 2)}\n`, + ) + + const size = head.ContentLength + if (size == null) { + return Tool.succeed( + metaBlock, + Content.text('Could not determine object content length'), + ) + } + + const fileType = detectFileType(handle.key) + + const contentBlocks: Content.ToolResultContentBlock[] = yield* FileType.$match( + fileType, + { + Image: () => + getImagePreview(handle).pipe( + Eff.Effect.map(({ format, bytes }) => + Content.ToolResultContentBlock.Image({ + format, + source: bytes as $TSFixMe, + }), + ), + Eff.Effect.catchAll((e) => + Eff.Effect.succeed( + Content.text( + 'Error while getting image preview:\n', + `\n${e}'n`, + ), + ), + ), + Eff.Effect.map(Eff.Array.of), + ), + Document: ({ format }) => + size > THRESHOLD + ? Eff.Effect.succeed([ + Content.text('Object is too large to include its contents directly'), + ]) + : getDocumentPreview(handle, format), + Unidentified: () => + Eff.Effect.succeed([ + Content.text( + 'Error while getting object contents:\n', + `\nUnidentified file type\n`, + ), + ]), + }, + ) + + return Tool.succeed(metaBlock, ...contentBlocks) + }) + +const SUPPORTED_IMAGE_EXTENSIONS = [ + '.jpg', + '.jpeg', + '.png', + '.gif', + '.webp', + '.bmp', + '.tiff', + '.tif', + '.czi', +] + +// =< 1568px as per anthropic/claude guidelines +const PREVIEW_SIZE = `w1024h768` + +interface ImagePreview { + format: Content.ImageFormat + bytes: ArrayBuffer +} + +const getImagePreview = (handle: S3ObjectLocation) => + Eff.Effect.gen(function* () { + const signer = yield* S3Signer + const url = yield* signer.sign(handle) + const src = `${cfg.apiGatewayEndpoint}/thumbnail${mkSearch({ + url, + size: PREVIEW_SIZE, + })}` + const r = yield* Eff.Effect.tryPromise(() => fetch(src)) + if (r.status !== 200) { + const text = yield* Eff.Effect.promise(() => r.text()) + return yield* new Eff.Cause.UnknownException(text, text) + } + const bytes = yield* Eff.Effect.promise(() => + r.blob().then((blob) => blob.arrayBuffer()), + ) + const format = yield* Eff.Effect.try(() => { + const info = r.headers.get('X-Quilt-Info') + if (!info) throw new Error('X-Quilt-Info header not found') + const parsed = JSON.parse(info) + switch (parsed.thumbnail_format) { + case 'JPG': + return 'jpeg' + case 'PNG': + return 'png' + case 'GIF': + return 'gif' + default: + throw new Error(`Unknown thumbnail format: ${parsed.thumbnail_format}`) + } + }) + return { format, bytes } as ImagePreview + }) + +const getDocumentPreview = (handle: S3ObjectLocation, format: Content.DocumentFormat) => + S3.pipe( + Eff.Effect.andThen((s3) => s3.getObject(handle)), + Eff.Effect.map((body) => { + const blob = body.Body + if (!blob) { + return Content.text('Could not get object contents') + } + return Content.ToolResultContentBlock.Document({ + name: normalizeDocumentName(handle.key), + format, + source: blob as $TSFixMe, + }) + }), + Eff.Effect.catchAll((e) => + Eff.Effect.succeed( + Content.text( + 'Error while getting object contents:\n', + `\n${e}'n`, + ), + ), + ), + Eff.Effect.map(Eff.Array.of), + ) + +// const hasNoExt = (key: string) => !extname(key) +// +// const isCsv = utils.extIs('.csv') +// +// const isExcel = utils.extIn(['.xls', '.xlsx']) +// +// const isJsonl = utils.extIs('.jsonl') +// +// const isParquet = R.anyPass([ +// utils.extIn(['.parquet', '.pq']), +// R.test(/.+_0$/), +// R.test(/[.-]c\d{3,5}$/gi), +// ]) +// +// const isTsv = utils.extIn(['.tsv', '.tab']) +// +// +// type TabularType = 'csv' | 'jsonl' | 'excel' | 'parquet' | 'tsv' +// +// const detectTabularType: (type: string) => TabularType = R.pipe( +// utils.stripCompression, +// R.cond([ +// [isCsv, R.always('csv')], +// [isExcel, R.always('excel')], +// [isJsonl, R.always('jsonl')], +// [isParquet, R.always('parquet')], +// [isTsv, R.always('tsv')], +// [R.T, R.always('csv')], +// ]), +// ) + +const LANGS = { + accesslog: /\.log$/, + bash: /\.(ba|z)?sh$/, + clojure: /\.clj$/, + coffeescript: /\.(coffee|cson|iced)$/, + coq: /\.v$/, + c: /\.(c|h)$/, + cpp: /\.((c(c|\+\+|pp|xx)?)|(h(\+\+|pp|xx)?))$/, + csharp: /\.cs$/, + css: /\.css$/, + diff: /\.(diff|patch)$/, + dockerfile: /^dockerfile$/, + erlang: /\.erl$/, + go: /\.go$/, + haskell: /\.hs$/, + ini: /\.(ini|toml)$/, + java: /\.(java|jsp)$/, + javascript: /\.m?jsx?$/, + json: /\.jsonl?$/, + lisp: /\.lisp$/, + makefile: /^(gnu)?makefile$/, + matlab: /\.m$/, + ocaml: /\.mli?$/, + perl: /\.pl$/, + php: /\.php[3-7]?$/, + plaintext: + /((^license)|(^readme)|(^\.\w*(ignore|rc|config))|(\.txt)|(\.(c|t)sv)|(\.(big)?bed)|(\.cef)|(\.fa)|(\.fsa)|(\.fasta)|(\.(san)?fastq)|(\.fq)|(\.sam)|(\.gff(2|3)?)|(\.gtf)|(\.index)|(\.readme)|(changelog)|(.*notes)|(\.pdbqt)|(\.results)(\.(inn|out)ie))$/, + python: /\.(py|gyp)$/, + r: /\.r$/, + ruby: /\.rb$/, + rust: /\.rs$/, + scala: /\.scala$/, + scheme: /\.s(s|ls|cm)$/, + sql: /\.sql$/, + typescript: /\.tsx?$/, + xml: /\.(xml|x?html|rss|atom|xjb|xsd|xsl|plist)$/, + yaml: /((\.ya?ml$)|(^snakefile))/, +} + +const langPairs = Object.entries(LANGS) + +function isText(name: string) { + const normalized = basename(name).toLowerCase() + return langPairs.some(([, re]) => re.test(normalized)) +} + +// const loaderChain = { +// Audio: extIn(['.flac', '.mp3', '.ogg', '.ts', '.tsa', '.wav']), +// Fcs: R.pipe(utils.stripCompression, utils.extIs('.fcs')), +// Json: 'json', +// Manifest: R.allPass([R.startsWith('.quilt/packages/'), hasNoExt]), +// NamedPackage: R.startsWith('.quilt/named_packages/'), +// Ngl: R.pipe( +// utils.stripCompression, +// utils.extIn(['.cif', '.ent', '.mol', '.mol2', '.pdb', '.sdf']),), +// Notebook: R.pipe(utils.stripCompression, utils.extIs('.ipynb')), +// Tabular: R.pipe( +// utils.stripCompression, +// R.anyPass([isCsv, isExcel, isJsonl, isParquet, isTsv]),), +// Vcf: R.pipe(utils.stripCompression, utils.extIs('.vcf')), +// Video: utils.extIn(['.m2t', '.m2ts', '.mp4', '.webm']), +// Text: R.pipe(findLang, Boolean), +// } +// TODO: convert pptx? + +type FileType = Eff.Data.TaggedEnum<{ + Image: {} + Document: { + readonly format: Content.DocumentFormat + } + Unidentified: {} +}> + +// eslint-disable-next-line @typescript-eslint/no-redeclare +const FileType = Eff.Data.taggedEnum() + +// const getExt = (key: string) => extname(key).toLowerCase().slice(1) + +// const COMPRESSION_TYPES = { gz: '.gz', bz2: '.bz2' } +// type CompressionType = keyof typeof COMPRESSION_TYPES +// +// const getCompression = (key: string): [string, CompressionType | null] => { +// for (const [type, ext] of Object.entries(COMPRESSION_TYPES)) { +// if (key.toLowerCase().endsWith(ext)) { +// return [ +// key.slice(0, -ext.length), +// type as CompressionType, +// ] +// } +// } +// return [key, null] +// } + +// TODO +const detectFileType = (key: string): FileType => { + // XXX: support compression? + // const [withoutCompression, compression] = getCompression(key) + // const ext = extname(withoutCompression).toLowerCase() + const ext = extname(key).toLowerCase() + + if (SUPPORTED_IMAGE_EXTENSIONS.includes(ext)) { + return FileType.Image() + } + if (['.htm', '.html'].includes(ext)) { + return FileType.Document({ format: 'html' }) + } + if (['.md', '.rmd'].includes(ext)) { + return FileType.Document({ format: 'md' }) + } + if (ext === '.pdf') { + return FileType.Document({ format: 'pdf' }) + } + if (ext === '.csv') { + // XXX: does it support TSV? + return FileType.Document({ format: 'csv' }) + } + if (ext === '.docx') { + return FileType.Document({ format: 'docx' }) + } + if (ext === '.doc') { + return FileType.Document({ format: 'doc' }) + } + if (ext === '.xls') { + return FileType.Document({ format: 'xls' }) + } + if (ext === '.xlsx') { + return FileType.Document({ format: 'xlsx' }) + } + if (isText(key)) { + return FileType.Document({ format: 'txt' }) + } + return FileType.Unidentified() +} diff --git a/catalog/app/components/Assistant/Model/GlobalTools.ts b/catalog/app/components/Assistant/Model/GlobalTools/search.ts similarity index 72% rename from catalog/app/components/Assistant/Model/GlobalTools.ts rename to catalog/app/components/Assistant/Model/GlobalTools/search.ts index 3289b8248ea..10c4e8dfb67 100644 --- a/catalog/app/components/Assistant/Model/GlobalTools.ts +++ b/catalog/app/components/Assistant/Model/GlobalTools/search.ts @@ -5,8 +5,8 @@ import * as S from '@effect/schema/Schema' import * as SearchModel from 'containers/Search/model' import * as Model from 'model' -import * as Content from './Content' -import * as Tool from './Tool' +import * as Content from '../Content' +import * as Tool from '../Tool' // TODO: more comprehensive params const SearchParamsSchema = S.Struct({ @@ -31,7 +31,7 @@ const SearchParamsSchema = S.Struct({ description: 'Start a search session', }) -function useStartSearch() { +export function useStartSearch() { const makeUrl = SearchModel.useMakeUrl() const history = useHistory() return Tool.useMakeTool( @@ -44,24 +44,13 @@ function useStartSearch() { const url = makeUrl({ ...defaultParams, ...params } as SearchModel.SearchUrlState) yield* Eff.Effect.sync(() => history.push(url)) return Eff.Option.some( - Tool.Result({ - status: 'success', - content: [ - Content.ToolResultContentBlock.Text({ - text: 'Navigating to the search page and starting the search session. Use catalog_search_getResults tool to get the search results.', - }), - ], - }), + Tool.succeed( + Content.ToolResultContentBlock.Text({ + text: 'Navigating to the search page and starting the search session. Use catalog_search_getResults tool to get the search results.', + }), + ), ) }), [makeUrl, history], ) } - -export function useGlobalTools(): Tool.Collection { - return { - catalog_global_startSearch: useStartSearch(), - } -} - -export { useGlobalTools as use } diff --git a/catalog/app/components/Assistant/Model/Tool.ts b/catalog/app/components/Assistant/Model/Tool.ts index 3a9247aa55c..e412002c8b2 100644 --- a/catalog/app/components/Assistant/Model/Tool.ts +++ b/catalog/app/components/Assistant/Model/Tool.ts @@ -12,6 +12,12 @@ export interface Result { // eslint-disable-next-line @typescript-eslint/no-redeclare export const Result = Eff.Data.case() +export const succeed = (...content: Content.ToolResultContentBlock[]) => + Result({ status: 'success', content }) + +export const fail = (...content: Content.ToolResultContentBlock[]) => + Result({ status: 'error', content }) + export type ResultOption = Eff.Option.Option export type Executor = (params: I) => Eff.Effect.Effect diff --git a/catalog/app/containers/Bucket/Qurator/Context.tsx b/catalog/app/containers/Bucket/Qurator/Context.tsx index c310218dbdd..bda2b7aa8cf 100644 --- a/catalog/app/containers/Bucket/Qurator/Context.tsx +++ b/catalog/app/containers/Bucket/Qurator/Context.tsx @@ -1,81 +1,15 @@ -import * as Eff from 'effect' import * as React from 'react' -import { Schema as S } from '@effect/schema' import * as Assistant from 'components/Assistant' import type * as Model from 'model' -import * as APIConnector from 'utils/APIConnector' -import mkSearch from 'utils/mkSearch' - -type EsHit = { _source?: { content?: string } } - -type EsOutput = { hits?: { hits?: EsHit[] } } | null - -interface ApiRequest { - (endpoint: string): Promise -} - -const GetObjectContentsAndMetadataSchema = S.Struct({ - bucket: S.String, - key: S.String, - version: S.optional(S.String).annotations({}), -}).annotations({ - description: 'Get full contents and metadata of an S3 object', -}) - -function useGetObjectContentsAndMetadata() { - const req: ApiRequest = APIConnector.use() - - return Assistant.Model.Tool.useMakeTool( - GetObjectContentsAndMetadataSchema, - (handle) => - Eff.Effect.gen(function* () { - yield* Eff.Console.debug('TOOL: getObjectContents', handle) - - const qs = mkSearch({ - action: 'freeform', - body: JSON.stringify({ - query: { - query_string: { - query: `key:"${handle.key}"`, - }, - }, - }), - filter_path: 'hits.hits', - index: handle.bucket, - size: 1, - }) - const res = yield* Eff.Effect.promise(() => req(`/search${qs}`)) - // eslint-disable-next-line no-underscore-dangle - const content = res?.hits?.hits?.[0]._source?.content - if (!content) throw new Error('Failed to find the content for this file') - // TODO: return appropriate Content.* type - return Eff.Option.some( - Assistant.Model.Tool.Result({ - status: 'success', - content: [ - Assistant.Model.Content.ToolResultContentBlock.Text({ text: content }), - ], - }), - ) - }), - [req], - ) -} interface QuratorContextProps { handle: Model.S3.S3ObjectLocation } export default function QuratorContext({ handle }: QuratorContextProps) { - const tools = { - getObjectContents: useGetObjectContentsAndMetadata(), - } - - const messages = React.useMemo( - () => [`You are viewing the details page for an S3 object ${JSON.stringify(handle)}`], - [handle], - ) - - return + const messages = [ + `You are viewing the details page for an S3 object ${JSON.stringify(handle)}`, + ] + return }