From 7569b27059388447352317269f933a99d6803f3d Mon Sep 17 00:00:00 2001 From: Nikita Skalkin Date: Fri, 13 Dec 2024 15:13:52 +0100 Subject: [PATCH] feat(dia-1033): infiniteDiscovery queries and temporal direct integration with OpenSearch (#6306) * feat(dia-1033): infiniteDiscovery queries and temporal direct integration with OpenSearch * delete tasteProfileVector from the schema * remove tasteProfileVector parameter * change type name * fix description * move opensearch related code into old type * remove deprecation warning * random curators picks for inital batch of artworks --- _schemaV2.graphql | 20 +++- package.json | 1 + src/config.ts | 4 + src/lib/apis/opensearch.ts | 26 ++++++ .../calculateMeanArtworksVector.ts | 30 ++++++ .../infiniteDiscovery/findSimilarArtworks.ts | 55 +++++++++++ .../getInitialArtworksSample.ts | 27 ++++++ .../v2/infiniteDiscovery/discoverArtworks.ts | 91 ++++++++----------- yarn.lock | 62 +++++++++++++ 9 files changed, 260 insertions(+), 56 deletions(-) create mode 100644 src/lib/apis/opensearch.ts create mode 100644 src/lib/infiniteDiscovery/calculateMeanArtworksVector.ts create mode 100644 src/lib/infiniteDiscovery/findSimilarArtworks.ts create mode 100644 src/lib/infiniteDiscovery/getInitialArtworksSample.ts diff --git a/_schemaV2.graphql b/_schemaV2.graphql index ae1151a89a..8662cd7b80 100644 --- a/_schemaV2.graphql +++ b/_schemaV2.graphql @@ -16851,13 +16851,19 @@ type Query { after: String before: String certainty: Float + + # (Only for when useOpenSearch is true) Exclude these artworks from the response + excludeArtworkIds: [String] first: Int last: Int + + # (Only for when useOpenSearch is true) These artworks are used to calculate the taste profile vector. Such artworks are excluded from the response + likedArtworkIds: [String] limit: Int offset: Int sort: DiscoverArtworksSort - useRelatedArtworks: Boolean = false - userId: String! + useOpenSearch: Boolean = false + userId: String ): ArtworkConnection # A namespace external partners (provided by Galaxy) @@ -21500,13 +21506,19 @@ type Viewer { after: String before: String certainty: Float + + # (Only for when useOpenSearch is true) Exclude these artworks from the response + excludeArtworkIds: [String] first: Int last: Int + + # (Only for when useOpenSearch is true) These artworks are used to calculate the taste profile vector. Such artworks are excluded from the response + likedArtworkIds: [String] limit: Int offset: Int sort: DiscoverArtworksSort - useRelatedArtworks: Boolean = false - userId: String! + useOpenSearch: Boolean = false + userId: String ): ArtworkConnection # A namespace external partners (provided by Galaxy) diff --git a/package.json b/package.json index 9f1be11a24..114f1ed779 100644 --- a/package.json +++ b/package.json @@ -86,6 +86,7 @@ "lodash": "4.17.21", "longjohn": "0.2.12", "marked": "2.0.1", + "mathjs": "^14.0.1", "memcached": "2.2.2", "moment": "2.29.4", "moment-timezone": "0.5.37", diff --git a/src/config.ts b/src/config.ts index 85de2aad24..2a6bfcbb33 100644 --- a/src/config.ts +++ b/src/config.ts @@ -63,6 +63,8 @@ const { METAPHYSICS_PRODUCTION_ENDPOINT, METAPHYSICS_STAGING_ENDPOINT, NODE_ENV, + OPENSEARCH_API_BASE, + OPENSEARCH_ARTWORKS_INFINITE_DISCOVERY_INDEX, PORT, POSITRON_API_BASE, PREDICTION_ENDPOINT, @@ -196,6 +198,8 @@ export default { METAPHYSICS_STAGING_ENDPOINT, METAPHYSICS_PRODUCTION_ENDPOINT, NODE_ENV: NODE_ENV || "development", + OPENSEARCH_API_BASE, + OPENSEARCH_ARTWORKS_INFINITE_DISCOVERY_INDEX, PORT: Number(PORT) || 3000, POSITRON_API_BASE, PREDICTION_ENDPOINT, diff --git a/src/lib/apis/opensearch.ts b/src/lib/apis/opensearch.ts new file mode 100644 index 0000000000..24b44dc76b --- /dev/null +++ b/src/lib/apis/opensearch.ts @@ -0,0 +1,26 @@ +import urljoin from "url-join" +import { assign } from "lodash" +import config from "config" +import fetch from "node-fetch" + +const { OPENSEARCH_API_BASE } = config + +export const opensearch = async ( + path, + _accessToken, + fetchOptions: any = {} +) => { + const headers = { + Accept: "application/json", + "Content-Type": "application/json", + } + + const response = await ( + await fetch( + urljoin(OPENSEARCH_API_BASE, path), + assign({}, fetchOptions, { headers }) + ) + ).json() + + return response +} diff --git a/src/lib/infiniteDiscovery/calculateMeanArtworksVector.ts b/src/lib/infiniteDiscovery/calculateMeanArtworksVector.ts new file mode 100644 index 0000000000..30dabd40ad --- /dev/null +++ b/src/lib/infiniteDiscovery/calculateMeanArtworksVector.ts @@ -0,0 +1,30 @@ +import config from "config" +import { opensearch } from "lib/apis/opensearch" +import { mean } from "mathjs" + +export const calculateMeanArtworksVector = async (artworkIds) => { + const getVectorsQuery = { + size: artworkIds.length, + _source: ["_id", "vector_embedding"], + query: { + ids: { + values: artworkIds, + }, + }, + } + + const artworksResponse = await opensearch( + `/${config.OPENSEARCH_ARTWORKS_INFINITE_DISCOVERY_INDEX}/_search`, + undefined, + { + method: "POST", + body: JSON.stringify(getVectorsQuery), + } + ) + + const vectorEmbeddings = artworksResponse.hits?.hits?.map( + (hit) => hit._source.vector_embedding + ) + + return mean(vectorEmbeddings, 0) +} diff --git a/src/lib/infiniteDiscovery/findSimilarArtworks.ts b/src/lib/infiniteDiscovery/findSimilarArtworks.ts new file mode 100644 index 0000000000..5c9877347b --- /dev/null +++ b/src/lib/infiniteDiscovery/findSimilarArtworks.ts @@ -0,0 +1,55 @@ +import config from "config" +import { opensearch } from "lib/apis/opensearch" + +/** + * Perform kNN operation to find artworks similiar to vectorEmbedding + * and then return the artworks loaded by artworksLoader + * + * @param vectorEmbedding - vector embedding of the artwork + * @param size - number of similar artworks to return + * @param excludeArtworkIds - list of artwork ids to exclude from the response + * @param artworksLoader - artworks loader + */ +export const findSimilarArtworks = async ( + vectorEmbedding: number[], + size = 10, + excludeArtworkIds: string[] = [], + artworksLoader +) => { + const knnQuery = { + size: size, + _source: ["_id"], + query: { + bool: { + must_not: { + terms: { + _id: excludeArtworkIds, + }, + }, + should: [ + { + knn: { + vector_embedding: { + vector: vectorEmbedding, + k: size, + }, + }, + }, + ], + }, + }, + } + + const knnResponse = await opensearch( + `/${config.OPENSEARCH_ARTWORKS_INFINITE_DISCOVERY_INDEX}/_search`, + undefined, + { + method: "POST", + body: JSON.stringify(knnQuery), + } + ) + + const artworkIds = knnResponse.hits?.hits?.map((hit) => hit._id) || [] + + return await artworksLoader({ ids: artworkIds }) +} diff --git a/src/lib/infiniteDiscovery/getInitialArtworksSample.ts b/src/lib/infiniteDiscovery/getInitialArtworksSample.ts new file mode 100644 index 0000000000..bbd4d49711 --- /dev/null +++ b/src/lib/infiniteDiscovery/getInitialArtworksSample.ts @@ -0,0 +1,27 @@ +import { opensearch } from "lib/apis/opensearch" + +export const getInitialArtworksSample = async (limit, artworksLoader) => { + // initial artworks sample comes from indexed curators picks, but + // in future we plan to come up with a more sophisticated approach + const curatorsPicks = await opensearch(`/curators_picks/_search`, undefined, { + method: "POST", + body: JSON.stringify({ + size: limit, + query: { + function_score: { + functions: [ + { + random_score: { + seed: Math.floor(Math.random() * 1000), + }, + }, + ], + }, + }, + }), + }) + + const artworkIds = curatorsPicks.hits?.hits?.map((hit) => hit._id) || [] + + return await artworksLoader({ ids: artworkIds }) +} diff --git a/src/schema/v2/infiniteDiscovery/discoverArtworks.ts b/src/schema/v2/infiniteDiscovery/discoverArtworks.ts index c7c5e56049..967f95bb87 100644 --- a/src/schema/v2/infiniteDiscovery/discoverArtworks.ts +++ b/src/schema/v2/infiniteDiscovery/discoverArtworks.ts @@ -4,14 +4,14 @@ import { GraphQLInt, GraphQLEnumType, GraphQLFloat, - GraphQLNonNull, GraphQLBoolean, + GraphQLList, } from "graphql" import { ResolverContext } from "types/graphql" import { artworkConnection } from "../artwork" import { connectionFromArray } from "graphql-relay" import { pageable } from "relay-cursor-paging" -import { sampleSize, shuffle, uniqBy } from "lodash" +import { sampleSize, uniqBy } from "lodash" import { insertSampleCuratedWorks, getUserFilterList, @@ -23,11 +23,14 @@ import { getArtworkIds, getFilteredIdList, } from "lib/infiniteDiscovery/weaviate" +import { getInitialArtworksSample } from "lib/infiniteDiscovery/getInitialArtworksSample" +import { calculateMeanArtworksVector } from "lib/infiniteDiscovery/calculateMeanArtworksVector" +import { findSimilarArtworks } from "lib/infiniteDiscovery/findSimilarArtworks" export const DiscoverArtworks: GraphQLFieldConfig = { type: artworkConnection.connectionType, args: pageable({ - userId: { type: GraphQLNonNull(GraphQLString) }, + userId: { type: GraphQLString }, limit: { type: GraphQLInt }, offset: { type: GraphQLInt }, certainty: { type: GraphQLFloat }, @@ -48,19 +51,22 @@ export const DiscoverArtworks: GraphQLFieldConfig = { }, }), }, - useRelatedArtworks: { type: GraphQLBoolean, defaultValue: false }, + useOpenSearch: { type: GraphQLBoolean, defaultValue: false }, + excludeArtworkIds: { + type: new GraphQLList(GraphQLString), + description: + "(Only for when useOpenSearch is true) Exclude these artworks from the response", + }, + likedArtworkIds: { + type: new GraphQLList(GraphQLString), + description: + "(Only for when useOpenSearch is true) These artworks are used to calculate the taste profile vector. Such artworks are excluded from the response", + }, }), resolve: async ( _root, args, - { - weaviateCreateObjectLoader, - weaviateGraphqlLoader, - artworksLoader, - relatedArtworksLoader, - marketingCollectionLoader, - savedArtworksLoader, - } + { weaviateCreateObjectLoader, weaviateGraphqlLoader, artworksLoader } ) => { if ( !artworksLoader || @@ -76,51 +82,32 @@ export const DiscoverArtworks: GraphQLFieldConfig = { offset = 0, certainty = 0.5, sort, - useRelatedArtworks, + useOpenSearch, } = args - if (useRelatedArtworks) { - if (!savedArtworksLoader) { - return new Error("You need to be signed in to perform this action") - } - - const { body: savedArtworks } = await savedArtworksLoader({ - size: 28, - sort: "-position", - user_id: userId, - private: true, - }) - - const savedArtworkIds = savedArtworks.map((artwork) => artwork.id) - - const curatedArtworksCollection = await marketingCollectionLoader( - "curators-picks" - ) - - const curatedArtworkIds = curatedArtworksCollection.artwork_ids + if (useOpenSearch) { + const { excludeArtworkIds, likedArtworkIds } = args - // Select two random artworks from curated artworks - const randomCuratedArtworksIds = sampleSize(curatedArtworkIds, 2) + let result = [] - const curatedArtworks = await artworksLoader({ - ids: randomCuratedArtworksIds, - }) - - // use curated artworks if there are no saved artworks - const finalArtworkIds = - savedArtworkIds.length > 0 ? [...savedArtworkIds] : curatedArtworkIds - - // Limit the number of artwork IDs to a maximum of 10 - const queryArtworkIds = finalArtworkIds.slice(0, 10) - - const relatedArtworks = await relatedArtworksLoader({ - artwork_id: queryArtworkIds, - size: 8, - }) + if (!likedArtworkIds) { + result = await getInitialArtworksSample(limit, artworksLoader) + } else { + const tasteProfileVector = await calculateMeanArtworksVector( + likedArtworkIds + ) + // we don't want to recommend the same artworks that the user already liked + excludeArtworkIds.push(...likedArtworkIds) + + result = await findSimilarArtworks( + tasteProfileVector, + limit, + excludeArtworkIds, + artworksLoader + ) + } - // inject curated artworks and shuffle the list - const shuffledArtworks = shuffle([...relatedArtworks, ...curatedArtworks]) - return connectionFromArray(shuffledArtworks, args) + return connectionFromArray(result, args) } const userQueryResponse = await weaviateGraphqlLoader({ diff --git a/yarn.lock b/yarn.lock index 5242a8044d..e96068d704 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1027,6 +1027,13 @@ dependencies: regenerator-runtime "^0.14.0" +"@babel/runtime@^7.25.7": + version "7.26.0" + resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.0.tgz#8600c2f595f277c60815256418b85356a65173c1" + integrity sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw== + dependencies: + regenerator-runtime "^0.14.0" + "@babel/template@^7.25.9", "@babel/template@^7.3.3": version "7.25.9" resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.25.9.tgz#ecb62d81a8a6f5dc5fe8abfc3901fc52ddf15016" @@ -3471,6 +3478,11 @@ commondir@^1.0.1: resolved "https://registry.yarnpkg.com/commondir/-/commondir-1.0.1.tgz#ddd800da0c66127393cca5950ea968a3aaf1253b" integrity sha512-W9pAhw0ja1Edb5GVdIF1mjZw/ASI0AlShXM83UUGe2DVr5TdAPEA1OA8m/g8zWp9x6On7gqufY+FatDbC3MDQg== +complex.js@^2.2.5: + version "2.4.2" + resolved "https://registry.yarnpkg.com/complex.js/-/complex.js-2.4.2.tgz#76f260a9e7e232d8ad26348484a9b128c13fcc9a" + integrity sha512-qtx7HRhPGSCBtGiST4/WGHuW+zeaND/6Ld+db6PbrulIB1i2Ev/2UPiqcmpQNPSyfBKraC0EOvOKCB5dGZKt3g== + component-emitter@^1.2.0, component-emitter@^1.2.1: version "1.3.1" resolved "https://registry.yarnpkg.com/component-emitter/-/component-emitter-1.3.1.tgz#ef1d5796f7d93f135ee6fb684340b26403c97d17" @@ -3814,6 +3826,11 @@ decache@^4.4.0: dependencies: callsite "^1.0.0" +decimal.js@^10.4.3: + version "10.4.3" + resolved "https://registry.yarnpkg.com/decimal.js/-/decimal.js-10.4.3.tgz#1044092884d245d1b7f65725fa4ad4c6f781cc23" + integrity sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA== + decode-uri-component@^0.2.0, decode-uri-component@^0.2.2: version "0.2.2" resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9" @@ -4211,6 +4228,11 @@ escape-html@~1.0.3: resolved "https://registry.yarnpkg.com/escape-html/-/escape-html-1.0.3.tgz#0258eae4d3d0c0974de1c169188ef0051d1d1988" integrity sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow== +escape-latex@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/escape-latex/-/escape-latex-1.2.0.tgz#07c03818cf7dac250cce517f4fda1b001ef2bca1" + integrity sha512-nV5aVWW1K0wEiUIEdZ4erkGGH8mDxGyxSeqPzRNtWP7ataw+/olFObw7hujFWlVjNsaDFw5VZ5NzVSIqRgfTiw== + escape-string-regexp@^1.0.2, escape-string-regexp@^1.0.5: version "1.0.5" resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz#1b61c0562190a8dff6ae3bb2cf0200ca130b86d4" @@ -4915,6 +4937,11 @@ forwarded@0.2.0: resolved "https://registry.yarnpkg.com/forwarded/-/forwarded-0.2.0.tgz#2269936428aad4c15c7ebe9779a84bf0b2a81811" integrity sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow== +fraction.js@^5.2.1: + version "5.2.1" + resolved "https://registry.yarnpkg.com/fraction.js/-/fraction.js-5.2.1.tgz#93ffc9507b1a68a1271883aa28e98f58a1c0c6b3" + integrity sha512-Ah6t/7YCYjrPUFUFsOsRLMXAdnYM+aQwmojD2Ayb/Ezr82SwES0vuyQ8qZ3QO8n9j7W14VJuVZZet8U3bhSdQQ== + fragment-cache@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/fragment-cache/-/fragment-cache-0.2.1.tgz#4290fad27f13e89be7f33799c6bc5a0abfff0d19" @@ -6122,6 +6149,11 @@ jackspeak@^3.1.2: optionalDependencies: "@pkgjs/parseargs" "^0.11.0" +javascript-natural-sort@^0.7.1: + version "0.7.1" + resolved "https://registry.yarnpkg.com/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz#f9e2303d4507f6d74355a73664d1440fb5a0ef59" + integrity sha512-nO6jcEfZWQXDhOiBtG2KvKyEptz7RVbpGP4vTD2hLBdmNQSsCiicO2Ioinv6UI4y9ukqnBpy+XZ9H6uLNgJTlw== + jest-changed-files@^29.7.0: version "29.7.0" resolved "https://registry.yarnpkg.com/jest-changed-files/-/jest-changed-files-29.7.0.tgz#1c06d07e77c78e1585d020424dedc10d6e17ac3a" @@ -7141,6 +7173,21 @@ marked@2.0.1: resolved "https://registry.yarnpkg.com/marked/-/marked-2.0.1.tgz#5e7ed7009bfa5c95182e4eb696f85e948cefcee3" integrity sha512-5+/fKgMv2hARmMW7DOpykr2iLhl0NgjyELk5yn92iE7z8Se1IS9n3UsFm86hFXIkvMBmVxki8+ckcpjBeyo/hw== +mathjs@^14.0.1: + version "14.0.1" + resolved "https://registry.yarnpkg.com/mathjs/-/mathjs-14.0.1.tgz#b47233a3e0913ae3d2669d67f4edf7a5b6fe1fb1" + integrity sha512-yyJgLwC6UXuve724np8tHRMYaTtb5UqiOGQkjwbSXgH8y1C/LcJ0pvdNDZLI2LT7r+iExh2Y5HwfAY+oZFtGIQ== + dependencies: + "@babel/runtime" "^7.25.7" + complex.js "^2.2.5" + decimal.js "^10.4.3" + escape-latex "^1.2.0" + fraction.js "^5.2.1" + javascript-natural-sort "^0.7.1" + seedrandom "^3.0.5" + tiny-emitter "^2.1.0" + typed-function "^4.2.1" + md5@^2.3.0: version "2.3.0" resolved "https://registry.yarnpkg.com/md5/-/md5-2.3.0.tgz#c3da9a6aae3a30b46b7b0c349b87b110dc3bda4f" @@ -8800,6 +8847,11 @@ samsam@~1.1: resolved "https://registry.yarnpkg.com/samsam/-/samsam-1.1.3.tgz#9f5087419b4d091f232571e7fa52e90b0f552621" integrity sha512-t9rCPskf50hZ53eH8Z+cSWD4LfJBac+8vSSuzi1Y2HzygyXxtAl0BaR3hr6iI6A+nFQbkmJNC/brQLNEeVnrmg== +seedrandom@^3.0.5: + version "3.0.5" + resolved "https://registry.yarnpkg.com/seedrandom/-/seedrandom-3.0.5.tgz#54edc85c95222525b0c7a6f6b3543d8e0b3aa0a7" + integrity sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg== + selfsigned@^2.0.1: version "2.4.1" resolved "https://registry.yarnpkg.com/selfsigned/-/selfsigned-2.4.1.tgz#560d90565442a3ed35b674034cec4e95dceb4ae0" @@ -9532,6 +9584,11 @@ through@^2.3.6: resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5" integrity sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg== +tiny-emitter@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/tiny-emitter/-/tiny-emitter-2.1.0.tgz#1d1a56edfc51c43e863cbb5382a72330e3555423" + integrity sha512-NB6Dk1A9xgQPMoGqC5CVXn123gWyte215ONT5Pp5a0yt4nlEoO1ZWeCwpncaekPHXO60i47ihFnZPiRPjRMq4Q== + tiny-glob@^0.2.6: version "0.2.9" resolved "https://registry.yarnpkg.com/tiny-glob/-/tiny-glob-0.2.9.tgz#2212d441ac17928033b110f8b3640683129d31e2" @@ -9744,6 +9801,11 @@ typed-array-length@^1.0.6: is-typed-array "^1.1.13" possible-typed-array-names "^1.0.0" +typed-function@^4.2.1: + version "4.2.1" + resolved "https://registry.yarnpkg.com/typed-function/-/typed-function-4.2.1.tgz#19aa51847aa2dea9ef5e7fb7641c060179a74426" + integrity sha512-EGjWssW7Tsk4DGfE+5yluuljS1OGYWiI1J6e8puZz9nTMM51Oug8CD5Zo4gWMsOhq5BI+1bF+rWTm4Vbj3ivRA== + typedarray@^0.0.6: version "0.0.6" resolved "https://registry.yarnpkg.com/typedarray/-/typedarray-0.0.6.tgz#867ac74e3864187b1d3d47d996a78ec5c8830777"