-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from impresso/embeddings
Add draft embeddings-docs JSON schema. It's been a dangling issue and PR, it's been updated. Feel free to re-open.
- Loading branch information
Showing
4 changed files
with
127 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"id": "ACI-1832-01-01-a-i0008", | ||
"embedding": [-0.05433, 0.04184, -0.00738, -0.00657, -0.06034, -0.01607, -0.05556, 0.08453, 0.07772, -0.01196, -0.03563, -0.00958, -0.01124, 0.03449, -0.08652, 0.06759, 0.10285, 0.03633, 0.0571, -0.00303, 0.07577, 0.0264, 0.00647, 0.02725, -0.08444, 0.09184, -0.0081, -0.08844, -0.07227, 0.02242, -0.05825, -0.01509, 0.01469, 0.06798, 0.03602, -0.04174, 0.0307, 0.03843, -0.00755, 0.00523, -0.02464, -0.02938, 0.03339, 0.01921, -0.02223, 0.04466, 0.10512, 0.06621, -0.04799, -0.0218, -0.01523, -0.02657, 0.02348, 0.01323, 0.02457, 0.07768, -0.11698, -0.01842, 0.13253, -0.03065, 0.00352, -0.0023, -0.05941, 0.02977, -0.08459, -0.03759, 0.03472, -0.02808, 0.12777, 0.00775, -0.02081, 0.03519, -0.01566, 0.02641, 0.02085, 0.02638, -0.00894, 0.02055, 0.01598, 0.02668, -0.04014, 0.01222, -3e-05, -0.00463, -0.05493, -0.0342, -0.00623, -0.04466, 0.00919, 0.00178, -0.00946, 0.00841, -0.0293, -0.01731, 0.02215, 0.07643, -0.02389, 0.02817, 0.06009, 0.02602, -0.02615, 0.02061, 0.01523, 0.0138, 0.03734, 0.02202, -0.01992, 0.01417, -0.0016, -0.04515, -0.06166, -0.03551, -0.05689, -0.03094, -0.00566, 0.06585, 0.02034, -0.00577, 0.02996, 0.03991, -0.00438, 0.00988, -0.06948, -0.03117, -0.11828, -0.008, -0.00863, -0.04799, -0.00856, -0.05224, 0.0863, 0.02242, 0.08237, -0.02452, 0.01142, -0.01346, 0.00498, 0.09867, 0.02224, 0.02777, -0.02734, 0.03227, -0.06435, 0.00116, 0.05238, 0.02205, 0.00195, -0.03677, 0.00527, 0.05112, -0.03449, 0.01328, 0.00983, 0.0437, 0.01913, -0.02824, 0.03883, 0.01167, -0.03162, -0.04115, 0.03647, 0.0552, 0.01168, -0.03536, -0.01671, -0.01974, 0.04855, 0.00917, 0.02572, -0.01037, -0.06124, 0.03046, 0.00619, -0.02053, -0.01667, 0.04677, -0.00325, 0.02759, -0.02823, 0.06654, 0.03658, -0.05173, -0.03142, -0.03645, 0.06498, -0.03103, 0.01181, 0.00387, -0.01095, -0.03401, -0.03393, 0.00877, -0.07456, 0.04522, 0.01134, -0.10758, -0.06759, 0.02662, 0.02509, -0.04507, 0.01905, 0.03268, -0.02165, 0.02796, -0.00056, 0.0026, 0.00562, -0.05173, -0.06088, -0.02134, 0.05009, -0.02788, 0.0142, 0.01435, -0.03656, -0.01422, -0.03593, -0.01214, 0.00796, -0.06529, 0.01242, -0.00229, -0.05051, -0.03657, 0.0234, 0.02014, -0.0018, 0.00586, -0.00554, -0.02147, -0.01768, -0.03864, 0.00847, 0.03976, -0.00096, -0.06175, -0.00715, 0.00424, -0.00917, 0.00693, 0.02248, 0.03038, 0.04416, -0.01312, 0.00146, -0.01952, -0.02847, -0.01951, 0.02354, -0.02947, -0.02443, 0.03555, -0.04106, 0.0587, -0.02594, 0.02111, 0.02082, 0.02106, -0.00256, -0.02473, -0.00253, -0.01338, 0.00727, -0.042, 0.02705, -0.11198, 0.00297, 0.01143, -0.00445, -0.0386, 0.00294, 0.04692, 0.01826, 0.06643, -0.02297, 0.00826, -0.02779, 0.02786, -0.01309, 0.00668, 0.01475, -0.02275, -0.01241, -0.02125, -0.00632, -0.00459, 0.04441, -0.0013, -0.01202, 0.03816, -0.01377, -0.0057, -0.03704, 0.02819, 0.03896, -0.05356, -0.03926, -0.01965, 0.01717, -0.0066, -0.05163, -0.04674, 0.01177, -0.0154, -0.03243, -0.01316, -0.05924, -0.02726, -0.00209, -0.00126, -0.02853, 0.00604, 0.03572, -0.04777, -0.01188, 0.04027, 0.00022, 0.0402, -0.05849, -0.01346, 0.00116, 0.07419, -0.02404, 0.01465, -0.02477, -0.01619, 0.02718, 0.03736, -0.05165, -0.02273, -0.03534, 0.01807, 0.0279, -0.084, 0.02632, 0.02391, 0.0464, -0.01303, 0.00964, 0.03324, 0.05655, 0.047, -0.04677, 0.00914, 0.00818, -0.01597, 0.08388, -0.01151, 0.02417, 0.00895, 0.00644, -0.0411, -0.00416, 0.03934, 0.0611, -0.02604, -0.02569, 0.01736, -0.02914, 0.0572, -0.02528, -0.03775, 0.01626, 0.03428, 0.02094, -0.01617, -0.00501, 0.03217, -0.04425, 0.0288, -0.02014, 0.00805, -0.02903, 0.09169, -0.03683, 0.02041, 0.01651, 0.01651, 0.01081, -0.02133, 0.00965, 0.05579, 0.03764, -0.03257, 0.02838, -0.01372, 0.00091, -0.00905, -0.00732, 0.00073, 0.04101, -0.02635, -0.03142, 0.01478, 0.02056, -0.0139, -0.00147, 0.027, -0.04169, 0.01208, -0.00738, 0.03467, 0.0241, 0.0687, 0.05998, 0.06154, -0.00574, -0.04092, -0.05103, -0.01536, 0.05188, 0.01329, 0.0034, -0.04556, 0.07287, 0.0223, 0.03062, 0.06213, 0.16062, -0.01824, 0.02829, 0.02414, 0.0215, 0.00548, -0.01299, -0.00635, 0.03498, 0.04476, -0.05471, 0.02231, -0.01392, 0.03889, 0.00057, -0.05781, -0.0309, -0.0303, -0.05699, -0.00907, 0.02278, -0.0038, -0.02807, -0.00425, 0.01968, -0.07242, -0.02723, 0.02706, 0.02483, 0.03949, -0.01408, 0.02869, -0.01038, 0.00981, 0.00952, -0.02354, 0.05632, -0.04016, -0.04395, 0.01178, 0.00859, -0.02142, -0.00894, 0.03136, -0.00288, -0.01722, 0.00276, 0.0415, 0.00117, 0.01905, -0.00092, 0.03379, -0.00026, 0.02454, 0.02602, 0.00099, -0.01073, 0.07699, -0.072, 0.01123, 0.00217, 0.03956, -0.0169, 0.03267, -0.01225, -0.04287, -0.00496, 0.02931, -0.00554, -0.01867, -0.00269, 0.04555, 0.05982, 0.01631, -0.00021, -0.00367, 0.03097, 0.0241, 0.00546, 0.00281, 0.00443, 0.01515, -0.02205, -0.05541, 0.04205, 0.0195, 0.01023, 0.00682, 0.0991, 0.0079, -0.01082, -0.02739, -0.01572, -0.02014, 0.02, -0.06989, -0.03337, 0.00256, -0.01917, 0.06411, 0.00194, -0.02602, 0.02387, 0.00912, -0.00754, 0.02813, 0.06901, 0.00301, 0.01226, 0.05736, 0.03043, -0.00597, -0.00239, 0.02977, 0.01278, -0.01999, 0.02324, 0.04084, 0.00758, 0.04383, 0.01144, 0.01069, -0.02324, -0.01259, -0.03113, -0.00386, -0.00379, 0.02368, 0.02004, -0.0641, -0.01724, -0.02068, 0.01396, 0.03581, 0.03455, 0.01617, -0.04573, -0.0384, 0.04505, -0.01241, -0.04088, 0.0365, -0.00811, -0.02268, 0.04856, -0.05889, 0.00682, 0.02481, 0.0249, -0.03197, -0.00231, 0.01089, 0.05358, -0.0031, 0.03965, 0.00809, -0.0426, -0.03546, -0.02799, -0.00435, -0.05234, 0.04881, -0.03847, -0.04747, -0.02357, -0.01257, 0.02776, 0.00383, -0.02063, -0.02037, -0.01896, -0.00941, -0.00489, 0.01582, 0.04962, -0.03766, 0.02395, -0.02564, -0.00566, -0.05643, 0.03247, 0.03744, 0.00785, -0.02159, 0.04823, 0.03578, 0.03575, 0.00185, -0.01605, -0.03539, -0.01974, 0.01, -0.00304, -0.01292, 0.03748, 0.00777, -0.01643, 0.00695, 0.00362, -0.03999, -0.01774, -0.00453, -0.04929, -0.03009, 0.02241, -0.03559, -0.01247, 0.00749, -0.01612, -0.0425, -0.07628, -0.00563, -0.00723, -0.02974, -0.01406, 0.00378, -0.03151, -0.02954, -0.06575, 0.02843, -0.01653, -0.04686, -0.05174, 0.05859, 0.03222, -0.01524, 0.05375, -0.02765, 0.0208, 0.05737, -0.01105, 0.02933, 0.01295, -0.00076, 0.01235, 0.02216, -0.00092, 0.00404, -0.01133, 0.0346, -0.0001, 0.04614, 0.01448, 0.01508, 0.0558, -0.03253, -0.05316, -0.00792, 0.03195, 0.05999, 0.00632, 0.02159, 0.01859, -0.01584, -0.08758, -0.03676, 0.04501, 0.00664, -0.02137, -0.02505, -0.08833, 0.00535, -0.00257, -0.00604, -0.00924, 0.01935, -0.03717, 0.03293, 0.04251, 0.02721, 0.01805, 0.03488, -0.00588, -0.02767, -0.01797, 0.00621, -0.00871, -0.00759, 0.0033, -0.00706, 0.01897, -0.05011, -0.01175, -0.00019, 0.03811, 0.02589, 0.06485, 0.04662, 0.03656, -0.00635, 0.02969, -0.03077, -0.01858, 0.01558, -0.01998, 0.00821, -0.00609, -0.02141, -0.03912, -0.01554, -0.04338, 0.09217, -0.04458, -0.03987, -0.01138, 0.01006, -0.03676, 0.01019, -0.01662, 0.0356, 0.01285, 0.03716, -0.01335, -0.01028, -0.02966, -0.01276, 0.01826, -0.00642, 0.03965, -0.00848, -0.02455, -0.02248, 0.03888, -0.016, -0.03594, 0.03684, -0.00577, -0.06935, 0.02999, 0.02628, -0.00587, 0.03771, -0.03981, 0.01803, 0.02051, -0.01534, -0.0043, -0.03625, -0.02237, -0.00387, -0.04445, 0.01126, -0.02385, -0.01256, 0.0834, -0.01093, -0.01638, 0.03417, 0.03081, 0.00643], | ||
"size": 926, | ||
"model_id": | ||
"embeddings-average-article-page-chunk-subtoken-4096-8192-gte-multilingual-base-Alibaba-NLP/gte-multilingual-base@f7d567e", | ||
"ts": "2024-10-09T12:09:24Z" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"$id": "https://impresso.github.io/impresso-schemas/json/embeddings/embeddings-docs.schema.json", | ||
"title": "Document Embeddings JSON Schema", | ||
"description": "A representation for the vector embedding representation of content items.", | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"$id": "#/properties/id", | ||
"type": "string", | ||
"title": "The Id Schema", | ||
"description": "The unique identifier for a content item, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json", | ||
"examples": ["actionfem-1940-01-08-a-i0001"], | ||
"pattern": "^(.*)$" | ||
}, | ||
"ts": { | ||
"$id": "#/properties/ts", | ||
"type": "string", | ||
"title": "The Ts Schema", | ||
"description": "The timestamp when the embeddings were created", | ||
"examples": ["2024-08-29T06:42:53+00:00Z"], | ||
"pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\+00:00|Z)$" | ||
}, | ||
"embedder": { | ||
"$id": "#/properties/embedder", | ||
"type": "string", | ||
"title": "The Embedder Schema", | ||
"description": "The model or tool used to generate the embeddings", | ||
"examples": ["Alibaba-NLP/gte-multilingual-base@f7d567e"] | ||
}, | ||
"len": { | ||
"$id": "#/properties/len", | ||
"type": "integer", | ||
"title": "The Length Schema", | ||
"description": "The length of the document in characters.", | ||
"examples": [2976] | ||
}, | ||
"embedding": { | ||
"$id": "#/properties/embedding", | ||
"type": "array", | ||
"title": "The Embedding Schema", | ||
"description": "The vector embeddings of the document", | ||
"items": { | ||
"$id": "#/properties/embedding/items", | ||
"type": "number", | ||
"title": "The Items Schema", | ||
"description": "A single number in the vector embeddings of the document", | ||
"examples": [-0.11429] | ||
} | ||
} | ||
}, | ||
"required": ["id", "ts", "embedder", "embedding"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
{ | ||
"$schema": "https://json-schema.org/draft/2020-12/schema#", | ||
"$id": "https://impresso.github.io/impresso-schemas/json/embeddings/embeddings-docs.schema.json", | ||
"title": "Document Embeddings JSON Schema", | ||
"description": "A representation for the vector embedding representation of content items.", | ||
"type": "object", | ||
"properties": { | ||
"ci_id": { | ||
"type": "string", | ||
"description": "Unique identifier for the content item.", | ||
"examples": "actionfem-1940-01-08-a-i0001", | ||
}, | ||
"ci_type": | ||
{ | ||
"type": "string", | ||
"description": "Type of content item (e.g., 'article', 'adv').", | ||
"examples": "article" | ||
}, | ||
"model_id": { | ||
"type": "string", | ||
"description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." | ||
}, | ||
"embedding": { | ||
"oneOf": [ | ||
{ | ||
"type": "array", | ||
"description": "The vector embedding representation of the content item as a single list.", | ||
"items": { | ||
"type": "number", | ||
"description": "A single number in the vector embedding of the document.", | ||
"examples": [-0.11429] | ||
} | ||
}, | ||
{ | ||
"type": "array", | ||
"description": "The vector embedding representation of the content item as a list of lists (e.g., for chunked embeddings).", | ||
"items": { | ||
"type": "array", | ||
"items": { | ||
"type": "number", | ||
"description": "A single number in a chunk's vector embedding.", | ||
"examples": [-0.11429] | ||
} | ||
} | ||
} | ||
] | ||
}, | ||
"size": { | ||
"type": "integer", | ||
"description": "The size of the embedding vectors." | ||
}, | ||
"ts": { | ||
"type": "string", | ||
"description": "Timestamp indicating when the embedding was created (e.g., '2024-10-09T09:29:02Z').", | ||
"format": "date-time" | ||
} | ||
}, | ||
"required": [ | ||
"ci_id", | ||
"embedding", | ||
"model_id", | ||
"size" | ||
] | ||
} |