diff --git a/Makefile b/Makefile index 2e3063a..83f549d 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ tests: jsonschema -V Draft202012Validator -i examples/entities/example1.json json/entities/entities.schema.json && $(print-test-ok)|| $(print-test-failed) jsonschema -V Draft202012Validator -i examples/entities/example2.json json/entities/entities.schema.json && $(print-test-ok)|| $(print-test-failed) jsonschema -V Draft202012Validator -i examples/versioning_manifest/canonical_v0-0-1.json json/versioning/manifest.schema.json && $(print-test-ok)|| $(print-test-failed) + jsonschema -V Draft202012Validator -i examples/doc_embeddings/example0.json json/embeddings/embeddings-docs.schema.json && $(print-test-ok)|| $(print-test-failed) jsonschema -V Draft202012Validator -i examples/text_reuse/tr_cluster_example.json json/text_reuse/cluster.schema.json && $(print-test-ok)|| $(print-test-failed) jsonschema -V Draft202012Validator -i examples/text_reuse/tr_passage_example.json json/text_reuse/passage.schema.json && $(print-test-ok)|| $(print-test-failed) @@ -30,6 +31,7 @@ documentation: jsonschema2md -d json/topic_model/ --header false -n -v 06 -o docs -x - -s propTable jsonschema2md -d json/language_identification/ --header false -n -v 06 -o docs -x - -s propTable jsonschema2md -d json/entities/ --header false -n -v 2020-12 -o docs -x - -s propTable + jsonschema2md -d json/embeddings/ --header false -n -v 2020-12 -o docs -x - -s propTable #jsonschema2md -d json/linguistic_annotation/ -n -v 06 -o docs jsonschema2md -d json/versioning/ --header false -n -v 2024-02 -o docs -x - -s propTable jsonschema2md -d json/text_reuse/ --header false -n -v 2024-09 -o docs -x - -s propTable diff --git a/examples/doc_embeddings/example0.json b/examples/doc_embeddings/example0.json new file mode 100644 index 0000000..8f6b3a9 --- /dev/null +++ b/examples/doc_embeddings/example0.json @@ -0,0 +1,8 @@ +{ + "id": "ACI-1832-01-01-a-i0008", + "embedding": [-0.05433, 0.04184, -0.00738, -0.00657, -0.06034, -0.01607, -0.05556, 0.08453, 0.07772, -0.01196, -0.03563, -0.00958, -0.01124, 0.03449, -0.08652, 0.06759, 0.10285, 0.03633, 0.0571, -0.00303, 0.07577, 0.0264, 0.00647, 0.02725, -0.08444, 0.09184, -0.0081, -0.08844, -0.07227, 0.02242, -0.05825, -0.01509, 0.01469, 0.06798, 0.03602, -0.04174, 0.0307, 0.03843, -0.00755, 0.00523, -0.02464, -0.02938, 0.03339, 0.01921, -0.02223, 0.04466, 0.10512, 0.06621, -0.04799, -0.0218, -0.01523, -0.02657, 0.02348, 0.01323, 0.02457, 0.07768, -0.11698, -0.01842, 0.13253, -0.03065, 0.00352, -0.0023, -0.05941, 0.02977, -0.08459, -0.03759, 0.03472, -0.02808, 0.12777, 0.00775, -0.02081, 0.03519, -0.01566, 0.02641, 0.02085, 0.02638, -0.00894, 0.02055, 0.01598, 0.02668, -0.04014, 0.01222, -3e-05, -0.00463, -0.05493, -0.0342, -0.00623, -0.04466, 0.00919, 0.00178, -0.00946, 0.00841, -0.0293, -0.01731, 0.02215, 0.07643, -0.02389, 0.02817, 0.06009, 0.02602, -0.02615, 0.02061, 0.01523, 0.0138, 0.03734, 0.02202, -0.01992, 0.01417, -0.0016, -0.04515, -0.06166, -0.03551, -0.05689, -0.03094, -0.00566, 0.06585, 0.02034, -0.00577, 0.02996, 0.03991, -0.00438, 0.00988, -0.06948, -0.03117, -0.11828, -0.008, -0.00863, -0.04799, -0.00856, -0.05224, 0.0863, 0.02242, 0.08237, -0.02452, 0.01142, -0.01346, 0.00498, 0.09867, 0.02224, 0.02777, -0.02734, 0.03227, -0.06435, 0.00116, 0.05238, 0.02205, 0.00195, -0.03677, 0.00527, 0.05112, -0.03449, 0.01328, 0.00983, 0.0437, 0.01913, -0.02824, 0.03883, 0.01167, -0.03162, -0.04115, 0.03647, 0.0552, 0.01168, -0.03536, -0.01671, -0.01974, 0.04855, 0.00917, 0.02572, -0.01037, -0.06124, 0.03046, 0.00619, -0.02053, -0.01667, 0.04677, -0.00325, 0.02759, -0.02823, 0.06654, 0.03658, -0.05173, -0.03142, -0.03645, 0.06498, -0.03103, 0.01181, 0.00387, -0.01095, -0.03401, -0.03393, 0.00877, -0.07456, 0.04522, 0.01134, -0.10758, -0.06759, 0.02662, 0.02509, -0.04507, 0.01905, 0.03268, -0.02165, 0.02796, -0.00056, 0.0026, 0.00562, -0.05173, -0.06088, -0.02134, 0.05009, -0.02788, 0.0142, 0.01435, -0.03656, -0.01422, -0.03593, -0.01214, 0.00796, -0.06529, 0.01242, -0.00229, -0.05051, -0.03657, 0.0234, 0.02014, -0.0018, 0.00586, -0.00554, -0.02147, -0.01768, -0.03864, 0.00847, 0.03976, -0.00096, -0.06175, -0.00715, 0.00424, -0.00917, 0.00693, 0.02248, 0.03038, 0.04416, -0.01312, 0.00146, -0.01952, -0.02847, -0.01951, 0.02354, -0.02947, -0.02443, 0.03555, -0.04106, 0.0587, -0.02594, 0.02111, 0.02082, 0.02106, -0.00256, -0.02473, -0.00253, -0.01338, 0.00727, -0.042, 0.02705, -0.11198, 0.00297, 0.01143, -0.00445, -0.0386, 0.00294, 0.04692, 0.01826, 0.06643, -0.02297, 0.00826, -0.02779, 0.02786, -0.01309, 0.00668, 0.01475, -0.02275, -0.01241, -0.02125, -0.00632, -0.00459, 0.04441, -0.0013, -0.01202, 0.03816, -0.01377, -0.0057, -0.03704, 0.02819, 0.03896, -0.05356, -0.03926, -0.01965, 0.01717, -0.0066, -0.05163, -0.04674, 0.01177, -0.0154, -0.03243, -0.01316, -0.05924, -0.02726, -0.00209, -0.00126, -0.02853, 0.00604, 0.03572, -0.04777, -0.01188, 0.04027, 0.00022, 0.0402, -0.05849, -0.01346, 0.00116, 0.07419, -0.02404, 0.01465, -0.02477, -0.01619, 0.02718, 0.03736, -0.05165, -0.02273, -0.03534, 0.01807, 0.0279, -0.084, 0.02632, 0.02391, 0.0464, -0.01303, 0.00964, 0.03324, 0.05655, 0.047, -0.04677, 0.00914, 0.00818, -0.01597, 0.08388, -0.01151, 0.02417, 0.00895, 0.00644, -0.0411, -0.00416, 0.03934, 0.0611, -0.02604, -0.02569, 0.01736, -0.02914, 0.0572, -0.02528, -0.03775, 0.01626, 0.03428, 0.02094, -0.01617, -0.00501, 0.03217, -0.04425, 0.0288, -0.02014, 0.00805, -0.02903, 0.09169, -0.03683, 0.02041, 0.01651, 0.01651, 0.01081, -0.02133, 0.00965, 0.05579, 0.03764, -0.03257, 0.02838, -0.01372, 0.00091, -0.00905, -0.00732, 0.00073, 0.04101, -0.02635, -0.03142, 0.01478, 0.02056, -0.0139, -0.00147, 0.027, -0.04169, 0.01208, -0.00738, 0.03467, 0.0241, 0.0687, 0.05998, 0.06154, -0.00574, -0.04092, -0.05103, -0.01536, 0.05188, 0.01329, 0.0034, -0.04556, 0.07287, 0.0223, 0.03062, 0.06213, 0.16062, -0.01824, 0.02829, 0.02414, 0.0215, 0.00548, -0.01299, -0.00635, 0.03498, 0.04476, -0.05471, 0.02231, -0.01392, 0.03889, 0.00057, -0.05781, -0.0309, -0.0303, -0.05699, -0.00907, 0.02278, -0.0038, -0.02807, -0.00425, 0.01968, -0.07242, -0.02723, 0.02706, 0.02483, 0.03949, -0.01408, 0.02869, -0.01038, 0.00981, 0.00952, -0.02354, 0.05632, -0.04016, -0.04395, 0.01178, 0.00859, -0.02142, -0.00894, 0.03136, -0.00288, -0.01722, 0.00276, 0.0415, 0.00117, 0.01905, -0.00092, 0.03379, -0.00026, 0.02454, 0.02602, 0.00099, -0.01073, 0.07699, -0.072, 0.01123, 0.00217, 0.03956, -0.0169, 0.03267, -0.01225, -0.04287, -0.00496, 0.02931, -0.00554, -0.01867, -0.00269, 0.04555, 0.05982, 0.01631, -0.00021, -0.00367, 0.03097, 0.0241, 0.00546, 0.00281, 0.00443, 0.01515, -0.02205, -0.05541, 0.04205, 0.0195, 0.01023, 0.00682, 0.0991, 0.0079, -0.01082, -0.02739, -0.01572, -0.02014, 0.02, -0.06989, -0.03337, 0.00256, -0.01917, 0.06411, 0.00194, -0.02602, 0.02387, 0.00912, -0.00754, 0.02813, 0.06901, 0.00301, 0.01226, 0.05736, 0.03043, -0.00597, -0.00239, 0.02977, 0.01278, -0.01999, 0.02324, 0.04084, 0.00758, 0.04383, 0.01144, 0.01069, -0.02324, -0.01259, -0.03113, -0.00386, -0.00379, 0.02368, 0.02004, -0.0641, -0.01724, -0.02068, 0.01396, 0.03581, 0.03455, 0.01617, -0.04573, -0.0384, 0.04505, -0.01241, -0.04088, 0.0365, -0.00811, -0.02268, 0.04856, -0.05889, 0.00682, 0.02481, 0.0249, -0.03197, -0.00231, 0.01089, 0.05358, -0.0031, 0.03965, 0.00809, -0.0426, -0.03546, -0.02799, -0.00435, -0.05234, 0.04881, -0.03847, -0.04747, -0.02357, -0.01257, 0.02776, 0.00383, -0.02063, -0.02037, -0.01896, -0.00941, -0.00489, 0.01582, 0.04962, -0.03766, 0.02395, -0.02564, -0.00566, -0.05643, 0.03247, 0.03744, 0.00785, -0.02159, 0.04823, 0.03578, 0.03575, 0.00185, -0.01605, -0.03539, -0.01974, 0.01, -0.00304, -0.01292, 0.03748, 0.00777, -0.01643, 0.00695, 0.00362, -0.03999, -0.01774, -0.00453, -0.04929, -0.03009, 0.02241, -0.03559, -0.01247, 0.00749, -0.01612, -0.0425, -0.07628, -0.00563, -0.00723, -0.02974, -0.01406, 0.00378, -0.03151, -0.02954, -0.06575, 0.02843, -0.01653, -0.04686, -0.05174, 0.05859, 0.03222, -0.01524, 0.05375, -0.02765, 0.0208, 0.05737, -0.01105, 0.02933, 0.01295, -0.00076, 0.01235, 0.02216, -0.00092, 0.00404, -0.01133, 0.0346, -0.0001, 0.04614, 0.01448, 0.01508, 0.0558, -0.03253, -0.05316, -0.00792, 0.03195, 0.05999, 0.00632, 0.02159, 0.01859, -0.01584, -0.08758, -0.03676, 0.04501, 0.00664, -0.02137, -0.02505, -0.08833, 0.00535, -0.00257, -0.00604, -0.00924, 0.01935, -0.03717, 0.03293, 0.04251, 0.02721, 0.01805, 0.03488, -0.00588, -0.02767, -0.01797, 0.00621, -0.00871, -0.00759, 0.0033, -0.00706, 0.01897, -0.05011, -0.01175, -0.00019, 0.03811, 0.02589, 0.06485, 0.04662, 0.03656, -0.00635, 0.02969, -0.03077, -0.01858, 0.01558, -0.01998, 0.00821, -0.00609, -0.02141, -0.03912, -0.01554, -0.04338, 0.09217, -0.04458, -0.03987, -0.01138, 0.01006, -0.03676, 0.01019, -0.01662, 0.0356, 0.01285, 0.03716, -0.01335, -0.01028, -0.02966, -0.01276, 0.01826, -0.00642, 0.03965, -0.00848, -0.02455, -0.02248, 0.03888, -0.016, -0.03594, 0.03684, -0.00577, -0.06935, 0.02999, 0.02628, -0.00587, 0.03771, -0.03981, 0.01803, 0.02051, -0.01534, -0.0043, -0.03625, -0.02237, -0.00387, -0.04445, 0.01126, -0.02385, -0.01256, 0.0834, -0.01093, -0.01638, 0.03417, 0.03081, 0.00643], + "size": 926, + "model_id": + "embeddings-average-article-page-chunk-subtoken-4096-8192-gte-multilingual-base-Alibaba-NLP/gte-multilingual-base@f7d567e", + "ts": "2024-10-09T12:09:24Z" +} \ No newline at end of file diff --git a/json/embeddings/embeddings-docs-backup.schema.json b/json/embeddings/embeddings-docs-backup.schema.json new file mode 100644 index 0000000..130dd73 --- /dev/null +++ b/json/embeddings/embeddings-docs-backup.schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://impresso.github.io/impresso-schemas/json/embeddings/embeddings-docs.schema.json", + "title": "Document Embeddings JSON Schema", + "description": "A representation for the vector embedding representation of content items.", + "type": "object", + "properties": { + "id": { + "$id": "#/properties/id", + "type": "string", + "title": "The Id Schema", + "description": "The unique identifier for a content item, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json", + "examples": ["actionfem-1940-01-08-a-i0001"], + "pattern": "^(.*)$" + }, + "ts": { + "$id": "#/properties/ts", + "type": "string", + "title": "The Ts Schema", + "description": "The timestamp when the embeddings were created", + "examples": ["2024-08-29T06:42:53+00:00Z"], + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\+00:00|Z)$" + }, + "embedder": { + "$id": "#/properties/embedder", + "type": "string", + "title": "The Embedder Schema", + "description": "The model or tool used to generate the embeddings", + "examples": ["Alibaba-NLP/gte-multilingual-base@f7d567e"] + }, + "len": { + "$id": "#/properties/len", + "type": "integer", + "title": "The Length Schema", + "description": "The length of the document in characters.", + "examples": [2976] + }, + "embedding": { + "$id": "#/properties/embedding", + "type": "array", + "title": "The Embedding Schema", + "description": "The vector embeddings of the document", + "items": { + "$id": "#/properties/embedding/items", + "type": "number", + "title": "The Items Schema", + "description": "A single number in the vector embeddings of the document", + "examples": [-0.11429] + } + } + }, + "required": ["id", "ts", "embedder", "embedding"] +} diff --git a/json/embeddings/embeddings-docs.schema.json b/json/embeddings/embeddings-docs.schema.json new file mode 100644 index 0000000..3558635 --- /dev/null +++ b/json/embeddings/embeddings-docs.schema.json @@ -0,0 +1,64 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema#", + "$id": "https://impresso.github.io/impresso-schemas/json/embeddings/embeddings-docs.schema.json", + "title": "Document Embeddings JSON Schema", + "description": "A representation for the vector embedding representation of content items.", + "type": "object", + "properties": { + "ci_id": { + "type": "string", + "description": "Unique identifier for the content item.", + "examples": "actionfem-1940-01-08-a-i0001", + }, + "ci_type": + { + "type": "string", + "description": "Type of content item (e.g., 'article', 'adv').", + "examples": "article" + }, + "model_id": { + "type": "string", + "description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." + }, + "embedding": { + "oneOf": [ + { + "type": "array", + "description": "The vector embedding representation of the content item as a single list.", + "items": { + "type": "number", + "description": "A single number in the vector embedding of the document.", + "examples": [-0.11429] + } + }, + { + "type": "array", + "description": "The vector embedding representation of the content item as a list of lists (e.g., for chunked embeddings).", + "items": { + "type": "array", + "items": { + "type": "number", + "description": "A single number in a chunk's vector embedding.", + "examples": [-0.11429] + } + } + } + ] + }, + "size": { + "type": "integer", + "description": "The size of the embedding vectors." + }, + "ts": { + "type": "string", + "description": "Timestamp indicating when the embedding was created (e.g., '2024-10-09T09:29:02Z').", + "format": "date-time" + } + }, + "required": [ + "ci_id", + "embedding", + "model_id", + "size" + ] +}