From b4e7fba3cb863c0c60cc274e44772af3d7fa4fe3 Mon Sep 17 00:00:00 2001 From: Daniele Guido <1181642+danieleguido@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:26:16 +0100 Subject: [PATCH] fix #69 adding levels based on coding and method difficulty --- src/components/App.astro | 5 +- src/components/AuthorCard.tsx | 6 +- src/components/CollectionCard.tsx | 65 ------------------- src/components/NotebookCard.css | 2 +- src/components/NotebookCard.tsx | 28 +++----- src/components/NotebookModal.tsx | 2 +- src/components/NotebookViewer.tsx | 3 +- src/components/NotebooksModal.tsx | 2 +- src/components/RegisterForm.tsx | 1 - .../{CollectionCard.css => SeriesCard.css} | 14 ++-- src/components/SeriesCard.tsx | 54 +++++++++++++++ src/components/Wall.tsx | 64 ++++++++---------- src/constants.ts | 28 ++++++-- src/content/config.ts | 12 +++- src/content/notebooks/impresso-py-maps.mdx | 2 + src/content/notebooks/impresso-py-network.mdx | 3 + ...nguage-identification-with-impresso-hf.mdx | 53 +++++++++++---- .../search-multilingual-docs-impresso-hf.mdx | 54 ++++++++++++--- src/pages/notebooks/[...slug].astro | 16 +---- .../components/NotebookCard.stories.tsx | 2 +- src/types.ts | 41 ++++++++++++ 21 files changed, 275 insertions(+), 182 deletions(-) delete mode 100644 src/components/CollectionCard.tsx rename src/components/{CollectionCard.css => SeriesCard.css} (81%) create mode 100644 src/components/SeriesCard.tsx diff --git a/src/components/App.astro b/src/components/App.astro index 8c0f3b1..8e0c58e 100644 --- a/src/components/App.astro +++ b/src/components/App.astro @@ -7,8 +7,9 @@ import Footer from '../components/Footer'; import { getCollection, getEntry } from 'astro:content'; import { getRecursivelyEntryData } from '../logic' import Modals from './Modals'; -import type { Collection } from './CollectionCard'; + import { SeriesPositionLeadingColumn, SeriesPositionCentralColumn, SeriesPositionTrailingColumn } from '../constants'; +import type { Series } from '../types'; const authors = await getCollection('authors') @@ -26,7 +27,7 @@ for(const seriesEntry of series){ seriesDataIndex[seriesEntry.id]['body'] = seriesEntry.body } -const seriesValues:Collection[] = Object.values(seriesDataIndex) +const seriesValues:Series[] = Object.values(seriesDataIndex) const associatedPartners = await getCollection('associatedPartners') const associatedPartnersData = associatedPartners.map((entry) => { diff --git a/src/components/AuthorCard.tsx b/src/components/AuthorCard.tsx index 47abc07..02a07c4 100644 --- a/src/components/AuthorCard.tsx +++ b/src/components/AuthorCard.tsx @@ -1,8 +1,4 @@ -export interface Author { - id: string - name: string - fullName?: string -} +import type { Author } from "../types" const AuthorCard: React.FC<{ author: Author }> = ({ author }) => { return {author.fullName ?? author.name} diff --git a/src/components/CollectionCard.tsx b/src/components/CollectionCard.tsx deleted file mode 100644 index b6c3db9..0000000 --- a/src/components/CollectionCard.tsx +++ /dev/null @@ -1,65 +0,0 @@ -import React from "react" -import "./CollectionCard.css" -import NotebookCard, { type Notebook } from "./NotebookCard.tsx" -import MarkdownSnippet from "./MarkdownSnippet.tsx" - -export interface Collection { - title: string - excerpt: string - body?: string - cover?: - | { - url: string - alt: string - } - | null - | undefined - category?: string[] - position?: string - notebooks: Notebook[] -} - -/** - * Props for the CollectionCard component. - * - * @interface CollectionCardProps - * @extends {React.HTMLProps} - * - * @property {Collection} collection - The collection data to be displayed in the card. - */ -export interface CollectionCardProps extends React.HTMLProps { - collection: Collection -} - -const CollectionCard: React.FC = ({ - className = "", - collection, - children, -}) => { - const hasCover = collection.cover?.url - return ( -
-
-

{collection.title}

-

{collection.excerpt}

- {collection.body ? : null} - {children} -
-
    - {collection.notebooks.map((notebook) => ( -
  1. - -
  2. - ))} -
- {hasCover && ( -
-
- {collection.cover?.alt} -
- )} -
- ) -} - -export default CollectionCard diff --git a/src/components/NotebookCard.css b/src/components/NotebookCard.css index bb75ab2..6d3d89a 100644 --- a/src/components/NotebookCard.css +++ b/src/components/NotebookCard.css @@ -5,7 +5,7 @@ overflow: hidden; } -.CollectionCard a { +.SeriesCard a { text-decoration: none; } diff --git a/src/components/NotebookCard.tsx b/src/components/NotebookCard.tsx index 7a224df..09629ba 100644 --- a/src/components/NotebookCard.tsx +++ b/src/components/NotebookCard.tsx @@ -1,25 +1,11 @@ -import AuthorCard, { type Author } from "./AuthorCard.tsx" +import AuthorCard from "./AuthorCard.tsx" import "./NotebookCard.css" import Link from "./Link.tsx" import Avatar from "boring-avatars" import { ArrowRight } from "iconoir-react" import { DateTime } from "luxon" - -export interface Notebook { - id: string - href: string - title: string - langModel?: string - excerpt?: string - githubUrl?: string - googleColabUrl?: string - sha?: string - authors: Author[] - date?: Date - seealso?: Notebook[] - showLinks?: boolean - links?: { label: string; href: string }[] -} +import type { Notebook } from "../types.ts" +import { NotebookLevelColors } from "../constants.ts" // const AvatarVariants = ["marble", "beam", "pixel", "sunset", "ring", "bauhaus"] const NotebookCard: React.FC<{ @@ -35,7 +21,8 @@ const NotebookCard: React.FC<{ "- title:", notebook?.title, "notebook.langModel", - notebook.langModel + notebook.langModel, + notebook ) return (
@@ -43,9 +30,12 @@ const NotebookCard: React.FC<{
diff --git a/src/components/NotebookModal.tsx b/src/components/NotebookModal.tsx index 7efdf4d..7b36010 100644 --- a/src/components/NotebookModal.tsx +++ b/src/components/NotebookModal.tsx @@ -1,6 +1,6 @@ import Page from "./Page" -import type { Notebook } from "./NotebookCard" import NotebookViewer from "./NotebookViewer" +import type { Notebook } from "../types" interface NotebookModalProps { notebook: Notebook diff --git a/src/components/NotebookViewer.tsx b/src/components/NotebookViewer.tsx index ed20533..85ab9cd 100644 --- a/src/components/NotebookViewer.tsx +++ b/src/components/NotebookViewer.tsx @@ -2,14 +2,13 @@ import React from "react" import { Col, Container, Row } from "react-bootstrap" import CodeSnippet from "./CodeSnippet" import MarkdownSnipped from "./MarkdownSnippet" -import type { Notebook } from "./NotebookCard" import NotebookCard from "./NotebookCard" import AuthorCard from "./AuthorCard" import Alert from "./Alert" import { DateTime } from "luxon" import "./NotebookViewer.css" import { OverlayTrigger, Tooltip } from "react-bootstrap" -import type { CellInfo } from "../types" +import type { CellInfo, Notebook } from "../types" import { ModelLanguagesLabels } from "../constants" export interface NotebookViewerProps { diff --git a/src/components/NotebooksModal.tsx b/src/components/NotebooksModal.tsx index 55b8ddd..9a17f2b 100644 --- a/src/components/NotebooksModal.tsx +++ b/src/components/NotebooksModal.tsx @@ -1,7 +1,7 @@ import Page from "./Page" -import type { Notebook } from "./NotebookCard" import NotebookCard from "./NotebookCard" import { Col, Container, Row } from "react-bootstrap" +import type { Notebook } from "../types" interface NotebookModalProps { notebooks: Notebook[] diff --git a/src/components/RegisterForm.tsx b/src/components/RegisterForm.tsx index 55735b6..2a5d73b 100644 --- a/src/components/RegisterForm.tsx +++ b/src/components/RegisterForm.tsx @@ -7,7 +7,6 @@ import { PlanEducational, PlanLabels, BrowserViewTermsOfUse, - PlanResearcherPlus, } from "../constants" import { useBrowserStore, usePersistentStore } from "../store" import { DateTime } from "luxon" diff --git a/src/components/CollectionCard.css b/src/components/SeriesCard.css similarity index 81% rename from src/components/CollectionCard.css rename to src/components/SeriesCard.css index b9fbd78..1bfb557 100644 --- a/src/components/CollectionCard.css +++ b/src/components/SeriesCard.css @@ -1,4 +1,4 @@ -.CollectionCard { +.SeriesCard { position: relative; overflow: hidden; border-radius: var(--impresso-border-radius-lg); @@ -6,25 +6,25 @@ background-color: rgba(255, 255, 255, 0.29); } -.CollectionCard ol { +.SeriesCard ol { padding: 0; margin: 0; list-style: none; } -.CollectionCard h2, -.CollectionCard h3 { +.SeriesCard h2, +.SeriesCard h3 { font-size: inherit; font-weight: var(--impresso-wght-bold); font-variation-settings: "wght" var(--impresso-wght-bold); } -.CollectionCard h2 { +.SeriesCard h2 { font-size: 1.5rem; margin: 0; font-variation-settings: "wght" var(--impresso-wght-medium); } -.CollectionCard .map-bg img { +.SeriesCard .map-bg img { position: absolute; top: 0; left: 0; @@ -35,7 +35,7 @@ object-fit: cover; } -.CollectionCard .overlay { +.SeriesCard .overlay { position: absolute; z-index: -1; top: 0; diff --git a/src/components/SeriesCard.tsx b/src/components/SeriesCard.tsx new file mode 100644 index 0000000..cf4faf4 --- /dev/null +++ b/src/components/SeriesCard.tsx @@ -0,0 +1,54 @@ +import React from "react" +import "./SeriesCard.css" +import NotebookCard from "./NotebookCard.tsx" +import MarkdownSnippet from "./MarkdownSnippet.tsx" +import type { Series } from "../types.ts" + +/** + * Props for the SeriesCard component. + * + * @interface SeriesCardProps + * @extends {React.HTMLProps} + * + * @property {Series} series - The series data to be displayed in the card. + */ +export interface SeriesCardProps extends React.HTMLProps { + series: Series +} + +const SeriesCard: React.FC = ({ + className = "", + series, + children, +}) => { + if (!series) { + console.error("[SeriesCard] - series is not defined") + return null + } + const hasCover = series.cover?.url + return ( +
+
+

{series.title}

+

{series.excerpt}

+ {series.body ? : null} + {children} +
+
    + {series.notebooks.map((notebook) => ( +
  1. + +
  2. + ))} +
+ {hasCover && ( +
+
+ {series.cover?.alt} +
+ )} +
+ ) +} + +export default SeriesCard diff --git a/src/components/Wall.tsx b/src/components/Wall.tsx index a7bd83f..fcf472c 100644 --- a/src/components/Wall.tsx +++ b/src/components/Wall.tsx @@ -1,9 +1,9 @@ import { Col, Container, Row } from "react-bootstrap" -import CollectionCard, { type Collection } from "./CollectionCard" +import SeriesCard from "./SeriesCard" import CodeSnippet from "./CodeSnippet" import { useEffect } from "react" -import Link from "./Link" import GettingStarted from "./GettingStarted" +import type { Series } from "../types" const CodeSample = `# Install the impresso library %pip install impresso @@ -40,11 +40,11 @@ const Wall = ({ numberOfAuthors?: number numberOfNotebooks?: number numberOfSeries?: number - seriesInTrailingColumn?: Collection[] - seriesInLeadingColumn?: Collection[] - seriesInCentralColumn?: Collection[] - enterImpressoPy: Collection - enterImpressoModels: Collection + seriesInTrailingColumn?: Series[] + seriesInLeadingColumn?: Series[] + seriesInCentralColumn?: Series[] + enterImpressoPy: Series + enterImpressoModels: Series scrollToTop?: boolean }) => { useEffect(() => { @@ -52,6 +52,16 @@ const Wall = ({ window.scrollTo(0, 0) } }, [scrollToTop]) + + console.info( + "[Wall] - numberOfAuthors:", + numberOfAuthors, + "- numberOfNotebooks:", + numberOfNotebooks, + "- numberOfSeries:", + numberOfSeries + ) + return (
@@ -64,14 +74,6 @@ const Wall = ({

Programmatic access to Impresso's Corpus, Data and Models

-

- We collected {numberOfNotebooks}{" "} - - Jupyter Ipynb notebooks - {" "} - in {numberOfSeries} series, maintained and developed by{" "} - {numberOfAuthors} authors. -

@@ -114,43 +116,31 @@ const Wall = ({ - +

Copy the code below in a blank jupyter notebook to get started

-
- {seriesInLeadingColumn.map((collection) => ( - + + {seriesInLeadingColumn.map((series) => ( + ))} - {seriesInCentralColumn.map((collection) => ( - + {seriesInCentralColumn.map((series) => ( + ))} - +

Copy the code below in a blank jupyter notebook to get started

-
- {seriesInTrailingColumn.map((collection) => ( - +
+ {seriesInTrailingColumn.map((series) => ( + ))}
diff --git a/src/constants.ts b/src/constants.ts index bba5f5d..e5bb386 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -253,10 +253,28 @@ export const ModelLanguagesLabels: Record = { "en-fr-de": "English, French, German", } +export const NotebookLevelBeginner = "beginner" +export const NotebookLevelApprentice = "apprentice" +export const NotebookLevelIntermediate = "intermediate" +export const NotebookLevelAdvanced = "advanced" + export const NotebookLevels = [ - "beginner", - "expert-in-methods", - "skilled-in-methods", - "expert-in-domain", - "skilled-in-domain", + NotebookLevelBeginner, + NotebookLevelApprentice, + NotebookLevelIntermediate, + NotebookLevelAdvanced, ] + +export const NotebookLevelLabels: Record = { + [NotebookLevelBeginner]: "Beginner", + [NotebookLevelApprentice]: "Apprentice", + [NotebookLevelIntermediate]: "Intermediate", + [NotebookLevelAdvanced]: "Advanced", +} + +export const NotebookLevelColors: Record = { + [NotebookLevelBeginner]: ["#98FB98", "#c7EA46"], + [NotebookLevelApprentice]: ["#29AB87", "#C7EA46"], + [NotebookLevelIntermediate]: ["#01796F", "#29AB87"], + [NotebookLevelAdvanced]: ["#87015a"], +} diff --git a/src/content/config.ts b/src/content/config.ts index c358ff0..f03644a 100644 --- a/src/content/config.ts +++ b/src/content/config.ts @@ -13,6 +13,7 @@ import { PlanNone, PlanEducational, NotebookLevels, + NotebookLevelBeginner, } from "../constants" const CorpusAccessUserPlansToPlan: Record = { @@ -168,7 +169,16 @@ const notebooks = defineCollection({ // note: this prevents circular reference // BEFORE: seealso: z.array(z.lazy(() => reference("notebooks"))).optional(), seealso: z.array(z.string()).optional(), - level: z.enum(NotebookLevels as any).default("beginner"), + // levels + levels: z + .object({ + coding: z.enum(NotebookLevels as any).default(NotebookLevelBeginner), + method: z.enum(NotebookLevels as any).default(NotebookLevelBeginner), + }) + .default({ + coding: NotebookLevelBeginner, + method: NotebookLevelBeginner, + }), // seealso: z.array(z.lazy(() => reference("notebooks"))).optional(), }), }) diff --git a/src/content/notebooks/impresso-py-maps.mdx b/src/content/notebooks/impresso-py-maps.mdx index 04536f9..d07553d 100644 --- a/src/content/notebooks/impresso-py-maps.mdx +++ b/src/content/notebooks/impresso-py-maps.mdx @@ -14,6 +14,8 @@ links: seealso: - impresso-py-search excerpt: This notebook provides a way to analyze and explore the geographic distribution of entities mentioned in Impresso using the Impresso Python Library. +levels: + method: apprentice --- {/* cell:0 cell_type:markdown */} diff --git a/src/content/notebooks/impresso-py-network.mdx b/src/content/notebooks/impresso-py-network.mdx index b947d69..268f741 100644 --- a/src/content/notebooks/impresso-py-network.mdx +++ b/src/content/notebooks/impresso-py-network.mdx @@ -12,6 +12,9 @@ links: label: Prague Spring seealso: - impresso-py-search +levels: + coding: beginner + method: intermediate --- {/* cell:0 cell_type:markdown */} diff --git a/src/content/notebooks/language-identification-with-impresso-hf.mdx b/src/content/notebooks/language-identification-with-impresso-hf.mdx index 45d521e..a0ca314 100644 --- a/src/content/notebooks/language-identification-with-impresso-hf.mdx +++ b/src/content/notebooks/language-identification-with-impresso-hf.mdx @@ -14,10 +14,22 @@ excerpt: This notebook demonstrates language identification using a pre-trained importance of language identification in broader NLP applications. This approach allows for more accurate and effective language processing across various scenarios. +levels: + coding: apprentice + method: intermediate --- {/* cell:0 cell_type:markdown */} -Open In Colab + + + Open In Colab + {/* cell:1 cell_type:markdown */} @@ -25,41 +37,46 @@ This notebook demonstrates how to use a pre-trained Floret language identificati We'll load the model, input some text, and predict the language of the text. ## What is this notebook about? + This notebook provides a hands-on demonstration of **language identification** (LID) using our Impresso LID model from Hugging Face. We will explore how to download and use this model to predict the language of Impresso-like text inputs. This notebook walks through the necessary steps to set up dependencies, load the model, and implement it for practical language identification tasks. ## What will you learn in this notebook? + By the end of this notebook, you will: + - Understand how to install and configure the required libraries (`floret` and `huggingface_hub`). - Learn to load our trained Floret language identification model from Hugging Face. - Run the model to predict the dominant language (or the mix of languages) of a given text input. - Gain insight into the core functionality of language identification using machine learning models. {/* cell:2 cell_type:markdown */} + ## 1. Install Dependencies First, we need to install `floret` and `huggingface_hub` to work with the Floret language identification model and Hugging Face. - {/* cell:3 cell_type:code */} + ```python !pip install floret !pip install huggingface_hub ``` {/* cell:4 cell_type:markdown */} + ## 2. Model Information In this example, we are using a language identification model hosted on the Hugging Face Hub: `impresso-project/impresso-floret-langident`. The model can predict the language of a given text of a reasonable length and supports the main impresso languages: German (de), French (fr), Luxemburgish (lb), Italian (it), English (en) - {/* cell:5 cell_type:markdown */} + ## 3. Defining the FloretLangIdentifier Class This class downloads the Floret model from Hugging Face and loads it for prediction. We use `huggingface_hub` to download the model locally. - {/* cell:6 cell_type:code */} + ```python from huggingface_hub import hf_hub_download import floret @@ -146,15 +163,17 @@ class FloretLangIdentifier: ``` {/* cell:7 cell_type:markdown */} + ## 4. Using the Model for Prediction Now that the model is loaded, you can input your own text and predict the language. - {/* cell:8 cell_type:markdown */} + ### 4.1 Predict the main language of a document {/* cell:9 cell_type:code */} + ```python # Define the repository and model file repo_id = "impresso-project/impresso-floret-langident" @@ -172,10 +191,11 @@ print("Language:", result) ``` {/* cell:10 cell_type:markdown */} -### 4.2 Predict the language mix of a document +### 4.2 Predict the language mix of a document {/* cell:11 cell_type:code */} + ```python # Multi-output for predicting mixed-language documents # Example text for prediction @@ -187,10 +207,11 @@ print("Language mix:", result) ``` {/* cell:12 cell_type:markdown */} -### 4.3 Predict the language mix of an impresso document +### 4.3 Predict the language mix of an impresso document {/* cell:13 cell_type:code */} + ```python # source: https://impresso-project.ch/app/issue/onsjongen-1945-03-03-a/view?p=1&articleId=i0001&text=1 text = " Lëtzeburger Zaldoten traine'èren an England Soldats luxembourgeois à l’entraînement en Angleterre" @@ -201,9 +222,11 @@ print("Language mix:", result) ``` {/* cell:14 cell_type:markdown */} + ### 4.4 Interactive mode {/* cell:15 cell_type:code */} + ```python # Interactive text input text = input("Enter a sentence for language identification: ") @@ -212,17 +235,19 @@ print("Prediction Result:", result) ``` {/* cell:16 cell_type:markdown */} + ## 5. Why is Language identification important? An example Many NLP models are trained on data from certain languages. For applying any further NLP processing, we often need to know the language. Let us visit a concrete example: Say that we want to count the nouns in a text. For this we load a NLP-processor from the popular spacy-library, that (i.a.) splits the text and tags our words with so-called part-of-speech-tags. - {/* cell:17 cell_type:markdown */} + ### 5.1 Build a simple Noun counter class {/* cell:18 cell_type:code */} + ```python class NounCounter: @@ -254,9 +279,11 @@ class NounCounter: ``` {/* cell:19 cell_type:markdown */} + ### 5.2 Noun counter: A first naive test {/* cell:20 cell_type:code */} + ```python # Example text for prediction text = "Das ist ein Testdokument. Ein Mann geht mit einem Hund im Park spazieren." @@ -275,6 +302,7 @@ print("Text: \"{}\"\nNoun-count: {}".format(text, counter.count_nouns(text))) ``` {/* cell:21 cell_type:markdown */} + ### 5.3 Noun counter: A second test {/* cell:22 cell_type:markdown */} @@ -283,6 +311,7 @@ Now let us assume that we would know the language of the input document: German. This would let us load a default German spacy model. {/* cell:23 cell_type:code */} + ```python # Need to download the German model spacy.cli.download("de_core_news_sm") @@ -298,13 +327,14 @@ print("Text: \"{}\"\nNoun-count: {}".format(text, counter.count_nouns(text))) ``` {/* cell:24 cell_type:markdown */} -### 5.4 Noun counter: Combining our knowledge +### 5.4 Noun counter: Combining our knowledge {/* cell:25 cell_type:markdown */} We use our insights to build a language-informed spacy loader that uses our language identifier! {/* cell:26 cell_type:code */} + ```python class LanguageAwareSpacyLoader: @@ -355,6 +385,7 @@ class LanguageAwareSpacyLoader: Let's try it {/* cell:28 cell_type:code */} + ```python # We initialize our language aware spacy loader loader = LanguageAwareSpacyLoader(model) @@ -372,8 +403,8 @@ print("Noun-count: {}".format(counter.count_nouns(text))) {/* cell:29 cell_type:markdown */} Let's start the interactive mode again. Input any text in some language, and the two-step model (lang-id + nlp) will count its nouns. - {/* cell:30 cell_type:code */} + ```python text = input("Enter a sentence for Noun counting: ") nlp = loader.load(text) @@ -382,9 +413,9 @@ print("Noun-count: {}".format(counter.count_nouns(text))) ``` {/* cell:31 cell_type:markdown */} + ## 6. Summary and Next Steps In this notebook, we used a pre-trained Floret language identification model to predict the language of input text. You can modify the input or explore other models from Hugging Face. Feel free to try other texts, or languages to experiment with the model. - diff --git a/src/content/notebooks/search-multilingual-docs-impresso-hf.mdx b/src/content/notebooks/search-multilingual-docs-impresso-hf.mdx index 2f3a338..49c7cc2 100644 --- a/src/content/notebooks/search-multilingual-docs-impresso-hf.mdx +++ b/src/content/notebooks/search-multilingual-docs-impresso-hf.mdx @@ -7,6 +7,9 @@ sha: 413491d26bddf2b9c04cc45481be4664661852ae date: 2024-10-29T10:33:05Z links: [] googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/annotate/search_multilingual_docs-ImpressoHF.ipynb +levels: + method: advanced + coding: intermediate --- {/* cell:0 cell_type:markdown */} @@ -17,21 +20,20 @@ We'll load the model, embed the texts and demonstrate use cases on how to find r Reccomended Hardware: GPU support, the colab free one (T4) is sufficient. Alternatively, calculations with CPU are possible but much slower. - - - {/* cell:1 cell_type:markdown */} + ## 1. Install Dependencies First, we need to install `sentence-transformers` - {/* cell:2 cell_type:code */} + ```python !pip install sentence-transformers ``` {/* cell:3 cell_type:markdown */} + ## 2. Model Information In this example, we are using an off the shelf multilingual embedding model hosted on Huggingface: `gte-multilingual-base'. @@ -40,13 +42,14 @@ Note: Newer impresso version of the model is in the works. This model predicts an embedding representation (list of numbers that stores the "meaning") of a given text (sentence, paragraph, article) that can be used to measure similarity between two texts. - {/* cell:4 cell_type:markdown */} + ## 3. Loading the embedding model This class downloads the model from Hugging Face and loads it ready for prediction. We use the SentenceTransformers library to benefit from their functionality and documentation. {/* cell:5 cell_type:code */} + ```python from sentence_transformers import SentenceTransformer from scipy.spatial.distance import cosine @@ -55,15 +58,18 @@ embedding_model = SentenceTransformer("Alibaba-NLP/gte-multilingual-base", trust ``` {/* cell:6 cell_type:markdown */} + ### Simple Test {/* cell:7 cell_type:code */} + ```python sentence1_en = "This is an example test sentence" sentence2_en = "This constitutes a sample sentence" ``` {/* cell:8 cell_type:code */} + ```python embedding1_en = embedding_model.encode(sentence1_en) embedding2_en = embedding_model.encode(sentence2_en) @@ -78,6 +84,7 @@ Those numbers look intriguing, but do they really mean something? Answer: Yes, they can show us the similarity of the two texts {/* cell:10 cell_type:code */} + ```python similarity_value = round(1 - cosine(embedding1_en, embedding2_en),2) print("Sentence1 and Sentence2 have a cosine similarity of " + str(similarity_value)) @@ -93,44 +100,53 @@ The higher the cosine similarity, the more similar the two texts are. Range of w Based on our experiments on contemporary texts, cosine similarity of 0.85+ means the two texts are mostly equivalent {/* cell:12 cell_type:markdown */} + ### Simple Test Across Languages {/* cell:13 cell_type:code */} + ```python sentence1_de = "Das ist ein Beispieltestsatz" ``` {/* cell:14 cell_type:code */} + ```python embedding1_de = embedding_model.encode(sentence1_de) ``` {/* cell:15 cell_type:code */} + ```python similarity_value = round(1 - cosine(embedding1_en, embedding1_de),2) print("Sentence1 in English and Sentence1 in German have a cosine similarity of " + str(similarity_value)) ``` {/* cell:16 cell_type:markdown */} + ### Try your own similarity calculation {/* cell:17 cell_type:code */} + ```python input1 = input() ``` {/* cell:18 cell_type:code */} + ```python input2 = input() ``` {/* cell:19 cell_type:code */} + ```python embedding1 = embedding_model.encode(input1) embedding2 = embedding_model.encode(input2) ``` {/* cell:20 cell_type:code */} + ```python similarity_value = round(1 - cosine(embedding1, embedding2),2) print("Input1 and Input2 have a cosine similarity of " + str(similarity_value)) @@ -140,33 +156,35 @@ print("Input1 and Input2 have a cosine similarity of " + str(similarity_value)) Note: You can also calculate the similarity of inputs from different languages {/* cell:22 cell_type:markdown */} + ## 4. Finding similar texts within collections using the embedding model Now that we have seen how the model creates a representation and how we can use it to get the similarity of two texts, let's apply it to a couple of collections. {/* cell:23 cell_type:markdown */} + ## Setting up utility functions Here we setup utilities functions that we use for later. You can safely ignore details of the implementation. You simply need to have the code cell executed For these functions we just need to have a high level understanding of the following key information: -***create_embedding_collection(texts, embedding_model)*** +**_create_embedding_collection(texts, embedding_model)_** This function creates a collection of embeddings from a list of sentences, texts, using the embedding model which we already loaded. It outputs a list of tuples pairing each text with its embedding -***find_best_match_in_collection(source_collection, target_collection)*** +**_find_best_match_in_collection(source_collection, target_collection)_** This function finds the most similar text in a target_collection for each text in a source_collection based on their precomputed embeddings. Each collection should be a list of tuples (direct output of the create_embedding_collection function) where each tuple contains a sentence and its embedding. -***print_matches_formatted(matches, link=False, threshold=0)*** +**_print_matches_formatted(matches, link=False, threshold=0)_** This function formats and prints each match from the matches output of find_best_match_in_collection. It takes an optional threshold value to display only matches with a similarity above a certain value. - Example usage can be found on Section 4.1 {/* cell:24 cell_type:code */} + ```python from scipy.spatial.distance import cosine @@ -274,12 +292,14 @@ def print_matches_formatted(matches, link=False, threshold=0): ``` {/* cell:25 cell_type:markdown */} + ## 4.1 Searching in a Dummy Sentence Level Collection {/* cell:26 cell_type:markdown */} Here we create a sample sentence text collection to see what the model matches as most similar using minimal additional code. {/* cell:27 cell_type:code */} + ```python german_sentences = [ "Mit diesen drei Kernkraftwerken wird die Schweiz 1972 die höchste installierte nukleare Kapazität pro Kopf der Bevölkerung aller kontinentaleuropäischer Länder aufweisen .", @@ -300,6 +320,7 @@ french_collection = create_embedding_collection(french_sentences, embedding_mode ``` {/* cell:28 cell_type:code */} + ```python # Example of finding and printing the best matches matches = find_best_match_in_collection(source_collection=german_collection, target_collection=french_collection) # Find best matches @@ -307,6 +328,7 @@ print_matches_formatted(matches) # Print the matches ``` {/* cell:29 cell_type:markdown */} + ## 4.2 Searching in an Article collection exported from the interface {/* cell:30 cell_type:markdown */} @@ -314,8 +336,8 @@ Here we are working with collections exported directly from the Impresso Interfa In this example, we work on article level and filter for articles with a minimum length of 2000 characters. You can customise this filter by changing the parameter minimum_characters_in_article or pre-filter your dataframe in any way you please before providing it to the function. - {/* cell:31 cell_type:code */} + ```python def interface_exported_csv_to_collection(df, embedding_model, batch_size=16, minimum_characters_in_article=2000): """ @@ -359,6 +381,7 @@ def interface_exported_csv_to_collection(df, embedding_model, batch_size=16, min I setup the file on my google drive so you can also replicate with these files using this code. For another collection, you can simply drag and drop the files into the file interface of the notebook {/* cell:33 cell_type:code */} + ```python !pip install gdown import gdown @@ -372,6 +395,7 @@ gdown.download(f"https://drive.google.com/uc?export=download&id={file_id_german} ``` {/* cell:34 cell_type:code */} + ```python import pandas as pd @@ -381,6 +405,7 @@ marie_curie_df_french = pd.read_csv("mariecurie_french.csv", sep=";") ``` {/* cell:35 cell_type:code */} + ```python marie_curie_german_collection = interface_exported_csv_to_collection(marie_curie_df_german, embedding_model, minimum_characters_in_article=2000) print("German articles prepared: " + str(len(marie_curie_german_collection))) @@ -392,6 +417,7 @@ print("French articles prepared: " + str(len(marie_curie_french_collection))) Now we have reached the same data format and so we can use the same utility functions as before. An addition to earlier I add a hyperlink to the interface so we can browse all the details. To do so, we set the value of "link" to True. {/* cell:37 cell_type:code */} + ```python # Example of finding and printing the best matches matches = find_best_match_in_collection(source_collection=marie_curie_french_collection, target_collection=marie_curie_german_collection, link=True) # Find best matches @@ -402,6 +428,7 @@ print_matches_formatted(matches, link=True, threshold=0.70) # Print the matches U might want to save the results into a csv file, here is a utility function to do that. Within Google colab, just download the resulting file as an extra step. {/* cell:39 cell_type:code */} + ```python def save_matches_to_csv(matches, filename, link=False, threshold=0): """ @@ -433,12 +460,14 @@ save_matches_to_csv(matches, "marie_curie_first10french_matches.csv", link=True, ``` {/* cell:40 cell_type:markdown */} + ## 4.3 Searching in an Article collection sourced from Impresso Datalab {/* cell:41 cell_type:markdown */} Currently, embeddings are not available in the Impresso Datalab, so we will compute them here instead. {/* cell:42 cell_type:code */} + ```python %pip install --upgrade --force-reinstall impresso import impresso @@ -446,6 +475,7 @@ impresso_session = impresso.connect() ``` {/* cell:43 cell_type:code */} + ```python # some search and get data fr_result = impresso_session.search.find( @@ -463,6 +493,7 @@ for uri in fr_result.df.index[:40]: ``` {/* cell:44 cell_type:code */} + ```python # some search and get data de_result = impresso_session.search.find( @@ -480,12 +511,14 @@ for uri in de_result.df.index[:400]: ``` {/* cell:45 cell_type:code */} + ```python recipes_fr_collection = create_embedding_collection(fr_texts, embedding_model, uids=fr_uids) recipes_de_collection = create_embedding_collection(de_texts, embedding_model, uids=de_uids) ``` {/* cell:46 cell_type:code */} + ```python # Example of finding and printing the best matches matches = find_best_match_in_collection(source_collection=recipes_fr_collection, target_collection=recipes_de_collection, link=True) # Find best matches @@ -493,6 +526,7 @@ print_matches_formatted(matches, link=True, threshold=0.60) # Print the matches ``` {/* cell:47 cell_type:markdown */} + ## 5. Summary and Next Steps The pipeline, models and codes we provide is not the only method to find similar texts. You can always experiment with different models. pipelines and data filtering methods. Feel free to re-use our code! diff --git a/src/pages/notebooks/[...slug].astro b/src/pages/notebooks/[...slug].astro index 781202c..2884f36 100644 --- a/src/pages/notebooks/[...slug].astro +++ b/src/pages/notebooks/[...slug].astro @@ -29,20 +29,9 @@ if (notebookProps.seealso) { seealsoNotebooksProps.push(seealsoProps) } notebookProps.seealso = seealsoNotebooksProps -} -// load all the series where it has been used -// const series = await getCollection('series') -// console.log(series.map(d => d.data.notebooks)) -// const seriesWithNotebook = series.filter(s => s.data.notebooks.some((n) => n.slug === "setup")) -// const seriesWithNotebooksProps = [] -// for (const s of seriesWithNotebook) { -// const seriesProps = await getRecursivelyEntryData(s) -// seriesWithNotebooksProps.push(seriesProps) -// } -// console.log(seriesWithNotebooksProps) +} -// 3. Render the entry data --- @@ -56,7 +45,8 @@ if (notebookProps.seealso) { /> \ No newline at end of file diff --git a/src/stories/components/NotebookCard.stories.tsx b/src/stories/components/NotebookCard.stories.tsx index d9f4857..2d34f2c 100644 --- a/src/stories/components/NotebookCard.stories.tsx +++ b/src/stories/components/NotebookCard.stories.tsx @@ -1,7 +1,7 @@ import type { Meta, StoryObj } from "@storybook/react" // import { fn } from "@storybook/test" import NotebookCard from "../../components/NotebookCard" -import type { Notebook } from "../../components/NotebookCard" +import type { Notebook } from "../../types" const meta: Meta = { component: NotebookCard, diff --git a/src/types.ts b/src/types.ts index 1e8e21f..c55c1e6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -2,6 +2,47 @@ export type Group = { name: string id: number } +export type Author = { + id: string + name: string + fullName?: string +} + +export type Notebook = { + id: string + href: string + title: string + langModel?: string + excerpt?: string + githubUrl?: string + googleColabUrl?: string + sha?: string + levels: { + coding: string + method: string + } + authors: Author[] + date?: Date + seealso?: Notebook[] + showLinks?: boolean + links?: { label: string; href: string }[] +} + +export interface Series { + title: string + excerpt: string + body?: string + cover?: + | { + url: string + alt: string + } + | null + | undefined + category?: string[] + position?: string + notebooks: Notebook[] +} export type User = { username: string