From 6714dc08047335874939c54c5e86ab3707d55e82 Mon Sep 17 00:00:00 2001 From: Daniele Guido Date: Wed, 16 Oct 2024 09:49:05 +0200 Subject: [PATCH] Fix/issue 28 (#29) hello world Datalab * rephrase text * remove individual authors * make subtitle more visible * fix notebook text * Update CodeSnippet.tsx * change notebook filename * Update TermsOfUseModal.tsx * Update CodeSnippet.tsx --- src/components/CodeSnippet.tsx | 3 +- src/components/GettingStarted.tsx | 2 +- src/components/TermsOfUseModal.tsx | 2 +- src/components/Wall.tsx | 2 +- ...detect-news-agency-with-impresso-model.mdx | 14 +++++---- src/content/notebooks/generic-entity-api.mdx | 30 ++++++++++++++----- .../notebooks/impresso-py-collections.mdx | 14 ++++++++- .../{setup.mdx => impresso-py-connect.mdx} | 27 +++++++++++++---- src/content/notebooks/impresso-py-maps.mdx | 2 +- src/content/notebooks/impresso-py-network.mdx | 2 +- src/content/notebooks/impresso-py-search.mdx | 2 +- src/content/series/enter-impresso-models.mdx | 1 + src/content/series/enter-impresso.mdx | 4 +-- src/content/series/entities.mdx | 5 ++-- 14 files changed, 77 insertions(+), 33 deletions(-) rename src/content/notebooks/{setup.mdx => impresso-py-connect.mdx} (94%) diff --git a/src/components/CodeSnippet.tsx b/src/components/CodeSnippet.tsx index 77c37ae..c959449 100644 --- a/src/components/CodeSnippet.tsx +++ b/src/components/CodeSnippet.tsx @@ -1,7 +1,6 @@ import { useState, useRef, useEffect } from "react" import ReactCodeMirror, { EditorView } from "@uiw/react-codemirror" import type { ReactCodeMirrorRef } from "@uiw/react-codemirror" -import { duotoneDark } from "@uiw/codemirror-theme-duotone" import { python } from "@codemirror/lang-python" import { Copy, CheckCircle } from "iconoir-react" import { createTheme } from "@uiw/codemirror-themes" @@ -18,7 +17,7 @@ export interface CodeSnippetProps { const myTheme = createTheme({ theme: "light", settings: { - background: "#fff9f2", + background: "#fff9f250", backgroundImage: "", foreground: "#75baff", caret: "#5d00ff", diff --git a/src/components/GettingStarted.tsx b/src/components/GettingStarted.tsx index 2beac1e..e119263 100644 --- a/src/components/GettingStarted.tsx +++ b/src/components/GettingStarted.tsx @@ -44,7 +44,7 @@ const GettingStarted = ({ className = "" }) => {
{startNumAfterOptionalSteps}
{" "} - Consult our terms of use + Accept our Terms of Use
  • diff --git a/src/components/TermsOfUseModal.tsx b/src/components/TermsOfUseModal.tsx index c9c45cc..04faece 100644 --- a/src/components/TermsOfUseModal.tsx +++ b/src/components/TermsOfUseModal.tsx @@ -1,4 +1,4 @@ -import { useEffect, useRef, useState, type ChangeEvent } from "react" +import { useEffect, useState, type ChangeEvent } from "react" import AcceptTermsOfUse from "./AcceptTermsOfUse" import Page from "./Page" import { Col, Container, Row } from "react-bootstrap" diff --git a/src/components/Wall.tsx b/src/components/Wall.tsx index 1e62136..46dd053 100644 --- a/src/components/Wall.tsx +++ b/src/components/Wall.tsx @@ -73,7 +73,7 @@ const Wall = ({ {numberOfAuthors} authors.

    - +

    Join us in this early stage of development and help us to improve the platform. diff --git a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx index a2cd98d..be7fffc 100644 --- a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx +++ b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx @@ -8,20 +8,19 @@ date: 2024-09-18T10:11:47Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_02_newsagencies.ipynb authors: - impresso-team -seealso: - - setup --- {/* cell:0 cell_type:markdown */} -Delivering swift and reliable news since the 1830s and 1840s, news agencies have played a pivotal role both nationally and internationally. However, understanding their precise impact on shaping news content has remained somewhat elusive. Our goal is to illuminate this aspect by identifying news agencies within historical newspaper articles. Using data from newspapers in Switzerland and Luxembourg as part of the impresso project, we've trained our pipeline to recognize these entities. +Delivering swift and reliable news since the 1830s and 1840s, news agencies have played a pivotal role both nationally and internationally. However, understanding their precise impact on shaping news content has remained somewhat elusive. Our goal is to illuminate this aspect by identifying news agencies within historical newspaper articles. Using data from newspapers in Switzerland and Luxembourg as part of the impresso project, we've trained our pipeline to recognize these entities. If you're here, you likely seek to detect news agency entities in your own text. This notebook will guide you through the process of setting up a workflow to identify specific newspaper or agency mentions within your text. {/* cell:1 cell_type:markdown */} -Install necessary libraries (if not already installed) and +Install necessary libraries (if not already installed) and download the necessary NLTK data. {/* cell:2 cell_type:code */} + ```python !pip install python-dotenv !pip install transformers @@ -29,18 +28,20 @@ download the necessary NLTK data. ``` {/* cell:3 cell_type:markdown */} -*Note: This notebook requires `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). We use [dotenv](https://pypi.org/project/python-dotenv/) library to load the HF_TOKEN value from a local .env file* +_Note: This notebook requires `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). We use [dotenv](https://pypi.org/project/python-dotenv/) library to load the HF_TOKEN value from a local .env file_ {/* cell:4 cell_type:code */} + ```python from dotenv import load_dotenv load_dotenv() # take environment variables from .env. ``` {/* cell:5 cell_type:markdown */} -Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text. +Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text. {/* cell:6 cell_type:code */} + ```python from transformers import is_torch_available from transformers import pipeline @@ -56,6 +57,7 @@ nlp = pipeline("newsagency-ner", model="impresso-project/bert-newsagency-ner-fr" Run the example below to see how it works. {/* cell:8 cell_type:code */} + ```python # Example text = "Mon nom est François et j'habite à Paris. (Reuter)" diff --git a/src/content/notebooks/generic-entity-api.mdx b/src/content/notebooks/generic-entity-api.mdx index 48243cb..e896632 100644 --- a/src/content/notebooks/generic-entity-api.mdx +++ b/src/content/notebooks/generic-entity-api.mdx @@ -2,7 +2,7 @@ githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/generic-entity-api.ipynb authors: - impresso-team - - EmanuelaBoros +# - EmanuelaBoros title: Detect Entities and Link them to Wikipedia and Wikidata in a Text through the Impresso API sha: 54802fcabc0e32a4a05a1b4f2761a54b9807b0c5 @@ -14,10 +14,13 @@ googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datal Named entities such as organizations, locations, persons, and temporal expressions play a crucial role in the comprehension and analysis of both historical and contemporary texts. The HIPE-2022 project focuses on named entity recognition and classification (NERC) and entity linking (EL) in multilingual historical documents. ### About HIPE-2022 + HIPE-2022 involves processing diverse datasets from historical newspapers and classical commentaries, spanning approximately 200 years and multiple languages. The primary goal is to confront systems with challenges related to multilinguality, domain-specific entities, and varying annotation tag sets. ### Datasets + The HIPE-2022 datasets are based on six primary datasets, but this model was only trained on **hipe2020** in French and German. + - **ajmc**: Classical commentaries in German, French, and English. - **hipe2020**: Historical newspapers in German, French, and English. - **letemps**: Historical newspapers in French. @@ -26,6 +29,7 @@ The HIPE-2022 datasets are based on six primary datasets, but this model was onl - **sonar**: Historical newspapers in German. ### Annotation Types and Levels + HIPE-2022 employs an IOB tagging scheme (inside-outside-beginning format) for entity annotations. The annotation levels include: 1. **TOKEN**: The annotated token. @@ -37,6 +41,7 @@ HIPE-2022 employs an IOB tagging scheme (inside-outside-beginning format) for en 7. **NE-NESTED**: Coarse type of the nested entity. ### Getting Started + This notebook will guide you through setting up a workflow to identify named entities within your text using the HIPE-2022 trained pipeline. By leveraging this pipeline, you can detect mentions of people, places, organizations, and temporal expressions, enhancing your analysis and understanding of historical and contemporary documents. --- @@ -45,10 +50,11 @@ This updated description provides a clear overview of the HIPE-2022 project's go *Note: This notebook *might* require `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable)* {/* cell:1 cell_type:markdown */} -Install necessary libraries (if not already installed) and +Install necessary libraries (if not already installed) and download the necessary NLTK data. {/* cell:2 cell_type:code */} + ```python !pip install transformers !pip install nltk @@ -56,12 +62,13 @@ download the necessary NLTK data. ``` {/* cell:3 cell_type:code */} + ```python def print_nicely(results, text): # Print the timestamp and system ID print(f"Timestamp: {results.get('ts')}") print(f"System ID: {results.get('sys_id')}") - + entities = results.get('nes', []) if entities: print(f"\n{'Entity':<20} {'Type':<15} {'Confidence NER':<15} {'Confidence NEL':<15} {'Start':<5} {'End':<5} {'Wikidata ID':<10} {'Wikipedia Page':<20}") @@ -72,7 +79,7 @@ def print_nicely(results, text): wkd_id = entity.get('wkd_id', 'N/A') wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A') print(f"{entity['surface']:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}") - + print("*" * 100) print('Testing offsets:') print("*" * 100) @@ -84,7 +91,7 @@ def print_nicely(results, text): wkd_id = entity.get('wkd_id', 'N/A') wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A') print(f"{text[entity['lOffset']:entity['rOffset']]:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}") - + print("*" * 100) print('Testing offsets in the returned text:') print("*" * 100) @@ -96,14 +103,15 @@ def print_nicely(results, text): wkd_id = entity.get('wkd_id', 'N/A') wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A') print(f"{results['text'][entity['lOffset']:entity['rOffset']]:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}") - + ``` {/* cell:4 cell_type:markdown */} -Now the fun part, this function will download the requried model and gives you the keys to successfullly detect entities in your text. +Now the fun part, this function will download the requried model and gives you the keys to successfullly detect entities in your text. {/* cell:5 cell_type:code */} + ```python from utils import get_linked_entities import requests @@ -117,41 +125,49 @@ for sentence in sentences: ``` {/* cell:6 cell_type:code */} + ```python ``` {/* cell:7 cell_type:code */} + ```python ``` {/* cell:8 cell_type:code */} + ```python ``` {/* cell:9 cell_type:code */} + ```python ``` {/* cell:10 cell_type:code */} + ```python ``` {/* cell:11 cell_type:code */} + ```python ``` {/* cell:12 cell_type:code */} + ```python ``` {/* cell:13 cell_type:code */} + ```python ``` diff --git a/src/content/notebooks/impresso-py-collections.mdx b/src/content/notebooks/impresso-py-collections.mdx index bdb3935..af4faf8 100644 --- a/src/content/notebooks/impresso-py-collections.mdx +++ b/src/content/notebooks/impresso-py-collections.mdx @@ -1,7 +1,8 @@ --- githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/collections.ipynb authors: - - RomanKalyakin + # - RomanKalyakin + - impresso-team title: Search collections sha: fbebc19629cfc008a085283e61c0669de326add9 date: 2024-09-18T15:04:39Z @@ -9,6 +10,7 @@ googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/bl --- {/* cell:0 cell_type:code */} + ```python from impresso import connect @@ -16,28 +18,33 @@ impresso = connect() ``` {/* cell:1 cell_type:code */} + ```python result = impresso.collections.find() result ``` {/* cell:2 cell_type:markdown */} + # Get collection Get metadata of a colection by its ID. {/* cell:3 cell_type:code */} + ```python result = impresso.collections.get("local-roka-tOrwrOG3") result ``` {/* cell:4 cell_type:markdown */} + ## Get collection items Get items from a collection by its ID. {/* cell:5 cell_type:code */} + ```python colection_id = result.raw["uid"] items = impresso.collections.items(colection_id) @@ -45,23 +52,28 @@ items ``` {/* cell:6 cell_type:markdown */} + ## Remove items from collection {/* cell:7 cell_type:code */} + ```python item_id = items.pydantic.data[0].uid item_id ``` {/* cell:8 cell_type:code */} + ```python impresso.collections.remove_items(colection_id, [item_id]) ``` {/* cell:9 cell_type:markdown */} + ## Add items to collection {/* cell:10 cell_type:code */} + ```python impresso.collections.add_items(colection_id, [item_id]) ``` diff --git a/src/content/notebooks/setup.mdx b/src/content/notebooks/impresso-py-connect.mdx similarity index 94% rename from src/content/notebooks/setup.mdx rename to src/content/notebooks/impresso-py-connect.mdx index fdd9fe5..1589af6 100644 --- a/src/content/notebooks/setup.mdx +++ b/src/content/notebooks/impresso-py-connect.mdx @@ -1,7 +1,7 @@ --- -title: Initialize Impresso Client +title: How to connect to the API excerpt: This is the first notebook in the Enter Impresso series. -githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/basic.ipynb +githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/1-starter/ST_01_basics.ipynb tags: - hello-world binderUrl: https://mybinder.org/v2/gh/binder-examples/r/master?urlpath=rstudio @@ -9,12 +9,13 @@ authors: - impresso-team date: 2024-09-18T15:04:39Z seealso: - - detect-news-agency-with-impresso-model + - impresso-py-search sha: fbebc19629cfc008a085283e61c0669de326add9 googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/blob/main/examples/notebooks/basic.ipynb --- {/* cell:0 cell_type:code */} + ```python from impresso import connect @@ -22,14 +23,15 @@ impresso = connect() ``` {/* cell:1 cell_type:markdown */} + ## Search articles In this notebook, we will search for articles that contain the term "European Union" in the text. The results are ordered by date. Below the result container is rendered as an overview of what it contains. - {/* cell:2 cell_type:code */} + ```python result = impresso.search.find( q="European Union", @@ -46,6 +48,7 @@ The `pydantic` property is a [Pydantic](https://docs.pydantic.dev/latest/) model We use the `data` property of the response to iterate over the page of the results and return excerpts of the articles that contain the search term. {/* cell:4 cell_type:code */} + ```python result = impresso.search.find( q="European Union", @@ -59,36 +62,45 @@ for article in result.pydantic.data[:3]: There are several useful properties on the result object that let us know the total nubmer of results found, the current page and its size. {/* cell:6 cell_type:code */} + ```python print("%i results were found for this term. The current result object contains %i items starting from the item number %i" % (result.total, result.size, result.offset)) ``` {/* cell:7 cell_type:markdown */} + ### Pydantic + The full response from the Impresso API as a pydantic model. {/* cell:8 cell_type:code */} + ```python result.pydantic ``` {/* cell:9 cell_type:markdown */} + ### Pandas -We can also get the search results as a [Pandas](https://pandas.pydata.org/) DataFrame. + +We can also get the search results as a [Pandas](https://pandas.pydata.org/) DataFrame. This allows us to easily manipulate and analyze the data using pandas' powerful data manipulation capabilities. {/* cell:10 cell_type:code */} + ```python df = result.df df.head(2) ``` {/* cell:11 cell_type:markdown */} + ## Get an article Below we will use the `articles` resource to get an article by its ID: {/* cell:12 cell_type:code */} + ```python article = impresso.articles.get("NZZ-1794-08-09-a-i0002") article @@ -98,23 +110,26 @@ article We can also get it as a Pydantic model or as a DataFrame. {/* cell:14 cell_type:code */} + ```python article.pydantic.excerpt ``` {/* cell:15 cell_type:code */} + ```python article.df[['uid', 'country', 'language']] ``` {/* cell:16 cell_type:markdown */} + ## Search facets In this cell, we will search for facets related to the term "fromage" in the Impresso collection. This is a convenient way to see a breakdown of the search results by country. - {/* cell:17 cell_type:code */} + ```python country_facet = impresso.search.facet("country", q="fromage") country_facet.df diff --git a/src/content/notebooks/impresso-py-maps.mdx b/src/content/notebooks/impresso-py-maps.mdx index 5523c11..d4b4286 100644 --- a/src/content/notebooks/impresso-py-maps.mdx +++ b/src/content/notebooks/impresso-py-maps.mdx @@ -3,7 +3,7 @@ title: Exploring impresso with maps githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb authors: - impresso-team - - RomanKalyakin + # - RomanKalyakin sha: 168c669246385a2ec6c3e088b0081364f129d11c date: 2024-09-27T12:54:12Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb diff --git a/src/content/notebooks/impresso-py-network.mdx b/src/content/notebooks/impresso-py-network.mdx index b56ce8f..07836df 100644 --- a/src/content/notebooks/impresso-py-network.mdx +++ b/src/content/notebooks/impresso-py-network.mdx @@ -3,7 +3,7 @@ title: Network graph with Impresso Py githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb authors: - impresso-team - - RomanKalyakin + # - RomanKalyakin sha: 168c669246385a2ec6c3e088b0081364f129d11c date: 2024-09-27T12:54:12Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb diff --git a/src/content/notebooks/impresso-py-search.mdx b/src/content/notebooks/impresso-py-search.mdx index 6229d52..e750ff7 100644 --- a/src/content/notebooks/impresso-py-search.mdx +++ b/src/content/notebooks/impresso-py-search.mdx @@ -2,7 +2,7 @@ githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/search.ipynb authors: - impresso-team - - RomanKalyakin + # - RomanKalyakin seealso: - impresso-py-collections title: Search diff --git a/src/content/series/enter-impresso-models.mdx b/src/content/series/enter-impresso-models.mdx index 2599429..fad32df 100644 --- a/src/content/series/enter-impresso-models.mdx +++ b/src/content/series/enter-impresso-models.mdx @@ -2,6 +2,7 @@ title: Enrich your Data with Impresso Models excerpt: "Use Impresso’s models for the semantic indexation of your personal data collections " notebooks: + - generic-entity-api - detect-news-agency-with-impresso-model --- diff --git a/src/content/series/enter-impresso.mdx b/src/content/series/enter-impresso.mdx index c9707cb..1dc5084 100644 --- a/src/content/series/enter-impresso.mdx +++ b/src/content/series/enter-impresso.mdx @@ -1,8 +1,8 @@ --- title: Getting Started -excerpt: "Three easy steps to enter the impresso way of doing research." +excerpt: "Three simple steps to begin research with Impresso." notebooks: - - setup + - impresso-py-connect --- Create an Impresso account and learn how to access our API. diff --git a/src/content/series/entities.mdx b/src/content/series/entities.mdx index 0d5eefc..5886ccc 100644 --- a/src/content/series/entities.mdx +++ b/src/content/series/entities.mdx @@ -1,10 +1,9 @@ --- -title: Explore and Visualise your Data -excerpt: "Use Impresso notebook templates as starting points for your analysis." +title: Explore and Visualise your Impresso Data +excerpt: "Notebook templates offer complementary views on your Impresso personal collections and external datasets beyond the capabilities of the Impresso Web App." notebooks: - impresso-py-maps - impresso-py-network - - generic-entity-api category: - explorations position: central-column