From 6714dc08047335874939c54c5e86ab3707d55e82 Mon Sep 17 00:00:00 2001
From: Daniele Guido
Date: Wed, 16 Oct 2024 09:49:05 +0200
Subject: [PATCH] Fix/issue 28 (#29) hello world Datalab
* rephrase text
* remove individual authors
* make subtitle more visible
* fix notebook text
* Update CodeSnippet.tsx
* change notebook filename
* Update TermsOfUseModal.tsx
* Update CodeSnippet.tsx
---
src/components/CodeSnippet.tsx | 3 +-
src/components/GettingStarted.tsx | 2 +-
src/components/TermsOfUseModal.tsx | 2 +-
src/components/Wall.tsx | 2 +-
...detect-news-agency-with-impresso-model.mdx | 14 +++++----
src/content/notebooks/generic-entity-api.mdx | 30 ++++++++++++++-----
.../notebooks/impresso-py-collections.mdx | 14 ++++++++-
.../{setup.mdx => impresso-py-connect.mdx} | 27 +++++++++++++----
src/content/notebooks/impresso-py-maps.mdx | 2 +-
src/content/notebooks/impresso-py-network.mdx | 2 +-
src/content/notebooks/impresso-py-search.mdx | 2 +-
src/content/series/enter-impresso-models.mdx | 1 +
src/content/series/enter-impresso.mdx | 4 +--
src/content/series/entities.mdx | 5 ++--
14 files changed, 77 insertions(+), 33 deletions(-)
rename src/content/notebooks/{setup.mdx => impresso-py-connect.mdx} (94%)
diff --git a/src/components/CodeSnippet.tsx b/src/components/CodeSnippet.tsx
index 77c37ae..c959449 100644
--- a/src/components/CodeSnippet.tsx
+++ b/src/components/CodeSnippet.tsx
@@ -1,7 +1,6 @@
import { useState, useRef, useEffect } from "react"
import ReactCodeMirror, { EditorView } from "@uiw/react-codemirror"
import type { ReactCodeMirrorRef } from "@uiw/react-codemirror"
-import { duotoneDark } from "@uiw/codemirror-theme-duotone"
import { python } from "@codemirror/lang-python"
import { Copy, CheckCircle } from "iconoir-react"
import { createTheme } from "@uiw/codemirror-themes"
@@ -18,7 +17,7 @@ export interface CodeSnippetProps {
const myTheme = createTheme({
theme: "light",
settings: {
- background: "#fff9f2",
+ background: "#fff9f250",
backgroundImage: "",
foreground: "#75baff",
caret: "#5d00ff",
diff --git a/src/components/GettingStarted.tsx b/src/components/GettingStarted.tsx
index 2beac1e..e119263 100644
--- a/src/components/GettingStarted.tsx
+++ b/src/components/GettingStarted.tsx
@@ -44,7 +44,7 @@ const GettingStarted = ({ className = "" }) => {
{startNumAfterOptionalSteps}
{" "}
- Consult our terms of use
+ Accept our Terms of Use
diff --git a/src/components/TermsOfUseModal.tsx b/src/components/TermsOfUseModal.tsx
index c9c45cc..04faece 100644
--- a/src/components/TermsOfUseModal.tsx
+++ b/src/components/TermsOfUseModal.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useRef, useState, type ChangeEvent } from "react"
+import { useEffect, useState, type ChangeEvent } from "react"
import AcceptTermsOfUse from "./AcceptTermsOfUse"
import Page from "./Page"
import { Col, Container, Row } from "react-bootstrap"
diff --git a/src/components/Wall.tsx b/src/components/Wall.tsx
index 1e62136..46dd053 100644
--- a/src/components/Wall.tsx
+++ b/src/components/Wall.tsx
@@ -73,7 +73,7 @@ const Wall = ({
{numberOfAuthors} authors.
-
+
Join us in this early stage of development and help us to improve
the platform.
diff --git a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx
index a2cd98d..be7fffc 100644
--- a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx
+++ b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx
@@ -8,20 +8,19 @@ date: 2024-09-18T10:11:47Z
googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_02_newsagencies.ipynb
authors:
- impresso-team
-seealso:
- - setup
---
{/* cell:0 cell_type:markdown */}
-Delivering swift and reliable news since the 1830s and 1840s, news agencies have played a pivotal role both nationally and internationally. However, understanding their precise impact on shaping news content has remained somewhat elusive. Our goal is to illuminate this aspect by identifying news agencies within historical newspaper articles. Using data from newspapers in Switzerland and Luxembourg as part of the impresso project, we've trained our pipeline to recognize these entities.
+Delivering swift and reliable news since the 1830s and 1840s, news agencies have played a pivotal role both nationally and internationally. However, understanding their precise impact on shaping news content has remained somewhat elusive. Our goal is to illuminate this aspect by identifying news agencies within historical newspaper articles. Using data from newspapers in Switzerland and Luxembourg as part of the impresso project, we've trained our pipeline to recognize these entities.
If you're here, you likely seek to detect news agency entities in your own text. This notebook will guide you through the process of setting up a workflow to identify specific newspaper or agency mentions within your text.
{/* cell:1 cell_type:markdown */}
-Install necessary libraries (if not already installed) and
+Install necessary libraries (if not already installed) and
download the necessary NLTK data.
{/* cell:2 cell_type:code */}
+
```python
!pip install python-dotenv
!pip install transformers
@@ -29,18 +28,20 @@ download the necessary NLTK data.
```
{/* cell:3 cell_type:markdown */}
-*Note: This notebook requires `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). We use [dotenv](https://pypi.org/project/python-dotenv/) library to load the HF_TOKEN value from a local .env file*
+_Note: This notebook requires `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). We use [dotenv](https://pypi.org/project/python-dotenv/) library to load the HF_TOKEN value from a local .env file_
{/* cell:4 cell_type:code */}
+
```python
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
```
{/* cell:5 cell_type:markdown */}
-Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text.
+Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text.
{/* cell:6 cell_type:code */}
+
```python
from transformers import is_torch_available
from transformers import pipeline
@@ -56,6 +57,7 @@ nlp = pipeline("newsagency-ner", model="impresso-project/bert-newsagency-ner-fr"
Run the example below to see how it works.
{/* cell:8 cell_type:code */}
+
```python
# Example
text = "Mon nom est François et j'habite à Paris. (Reuter)"
diff --git a/src/content/notebooks/generic-entity-api.mdx b/src/content/notebooks/generic-entity-api.mdx
index 48243cb..e896632 100644
--- a/src/content/notebooks/generic-entity-api.mdx
+++ b/src/content/notebooks/generic-entity-api.mdx
@@ -2,7 +2,7 @@
githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/generic-entity-api.ipynb
authors:
- impresso-team
- - EmanuelaBoros
+# - EmanuelaBoros
title: Detect Entities and Link them to Wikipedia and Wikidata in a Text through
the Impresso API
sha: 54802fcabc0e32a4a05a1b4f2761a54b9807b0c5
@@ -14,10 +14,13 @@ googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datal
Named entities such as organizations, locations, persons, and temporal expressions play a crucial role in the comprehension and analysis of both historical and contemporary texts. The HIPE-2022 project focuses on named entity recognition and classification (NERC) and entity linking (EL) in multilingual historical documents.
### About HIPE-2022
+
HIPE-2022 involves processing diverse datasets from historical newspapers and classical commentaries, spanning approximately 200 years and multiple languages. The primary goal is to confront systems with challenges related to multilinguality, domain-specific entities, and varying annotation tag sets.
### Datasets
+
The HIPE-2022 datasets are based on six primary datasets, but this model was only trained on **hipe2020** in French and German.
+
- **ajmc**: Classical commentaries in German, French, and English.
- **hipe2020**: Historical newspapers in German, French, and English.
- **letemps**: Historical newspapers in French.
@@ -26,6 +29,7 @@ The HIPE-2022 datasets are based on six primary datasets, but this model was onl
- **sonar**: Historical newspapers in German.
### Annotation Types and Levels
+
HIPE-2022 employs an IOB tagging scheme (inside-outside-beginning format) for entity annotations. The annotation levels include:
1. **TOKEN**: The annotated token.
@@ -37,6 +41,7 @@ HIPE-2022 employs an IOB tagging scheme (inside-outside-beginning format) for en
7. **NE-NESTED**: Coarse type of the nested entity.
### Getting Started
+
This notebook will guide you through setting up a workflow to identify named entities within your text using the HIPE-2022 trained pipeline. By leveraging this pipeline, you can detect mentions of people, places, organizations, and temporal expressions, enhancing your analysis and understanding of historical and contemporary documents.
---
@@ -45,10 +50,11 @@ This updated description provides a clear overview of the HIPE-2022 project's go
*Note: This notebook *might* require `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable)*
{/* cell:1 cell_type:markdown */}
-Install necessary libraries (if not already installed) and
+Install necessary libraries (if not already installed) and
download the necessary NLTK data.
{/* cell:2 cell_type:code */}
+
```python
!pip install transformers
!pip install nltk
@@ -56,12 +62,13 @@ download the necessary NLTK data.
```
{/* cell:3 cell_type:code */}
+
```python
def print_nicely(results, text):
# Print the timestamp and system ID
print(f"Timestamp: {results.get('ts')}")
print(f"System ID: {results.get('sys_id')}")
-
+
entities = results.get('nes', [])
if entities:
print(f"\n{'Entity':<20} {'Type':<15} {'Confidence NER':<15} {'Confidence NEL':<15} {'Start':<5} {'End':<5} {'Wikidata ID':<10} {'Wikipedia Page':<20}")
@@ -72,7 +79,7 @@ def print_nicely(results, text):
wkd_id = entity.get('wkd_id', 'N/A')
wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A')
print(f"{entity['surface']:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}")
-
+
print("*" * 100)
print('Testing offsets:')
print("*" * 100)
@@ -84,7 +91,7 @@ def print_nicely(results, text):
wkd_id = entity.get('wkd_id', 'N/A')
wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A')
print(f"{text[entity['lOffset']:entity['rOffset']]:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}")
-
+
print("*" * 100)
print('Testing offsets in the returned text:')
print("*" * 100)
@@ -96,14 +103,15 @@ def print_nicely(results, text):
wkd_id = entity.get('wkd_id', 'N/A')
wkpedia_pagename = entity.get('wkpedia_pagename', 'N/A')
print(f"{results['text'][entity['lOffset']:entity['rOffset']]:<20} {entity['type']:<15} {confidence_ner:<15} {confidence_nel:<15} {entity['lOffset']:<5} {entity['rOffset']:<5} {wkd_id:<10} {wkpedia_pagename:<20}")
-
+
```
{/* cell:4 cell_type:markdown */}
-Now the fun part, this function will download the requried model and gives you the keys to successfullly detect entities in your text.
+Now the fun part, this function will download the requried model and gives you the keys to successfullly detect entities in your text.
{/* cell:5 cell_type:code */}
+
```python
from utils import get_linked_entities
import requests
@@ -117,41 +125,49 @@ for sentence in sentences:
```
{/* cell:6 cell_type:code */}
+
```python
```
{/* cell:7 cell_type:code */}
+
```python
```
{/* cell:8 cell_type:code */}
+
```python
```
{/* cell:9 cell_type:code */}
+
```python
```
{/* cell:10 cell_type:code */}
+
```python
```
{/* cell:11 cell_type:code */}
+
```python
```
{/* cell:12 cell_type:code */}
+
```python
```
{/* cell:13 cell_type:code */}
+
```python
```
diff --git a/src/content/notebooks/impresso-py-collections.mdx b/src/content/notebooks/impresso-py-collections.mdx
index bdb3935..af4faf8 100644
--- a/src/content/notebooks/impresso-py-collections.mdx
+++ b/src/content/notebooks/impresso-py-collections.mdx
@@ -1,7 +1,8 @@
---
githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/collections.ipynb
authors:
- - RomanKalyakin
+ # - RomanKalyakin
+ - impresso-team
title: Search collections
sha: fbebc19629cfc008a085283e61c0669de326add9
date: 2024-09-18T15:04:39Z
@@ -9,6 +10,7 @@ googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/bl
---
{/* cell:0 cell_type:code */}
+
```python
from impresso import connect
@@ -16,28 +18,33 @@ impresso = connect()
```
{/* cell:1 cell_type:code */}
+
```python
result = impresso.collections.find()
result
```
{/* cell:2 cell_type:markdown */}
+
# Get collection
Get metadata of a colection by its ID.
{/* cell:3 cell_type:code */}
+
```python
result = impresso.collections.get("local-roka-tOrwrOG3")
result
```
{/* cell:4 cell_type:markdown */}
+
## Get collection items
Get items from a collection by its ID.
{/* cell:5 cell_type:code */}
+
```python
colection_id = result.raw["uid"]
items = impresso.collections.items(colection_id)
@@ -45,23 +52,28 @@ items
```
{/* cell:6 cell_type:markdown */}
+
## Remove items from collection
{/* cell:7 cell_type:code */}
+
```python
item_id = items.pydantic.data[0].uid
item_id
```
{/* cell:8 cell_type:code */}
+
```python
impresso.collections.remove_items(colection_id, [item_id])
```
{/* cell:9 cell_type:markdown */}
+
## Add items to collection
{/* cell:10 cell_type:code */}
+
```python
impresso.collections.add_items(colection_id, [item_id])
```
diff --git a/src/content/notebooks/setup.mdx b/src/content/notebooks/impresso-py-connect.mdx
similarity index 94%
rename from src/content/notebooks/setup.mdx
rename to src/content/notebooks/impresso-py-connect.mdx
index fdd9fe5..1589af6 100644
--- a/src/content/notebooks/setup.mdx
+++ b/src/content/notebooks/impresso-py-connect.mdx
@@ -1,7 +1,7 @@
---
-title: Initialize Impresso Client
+title: How to connect to the API
excerpt: This is the first notebook in the Enter Impresso series.
-githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/basic.ipynb
+githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/1-starter/ST_01_basics.ipynb
tags:
- hello-world
binderUrl: https://mybinder.org/v2/gh/binder-examples/r/master?urlpath=rstudio
@@ -9,12 +9,13 @@ authors:
- impresso-team
date: 2024-09-18T15:04:39Z
seealso:
- - detect-news-agency-with-impresso-model
+ - impresso-py-search
sha: fbebc19629cfc008a085283e61c0669de326add9
googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/blob/main/examples/notebooks/basic.ipynb
---
{/* cell:0 cell_type:code */}
+
```python
from impresso import connect
@@ -22,14 +23,15 @@ impresso = connect()
```
{/* cell:1 cell_type:markdown */}
+
## Search articles
In this notebook, we will search for articles that contain the term "European Union" in the text. The results are ordered by date.
Below the result container is rendered as an overview of what it contains.
-
{/* cell:2 cell_type:code */}
+
```python
result = impresso.search.find(
q="European Union",
@@ -46,6 +48,7 @@ The `pydantic` property is a [Pydantic](https://docs.pydantic.dev/latest/) model
We use the `data` property of the response to iterate over the page of the results and return excerpts of the articles that contain the search term.
{/* cell:4 cell_type:code */}
+
```python
result = impresso.search.find(
q="European Union",
@@ -59,36 +62,45 @@ for article in result.pydantic.data[:3]:
There are several useful properties on the result object that let us know the total nubmer of results found, the current page and its size.
{/* cell:6 cell_type:code */}
+
```python
print("%i results were found for this term. The current result object contains %i items starting from the item number %i" % (result.total, result.size, result.offset))
```
{/* cell:7 cell_type:markdown */}
+
### Pydantic
+
The full response from the Impresso API as a pydantic model.
{/* cell:8 cell_type:code */}
+
```python
result.pydantic
```
{/* cell:9 cell_type:markdown */}
+
### Pandas
-We can also get the search results as a [Pandas](https://pandas.pydata.org/) DataFrame.
+
+We can also get the search results as a [Pandas](https://pandas.pydata.org/) DataFrame.
This allows us to easily manipulate and analyze the data using pandas' powerful data manipulation capabilities.
{/* cell:10 cell_type:code */}
+
```python
df = result.df
df.head(2)
```
{/* cell:11 cell_type:markdown */}
+
## Get an article
Below we will use the `articles` resource to get an article by its ID:
{/* cell:12 cell_type:code */}
+
```python
article = impresso.articles.get("NZZ-1794-08-09-a-i0002")
article
@@ -98,23 +110,26 @@ article
We can also get it as a Pydantic model or as a DataFrame.
{/* cell:14 cell_type:code */}
+
```python
article.pydantic.excerpt
```
{/* cell:15 cell_type:code */}
+
```python
article.df[['uid', 'country', 'language']]
```
{/* cell:16 cell_type:markdown */}
+
## Search facets
In this cell, we will search for facets related to the term "fromage" in the Impresso collection. This is a convenient way to see a breakdown of the search results by country.
-
{/* cell:17 cell_type:code */}
+
```python
country_facet = impresso.search.facet("country", q="fromage")
country_facet.df
diff --git a/src/content/notebooks/impresso-py-maps.mdx b/src/content/notebooks/impresso-py-maps.mdx
index 5523c11..d4b4286 100644
--- a/src/content/notebooks/impresso-py-maps.mdx
+++ b/src/content/notebooks/impresso-py-maps.mdx
@@ -3,7 +3,7 @@ title: Exploring impresso with maps
githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb
authors:
- impresso-team
- - RomanKalyakin
+ # - RomanKalyakin
sha: 168c669246385a2ec6c3e088b0081364f129d11c
date: 2024-09-27T12:54:12Z
googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb
diff --git a/src/content/notebooks/impresso-py-network.mdx b/src/content/notebooks/impresso-py-network.mdx
index b56ce8f..07836df 100644
--- a/src/content/notebooks/impresso-py-network.mdx
+++ b/src/content/notebooks/impresso-py-network.mdx
@@ -3,7 +3,7 @@ title: Network graph with Impresso Py
githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb
authors:
- impresso-team
- - RomanKalyakin
+ # - RomanKalyakin
sha: 168c669246385a2ec6c3e088b0081364f129d11c
date: 2024-09-27T12:54:12Z
googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb
diff --git a/src/content/notebooks/impresso-py-search.mdx b/src/content/notebooks/impresso-py-search.mdx
index 6229d52..e750ff7 100644
--- a/src/content/notebooks/impresso-py-search.mdx
+++ b/src/content/notebooks/impresso-py-search.mdx
@@ -2,7 +2,7 @@
githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/search.ipynb
authors:
- impresso-team
- - RomanKalyakin
+ # - RomanKalyakin
seealso:
- impresso-py-collections
title: Search
diff --git a/src/content/series/enter-impresso-models.mdx b/src/content/series/enter-impresso-models.mdx
index 2599429..fad32df 100644
--- a/src/content/series/enter-impresso-models.mdx
+++ b/src/content/series/enter-impresso-models.mdx
@@ -2,6 +2,7 @@
title: Enrich your Data with Impresso Models
excerpt: "Use Impresso’s models for the semantic indexation of your personal data collections "
notebooks:
+ - generic-entity-api
- detect-news-agency-with-impresso-model
---
diff --git a/src/content/series/enter-impresso.mdx b/src/content/series/enter-impresso.mdx
index c9707cb..1dc5084 100644
--- a/src/content/series/enter-impresso.mdx
+++ b/src/content/series/enter-impresso.mdx
@@ -1,8 +1,8 @@
---
title: Getting Started
-excerpt: "Three easy steps to enter the impresso way of doing research."
+excerpt: "Three simple steps to begin research with Impresso."
notebooks:
- - setup
+ - impresso-py-connect
---
Create an Impresso account and learn how to access our API.
diff --git a/src/content/series/entities.mdx b/src/content/series/entities.mdx
index 0d5eefc..5886ccc 100644
--- a/src/content/series/entities.mdx
+++ b/src/content/series/entities.mdx
@@ -1,10 +1,9 @@
---
-title: Explore and Visualise your Data
-excerpt: "Use Impresso notebook templates as starting points for your analysis."
+title: Explore and Visualise your Impresso Data
+excerpt: "Notebook templates offer complementary views on your Impresso personal collections and external datasets beyond the capabilities of the Impresso Web App."
notebooks:
- impresso-py-maps
- impresso-py-network
- - generic-entity-api
category:
- explorations
position: central-column