diff --git a/package-lock.json b/package-lock.json index 80b073c..5db20cb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,6 @@ "@astrojs/react": "^3.6.2", "@astrolib/seo": "^1.0.0-beta.6", "@codemirror/lang-python": "^6.1.6", - "@custom-react-hooks/use-on-screen": "^1.5.1", "@feathersjs/authentication-client": "^5.0.30", "@feathersjs/errors": "^5.0.30", "@feathersjs/feathers": "^5.0.30", @@ -793,15 +792,6 @@ "w3c-keyname": "^2.2.4" } }, - "node_modules/@custom-react-hooks/use-on-screen": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/@custom-react-hooks/use-on-screen/-/use-on-screen-1.5.1.tgz", - "integrity": "sha512-BSEgu9PvqhnqkCF73ZdG7mN6NkzJ8ISrqkKIx1g2ipb5skzenDwAb/6dUZ33jOEX3oyvWc/dVGQcboeTh3fVww==", - "license": "MIT", - "peerDependencies": { - "react": ">=16" - } - }, "node_modules/@emmetio/abbreviation": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/@emmetio/abbreviation/-/abbreviation-2.3.3.tgz", diff --git a/src/content/notebooks/impresso-py-network.mdx b/src/content/notebooks/impresso-py-network.mdx index a46fb59..e2fe505 100644 --- a/src/content/notebooks/impresso-py-network.mdx +++ b/src/content/notebooks/impresso-py-network.mdx @@ -3,8 +3,8 @@ title: Exploring Entity Co-occurrence Networks githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/explore-vis/entity_network.ipynb authors: - impresso-team -sha: 1a53c9204d6e4cc4d77363652d7991688039bdb3 -date: 2024-10-24T19:27:13Z +sha: dd13ddcc0ba2f4a2b24face9790c46595dc2ebca +date: 2024-10-27T13:19:55Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/explore-vis/entity_network.ipynb links: - href: https://en.wikipedia.org/wiki/Prague_Spring @@ -15,40 +15,39 @@ seealso: {/* cell:0 cell_type:markdown */} -## Install dependencies + + Open In Colab + -{/* cell:1 cell_type:code */} +{/* cell:1 cell_type:markdown */} +## Install dependencies +{/* cell:2 cell_type:code */} ```python %pip install -q impresso ipysigma networkx tqdm ``` -{/* cell:2 cell_type:markdown */} - +{/* cell:3 cell_type:markdown */} ## Connect to Impresso -{/* cell:3 cell_type:code */} - +{/* cell:4 cell_type:code */} ```python from impresso import connect, OR, AND impresso_session = connect() ``` -{/* cell:4 cell_type:markdown */} - +{/* cell:5 cell_type:markdown */} ## Part 1: Get entities and their co-occurrences ### First, we retrieve all person entities mentioned in all articles that talk about the [Prague Spring](https://en.wikipedia.org/wiki/Prague_Spring). -{/* cell:5 cell_type:code */} - +{/* cell:6 cell_type:code */} ```python query = OR("Prague Spring", "Prager Frühling", "Printemps de Prague") ``` -{/* cell:6 cell_type:code */} - +{/* cell:7 cell_type:code */} ```python persons = impresso_session.search.facet( facet="person", @@ -59,16 +58,14 @@ persons = impresso_session.search.facet( persons ``` -{/* cell:7 cell_type:markdown */} - +{/* cell:8 cell_type:markdown */} ### Next, we generate all unique pairs of entities with a mention count higher than `n`. - + First, entities that meet the mention threshold are selected, and then all possible pairs are generated using the `itertools.combinations` function. The `n` value can be adjusted so that we don't get too many entity combinations. A sweet spot is just under 500 combinations. -{/* cell:8 cell_type:code */} - +{/* cell:9 cell_type:code */} ```python import itertools @@ -83,8 +80,7 @@ person_ids_combinations = list(itertools.combinations(persons_ids, 2)) print(f"Total combinations: {len(person_ids_combinations)}") ``` -{/* cell:9 cell_type:code */} - +{/* cell:10 cell_type:code */} ```python if len(person_ids_combinations) > 500: msg = ( @@ -96,14 +92,13 @@ if len(person_ids_combinations) > 500: raise Exception(msg) ``` -{/* cell:10 cell_type:markdown */} +{/* cell:11 cell_type:markdown */} ### We also retrieve the dates and the number of articles where person entity pairs appear in. This piece of code gets a facet for every combination of named entities. It is a single call per combination so it may take a while for a large number of combinations. -{/* cell:11 cell_type:code */} - +{/* cell:12 cell_type:code */} ```python from impresso.util.error import ImpressoError from time import sleep @@ -135,11 +130,10 @@ for idx, combo in tqdm(enumerate(person_ids_combinations), total=len(person_ids_ connections.append((combo, items)) ``` -{/* cell:12 cell_type:markdown */} +{/* cell:13 cell_type:markdown */} We put all in a dataframe -{/* cell:13 cell_type:code */} - +{/* cell:14 cell_type:code */} ```python import pandas as pd @@ -155,11 +149,10 @@ connections_df = pd.DataFrame(connections_denormalised, columns=('node_a', 'node connections_df ``` -{/* cell:14 cell_type:markdown */} +{/* cell:15 cell_type:markdown */} And save the connections to a CSV file that can be visualised independently in Part 2. Provide a name for the file. -{/* cell:15 cell_type:code */} - +{/* cell:16 cell_type:code */} ```python from tempfile import gettempdir @@ -171,12 +164,10 @@ connections_df.to_csv(connections_csv_filepath) print(f"File saved in {connections_csv_filepath}") ``` -{/* cell:16 cell_type:markdown */} - +{/* cell:17 cell_type:markdown */} ## Part 2: visualise -{/* cell:17 cell_type:code */} - +{/* cell:18 cell_type:code */} ```python import pandas as pd @@ -184,11 +175,10 @@ connections_df = pd.read_csv(connections_csv_filepath) connections_df ``` -{/* cell:18 cell_type:markdown */} +{/* cell:19 cell_type:markdown */} Group connections counting number of mentions and preserve the URL. -{/* cell:19 cell_type:code */} - +{/* cell:20 cell_type:code */} ```python grouped_connections_df = connections_df.groupby(['node_a', 'node_b']) \ .agg({'timestamp': lambda x: ', '.join(list(x)), 'count': 'sum', 'url': lambda x: list(set(x))[0]}) \ @@ -196,8 +186,7 @@ grouped_connections_df = connections_df.groupby(['node_a', 'node_b']) \ grouped_connections_df ``` -{/* cell:20 cell_type:code */} - +{/* cell:21 cell_type:code */} ```python import networkx as nx @@ -213,11 +202,10 @@ for i in sorted(G.nodes()): G.nodes ``` -{/* cell:21 cell_type:markdown */} +{/* cell:22 cell_type:markdown */} Save the file so that it could be downloaded and used elsewhere. -{/* cell:22 cell_type:code */} - +{/* cell:23 cell_type:code */} ```python from tempfile import gettempdir @@ -231,11 +219,10 @@ nx.write_gexf(G, gefx_filepath) print(f"File saved in {gefx_filepath}") ``` -{/* cell:23 cell_type:markdown */} +{/* cell:24 cell_type:markdown */} If running in Colab - activate custom widgets to allow `ipysigma` to render the graph. -{/* cell:24 cell_type:code */} - +{/* cell:25 cell_type:code */} ```python try: from google.colab import output @@ -244,11 +231,10 @@ except: pass ``` -{/* cell:25 cell_type:markdown */} +{/* cell:26 cell_type:markdown */} Render the graph. -{/* cell:26 cell_type:code */} - +{/* cell:27 cell_type:code */} ```python import ipywidgets @@ -260,18 +246,17 @@ node_size_widget = ipywidgets.Dropdown( ) ipywidgets.Box( [ - ipywidgets.Label(value='What should represent the size of the nodes:'), + ipywidgets.Label(value='What should represent the size of the nodes:'), node_size_widget ] ) ``` -{/* cell:27 cell_type:markdown */} +{/* cell:28 cell_type:markdown */} Refresh the next cell after changing the value above. -{/* cell:28 cell_type:code */} - +{/* cell:29 cell_type:code */} ```python import networkx as nx from ipysigma import Sigma diff --git a/src/content/notebooks/ne-processing-with-impresso-hf.mdx b/src/content/notebooks/ne-processing-with-impresso-hf.mdx index 8add1bf..28c6ecd 100644 --- a/src/content/notebooks/ne-processing-with-impresso-hf.mdx +++ b/src/content/notebooks/ne-processing-with-impresso-hf.mdx @@ -7,8 +7,8 @@ excerpt: Trained on the [HIPE 2020](https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-hipe2020.md) dataset, the Impresso models recognize both coarse and fine-grained named entities, linking mentions to knowledge bases when possible. -sha: 44a3c9f14c74807de3722878701d97ed71fa3e05 -date: 2024-10-25T14:18:01Z +sha: dd13ddcc0ba2f4a2b24face9790c46595dc2ebca +date: 2024-10-27T13:19:55Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/annotate/NE-processing_ImpressoHF.ipynb seealso: - ne-processing-with-impresso-api @@ -30,6 +30,12 @@ links: --- {/* cell:0 cell_type:markdown */} + + + Open In Colab + + +{/* cell:1 cell_type:markdown */} ## What is this notebook about? This notebook demonstrates how to use Impresso models for named entity recognition (NER) and entity linking (EL). @@ -55,7 +61,7 @@ By the end of this notebook, you will know how to: **Warning**: To use this notebook, you may need to set the `HF_TOKEN` environment variable in the `.env` file (refer to `.env.example`). You can obtain a token by signing up on the [Hugging Face website](https://huggingface.co/join) and find additional information in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). If you do not want to register an account on HF, simply select Cancel when prompted for a Hugging Face token — no token is needed for this notebook. -{/* cell:1 cell_type:markdown */} +{/* cell:2 cell_type:markdown */} ## Prerequisites First, we install and download necessary libraries: @@ -71,7 +77,7 @@ Libraries can be installed from the notebook, or within your environment: pip install torch protobuf sentencepiece transformers nltk ``` -{/* cell:2 cell_type:code */} +{/* cell:3 cell_type:code */} ```python !pip install torch !pip install protobuf @@ -80,20 +86,20 @@ pip install torch protobuf sentencepiece transformers nltk !pip install nltk ``` -{/* cell:3 cell_type:markdown */} +{/* cell:4 cell_type:markdown */} ## Entity Recognition -{/* cell:4 cell_type:code */} +{/* cell:5 cell_type:code */} ```python # Import necessary Python modules from the Transformers library from transformers import AutoModelForTokenClassification, AutoTokenizer from transformers import pipeline ``` -{/* cell:5 cell_type:markdown */} +{/* cell:6 cell_type:markdown */} For NER, we use the Impresso NER model named 'ner-stacked-bert-multilingual' and published on Hugging Face: https://huggingface.co/impresso-project/ner-stacked-bert-multilingual. -{/* cell:6 cell_type:code */} +{/* cell:7 cell_type:code */} ```python # We set the model_name variable to our chosen model, enabling us to load it and use it for token classification and NER MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual" @@ -102,11 +108,11 @@ MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual" ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ``` -{/* cell:7 cell_type:markdown */} -It is necessary to create a pipeline for our task (`generic-ner`), using the loaded model and tokenizer. This pipeline handles multiple tasks under the hood. +{/* cell:8 cell_type:markdown */} +It is necessary to create a pipeline for our task (`generic-ner`), using the loaded model and tokenizer. This pipeline handles multiple tasks under the hood. This custom NER pipeline streamlines tokenization, language-specific rules, and post-processing into a single workflow. It accurately identifies, aligns, and cleans entities while managing complexities like multilingual punctuation rules, attachment of complementary information (e.g., titles), and removal of redundant tokens. Using this pipeline simplifies our task by handling the NER within a single, streamlined process, making the workflow efficient and minimizing manual data processing. -{/* cell:8 cell_type:code */} +{/* cell:9 cell_type:code */} ```python ner_pipeline = pipeline("generic-ner", model=MODEL_NAME, tokenizer=ner_tokenizer, @@ -114,7 +120,7 @@ ner_pipeline = pipeline("generic-ner", model=MODEL_NAME, device='cpu') ``` -{/* cell:9 cell_type:code */} +{/* cell:10 cell_type:code */} ```python # We define some test input sentence = """In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles, @@ -127,7 +133,7 @@ sentence = """In the year 1789, King Louis XVI, ruler of France, convened the Es print(sentence) ``` -{/* cell:10 cell_type:code */} +{/* cell:11 cell_type:code */} ```python # A function that formats and displays the model output in a readable structure def print_nicely(data): @@ -138,10 +144,10 @@ def print_nicely(data): ``` -{/* cell:11 cell_type:markdown */} +{/* cell:12 cell_type:markdown */} We apply the pipeline on the input and print nicely the output -{/* cell:12 cell_type:code */} +{/* cell:13 cell_type:code */} ```python # Recognize stacked entities for each sentence entities = ner_pipeline(sentence) @@ -150,15 +156,54 @@ entities = ner_pipeline(sentence) print_nicely(entities) ``` -{/* cell:13 cell_type:markdown */} +{/* cell:14 cell_type:markdown */} +### Example of Entity Recognition with OCR Errors + +Below, we introduce simulated OCR errors, such as character misrecognition, missing spaces, and incorrect capitalization. + +{/* cell:15 cell_type:code */} +```python +sentence_with_ocr_errors = """In the year 1789, K1ng L0uis XVl, ruler of France, convened the Estatzs-General at the Palaceof Versailles, + where Marie Antoinette, the Qveen of France, alongside Max1milien Robespierre, a leading member of the National Assembly, + debated with JeanJacques Rousseau, the fam0us philos0pher, and Charles de Talleyrand, the B1shop of Autun, + regarding the futureoftheFrench monarchy. At the same time, across the Atlant1c in Philadelp1ia, + GeorgeWashington, the first President of the United States, and Thomas Jeffers0n, the nation’s SecretaryofState, + were drafting policies for the newly establ1shed American govemment foll0wing the sign1ng of the Const1tution.""" + +``` + +{/* cell:16 cell_type:markdown */} +Now, let’s run the OCR-affected text through the NER pipeline to observe how well the algorithm performs under OCR-induced distortions. + + +{/* cell:17 cell_type:code */} +```python +entities_with_errors = ner_pipeline(sentence_with_ocr_errors) + +print_nicely(entities_with_errors) +``` + +{/* cell:18 cell_type:code */} +```python +# Verify that the entity counts match for the original and OCR-affected sentences +original_entities = ner_pipeline(sentence) +entities_with_errors = ner_pipeline(sentence_with_ocr_errors) + +print("Number of entities in the original text:", len(original_entities)) +print("Number of entities in the OCR-affected text:", len(entities_with_errors)) +print("Are entity counts equal?", len(original_entities) == len(entities_with_errors)) + +``` + +{/* cell:19 cell_type:markdown */} ## Entity Linking -{/* cell:14 cell_type:markdown */} +{/* cell:20 cell_type:markdown */} With the EL model, we can link the previously recognised entity mentions to unique referents in Wikipedia and Wikidata. We use the Impresso model named 'nel-mgenre-multilingual' and published on Hugging Face: https://huggingface.co/impresso-project/nel-mgenre-multilingual. -{/* cell:15 cell_type:code */} +{/* cell:21 cell_type:code */} ```python # Import the necessary modules from the transformers library from transformers import AutoTokenizer, AutoModelForSeq2SeqLM @@ -171,7 +216,7 @@ NEL_MODEL_NAME = "impresso-project/nel-mgenre-multilingual" nel_tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-mgenre-multilingual") ``` -{/* cell:16 cell_type:code */} +{/* cell:22 cell_type:code */} ```python nel_pipeline = pipeline("generic-nel", model=NEL_MODEL_NAME, tokenizer=nel_tokenizer, @@ -179,7 +224,7 @@ nel_pipeline = pipeline("generic-nel", model=NEL_MODEL_NAME, device='cpu') ``` -{/* cell:17 cell_type:markdown */} +{/* cell:23 cell_type:markdown */} Our entity linker requires a specific format to correctly identify the entity that needs to be linked, as follows: ``` @@ -197,7 +242,7 @@ The event was held at the Palace of Versailles, a symbol of [START] French monar Let's take this example: -{/* cell:18 cell_type:code */} +{/* cell:24 cell_type:code */} ```python simple_sentence = "The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy." @@ -206,10 +251,10 @@ linked_entity = nel_pipeline(simple_sentence) print_nicely(linked_entity) ``` -{/* cell:19 cell_type:markdown */} +{/* cell:25 cell_type:markdown */} It _could_ work without the special markers and texts mentioning only one entity, but we do not recommend it. -{/* cell:20 cell_type:code */} +{/* cell:26 cell_type:code */} ```python simple_sentence = "The event was held at the Palace of Versailles, a symbol of French monarchy." @@ -218,10 +263,10 @@ linked_entity = nel_pipeline(simple_sentence) print_nicely(linked_entity) ``` -{/* cell:21 cell_type:markdown */} +{/* cell:27 cell_type:markdown */} By using our NER tool, we can automatically generate sentences with entity markers and subsequently link each entity: -{/* cell:22 cell_type:code */} +{/* cell:28 cell_type:code */} ```python # Run the NER pipeline on the input sentence and store the results entities = ner_pipeline(sentence) @@ -259,27 +304,89 @@ for entity in entities: print_nicely(linked_entities) ``` -{/* cell:23 cell_type:markdown */} +{/* cell:29 cell_type:markdown */} +### Example of Entity Linking with OCR Errors + +To evaluate the robustness of entity linking with OCR errors, we use both the original and OCR-affected sentences. Below, the entities identified by NER are linked individually to unique Wikipedia/Wikidata entries, while OCR errors are present. + + + +{/* cell:30 cell_type:code */} +```python +print(f'{len(entities_with_errors)} entities were previously detected in OCR-affected text.') + +# List to avoid reprocessing the same entities +already_done_ocr = [] + +# Process each detected entity in OCR-affected text for linking +for entity in entities_with_errors: + if entity['surface'] not in already_done_ocr: + # Format sentence with entity markers for EL + language = 'en' + tokens = sentence_with_ocr_errors.split(' ') + start, end = entity["index"][0], entity["index"][1] + + context_start = max(0, start - 10) + context_end = min(len(tokens), end + 11) + + # Surround entity with [START] and [END] tags + nel_sentence = ( + " ".join(tokens[context_start:start]) + + " [START] " + + entity['surface'] + + " [END] " + + " ".join(tokens[end + 1:context_end]) + ) + + # Perform entity linking on OCR-affected sentence + linked_entity_ocr = nel_pipeline(nel_sentence) + print("Sentence with OCR Error:") + print(nel_sentence) + print("Linked Entity:") + print_nicely(linked_entity_ocr) + already_done_ocr.append(entity['surface']) + +``` + +{/* cell:31 cell_type:markdown */} ## Looking up entities in the Impresso Corpus -Are the previously recognised entities present in the Impresso Corpus? Let's explore using the Impresso API and Python Library: +Are the previously recognised entities present in the Impresso Corpus? For each entity, we use impresso_session.entities.find() to look it up by name. This search will attempt to find a match for the exact name provided. If OCR errors are introduced (e.g., "Max1milien Robespierre" instead of "Maximilien Robespierre"), we can observe how resilient the search function is to variations. Let's explore using the Impresso API and Python Library. -{/* cell:24 cell_type:code */} +{/* cell:32 cell_type:code */} ```python +from impresso import connect +impresso_session = connect() ``` -{/* cell:25 cell_type:code */} +{/* cell:33 cell_type:code */} ```python +entity = impresso_session.entities.find("Maximilien Robespierre") +entity ``` -{/* cell:26 cell_type:code */} +{/* cell:34 cell_type:markdown */} +This command checks if "Maximilien Robespierre" exists in the Impresso database. Similarly, we test the resilience of the search function by querying slightly altered names (e.g., "Max1milien Robespierre"). + + + +{/* cell:35 cell_type:code */} +```python +entity = impresso_session.entities.find("Max1milien Robespierre") + +entity +``` + +{/* cell:36 cell_type:code */} ```python +entity = impresso_session.entities.find("Marie Antoinette") +entity ``` -{/* cell:27 cell_type:code */} +{/* cell:37 cell_type:code */} ```python ```