From 405c46c053d53d842fbb11ac60c49db44666c396 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Sat, 27 Mar 2021 17:42:36 +0100 Subject: [PATCH] Fix #180 --- notebook/SOMEF Usage Example.ipynb | 262 +++++++++++++++++++++++------ notebook/test.json | 2 +- src/somef/cli.py | 2 +- 3 files changed, 212 insertions(+), 54 deletions(-) diff --git a/notebook/SOMEF Usage Example.ipynb b/notebook/SOMEF Usage Example.ipynb index f25d409c..69abb935 100644 --- a/notebook/SOMEF Usage Example.ipynb +++ b/notebook/SOMEF Usage Example.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -55,14 +55,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "SOftware MEtadata Extraction Framework (SOMEF) Command Line Interface\n", + "SOftware Metadata Extraction Framework (SOMEF) Command Line Interface\n", "Configuring SOMEF automatically. To assign credentials edit the configuration file or run the intearctive mode\n", "Success\n" ] @@ -91,16 +91,23 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "SOftware MEtadata Extraction Framework (SOMEF) Command Line Interface\n", - "Loading Repository https://github.com/usc-isi-i2/rltk Information....\n", - "https://api.github.com/repos/usc-isi-i2/rltk\n", + "SOftware Metadata Extraction Framework (SOMEF) Command Line Interface\n", + "Loading Repository https://github.com/usc-isi-i2/kgtk Information....\n", + "https://api.github.com/repos/usc-isi-i2/kgtk\n", + "Downloading https://github.com/usc-isi-i2/kgtk/archive/master.zip\n", + "['https://github.com/usc-isi-i2/kgtk/tree/master/docs']\n", + "['https://github.com/usc-isi-i2/kgtk/tree/master/docs', 'https://github.com/usc-isi-i2/kgtk/tree/master/examples/docs']\n", + "NOTEBOOKS:\n", + "['use-cases/Knowledge-Graph-Profiler.ipynb', 'use-cases/Generate-Triples-And-Load-Blazegraph.ipynb', 'use-cases/Wikidata Subsets.ipynb', 'use-cases/Wikidata Useful Files.ipynb', 'tutorial/3 Enhance KG.ipynb', 'tutorial/Knowledge-Graph-Profiler.out.ipynb', 'tutorial/5 Embeddings.ipynb', 'tutorial/4 Generate Triples.ipynb', 'tutorial/2 Construct KG.ipynb', 'tutorial/1 Introduction.ipynb', 'examples/Example2 - Curation and Statistics.ipynb', 'examples/CSKG Use Case.ipynb', 'examples/abbreviate_human_labels.ipynb', 'examples/Example12 - CSKG Analysis.ipynb', 'examples/Example6 - Wikipedia Tables.ipynb', 'examples/count-wikidata-entities-and-properties.ipynb', 'examples/partition-wikidata.ipynb', 'examples/Example 11 - Find Ambiguous Items.ipynb', 'examples/Example4 - Wikidata Pagerank.ipynb', 'examples/Example5 - AIDA AIF.ipynb', 'examples/generate_wikitable_anchors.ipynb', 'examples/Example8 - Wikidata Subset.ipynb', 'examples/Example3 - Reachability.ipynb', 'examples/combine-with-qualifiers.ipynb', 'examples/Example1 - Embeddings.ipynb', 'examples/Example7 - Wikidata Outputs.ipynb', 'examples/DBPedia_links.ipynb', 'examples/Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb', 'examples/commands/text_embeddings.ipynb']\n", + "DOCKERFILES:\n", + "['docker/Dockerfile', 'docker/dev/Dockerfile', 'docker/lite/Dockerfile']\n", "Repository Information Successfully Loaded. \n", "\n", "Extracting information using headers\n", @@ -112,13 +119,13 @@ "Splitting text into valid excerpts for classification\n", "Text Successfully split. \n", "\n", - "Classifying excerpts for the catgory description\n", + "Classifying excerpts for the category description\n", "Excerpt Classification Successful for the Category description\n", - "Classifying excerpts for the catgory citation\n", + "Classifying excerpts for the category citation\n", "Excerpt Classification Successful for the Category citation\n", - "Classifying excerpts for the catgory installation\n", + "Classifying excerpts for the category installation\n", "Excerpt Classification Successful for the Category installation\n", - "Classifying excerpts for the catgory invocation\n", + "Classifying excerpts for the category invocation\n", "Excerpt Classification Successful for the Category invocation\n", "\n", "\n", @@ -171,31 +178,56 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'description': [{'excerpt': 'The Record Linkage ToolKit (RLTK) is a general-purpose open-source record linkage platform that allows users to build powerful Python programs that link records referring to the same underlying entity. Record linkage is an extremely important problem that shows up in domains extending from social networks to bibliographic data and biomedicine. Current open platforms for record linkage have problems scaling even to moderately sized datasets, or are just not easy to use (even by experts). RLTK attempts to address all of these issues. \\nRLTK supports a full, scalable record linkage pipeline, including multi-core algorithms for blocking, profiling data, computing a wide variety of features, and training and applying machine learning classifiers based on Python’s sklearn library. An end-to-end RLTK pipeline can be jump-started with only a few lines of code. However, RLTK is also designed to be extensible and customizable, allowing users arbitrary degrees of control over many of the individual components. You can add new features to RLTK (e.g. a custom string similarity) very easily. \\n',\n", - " 'confidence': [0.8482897919547067, 0.9163711174405582],\n", - " 'technique': 'Supervised classification'},\n", - " {'excerpt': 'RLTK is under active maintenance and we expect to keep adding new features and state-of-the-art record linkage algorithms in the foreseeable future, in addition to continuously supporting our adopters to integrate the platform into their applications. \\n',\n", - " 'confidence': [0.8574457628244572],\n", + "{'description': [{'excerpt': 'KGTK is a Python toolkit for building applications using knowledge graphs (KG). KGTK is designed for ease of use, scalability and speed. It represents KGs as simple TSV files with four columns to represent the head, relation and tail of a triple, as well as an identifier for each triple. This simple model allows KGTK to operate on property graphs and on RDF graphs. KGTK offers a comprehensive collection of 20+ commands to import, transform, query and analyze KGs, including wrappers for state of the art graph analytics and deep learning libraries. KGTK is optimized for batch processing, making it easy to write KG pipelines that process large KGs such as Wikidata on a laptop to produce datasets for use in downstream applications. KGTK is open-source software released under the MIT license. \\n',\n", + " 'confidence': [0.9100667347076543],\n", " 'technique': 'Supervised classification'},\n", - " {'excerpt': 'Record Linkage ToolKit (Find and link entities)',\n", + " {'excerpt': 'Knowledge Graph Toolkit ',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'}],\n", - " 'citation': [],\n", - " 'installation': [],\n", + " 'citation': [{'excerpt': '```\\n@inproceedings{ilievski2020kgtk,\\n title={{KGTK}: A Toolkit for Large Knowledge Graph Manipulation and Analysis}},\\n author={Ilievski, Filip and Garijo, Daniel and Chalupsky, Hans and Divvala, Naren Teja and Yao, Yixiang and Rogers, Craig and Li, Ronpeng and Liu, Jun and Singh, Amandeep and Schwabe, Daniel and Szekely, Pedro},\\n booktitle={International Semantic Web Conference},\\n pages={278--293},\\n year={2020},\\n organization={Springer}\\n url={https://arxiv.org/pdf/2006.00088.pdf}\\n}\\n```',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'},\n", + " {'excerpt': '@inproceedings{ilievski2020kgtk,\\n title={{KGTK}: A Toolkit for Large Knowledge Graph Manipulation and Analysis}},\\n author={Ilievski, Filip and Garijo, Daniel and Chalupsky, Hans and Divvala, Naren Teja and Yao, Yixiang and Rogers, Craig and Li, Ronpeng and Liu, Jun and Singh, Amandeep and Schwabe, Daniel and Szekely, Pedro},\\n booktitle={International Semantic Web Conference},\\n pages={278--293},\\n year={2020},\\n organization={Springer}\\n url={https://arxiv.org/pdf/2006.00088.pdf}\\n}',\n", + " 'confidence': [1.0],\n", + " 'technique': 'Regular expression'}],\n", + " 'installation': [{'excerpt': 'To update your version of KGTK, just follow the instructions below:\\n\\n- If you installed KGTK with through Docker, then just pull the most recent image: `docker pull `, where `` is the tag of the image of interest (e.g. uscisii2/kgtk:latest)\\n- If you installed KGTK from pip, then type `pip install -U kgtk`.\\n- If you installed KGTK from GitHub, then type `git pull && pip install` . Alternatively, you may execute: `git pull && python setup.py install`.\\n- If you installed KGTK in development mode, (i.e., `pip install -e`); then you only need to do update your repository: `git pull`.\\n\\n',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'},\n", + " {'excerpt': \"Our installation will be in a **conda environment**. If you don't have conda installed, follow [link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) to install it. Once installed, follow the instructions below:\\n\\n1. Set up your own conda environment:\\n```\\nconda create -n kgtk-env python=3.7\\nconda activate kgtk-env\\n```\\n **Note:** Installing Graph-tool is problematic on python 3.8 and out of a virtual environment. Thus: **the advised installation path is by using a virtual environment.**\\n\\n2. Install (the dev branch at this point): `pip install kgtk`\\n\\nYou can test if `kgtk` is installed properly now with: `kgtk -h`.\\n\\n3. Download the English model of SpaCY: `python -m spacy download en_core_web_sm`\\n\\n4. Install `graph-tool`: `conda install -c conda-forge graph-tool`. If you don't use conda or run into problems, see these [instructions](https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions).\\n\\n5. Python library rdflib has a known [issue](https://github.com/RDFLib/rdflib/issues/1043), where the ttl serialization of decimal values is incorrect. The library will add a `.0` at the end of decimal values in scientific notation. This will make the ttl invalid and cannot be loaded into a triplestore.\\n\\nTo solve this issue, run the following commands after the `kgtk` installation is complete.\\n```\\npip uninstall rdflib\\npip install git+https://github.com/RDFLib/rdflib.git@master\\n```\\n\\nThe code fix for this bug is already merged into the library, but has not been released as a `pypi` package. This step will be removed after `rdflib` version 6 is released.\\n\\n\",\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'},\n", + " {'excerpt': '```\\ndocker pull uscisii2/kgtk\\n```\\n\\nTo run KGTK in the command line:\\n\\n```\\ndocker run -it --rm --user root -e NB_GID=100 -e GEN_CERT=yes -e GRANT_SUDO=yes uscisii2/kgtk:latest /bin/bash\\n```\\n\\nNote: if you want to load data from your local machine, you will need to [mount a volume](https://docs.docker.com/storage/volumes/).\\nFor example, to mount the current directory (`$PWD`) and launch KGTK in command line mode:\\n\\n```\\ndocker run -it --rm -v $PWD:/out --user root -e NB_GID=100 -e GEN_CERT=yes -e GRANT_SUDO=yes uscisii2/kgtk:latest /bin/bash\\n```\\n\\nIf you want to run KGTK in a **Jupyter notebook**, mounting the current directory (`$PWD`) as a folder called `/out` then you will have to type:\\n```\\ndocker run -it -v $PWD:/out -p 8888:8888 uscisii2/kgtk:latest /bin/bash -c \"jupyter notebook --ip=\\'*\\' --port=8888 --no-browser\"\\n```\\n\\nMore information about versions and tags is available here: https://hub.docker.com/repository/docker/uscisii2/kgtk. For example, the `dev` branch is available at `uscisii2/kgtk:latest-dev`.\\n\\nSee additional examples in [the documentation](https://kgtk.readthedocs.io/en/latest/install/).\\n\\n',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'}],\n", " 'invocation': [],\n", - " 'long_title': {'excerpt': 'RLTK: Record Linkage ToolKit',\n", + " 'long_title': {'excerpt': 'KGTK: Knowledge Graph Toolkit',\n", " 'confidence': [1.0],\n", " 'technique': 'Regular expression'},\n", - " 'usage': [{'excerpt': \"Installation (make sure prerequisites are installed)::\\n\\n pip install -U rltk\\n\\nExample::\\n\\n >>> import rltk\\n >>> rltk.levenshtein_distance('abc', 'abd')\\n 1\\n\\n\",\n", + " 'executable_example': [{'excerpt': 'https://mybinder.org/v2/gh/dgarijo/kgtk/dev?filepath=%2Fkgtk%2Fexamples%2FCSKG%20Use%20Case.ipynb',\n", + " 'confidence': [1.0],\n", + " 'technique': 'Regular expression'},\n", + " {'excerpt': 'https://mybinder.org/v2/gh/usc-isi-i2/kgtk/master?filepath=examples%2FExample5%20-%20AIDA%20AIF.ipynb',\n", + " 'confidence': [1.0],\n", + " 'technique': 'Regular expression'}],\n", + " 'documentation': [{'excerpt': 'https://kgtk.readthedocs.io/en/latest/\\n\\n',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'}],\n", + " 'run': [{'excerpt': 'To list all the available KGTK commands, run:\\n\\n```\\nkgtk -h\\n```\\n\\nTo see the arguments of a particular commands, run:\\n\\n```\\nkgtk -h\\n```\\n\\nAn example command that computes instances of the subclasses of two classes:\\n\\n```\\nkgtk instances --transitive --class Q13442814,Q12345678\\n```\\n\\n',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'},\n", + " {'excerpt': '```\\ncd kgtk/tests\\npython -W ignore -m unittest discover\\n```\\n\\n',\n", + " 'confidence': [1],\n", + " 'technique': 'Header extraction'}],\n", + " 'usage': [{'excerpt': 'The easiest, no-cost way of trying out KGTK is through [MyBinder](https://mybinder.org/). We have made available several **example notebooks** to show some of the features of KGTK, which can be run in two environments:\\n\\n* **Basic KGTK functionality**: This notebook may take **5-10 minutes** to launch, please be patient. Note that in this notebook some KGTK commands (graph analytics and embeddings) **will not run**. To launch the notebook in your browser, click on the \"Binder\" icon: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/usc-isi-i2/kgtk/master?filepath=examples%2FExample5%20-%20AIDA%20AIF.ipynb)\\n\\n* **Advanced KGTK functionality**: This notebook may take **10-20 minutes to launch**. It includes basic KGTK functionality and **graph analytics and embedding capabilities** of KGTK: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dgarijo/kgtk/dev?filepath=%2Fkgtk%2Fexamples%2FCSKG%20Use%20Case.ipynb)\\n\\nFor executing KGTK with large datasets, **we recommend a Docker/local installation**.\\n\\n',\n", " 'confidence': [1],\n", " 'technique': 'Header extraction'}],\n", - " 'codeRepository': {'excerpt': 'https://github.com/usc-isi-i2/rltk',\n", + " 'codeRepository': {'excerpt': 'https://github.com/usc-isi-i2/kgtk',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", " 'owner': {'excerpt': 'usc-isi-i2',\n", @@ -204,56 +236,189 @@ " 'ownerType': {'excerpt': 'Organization',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'dateCreated': {'excerpt': '2017-02-15T22:20:47Z',\n", + " 'dateCreated': {'excerpt': '2020-01-18T03:34:48Z',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'dateModified': {'excerpt': '2020-10-12T13:26:57Z',\n", + " 'dateModified': {'excerpt': '2021-03-17T17:03:58Z',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", " 'license': {'excerpt': {'name': 'MIT License',\n", " 'url': 'https://api.github.com/licenses/mit'},\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'name': {'excerpt': 'rltk', 'confidence': [1.0], 'technique': 'GitHub API'},\n", - " 'fullName': {'excerpt': 'usc-isi-i2/rltk',\n", + " 'name': {'excerpt': 'kgtk', 'confidence': [1.0], 'technique': 'GitHub API'},\n", + " 'fullName': {'excerpt': 'usc-isi-i2/kgtk',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'issueTracker': {'excerpt': 'https://api.github.com/repos/usc-isi-i2/rltk/issues{/number}',\n", + " 'issueTracker': {'excerpt': 'https://api.github.com/repos/usc-isi-i2/kgtk/issues{/number}',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'forks_url': {'excerpt': 'https://api.github.com/repos/usc-isi-i2/rltk/forks',\n", + " 'forks_url': {'excerpt': 'https://api.github.com/repos/usc-isi-i2/kgtk/forks',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'stargazers_count': {'excerpt': {'count': 70,\n", - " 'date': 'Wed, 30 Dec 2020 22:14:20 GMT'},\n", + " 'stargazers_count': {'excerpt': {'count': 81,\n", + " 'date': 'Sat, 27 Mar 2021 16:40:26 GMT'},\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'forks_count': {'excerpt': {'count': 20,\n", - " 'date': 'Wed, 30 Dec 2020 22:14:20 GMT'},\n", + " 'forks_count': {'excerpt': {'count': 23,\n", + " 'date': 'Sat, 27 Mar 2021 16:40:26 GMT'},\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'downloadUrl': {'excerpt': 'https://github.com/usc-isi-i2/rltk/releases',\n", + " 'downloadUrl': {'excerpt': 'https://github.com/usc-isi-i2/kgtk/releases',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'topics': {'excerpt': ['linkage',\n", - " 'similarity',\n", - " 'similarity-metric',\n", - " 'string-similarity',\n", - " 'record-linkage',\n", - " 'entity-resolution',\n", - " 'deduplication'],\n", + " 'topics': {'excerpt': ['knowledge-graphs',\n", + " 'graphs',\n", + " 'efficient',\n", + " 'triples',\n", + " 'rdf',\n", + " 'etl-framework'],\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'languages': {'excerpt': ['Python', 'Shell'],\n", + " 'languages': {'excerpt': ['Jupyter Notebook',\n", + " 'Python',\n", + " 'HTML',\n", + " 'Shell',\n", + " 'Dockerfile',\n", + " 'Makefile'],\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'readme_url': {'excerpt': 'https://github.com/usc-isi-i2/rltk/blob/master/README.rst',\n", + " 'readme_url': {'excerpt': 'https://github.com/usc-isi-i2/kgtk/blob/master/README.md',\n", " 'confidence': [1.0],\n", " 'technique': 'GitHub API'},\n", - " 'releases': {'excerpt': [], 'confidence': [1.0], 'technique': 'GitHub API'}}" + " 'hasExecutableNotebook': {'excerpt': ['https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Knowledge-Graph-Profiler.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Generate-Triples-And-Load-Blazegraph.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Wikidata%20Subsets.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Wikidata%20Useful%20Files.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/3%20Enhance%20KG.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/Knowledge-Graph-Profiler.out.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/5%20Embeddings.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/4%20Generate%20Triples.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/2%20Construct%20KG.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/1%20Introduction.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example2%20-%20Curation%20and%20Statistics.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/CSKG%20Use%20Case.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/abbreviate_human_labels.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example12%20-%20CSKG%20Analysis.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example6%20-%20Wikipedia%20Tables.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/count-wikidata-entities-and-properties.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/partition-wikidata.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example%2011%20-%20Find%20Ambiguous%20Items.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example4%20-%20Wikidata%20Pagerank.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example5%20-%20AIDA%20AIF.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/generate_wikitable_anchors.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example8%20-%20Wikidata%20Subset.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example3%20-%20Reachability.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/combine-with-qualifiers.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example1%20-%20Embeddings.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example7%20-%20Wikidata%20Outputs.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/DBPedia_links.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/commands/text_embeddings.ipynb'],\n", + " 'confidence': [1.0],\n", + " 'technique': 'File Exploration'},\n", + " 'hasBuildFile': {'excerpt': ['https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/Dockerfile',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/dev/Dockerfile',\n", + " 'https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/lite/Dockerfile'],\n", + " 'confidence': [1.0],\n", + " 'technique': 'File Exploration'},\n", + " 'hasDocumentation': {'excerpt': ['https://github.com/usc-isi-i2/kgtk/tree/master/docs',\n", + " 'https://github.com/usc-isi-i2/kgtk/tree/master/examples/docs'],\n", + " 'confidence': [1.0],\n", + " 'technique': 'File Exploration'},\n", + " 'releases': {'excerpt': [{'tag_name': '0.6.0',\n", + " 'name': 'KGTK Tutorial',\n", + " 'author_name': 'saggu',\n", + " 'authorType': 'User',\n", + " 'body': '',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.6.0',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.6.0',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.6.0',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/36946041',\n", + " 'dateCreated': '2021-01-27T01:56:30Z',\n", + " 'datePublished': '2021-01-27T01:57:13Z'},\n", + " {'tag_name': '0.5.0',\n", + " 'name': 'KGTK 0.5.0: Kypher and other improvements',\n", + " 'author_name': 'saggu',\n", + " 'authorType': 'User',\n", + " 'body': 'This release includes Kypher, an implementation of the Cypher query language over KGTK files without the need to add them to a graph store. \\r\\nOther major improvements include:\\r\\n- Bug fixes (see https://github.com/usc-isi-i2/kgtk/pull/197)\\r\\n- Parallel processing when importing triples\\r\\n- Updated Tutorial materials',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.5.0',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.5.0',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.5.0',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/35059658',\n", + " 'dateCreated': '2020-12-09T18:43:14Z',\n", + " 'datePublished': '2020-12-09T18:44:59Z'},\n", + " {'tag_name': '0.4.0',\n", + " 'name': 'Unit Test Coverage',\n", + " 'author_name': 'saggu',\n", + " 'authorType': 'User',\n", + " 'body': '',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.4.0',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.4.0',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.4.0',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/31079990',\n", + " 'dateCreated': '2020-09-10T16:42:14Z',\n", + " 'datePublished': '2020-09-10T19:06:34Z'},\n", + " {'tag_name': '0.3.2',\n", + " 'name': 'Consistent command names and options',\n", + " 'author_name': 'saggu',\n", + " 'authorType': 'User',\n", + " 'body': 'This release introduces a set of changes to homogenize the command names and make them consistent. It also addresses:\\r\\n- Updates to examples and notebooks (more notebooks).\\r\\n- Several bug fixes on commands and pipes.\\r\\n- Consistency on inputs and output \\r\\n- Docker files for accessing a lite version and a dev version of KGTK.\\r\\n- Examples on how to execute KGTK in myBinder\\r\\n- Updates and improvements to documentation\\r\\n- New commands for creating canonical representations of tabular data',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.3.2',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.3.2',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.3.2',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/28277008',\n", + " 'dateCreated': '2020-07-06T22:15:21Z',\n", + " 'datePublished': '2020-07-06T22:42:03Z'},\n", + " {'tag_name': 'v0.2.1',\n", + " 'name': 'KGTK 0.2.1: Additional commands and bug fixes',\n", + " 'author_name': 'dgarijo',\n", + " 'authorType': 'User',\n", + " 'body': 'This version of KGTK fixes:\\r\\n* Updates installation instructions to add Docker support\\r\\n* Updates stats\\r\\n* Refines filter command\\r\\n* adds expand and explode commands\\r\\n* Refines the clean and validate command with additional options\\r\\n* Bug fixes in export to WD triples (additional support for custom ns prefixes)\\r\\n* new commands: lift, rename columns\\r\\n* ...',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/v0.2.1',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/v0.2.1',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/v0.2.1',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/27512547',\n", + " 'dateCreated': '2020-06-12T00:21:42Z',\n", + " 'datePublished': '2020-06-12T22:51:46Z'},\n", + " {'tag_name': 'v0.2.0',\n", + " 'name': 'KGTK 0.2.0: Extended command support',\n", + " 'author_name': 'dgarijo',\n", + " 'authorType': 'User',\n", + " 'body': 'This release incorporates expands the previous set of commands for importing and exporting triples; curating and manipulating graphs (validating, cleaning, merging, filtering); and graph analysis (centrality, page rank, text embeddings). See the full documentation at https://kgtk.readthedocs.io/en/latest/',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/v0.2.0',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/v0.2.0',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/v0.2.0',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/26772346',\n", + " 'dateCreated': '2020-05-21T20:17:07Z',\n", + " 'datePublished': '2020-05-21T20:19:02Z'},\n", + " {'tag_name': '0.1.1',\n", + " 'name': 'KGTK 0.1.1 release',\n", + " 'author_name': 'dgarijo',\n", + " 'authorType': 'User',\n", + " 'body': 'Intermediate release in preparation for the ISWC paper',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.1.1',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.1.1',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.1.1',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/26539516',\n", + " 'dateCreated': '2020-04-28T00:24:19Z',\n", + " 'datePublished': '2020-05-15T01:14:07Z'},\n", + " {'tag_name': '0.1.0',\n", + " 'name': 'First pypi release',\n", + " 'author_name': 'saggu',\n", + " 'authorType': 'User',\n", + " 'body': '',\n", + " 'tarball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.1.0',\n", + " 'zipball_url': 'https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.1.0',\n", + " 'html_url': 'https://github.com/usc-isi-i2/kgtk/releases/tag/0.1.0',\n", + " 'url': 'https://api.github.com/repos/usc-isi-i2/kgtk/releases/25943386',\n", + " 'dateCreated': '2020-04-28T00:09:06Z',\n", + " 'datePublished': '2020-04-28T00:20:30Z'}],\n", + " 'confidence': [1.0],\n", + " 'technique': 'GitHub API'}}" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -264,13 +429,6 @@ "results = json.load(f)\n", "results" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/notebook/test.json b/notebook/test.json index 3c146d15..3075b1c9 100644 --- a/notebook/test.json +++ b/notebook/test.json @@ -1 +1 @@ -{"description": [{"excerpt": "The Record Linkage ToolKit (RLTK) is a general-purpose open-source record linkage platform that allows users to build powerful Python programs that link records referring to the same underlying entity. Record linkage is an extremely important problem that shows up in domains extending from social networks to bibliographic data and biomedicine. Current open platforms for record linkage have problems scaling even to moderately sized datasets, or are just not easy to use (even by experts). RLTK attempts to address all of these issues. \nRLTK supports a full, scalable record linkage pipeline, including multi-core algorithms for blocking, profiling data, computing a wide variety of features, and training and applying machine learning classifiers based on Python\u2019s sklearn library. An end-to-end RLTK pipeline can be jump-started with only a few lines of code. However, RLTK is also designed to be extensible and customizable, allowing users arbitrary degrees of control over many of the individual components. You can add new features to RLTK (e.g. a custom string similarity) very easily. \n", "confidence": [0.8482897919547067, 0.9163711174405582], "technique": "Supervised classification"}, {"excerpt": "RLTK is under active maintenance and we expect to keep adding new features and state-of-the-art record linkage algorithms in the foreseeable future, in addition to continuously supporting our adopters to integrate the platform into their applications. \n", "confidence": [0.8574457628244572], "technique": "Supervised classification"}, {"excerpt": "Record Linkage ToolKit (Find and link entities)", "confidence": [1.0], "technique": "GitHub API"}], "citation": [], "installation": [], "invocation": [], "long_title": {"excerpt": "RLTK: Record Linkage ToolKit", "confidence": [1.0], "technique": "Regular expression"}, "usage": [{"excerpt": "Installation (make sure prerequisites are installed)::\n\n pip install -U rltk\n\nExample::\n\n >>> import rltk\n >>> rltk.levenshtein_distance('abc', 'abd')\n 1\n\n", "confidence": [1], "technique": "Header extraction"}], "codeRepository": {"excerpt": "https://github.com/usc-isi-i2/rltk", "confidence": [1.0], "technique": "GitHub API"}, "owner": {"excerpt": "usc-isi-i2", "confidence": [1.0], "technique": "GitHub API"}, "ownerType": {"excerpt": "Organization", "confidence": [1.0], "technique": "GitHub API"}, "dateCreated": {"excerpt": "2017-02-15T22:20:47Z", "confidence": [1.0], "technique": "GitHub API"}, "dateModified": {"excerpt": "2020-10-12T13:26:57Z", "confidence": [1.0], "technique": "GitHub API"}, "license": {"excerpt": {"name": "MIT License", "url": "https://api.github.com/licenses/mit"}, "confidence": [1.0], "technique": "GitHub API"}, "name": {"excerpt": "rltk", "confidence": [1.0], "technique": "GitHub API"}, "fullName": {"excerpt": "usc-isi-i2/rltk", "confidence": [1.0], "technique": "GitHub API"}, "issueTracker": {"excerpt": "https://api.github.com/repos/usc-isi-i2/rltk/issues{/number}", "confidence": [1.0], "technique": "GitHub API"}, "forks_url": {"excerpt": "https://api.github.com/repos/usc-isi-i2/rltk/forks", "confidence": [1.0], "technique": "GitHub API"}, "stargazers_count": {"excerpt": {"count": 70, "date": "Wed, 30 Dec 2020 22:14:20 GMT"}, "confidence": [1.0], "technique": "GitHub API"}, "forks_count": {"excerpt": {"count": 20, "date": "Wed, 30 Dec 2020 22:14:20 GMT"}, "confidence": [1.0], "technique": "GitHub API"}, "downloadUrl": {"excerpt": "https://github.com/usc-isi-i2/rltk/releases", "confidence": [1.0], "technique": "GitHub API"}, "topics": {"excerpt": ["linkage", "similarity", "similarity-metric", "string-similarity", "record-linkage", "entity-resolution", "deduplication"], "confidence": [1.0], "technique": "GitHub API"}, "languages": {"excerpt": ["Python", "Shell"], "confidence": [1.0], "technique": "GitHub API"}, "readme_url": {"excerpt": "https://github.com/usc-isi-i2/rltk/blob/master/README.rst", "confidence": [1.0], "technique": "GitHub API"}, "releases": {"excerpt": [], "confidence": [1.0], "technique": "GitHub API"}} \ No newline at end of file +{"description": [{"excerpt": "KGTK is a Python toolkit for building applications using knowledge graphs (KG). KGTK is designed for ease of use, scalability and speed. It represents KGs as simple TSV files with four columns to represent the head, relation and tail of a triple, as well as an identifier for each triple. This simple model allows KGTK to operate on property graphs and on RDF graphs. KGTK offers a comprehensive collection of 20+ commands to import, transform, query and analyze KGs, including wrappers for state of the art graph analytics and deep learning libraries. KGTK is optimized for batch processing, making it easy to write KG pipelines that process large KGs such as Wikidata on a laptop to produce datasets for use in downstream applications. KGTK is open-source software released under the MIT license. \n", "confidence": [0.9100667347076543], "technique": "Supervised classification"}, {"excerpt": "Knowledge Graph Toolkit ", "confidence": [1.0], "technique": "GitHub API"}], "citation": [{"excerpt": "```\n@inproceedings{ilievski2020kgtk,\n title={{KGTK}: A Toolkit for Large Knowledge Graph Manipulation and Analysis}},\n author={Ilievski, Filip and Garijo, Daniel and Chalupsky, Hans and Divvala, Naren Teja and Yao, Yixiang and Rogers, Craig and Li, Ronpeng and Liu, Jun and Singh, Amandeep and Schwabe, Daniel and Szekely, Pedro},\n booktitle={International Semantic Web Conference},\n pages={278--293},\n year={2020},\n organization={Springer}\n url={https://arxiv.org/pdf/2006.00088.pdf}\n}\n```", "confidence": [1], "technique": "Header extraction"}, {"excerpt": "@inproceedings{ilievski2020kgtk,\n title={{KGTK}: A Toolkit for Large Knowledge Graph Manipulation and Analysis}},\n author={Ilievski, Filip and Garijo, Daniel and Chalupsky, Hans and Divvala, Naren Teja and Yao, Yixiang and Rogers, Craig and Li, Ronpeng and Liu, Jun and Singh, Amandeep and Schwabe, Daniel and Szekely, Pedro},\n booktitle={International Semantic Web Conference},\n pages={278--293},\n year={2020},\n organization={Springer}\n url={https://arxiv.org/pdf/2006.00088.pdf}\n}", "confidence": [1.0], "technique": "Regular expression"}], "installation": [{"excerpt": "To update your version of KGTK, just follow the instructions below:\n\n- If you installed KGTK with through Docker, then just pull the most recent image: `docker pull `, where `` is the tag of the image of interest (e.g. uscisii2/kgtk:latest)\n- If you installed KGTK from pip, then type `pip install -U kgtk`.\n- If you installed KGTK from GitHub, then type `git pull && pip install` . Alternatively, you may execute: `git pull && python setup.py install`.\n- If you installed KGTK in development mode, (i.e., `pip install -e`); then you only need to do update your repository: `git pull`.\n\n", "confidence": [1], "technique": "Header extraction"}, {"excerpt": "Our installation will be in a **conda environment**. If you don't have conda installed, follow [link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) to install it. Once installed, follow the instructions below:\n\n1. Set up your own conda environment:\n```\nconda create -n kgtk-env python=3.7\nconda activate kgtk-env\n```\n **Note:** Installing Graph-tool is problematic on python 3.8 and out of a virtual environment. Thus: **the advised installation path is by using a virtual environment.**\n\n2. Install (the dev branch at this point): `pip install kgtk`\n\nYou can test if `kgtk` is installed properly now with: `kgtk -h`.\n\n3. Download the English model of SpaCY: `python -m spacy download en_core_web_sm`\n\n4. Install `graph-tool`: `conda install -c conda-forge graph-tool`. If you don't use conda or run into problems, see these [instructions](https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions).\n\n5. Python library rdflib has a known [issue](https://github.com/RDFLib/rdflib/issues/1043), where the ttl serialization of decimal values is incorrect. The library will add a `.0` at the end of decimal values in scientific notation. This will make the ttl invalid and cannot be loaded into a triplestore.\n\nTo solve this issue, run the following commands after the `kgtk` installation is complete.\n```\npip uninstall rdflib\npip install git+https://github.com/RDFLib/rdflib.git@master\n```\n\nThe code fix for this bug is already merged into the library, but has not been released as a `pypi` package. This step will be removed after `rdflib` version 6 is released.\n\n", "confidence": [1], "technique": "Header extraction"}, {"excerpt": "```\ndocker pull uscisii2/kgtk\n```\n\nTo run KGTK in the command line:\n\n```\ndocker run -it --rm --user root -e NB_GID=100 -e GEN_CERT=yes -e GRANT_SUDO=yes uscisii2/kgtk:latest /bin/bash\n```\n\nNote: if you want to load data from your local machine, you will need to [mount a volume](https://docs.docker.com/storage/volumes/).\nFor example, to mount the current directory (`$PWD`) and launch KGTK in command line mode:\n\n```\ndocker run -it --rm -v $PWD:/out --user root -e NB_GID=100 -e GEN_CERT=yes -e GRANT_SUDO=yes uscisii2/kgtk:latest /bin/bash\n```\n\nIf you want to run KGTK in a **Jupyter notebook**, mounting the current directory (`$PWD`) as a folder called `/out` then you will have to type:\n```\ndocker run -it -v $PWD:/out -p 8888:8888 uscisii2/kgtk:latest /bin/bash -c \"jupyter notebook --ip='*' --port=8888 --no-browser\"\n```\n\nMore information about versions and tags is available here: https://hub.docker.com/repository/docker/uscisii2/kgtk. For example, the `dev` branch is available at `uscisii2/kgtk:latest-dev`.\n\nSee additional examples in [the documentation](https://kgtk.readthedocs.io/en/latest/install/).\n\n", "confidence": [1], "technique": "Header extraction"}], "invocation": [], "long_title": {"excerpt": "KGTK: Knowledge Graph Toolkit", "confidence": [1.0], "technique": "Regular expression"}, "executable_example": [{"excerpt": "https://mybinder.org/v2/gh/dgarijo/kgtk/dev?filepath=%2Fkgtk%2Fexamples%2FCSKG%20Use%20Case.ipynb", "confidence": [1.0], "technique": "Regular expression"}, {"excerpt": "https://mybinder.org/v2/gh/usc-isi-i2/kgtk/master?filepath=examples%2FExample5%20-%20AIDA%20AIF.ipynb", "confidence": [1.0], "technique": "Regular expression"}], "documentation": [{"excerpt": "https://kgtk.readthedocs.io/en/latest/\n\n", "confidence": [1], "technique": "Header extraction"}], "run": [{"excerpt": "To list all the available KGTK commands, run:\n\n```\nkgtk -h\n```\n\nTo see the arguments of a particular commands, run:\n\n```\nkgtk -h\n```\n\nAn example command that computes instances of the subclasses of two classes:\n\n```\nkgtk instances --transitive --class Q13442814,Q12345678\n```\n\n", "confidence": [1], "technique": "Header extraction"}, {"excerpt": "```\ncd kgtk/tests\npython -W ignore -m unittest discover\n```\n\n", "confidence": [1], "technique": "Header extraction"}], "usage": [{"excerpt": "The easiest, no-cost way of trying out KGTK is through [MyBinder](https://mybinder.org/). We have made available several **example notebooks** to show some of the features of KGTK, which can be run in two environments:\n\n* **Basic KGTK functionality**: This notebook may take **5-10 minutes** to launch, please be patient. Note that in this notebook some KGTK commands (graph analytics and embeddings) **will not run**. To launch the notebook in your browser, click on the \"Binder\" icon: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/usc-isi-i2/kgtk/master?filepath=examples%2FExample5%20-%20AIDA%20AIF.ipynb)\n\n* **Advanced KGTK functionality**: This notebook may take **10-20 minutes to launch**. It includes basic KGTK functionality and **graph analytics and embedding capabilities** of KGTK: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dgarijo/kgtk/dev?filepath=%2Fkgtk%2Fexamples%2FCSKG%20Use%20Case.ipynb)\n\nFor executing KGTK with large datasets, **we recommend a Docker/local installation**.\n\n", "confidence": [1], "technique": "Header extraction"}], "codeRepository": {"excerpt": "https://github.com/usc-isi-i2/kgtk", "confidence": [1.0], "technique": "GitHub API"}, "owner": {"excerpt": "usc-isi-i2", "confidence": [1.0], "technique": "GitHub API"}, "ownerType": {"excerpt": "Organization", "confidence": [1.0], "technique": "GitHub API"}, "dateCreated": {"excerpt": "2020-01-18T03:34:48Z", "confidence": [1.0], "technique": "GitHub API"}, "dateModified": {"excerpt": "2021-03-17T17:03:58Z", "confidence": [1.0], "technique": "GitHub API"}, "license": {"excerpt": {"name": "MIT License", "url": "https://api.github.com/licenses/mit"}, "confidence": [1.0], "technique": "GitHub API"}, "name": {"excerpt": "kgtk", "confidence": [1.0], "technique": "GitHub API"}, "fullName": {"excerpt": "usc-isi-i2/kgtk", "confidence": [1.0], "technique": "GitHub API"}, "issueTracker": {"excerpt": "https://api.github.com/repos/usc-isi-i2/kgtk/issues{/number}", "confidence": [1.0], "technique": "GitHub API"}, "forks_url": {"excerpt": "https://api.github.com/repos/usc-isi-i2/kgtk/forks", "confidence": [1.0], "technique": "GitHub API"}, "stargazers_count": {"excerpt": {"count": 81, "date": "Sat, 27 Mar 2021 16:40:26 GMT"}, "confidence": [1.0], "technique": "GitHub API"}, "forks_count": {"excerpt": {"count": 23, "date": "Sat, 27 Mar 2021 16:40:26 GMT"}, "confidence": [1.0], "technique": "GitHub API"}, "downloadUrl": {"excerpt": "https://github.com/usc-isi-i2/kgtk/releases", "confidence": [1.0], "technique": "GitHub API"}, "topics": {"excerpt": ["knowledge-graphs", "graphs", "efficient", "triples", "rdf", "etl-framework"], "confidence": [1.0], "technique": "GitHub API"}, "languages": {"excerpt": ["Jupyter Notebook", "Python", "HTML", "Shell", "Dockerfile", "Makefile"], "confidence": [1.0], "technique": "GitHub API"}, "readme_url": {"excerpt": "https://github.com/usc-isi-i2/kgtk/blob/master/README.md", "confidence": [1.0], "technique": "GitHub API"}, "hasExecutableNotebook": {"excerpt": ["https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Knowledge-Graph-Profiler.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Generate-Triples-And-Load-Blazegraph.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Wikidata%20Subsets.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/use-cases/Wikidata%20Useful%20Files.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/3%20Enhance%20KG.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/Knowledge-Graph-Profiler.out.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/5%20Embeddings.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/4%20Generate%20Triples.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/2%20Construct%20KG.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/tutorial/1%20Introduction.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example2%20-%20Curation%20and%20Statistics.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/CSKG%20Use%20Case.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/abbreviate_human_labels.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example12%20-%20CSKG%20Analysis.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example6%20-%20Wikipedia%20Tables.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/count-wikidata-entities-and-properties.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/partition-wikidata.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example%2011%20-%20Find%20Ambiguous%20Items.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example4%20-%20Wikidata%20Pagerank.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example5%20-%20AIDA%20AIF.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/generate_wikitable_anchors.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example8%20-%20Wikidata%20Subset.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example3%20-%20Reachability.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/combine-with-qualifiers.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example1%20-%20Embeddings.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example7%20-%20Wikidata%20Outputs.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/DBPedia_links.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/examples/commands/text_embeddings.ipynb"], "confidence": [1.0], "technique": "File Exploration"}, "hasBuildFile": {"excerpt": ["https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/Dockerfile", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/dev/Dockerfile", "https://raw.githubusercontent.com/usc-isi-i2/kgtk/master/docker/lite/Dockerfile"], "confidence": [1.0], "technique": "File Exploration"}, "hasDocumentation": {"excerpt": ["https://github.com/usc-isi-i2/kgtk/tree/master/docs", "https://github.com/usc-isi-i2/kgtk/tree/master/examples/docs"], "confidence": [1.0], "technique": "File Exploration"}, "releases": {"excerpt": [{"tag_name": "0.6.0", "name": "KGTK Tutorial", "author_name": "saggu", "authorType": "User", "body": "", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.6.0", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.6.0", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.6.0", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/36946041", "dateCreated": "2021-01-27T01:56:30Z", "datePublished": "2021-01-27T01:57:13Z"}, {"tag_name": "0.5.0", "name": "KGTK 0.5.0: Kypher and other improvements", "author_name": "saggu", "authorType": "User", "body": "This release includes Kypher, an implementation of the Cypher query language over KGTK files without the need to add them to a graph store. \r\nOther major improvements include:\r\n- Bug fixes (see https://github.com/usc-isi-i2/kgtk/pull/197)\r\n- Parallel processing when importing triples\r\n- Updated Tutorial materials", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.5.0", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.5.0", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.5.0", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/35059658", "dateCreated": "2020-12-09T18:43:14Z", "datePublished": "2020-12-09T18:44:59Z"}, {"tag_name": "0.4.0", "name": "Unit Test Coverage", "author_name": "saggu", "authorType": "User", "body": "", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.4.0", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.4.0", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.4.0", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/31079990", "dateCreated": "2020-09-10T16:42:14Z", "datePublished": "2020-09-10T19:06:34Z"}, {"tag_name": "0.3.2", "name": "Consistent command names and options", "author_name": "saggu", "authorType": "User", "body": "This release introduces a set of changes to homogenize the command names and make them consistent. It also addresses:\r\n- Updates to examples and notebooks (more notebooks).\r\n- Several bug fixes on commands and pipes.\r\n- Consistency on inputs and output \r\n- Docker files for accessing a lite version and a dev version of KGTK.\r\n- Examples on how to execute KGTK in myBinder\r\n- Updates and improvements to documentation\r\n- New commands for creating canonical representations of tabular data", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.3.2", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.3.2", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.3.2", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/28277008", "dateCreated": "2020-07-06T22:15:21Z", "datePublished": "2020-07-06T22:42:03Z"}, {"tag_name": "v0.2.1", "name": "KGTK 0.2.1: Additional commands and bug fixes", "author_name": "dgarijo", "authorType": "User", "body": "This version of KGTK fixes:\r\n* Updates installation instructions to add Docker support\r\n* Updates stats\r\n* Refines filter command\r\n* adds expand and explode commands\r\n* Refines the clean and validate command with additional options\r\n* Bug fixes in export to WD triples (additional support for custom ns prefixes)\r\n* new commands: lift, rename columns\r\n* ...", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/v0.2.1", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/v0.2.1", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/v0.2.1", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/27512547", "dateCreated": "2020-06-12T00:21:42Z", "datePublished": "2020-06-12T22:51:46Z"}, {"tag_name": "v0.2.0", "name": "KGTK 0.2.0: Extended command support", "author_name": "dgarijo", "authorType": "User", "body": "This release incorporates expands the previous set of commands for importing and exporting triples; curating and manipulating graphs (validating, cleaning, merging, filtering); and graph analysis (centrality, page rank, text embeddings). See the full documentation at https://kgtk.readthedocs.io/en/latest/", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/v0.2.0", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/v0.2.0", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/v0.2.0", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/26772346", "dateCreated": "2020-05-21T20:17:07Z", "datePublished": "2020-05-21T20:19:02Z"}, {"tag_name": "0.1.1", "name": "KGTK 0.1.1 release", "author_name": "dgarijo", "authorType": "User", "body": "Intermediate release in preparation for the ISWC paper", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.1.1", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.1.1", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.1.1", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/26539516", "dateCreated": "2020-04-28T00:24:19Z", "datePublished": "2020-05-15T01:14:07Z"}, {"tag_name": "0.1.0", "name": "First pypi release", "author_name": "saggu", "authorType": "User", "body": "", "tarball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/tarball/0.1.0", "zipball_url": "https://api.github.com/repos/usc-isi-i2/kgtk/zipball/0.1.0", "html_url": "https://github.com/usc-isi-i2/kgtk/releases/tag/0.1.0", "url": "https://api.github.com/repos/usc-isi-i2/kgtk/releases/25943386", "dateCreated": "2020-04-28T00:09:06Z", "datePublished": "2020-04-28T00:20:30Z"}], "confidence": [1.0], "technique": "GitHub API"}} \ No newline at end of file diff --git a/src/somef/cli.py b/src/somef/cli.py index 9479d7e6..e123fe0d 100644 --- a/src/somef/cli.py +++ b/src/somef/cli.py @@ -389,7 +389,7 @@ def run_classifiers(excerpts, file_paths): file_name = file_paths[category] if not path.exists(file_name): sys.exit(f"Error: File/Directory {file_name} does not exist") - print("Classifying excerpts for the catgory", category) + print("Classifying excerpts for the category", category) classifier = pickle.load(open(file_name, 'rb')) scores = classifier.predict_proba(excerpts) score_dict[category] = {'excerpt': excerpts, 'confidence': scores[:, 1]}