From 2c3f971d3f99836624be60d6c77c1c7ab08a3b56 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:15:11 +0100 Subject: [PATCH 1/2] fix: github portal documentation --- docs/portals/github.md | 140 +++++++++++------------------------------ 1 file changed, 36 insertions(+), 104 deletions(-) diff --git a/docs/portals/github.md b/docs/portals/github.md index ce62833e41..9b2fe4f87b 100644 --- a/docs/portals/github.md +++ b/docs/portals/github.md @@ -16,90 +16,47 @@ pip install 'frictionless[github]' --pre # for zsh shell You can read data from a github repository as follows: ```python tabs=Python -from pprint import pprint from frictionless import portals, Package -package = Package("https://github.com/fdtester/test-repo-without-datapackage") +package = Package("https://github.com/fdtester/test-repo-with-datapackage-json") print(package) ``` ``` -{'name': 'test-repo-without-datapackage', - 'resources': [{'name': 'capitals', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/capitals.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}, - {'name': 'countries', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/countries.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}, - {'name': 'student', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/student.xlsx', - 'scheme': 'https', - 'format': 'xlsx', - 'mediatype': 'application/vnd.ms-excel'}]} -``` -You can also use alias function instead, for example: -```python tabs=Python -from pprint import pprint -from frictionless import portals, Package - -package = Package("https://github.com/fdtester/test-repo-without-datapackage") -print(package) +{'name': 'test-package', + 'resources': [{'name': 'first-resource', + 'type': 'table', + 'path': 'table.xls', + 'scheme': 'file', + 'format': 'xls', + 'mediatype': 'application/vnd.ms-excel', + 'schema': {'fields': [{'name': 'id', 'type': 'number'}, + {'name': 'name', 'type': 'string'}]}}]} ``` To increase the access limit, pass 'apikey' as the param to the reader function as follows: ```python tabs=Python -from pprint import pprint from frictionless import portals, Package control = portals.GithubControl(apikey=apikey) -package = Package("https://github.com/fdtester/test-repo-without-datapackage", control=control) +package = Package("https://github.com/fdtester/test-repo-with-datapackage-json", control=control) print(package) ``` -The `reader` function can read package from repos with/without data package descriptor. If the repo does not have the descriptor it will create the descriptor with the name same as the repo name as shown in the example above. By default, the function reads files of type csv, xlsx and xls but we can set the file types using control parameters. - -If the repo has a descriptor it simply returns the descriptor as shown below +The `reader` function can read package from repos with/without data package descriptor. If the repo does not have the descriptor it will create the descriptor with the same name as the repo name. By default, the function reads files of type csv, xlsx and xls but we can set the file types using control parameters. -```python tabs=Python -from pprint import pprint -from frictionless import portals, Package +If the repo has a descriptor it simply returns the descriptor as shown above. -package = Package("https://https://github.com/fdtester/test-repo-with-datapackage-json") -``` -``` -print(package) -{'name': 'test-tabulator', - 'resources': [{'name': 'first-resource', - 'path': 'table.xls', - 'schema': {'fields': [{'name': 'id', 'type': 'number'}, - {'name': 'name', 'type': 'string'}]}}, - {'name': 'number-two', - 'path': 'table-reverse.csv', - 'schema': {'fields': [{'name': 'id', 'type': 'integer'}, - {'name': 'name', 'type': 'string'}]}}]} -``` Once you read the package from the repo, you can then easily access the resources and its data, for example: ```python tabs=Python -from pprint import pprint from frictionless import portals, Package -package = Package("https://github.com/fdtester/test-repo-without-datapackage") -pprint(package.get_resource('capitals').read_rows()) +package = Package("https://github.com/fdtester/test-repo-with-datapackage-json") +print(package.get_resource('first-resource').read_rows()) ``` ``` -[{'id': 1, 'cid': 1, 'name': 'London'}, - {'id': 2, 'cid': 2, 'name': 'Paris'}, - {'id': 3, 'cid': 3, 'name': 'Berlin'}, - {'id': 4, 'cid': 4, 'name': 'Rome'}, - {'id': 5, 'cid': 5, 'name': 'Lisbon'}] +[{'id': 1, 'name': 'english'}, {'id': 2, 'name': '中国人'}] ``` ## Reading Catalog @@ -107,7 +64,6 @@ pprint(package.get_resource('capitals').read_rows()) Catalog is a container for the packages. We can read single/multiple repositories from github and create a catalog. ```python tabs=Python -from pprint import pprint from frictionless import portals, Catalog control = portals.GithubControl(search="'TestAction: Read' in:readme", apikey=apikey) @@ -138,11 +94,12 @@ Total packages 4 'format': 'csv', 'mediatype': 'text/csv'}]}] ``` + To read catalog, we need authenticated user so we have to pass the token as 'apikey' to the function. In the above example we are using search text to filter the repositories to small number. The search field is not mandatory. We can simply use 'control' parameters and get the same result as above, for example: + ```python tabs=Python -from pprint import pprint from frictionless import portals, Catalog control = portals.GithubControl(search="'TestAction: Read' in:readme", user="fdtester", apikey=apikey) @@ -150,6 +107,7 @@ catalog = Catalog(control=control) print("Total packages", len(catalog.packages)) print(catalog.packages[:2]) ``` + As shown in the example above, we can use different qualifiers to search the repos. The above example searches for all the repos which has 'TestAction: Read' text in readme files. Similary we can use many different qualifiers and combination of those. To get full list of qualifiers you can check the github document [here](https://docs.github.com/en/search-github/searching-on-github/searching-for-repositories). Some examples of the qualifiers: @@ -159,9 +117,10 @@ Some examples of the qualifiers: ‘jquery’ in:name user:name sort:updated-asc ‘TestAction: Read’ in:readme ``` + If we want to read the list of repositories of user 'fdtester' which has 'jquery' in its name then we write search query as follows: + ```python tabs=Python -from pprint import pprint from frictionless import portals, Catalog control = portals.GithubControl(apikey=apikey, search="user:fdtester jquery in:name") @@ -177,11 +136,12 @@ print(catalog.packages) 'format': 'csv', 'mediatype': 'text/csv'}]}] ``` + There is only one repository having 'jquery' in name for this user's account, so it returned only one repository. We can also read repositories in defined order using 'sort' param or qualifier. Here we are trying to read the repos with 'TestAction: Read' text in readme file in recently updated order, for example: + ```python tabs=Python -from pprint import pprint from frictionless import portals, Catalog control = portals.GithubControl(apikey=apikey, search="user:fdtester sort:updated-desc 'TestAction: Read' in:readme") @@ -193,27 +153,6 @@ for index,package in enumerate(catalog.packages): ``` package:0 -{'name': 'test-repo-without-datapackage', - 'resources': [{'name': 'capitals', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/capitals.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}, - {'name': 'countries', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/countries.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}, - {'name': 'student', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/student.xlsx', - 'scheme': 'https', - 'format': 'xlsx', - 'mediatype': 'application/vnd.ms-excel'}]} -package:1 - {'name': 'test-repo-jquery', 'resources': [{'name': 'country-1', 'type': 'table', @@ -221,7 +160,7 @@ package:1 'scheme': 'https', 'format': 'csv', 'mediatype': 'text/csv'}]} -package:2 +package:1 {'resources': [{'name': 'capitals', 'type': 'table', @@ -234,7 +173,7 @@ package:2 'schema': {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'cid', 'type': 'integer'}, {'name': 'name', 'type': 'string'}]}}]} -package:3 +package:2 {'name': 'test-tabulator', 'resources': [{'name': 'first-resource', @@ -251,7 +190,6 @@ package:3 To write data to the repository, we use `Package.publish` function as follows: ```python tabs=Python -from pprint import pprint from frictionless import portals, Package package = Package('1174/datapackage.json') @@ -273,33 +211,27 @@ We can control the behavior of all the above three functions using various param For example, to read only 'csv' files in package we use the following code: ```python tabs=Python -from pprint import pprint from frictionless import portals, Package -control = portals.GithubControl(user="fdtester", formats=["csv"], repo="test-repo-without-datapackage", apikey=apikey) -package = Package("https://github.com/fdtester/test-repo-without-datapackage") +control = portals.GithubControl(user="fdtester", formats=["csv"], repo="test-repo-without-datapackage") +package = Package("https://github.com/fdtester/test-repo-with-datapackage-json") print(package) ``` ``` -{'name': 'test-repo-without-datapackage', - 'resources': [{'name': 'capitals', - 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/capitals.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}, - {'name': 'countries', +{'name': 'test-package', + 'resources': [{'name': 'first-resource', 'type': 'table', - 'path': 'https://raw.githubusercontent.com/fdtester/test-repo-without-datapackage/master/data/countries.csv', - 'scheme': 'https', - 'format': 'csv', - 'mediatype': 'text/csv'}]} + 'path': 'table.xls', + 'scheme': 'file', + 'format': 'xls', + 'mediatype': 'application/vnd.ms-excel', + 'schema': {'fields': [{'name': 'id', 'type': 'number'}, + {'name': 'name', 'type': 'string'}]}}]} ``` In order to read first page of the search result and create a catalog, we use `per_page` and `page` params as follows: ```python tabs=Python -from pprint import pprint from frictionless import portals, Catalog control = portals.GithubControl(apikey=apikey, search="user:fdtester sort:updated-desc 'TestAction: Read' in:readme", per_page=1, page=1) @@ -316,8 +248,8 @@ catalog = Catalog(control=control) ``` Similary, we can also control the write function using params as follows: + ``` -from pprint import pprint from frictionless import portals, Package package = Package('datapackage.json') From a1f582d9ce377502fabd4dbc9ff765f8c8cec601 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:20:06 +0100 Subject: [PATCH 2/2] remove unused `portals` imports --- docs/portals/github.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/portals/github.md b/docs/portals/github.md index 9b2fe4f87b..25b0f9fb82 100644 --- a/docs/portals/github.md +++ b/docs/portals/github.md @@ -16,7 +16,7 @@ pip install 'frictionless[github]' --pre # for zsh shell You can read data from a github repository as follows: ```python tabs=Python -from frictionless import portals, Package +from frictionless import Package package = Package("https://github.com/fdtester/test-repo-with-datapackage-json") print(package) @@ -50,7 +50,7 @@ If the repo has a descriptor it simply returns the descriptor as shown above. Once you read the package from the repo, you can then easily access the resources and its data, for example: ```python tabs=Python -from frictionless import portals, Package +from frictionless import Package package = Package("https://github.com/fdtester/test-repo-with-datapackage-json") print(package.get_resource('first-resource').read_rows())