diff --git a/.github/workflows/notebooks_to_exclude.txt b/.github/workflows/notebooks_to_exclude.txt index 9e94a0f5a..6e8d03dde 100644 --- a/.github/workflows/notebooks_to_exclude.txt +++ b/.github/workflows/notebooks_to_exclude.txt @@ -12,6 +12,7 @@ # This only works against lakeFS cloud ./rbac-demo.ipynb +./data-collaboration.ipynb # These notebooks include a deliberate failure as part of # the data Audit step, and so will always throw an error (by design) diff --git a/00_notebooks/00_index.ipynb b/00_notebooks/00_index.ipynb index d3cad420a..5c1371398 100644 --- a/00_notebooks/00_index.ipynb +++ b/00_notebooks/00_index.ipynb @@ -41,6 +41,7 @@ "* [**RBAC demo**](./rbac-demo.ipynb) (Requires [lakefS Cloud](https://lakefs.cloud/register) or [lakefS Enterprise](https://docs.lakefs.io/understand/enterprise/) on prem)\n", "* [**Version Control of multi-buckets pipelines**](./version-control-of-multi-buckets-pipelines.ipynb) \n", "* [**Reprocess and Backfill Data with new ETL logic**](./reprocess-backfill-data.ipynb) \n", + "* [**Data Collaboration**](./data-collaboration.ipynb)\n", "* **lakeFS and Apache Iceberg**\n", " * [Basic example](./iceberg-lakefs-basic.ipynb)\n", " * [NYC Film Permits example](./iceberg-lakefs-nyc.ipynb)\n", diff --git a/00_notebooks/data-collaboration.ipynb b/00_notebooks/data-collaboration.ipynb new file mode 100644 index 000000000..2a277ba8b --- /dev/null +++ b/00_notebooks/data-collaboration.ipynb @@ -0,0 +1,1223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b2c8fa0-1702-411a-b11c-3190679bf31c", + "metadata": {}, + "source": [ + "\"lakeFS \n", + "\n", + "# lakeFS for Data Collaboration" + ] + }, + { + "cell_type": "markdown", + "id": "ee1f1244-ba66-4560-9e0c-7b87660aff67", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "* This Notebook requires connecting to lakeFS Cloud or lakeFS Enterprise\n", + "* Register to lakeFS Cloud: https://lakefs.cloud/register or Contact Us for a lakeFS Enterprise Key: https://lakefs.io/contact-sales/" + ] + }, + { + "cell_type": "markdown", + "id": "0b6d1654-82fd-477b-900d-8bfbddf7346b", + "metadata": {}, + "source": [ + "### The image below demonstrates the setup created in this sample notebook:\n", + "* A single lakeFS repository, with a protected Main branch that stores production data.\n", + "* Three groups:\n", + " * **Admins**: including a single user\n", + " * **Data Scientists**: including a single user\n", + " * **Developers**: including two users\n", + "* A **FSBlockMergingToMain** policy which prevents users from being able to promote data to production.\n", + "* Multiple branches created by individual users" + ] + }, + { + "cell_type": "markdown", + "id": "264ff04b-9137-47b5-b4c2-f80a11246800", + "metadata": {}, + "source": [ + "![data_collaboration](./images/data_colab.png)" + ] + }, + { + "cell_type": "markdown", + "id": "2d06bdc4-9a08-48b8-bc8f-c1d7a23fcc97", + "metadata": {}, + "source": [ + "## Config" + ] + }, + { + "cell_type": "markdown", + "id": "387f4469-9708-435d-bec9-c943fd87a0bf", + "metadata": {}, + "source": [ + "### Change your lakeFS credentials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa46799-5f86-4104-9447-d91328edbff4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io'\n", + "lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'\n", + "lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'" + ] + }, + { + "cell_type": "markdown", + "id": "13d1c562-9899-4343-b887-c5c018ea79b0", + "metadata": {}, + "source": [ + "### Storage Information\n", + "##### Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25b151c2-743d-43e1-9f2f-25482967c207", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "storageNamespace = 's3://example/' # e.g. \"s3://username-lakefs-cloud/\"" + ] + }, + { + "cell_type": "markdown", + "id": "debe8733-d1f8-4b54-961b-33fc40535fe1", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "**(you shouldn't need to change anything in this section, just run it)**" + ] + }, + { + "cell_type": "markdown", + "id": "fc6d8ce9-1147-4d4c-9882-9718fde1ddde", + "metadata": {}, + "source": [ + "## You can change lakeFS repo name (it can be an existing repo or provide another repo name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6e3c546-98b9-4e4e-a9c0-c6737758ce20", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "repo_name = \"data-collaboration-repo\"" + ] + }, + { + "cell_type": "markdown", + "id": "c093db8e-68d9-409f-bde7-73ee4ceca5a5", + "metadata": {}, + "source": [ + "## Versioning Information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b839850-5954-4631-8e7e-d0dee6d17dde", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mainBranch = \"main\"\n", + "fileName = \"lakefs_test.csv\"" + ] + }, + { + "cell_type": "markdown", + "id": "b7ae3a20-86f3-411f-83b5-b77a7ad3293f", + "metadata": {}, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63c6251a-9545-4292-8663-4785edd29b4b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%xmode Minimal\n", + "import lakefs\n", + "from lakefs.client import Client\n", + "import lakefs_sdk\n", + "from lakefs_sdk.client import LakeFSClient\n", + "from lakefs_sdk import models\n", + "from assets.lakefs_demo import print_commit, print_diff" + ] + }, + { + "cell_type": "markdown", + "id": "a7ede096-1b84-4b59-bdd1-2608ced6be51", + "metadata": {}, + "source": [ + "## Working with the lakeFS Python client API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08f45f04-5425-4224-8da2-88c5e506a6ba", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if not 'superUserClient' in locals():\n", + " configuration = lakefs_sdk.Configuration(\n", + " host=lakefsEndPoint,\n", + " username=lakefsAccessKey,\n", + " password=lakefsSecretKey,\n", + " )\n", + " superUserClient = LakeFSClient(configuration)" + ] + }, + { + "cell_type": "markdown", + "id": "522255a6-b7f0-47f6-ac70-106fd4ee2a65", + "metadata": {}, + "source": [ + "### Verify lakeFS credentials by getting lakeFS version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c92d094-5a07-42d7-9167-7a14c8bfff04", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Verifying lakeFS credentials…\")\n", + "try:\n", + " v=superUserClient.internal_api.get_lake_fs_version().version\n", + "except:\n", + " print(\"🛑 failed to get lakeFS version\")\n", + "else:\n", + " print(f\"…✅lakeFS credentials verified\\n\\nℹ️lakeFS version {v}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3dd29881-80eb-495c-919d-5680abb99075", + "metadata": {}, + "source": [ + "## Super User creates an \"admin1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d874bce-215a-4dc9-828b-c14a55871ff5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "superUserClient.auth_api.create_user(\n", + " user_creation=models.UserCreation(\n", + " id='admin1'))" + ] + }, + { + "cell_type": "markdown", + "id": "b8939939-1d58-4a0e-b879-d63890a3e502", + "metadata": {}, + "source": [ + "## Super User adds \"admin1\" user to an \"Admins\" group auto-created by lakeFS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29d0afcd-3426-4f4d-a064-ebf6ed9e23f3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "groupName='Admins'\n", + "\n", + "has_more = True\n", + "next_offset = \"\"\n", + "while has_more:\n", + " groups = superUserClient.auth_api.list_groups(after=next_offset)\n", + " for r in groups.results:\n", + " if r.name == groupName:\n", + " groupId = r.id\n", + " break\n", + " has_more = groups.pagination.has_more\n", + " next_offset = groups.pagination.next_offset\n", + " \n", + "superUserClient.auth_api.add_group_membership(\n", + " group_id=groupId,\n", + " user_id='admin1')" + ] + }, + { + "cell_type": "markdown", + "id": "16ab9b40-00fd-4f2a-89ea-258dd7ea2e60", + "metadata": {}, + "source": [ + "## Create credentials for \"admin1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49ed0e63-bcff-4858-a461-3764f108d049", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "credentials = superUserClient.auth_api.create_credentials(user_id='admin1')\n", + "print(credentials)\n", + "admin1AccessKey = credentials.access_key_id\n", + "admin1SecretKey = credentials.secret_access_key" + ] + }, + { + "cell_type": "markdown", + "id": "62674ac5-0f82-4331-ac98-65c6110b4c0f", + "metadata": {}, + "source": [ + "## Create a lakeFS Python client for \"admin1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f7b8c5-92a4-4634-9d05-ae7d2186e069", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "configuration = lakefs_sdk.Configuration(\n", + " host=lakefsEndPoint,\n", + " username=admin1AccessKey,\n", + " password=admin1SecretKey,\n", + ")\n", + "admin1Client = LakeFSClient(configuration)\n", + "\n", + "admin1LakefsClient = Client(\n", + " host=lakefsEndPoint,\n", + " username=admin1AccessKey,\n", + " password=admin1SecretKey,\n", + ")\n", + "\n", + "\n", + "print(\"Created lakeFS client for admin1.\")" + ] + }, + { + "cell_type": "markdown", + "id": "24c3927a-4af6-43f6-816c-f09a00af45a0", + "metadata": {}, + "source": [ + "## Verify user for \"admin1Client\" Python client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3007f6a9-9b28-48a2-9028-810de0b4d5bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.get_current_user()" + ] + }, + { + "cell_type": "markdown", + "id": "564765a0-806e-49f5-87e1-be4f4a0707df", + "metadata": {}, + "source": [ + "# The Demo Starts Here\n", + "\n", + "##### \"admin1\" will do rest of the setup to define data collaboration rules specific to the organization" + ] + }, + { + "cell_type": "markdown", + "id": "f8cc0da5-37d6-4e59-958c-de60938042fa", + "metadata": {}, + "source": [ + "#### \"admin1\" creates \"developer1\" and \"developer2\" users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c2b90b8-24d1-4552-a2e0-ef2a62064ee5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.create_user(\n", + " user_creation=models.UserCreation(\n", + " id='developer1'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f20cc073-3a90-41e6-aa1c-d24683c3fa53", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.create_user(\n", + " user_creation=models.UserCreation(\n", + " id='developer2'))" + ] + }, + { + "cell_type": "markdown", + "id": "61450afe-b6ea-46ce-857f-8d6ad15d662a", + "metadata": {}, + "source": [ + "## \"admin1\" adds \"developer1\" and \"developer2\" to lakeFS created \"Developers\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf9691fa-9f96-4c39-87cd-8f1952a4e136", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "groupNameDevelopers='Developers'\n", + "\n", + "has_more = True\n", + "next_offset = \"\"\n", + "while has_more:\n", + " groups = superUserClient.auth_api.list_groups(after=next_offset)\n", + " for r in groups.results:\n", + " if r.name == groupNameDevelopers:\n", + " groupIdDevelopers = r.id\n", + " break\n", + " has_more = groups.pagination.has_more\n", + " next_offset = groups.pagination.next_offset\n", + " \n", + "admin1Client.auth_api.add_group_membership(\n", + " group_id=groupIdDevelopers,\n", + " user_id='developer1')\n", + "\n", + "admin1Client.auth_api.add_group_membership(\n", + " group_id=groupIdDevelopers,\n", + " user_id='developer2')" + ] + }, + { + "cell_type": "markdown", + "id": "4b68c0ab-c0a0-4ec4-a3c3-9630e114280f", + "metadata": {}, + "source": [ + "## Create credentials for \"developer1\" and \"developer2\" users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46c20127-2cc6-4490-853d-3b40662674cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "credentials = admin1Client.auth_api.create_credentials(user_id='developer1')\n", + "print(credentials)\n", + "developer1AccessKey = credentials.access_key_id\n", + "developer1SecretKey = credentials.secret_access_key\n", + "\n", + "credentials = admin1Client.auth_api.create_credentials(user_id='developer2')\n", + "print(credentials)\n", + "developer2AccessKey = credentials.access_key_id\n", + "developer2SecretKey = credentials.secret_access_key" + ] + }, + { + "cell_type": "markdown", + "id": "c5100037-599d-4d7a-92b4-f33f62b5c142", + "metadata": {}, + "source": [ + "## Create lakeFS Python client for \"developer1\" and \"developer2\" users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c62ee7-4fdf-4203-8446-108ec4c251cb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "configuration = lakefs_sdk.Configuration(\n", + " host=lakefsEndPoint,\n", + " username=developer1AccessKey,\n", + " password=developer1SecretKey,\n", + ")\n", + "developer1Client = LakeFSClient(configuration)\n", + "\n", + "developer1LakeFSClient = Client(\n", + " host=lakefsEndPoint,\n", + " username=developer1AccessKey,\n", + " password=developer1SecretKey,\n", + ")\n", + " \n", + "print(\"Created lakeFS client for developer1.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3439202d-e240-4165-b4df-ffcd2fef91bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "configuration = lakefs_sdk.Configuration(\n", + " host=lakefsEndPoint,\n", + " username=developer2AccessKey,\n", + " password=developer2SecretKey,\n", + ")\n", + "developer2Client = LakeFSClient(configuration)\n", + "\n", + "developer1LakeFSClient = Client(\n", + " host=lakefsEndPoint,\n", + " username=developer2AccessKey,\n", + " password=developer2SecretKey,\n", + ")\n", + " \n", + "print(\"Created lakeFS client for developer2.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3035b4b9-e03c-4d4e-9b57-a2f6c5cc7b60", + "metadata": {}, + "source": [ + "## Verify user for \"developer1\" and \"developer2\" Python clients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a4026e5-a6bc-4a25-9f53-1c331fda0e25", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "developer1Client.auth_api.get_current_user()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8756c40c-c8fa-425b-821c-644e95337f32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "developer2Client.auth_api.get_current_user()" + ] + }, + { + "cell_type": "markdown", + "id": "2b29ae2c-694e-4944-b7c1-e53e4e4e8811", + "metadata": {}, + "source": [ + "## \"admin1\" creates \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d356d8b5-839d-4040-8f33-8e37fd4f8471", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DataScientistsGroup = admin1Client.auth_api.create_group(\n", + " group_creation=models.GroupCreation(\n", + " id='DataScientists'))" + ] + }, + { + "cell_type": "markdown", + "id": "b458c1cd-3de5-4a66-a7ab-e6532a1a9f05", + "metadata": {}, + "source": [ + "## \"admin1\" attaches lakeFS created \"AuthManageOwnCredentials\" policy to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e538c54-76ab-490b-8383-fa4c65ae0593", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.attach_policy_to_group(\n", + " group_id=DataScientistsGroup.id,\n", + " policy_id='AuthManageOwnCredentials')" + ] + }, + { + "cell_type": "markdown", + "id": "5f786518-0346-4844-83d5-1923b666a313", + "metadata": {}, + "source": [ + "## \"admin1\" attaches lakeFS created \"FSReadWriteAll\" policy to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "822d7660-b4a9-489c-9812-6efb2bf9642a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.attach_policy_to_group(\n", + " group_id=DataScientistsGroup.id,\n", + " policy_id='FSReadWriteAll')" + ] + }, + { + "cell_type": "markdown", + "id": "abb700d6-18db-427a-ad5d-aa54ca6565ca", + "metadata": {}, + "source": [ + "## \"admin1\" attaches lakeFS created \"RepoManagementReadAll\" policy to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e53e3325-e641-4fa4-af61-e275898ba885", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.attach_policy_to_group(\n", + " group_id=DataScientistsGroup.id,\n", + " policy_id='RepoManagementReadAll')" + ] + }, + { + "cell_type": "markdown", + "id": "e76e419a-fd61-4fb5-8049-1e7f7a035728", + "metadata": {}, + "source": [ + "## \"admin1\" creates \"data_scientist1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "418de2f8-d477-4ced-bb7b-9e48af9cf093", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.create_user(\n", + " user_creation=models.UserCreation(\n", + " id='data_scientist1'))" + ] + }, + { + "cell_type": "markdown", + "id": "1b4b34a4-af08-4d97-a16a-23f29cb32525", + "metadata": {}, + "source": [ + "## \"admin1\" adds \"data_scientist1\" user to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23771e33-75f3-41a1-9abc-8c37d1aa09eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.add_group_membership(\n", + " group_id=DataScientistsGroup.id,\n", + " user_id='data_scientist1')" + ] + }, + { + "cell_type": "markdown", + "id": "9f6c505a-1be1-4af4-819a-2bf61228dac8", + "metadata": {}, + "source": [ + "## Create credentials for \"data_scientist1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1d02624-1ad5-425f-a1fb-bf5b6190998f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "credentials = admin1Client.auth_api.create_credentials(user_id='data_scientist1')\n", + "print(credentials)\n", + "data_scientist1AccessKey = credentials.access_key_id\n", + "data_scientist1SecretKey = credentials.secret_access_key" + ] + }, + { + "cell_type": "markdown", + "id": "fd5c634b-2d7e-40a7-8dc5-581326cf6bd0", + "metadata": {}, + "source": [ + "## Create lakeFS Python client for \"data_scientist1\" user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ebd9e01-311f-49f8-bdbf-a1949a5bec07", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "configuration = lakefs_sdk.Configuration(\n", + " host=lakefsEndPoint,\n", + " username=data_scientist1AccessKey,\n", + " password=data_scientist1SecretKey,\n", + ")\n", + "data_scientist1Client = LakeFSClient(configuration)\n", + "\n", + "data_scientist1LakeFSClient = Client(\n", + " host=lakefsEndPoint,\n", + " username=data_scientist1AccessKey,\n", + " password=data_scientist1SecretKey,\n", + ")\n", + " \n", + "print(\"Created lakeFS client for data_scientist1.\")" + ] + }, + { + "cell_type": "markdown", + "id": "335da665-1080-4a97-ad97-a0295ac1bd7e", + "metadata": {}, + "source": [ + "## Verify user for \"data_scientist1Client\" Python client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a8e7a27-e66a-4fd1-9de1-b319c36a2446", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data_scientist1Client.auth_api.get_current_user()" + ] + }, + { + "cell_type": "markdown", + "id": "0f5375e2-5c6d-4442-bf2e-34e528167776", + "metadata": {}, + "source": [ + "## \"admin1\" creates \"FSBlockMergingToMain\" policy to prevent commits to the main branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3e1c16-15d2-4a70-8d0a-959c8e15d28a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.create_policy(\n", + " policy=models.Policy(\n", + " id='FSBlockMergingToMain',\n", + " statement=[models.Statement(\n", + " effect=\"deny\",\n", + " resource=\"arn:lakefs:fs:::repository/*/branch/main\",\n", + " action=[\"fs:CreateCommit\"],\n", + " ),\n", + " ]\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a53eb7f6-3777-4515-b9bb-d729b4770280", + "metadata": {}, + "source": [ + "## \"admin1\" attaches \"FSBlockMergingToMain\" policy to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4e2964c-968a-476e-a636-8c2097181380", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.auth_api.attach_policy_to_group(\n", + " group_id=DataScientistsGroup.id,\n", + " policy_id='FSBlockMergingToMain')" + ] + }, + { + "cell_type": "markdown", + "id": "2422d8af-c714-4d32-aa7b-eddfacbbddb4", + "metadata": {}, + "source": [ + "## If repo already exists on your lakeFS server then you can skip following step otherwise \"admin1\" creates a new repo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7211351b-9df6-4996-bc77-96f1ad407ef4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "repo = lakefs.Repository(repo_name, client=admin1LakefsClient).create(storage_namespace=f\"{storageNamespace}/{repo_name}\", default_branch=mainBranch, exist_ok=True)\n", + "branchMain = repo.branch(mainBranch)\n", + "print(repo)" + ] + }, + { + "cell_type": "markdown", + "id": "1403069c-4ffd-459b-a5a5-316cd9033b1a", + "metadata": {}, + "source": [ + "## \"admin1\" protects main branch so no one can write directly to main branch and any subsequent writes must be done via the merge of a branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f7713d4-5b21-4f99-807e-447e9f5845c5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "admin1Client.repositories_api.set_branch_protection_rules(\n", + " repository=repo_name,\n", + " branch_protection_rule=[models.BranchProtectionRule(\n", + " pattern=mainBranch)])" + ] + }, + { + "cell_type": "markdown", + "id": "70239947-a305-4f3a-a275-c9a528bafce6", + "metadata": {}, + "source": [ + "## \"admin1\" tries to upload a file to a \"shopping_transactions/raw\" folder on the main branch, but the upload fails because main branch is protected" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f68f53e-1a7c-4c96-acee-abf6d28463e5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "contentToUpload = open(f\"/data/{fileName}\", 'r').read()\n", + "branchMain.object('shopping_transactions/raw/'+fileName).upload(data=contentToUpload, mode='wb', pre_sign=False)" + ] + }, + { + "cell_type": "markdown", + "id": "c3572f3d-229c-4991-b14c-9518ec3bf5e9", + "metadata": {}, + "source": [ + "## \"admin1\" creates an \"ingest-shopping-transactions\" branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d48e4f3-7e26-49c2-afcf-2f34fc6ba3a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "branchIngestShoppingTransactions = repo.branch('ingest-shopping-transactions').create(source_reference=mainBranch)\n", + "print(\"ingest-shopping-transactions ref:\", branchIngestShoppingTransactions.get_commit().id)" + ] + }, + { + "cell_type": "markdown", + "id": "52ade461-fd60-4f5a-b8a1-84e475ec6e77", + "metadata": {}, + "source": [ + "## \"admin1\" uploads the file to \"shopping_transactions/raw\" folder in \"ingest-shopping_transactions\" branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c243e371-cefa-4188-b170-801121f62b2e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "contentToUpload = open(f\"/data/{fileName}\", 'r').read()\n", + "branchIngestShoppingTransactions.object('shopping_transactions/raw/'+fileName).upload(data=contentToUpload, mode='wb', pre_sign=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ed1629fe-2b8d-442b-bb42-9a9e1779875e", + "metadata": {}, + "source": [ + "## \"admin1\" commits changes and attaches some metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6340672-cd51-4d83-b3af-7372739051f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ref = branchIngestShoppingTransactions.commit(message='Ingested raw shopping transactions data!', metadata={'using': 'python_sdk'})\n", + "print_commit(ref.get_commit())" + ] + }, + { + "cell_type": "markdown", + "id": "ef4d38c1-7b43-4082-bee5-7fe8639a4467", + "metadata": {}, + "source": [ + "## \"admin1\" merges \"ingest-shopping-transactions\" branch to main branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b00c062d-4a4d-4e84-9276-6e2a3c7ed791", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "res = branchIngestShoppingTransactions.merge_into(branchMain)\n", + "print(res)" + ] + }, + { + "cell_type": "markdown", + "id": "d45b72c0-e325-44b7-8577-a34b079fddb6", + "metadata": {}, + "source": [ + "## \"developer1\" works on changing a script that transforms raw shopping transactions data into datasets the user application consumes.\n", + "### \"developer1\" wants to test their change against real production data under shopping_transactions/raw. To do that, the create a branch from \"main\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e7e1d01-694f-46e3-aeba-678dd69da9aa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "branchTransformationsChange = repo.branch('transformations-change').create(source_reference=mainBranch)\n", + "print(\"transformations-change ref:\", branchTransformationsChange.get_commit().id)" + ] + }, + { + "cell_type": "markdown", + "id": "d7fe0a5b-5088-4bf5-a648-ae35087e9060", + "metadata": { + "tags": [] + }, + "source": [ + "## At the same time, \"developer2\" is ingesting new raw data into \"shopping_transactions/raw\"\n", + "### \"developer2\" creates an \"second-ingest-shopping-transactions\" branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8f3b89c-37fc-4eb9-aa94-2245637f730b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "branchSecondIngestion = repo.branch('ingest-shopping-transactions-2').create(source_reference=mainBranch)\n", + "print(\"ingest-shopping-transactions-2' ref:\", branchSecondIngestion.get_commit().id)" + ] + }, + { + "cell_type": "markdown", + "id": "5dfcf3f5-8493-47e0-a6a1-8ac3419403e8", + "metadata": {}, + "source": [ + "## \"developer2\" uploads additional data to \"shopping_transactions/raw\" folder in \"ingest-shopping-transactions-2\" branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a87c6bc4-997f-4869-9835-91be273b2f44", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "contentToUpload = open(f\"/data/{fileName}\", 'r').read()\n", + "branchSecondIngestion.object('shopping_transactions/raw/rawdata2.csv').upload(data=contentToUpload, mode='wb', pre_sign=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ee00bec3-bfe7-4222-8361-91051714e556", + "metadata": {}, + "source": [ + "## \"developer2\" commits with additional commit medata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a440567-c6e8-4f7d-8543-16ae89a8f1d2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ref = branchSecondIngestion.commit(message='Ingested raw shopping transactions data!', metadata={'using': 'python_sdk'})\n", + "print_commit(ref.get_commit())" + ] + }, + { + "cell_type": "markdown", + "id": "ebf8b6fb-9223-47dd-a09e-9185229e3180", + "metadata": { + "tags": [] + }, + "source": [ + "## \"developer2\" merges changes to \"ingest-shopping-transactions-2\" to main, and introduce new data to production" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f295f40d-27c9-4f59-86bf-a59418cadd39", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "res = branchSecondIngestion.merge_into(branchMain)\n", + "print(res)" + ] + }, + { + "cell_type": "markdown", + "id": "c14dac33-3ef7-4938-846c-57332564a76d", + "metadata": { + "tags": [] + }, + "source": [ + "## \"developer1\" branch still points to the production data version they created the branch from, and is not seeing the recent change made by \"developer2\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b003167-71e0-4288-acc4-5077be37791b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "diff = branchTransformationsChange.diff(other_ref=branchSecondIngestion)\n", + "print_diff(diff)" + ] + }, + { + "cell_type": "markdown", + "id": "29cf21fe-ccbb-4c7a-8b3e-8274f28af4b5", + "metadata": {}, + "source": [ + "## \"data_scientist1\" creates \"ds_branch\" branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ede96885-d05f-4b6b-8aec-8b212c3304f3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "branchDSBranch = lakefs.Repository(repo_name, client=data_scientist1LakeFSClient).branch('ds_branch').create(source_reference=mainBranch)\n", + "print(\"ds_branch ref:\", branchDSBranch.get_commit().id)" + ] + }, + { + "cell_type": "markdown", + "id": "6e461465-693a-4d05-94cd-2608c75ff675", + "metadata": {}, + "source": [ + "## \"data_scientist1\" uploads a new file to \"experiment1\" branch " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ee6e6f3-6dc5-4ff3-b915-b10ccddee652", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "contentToUpload = open('/data/lakefs_test_new.csv', 'r').read()\n", + "branchDSBranch.object('ds/lakefs_test_new.csv').upload(data=contentToUpload, mode='wb', pre_sign=False)" + ] + }, + { + "cell_type": "markdown", + "id": "c6e63e6f-d184-4d61-a28f-dd4c793397bc", + "metadata": {}, + "source": [ + "## \"data_scientist1\" commits changes and attaches some metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "694b9af0-738b-4f1c-9269-c295aae62343", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ref = branchDSBranch.commit(message='Added new data file!', metadata={'using': 'python_sdk'})\n", + "print_commit(ref.get_commit())" + ] + }, + { + "cell_type": "markdown", + "id": "a7c6f83c-2f23-4201-875d-be0d27e14a98", + "metadata": {}, + "source": [ + "## But \"data_scientist1\" can't merge \"ds_branch\" branch to main branch due to \"FSBlockMergingToMain\" policy attached to \"DataScientists\" group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59c5734b-ea8d-417b-8d73-225fbd31beb3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "branchDSBranch.merge_into(mainBranch)" + ] + }, + { + "cell_type": "markdown", + "id": "765fc9c6-c8f2-4b75-8e5e-f7af1dcd6e36", + "metadata": {}, + "source": [ + "## More Questions?\n", + "\n", + "###### Join the lakeFS Slack group - https://lakefs.io/slack" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/00_notebooks/images/data_colab.png b/00_notebooks/images/data_colab.png new file mode 100644 index 000000000..90d90dd1a Binary files /dev/null and b/00_notebooks/images/data_colab.png differ