} with the number of segments in the database whose
- state has been changed (that is, the segments were marked as unused) as the result
- of this API call.
-
- Reference
- ---------
- `DELETE /druid/coordinator/v1/datasources/{dataSourceName}`
- '''
- r = self.rest_client.delete(REQ_DATASOURCE, args=[ds_name])
- if if_exists and r.status_code == requests.codes.not_found:
- return
- check_error(r)
-
- def load_status_req(self, ds_name, params=None):
- return self.rest_client.get_json(REQ_DS_LOAD_STATUS, args=[ds_name], params=params)
-
- def load_status(self, ds_name):
- return self.load_status_req(ds_name, {
- 'forceMetadataRefresh': 'true',
- 'interval': '1970-01-01/2999-01-01'})
-
- def wait_until_ready(self, ds_name):
- while True:
- resp = self.load_status(ds_name)
- if dict_get(resp, ds_name) == 100.0:
- return
- time.sleep(0.5)
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/display.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/display.py
deleted file mode 100644
index e51bff70ce45..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/display.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from druidapi import consts
-import time
-
-class DisplayClient:
- '''
- Abstract base class to display various kinds of results.
- '''
-
- def __init__(self, druid=None):
- # If the client is None, it must be backfilled by the caller.
- # This case occurs only when creating the DruidClient to avoid
- # a circular depencency.
- self._druid = druid
-
- # Basic display operations
-
- def text(self, msg):
- raise NotImplementedError()
-
- def alert(self, msg):
- raise NotImplementedError()
-
- def error(self, msg):
- raise NotImplementedError()
-
- # Tabular formatting
-
- def new_table(self):
- raise NotImplementedError()
-
- def show_table(self, table):
- raise NotImplementedError()
-
- def data_table(self, rows, cols=None):
- '''
- Display a table of data with the optional column headings.
-
- Parameters
- ----------
- objects: list[list]
- The data to display as a list of lists, where each inner list represents one
- row of data. Rows should be of the same width: ragged rows will display blank
- cells. Data can be of any scalar type and is formatted correctly for that type.
-
- cols: list[str]
- Optional list of column headings.
- '''
- table = self.new_table()
- table.rows(rows)
- table.headers(cols)
- self.show_table(table)
-
- def object_list(self, objects, cols=None):
- '''
- Display a list of objects represented as dictionaries with optional headings.
-
- Parameters
- ----------
- objects: list[dict]
- List of dictionaries: one dictionary for each row.
-
- cols: dict, Default = None
- A list of column headings in the form `{'key': 'label'}`
- '''
- table = self.new_table()
- table.from_object_list(objects, cols)
- self.show_table(table)
-
- def object(self, obj, labels=None):
- '''
- Display a single object represented as a dictionary with optional headings.
- The object is displayed in two columns: keys and values.
-
- Parameters
- ----------
- objects: list[dict]
- List of dictionaries: one dictionary for each row.
-
- labels: list, Default = None
- A list of column headings in the form `['key', 'value']`. Default headings
- are used if the lables are not provided.
- '''
- table = self.new_table()
- table.from_object(obj, labels)
- self.show_table(table)
-
- # SQL formatting
-
- def sql(self, sql):
- '''
- Run a query and display the result as a table.
-
- Parameters
- ----------
- query
- The query as either a string or a SqlRequest object.
- '''
- self._druid.sql.sql_query(sql).show(display=self)
-
- def table(self, table_name):
- '''
- Describe a table by returning the list of columns in the table.
-
- Parameters
- ----------
- table_name str
- The name of the table as either "table" or "schema.table".
- If the form is "table", then the 'druid' schema is assumed.
- '''
- self._druid.sql._schema_query(table_name).show(display=self)
-
- def function(self, table_name):
- '''
- Retrieve the list of parameters for a partial external table defined in
- the Druid catalog.
-
- Parameters
- ----------
- table_name str
- The name of the table as either "table" or "schema.table".
- If the form is "table", then the 'ext' schema is assumed.
- '''
- return self._druid.sql._function_args_query(table_name).show(display=self)
-
- def schemas(self):
- '''
- Display the list of schemas available in Druid.
- '''
- self._druid.sql._schemas_query().show()
-
- def tables(self, schema=consts.DRUID_SCHEMA):
- self._druid.sql._tables_query(schema).show(display=self)
-
- def run_task(self, query):
- '''
- Run an MSQ task while displaying progress in the cell output.
- :param query: INSERT/REPLACE statement to run
- :return: None
- '''
- from tqdm import tqdm
-
- task = self._druid.sql.task(query)
- with tqdm(total=100.0) as pbar:
- previous_progress = 0.0
- while True:
- reports=task.reports_no_wait()
- # check if progress metric is available and display it
- if 'multiStageQuery' in reports.keys():
- if 'payload' in reports['multiStageQuery'].keys():
- if 'counters' in reports['multiStageQuery']['payload'].keys():
- if ('0' in reports['multiStageQuery']['payload']['counters'].keys() ) and \
- ('0' in reports['multiStageQuery']['payload']['counters']['0'].keys()):
- if 'progressDigest' in reports['multiStageQuery']['payload']['counters']['0']['0']['sortProgress'].keys():
- current_progress = reports['multiStageQuery']['payload']['counters']['0']['0']['sortProgress']['progressDigest']*100.0
- pbar.update( current_progress - previous_progress ) # update requires a relative value
- previous_progress = current_progress
- # present status if available
- if 'status' in reports['multiStageQuery']['payload'].keys():
- pbar.set_description(f"Loading data, status:[{reports['multiStageQuery']['payload']['status']['status']}]")
- # stop when job is done
- if reports['multiStageQuery']['payload']['status']['status'] in ['SUCCESS', 'FAILED']:
- break;
- else:
- pbar.set_description('Initializing...')
- time.sleep(1)
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/druid.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/druid.py
deleted file mode 100644
index 79f130dcae68..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/druid.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from druidapi.rest import DruidRestClient
-from druidapi.status import StatusClient
-from druidapi.catalog import CatalogClient
-from druidapi.sql import QueryClient
-from druidapi.tasks import TaskClient
-from druidapi.datasource import DatasourceClient
-from druidapi.basic_auth import BasicAuthClient
-
-class DruidClient:
- '''
- Client for a Druid cluster. Functionality is split into a number of
- specialized "clients" that group many of Druid's REST API calls.
- '''
-
- def __init__(self, router_endpoint, display_client=None, auth=None):
- self.rest_client = DruidRestClient(router_endpoint, auth=auth)
- self.status_client = None
- self.catalog_client = None
- self.sql_client = None
- self.tasks_client = None
- self.datasource_client = None
- if display_client:
- self.display_client = display_client
- else:
- from druidapi.text_display import TextDisplayClient
- self.display_client = TextDisplayClient()
- self.display_client._druid = self
-
- @property
- def rest(self):
- '''
- Returns the low-level REST client. Useful for debugging and to access REST API
- calls not yet wrapped by the various function-specific clients.
-
- If you find you need to use this, consider creating a wrapper function in Python
- and contributing it to Druid via a pull request.
- '''
- return self.rest_client
-
- def trace(self, enable=True):
- '''
- Enable or disable tracing. When enabled, the Druid client prints the
- URL and payload for each REST API call. Useful for debugging, or if you want
- to learn what the code does so you can replicate it in your own client.
- '''
- self.rest_client.enable_trace(enable)
-
- @property
- def status(self) -> StatusClient:
- '''
- Returns the status client for the Router service.
- '''
- if not self.status_client:
- self.status_client = StatusClient(self.rest_client)
- return self.status_client
-
- def status_for(self, endpoint) -> StatusClient:
- '''
- Returns the status client for a Druid service.
-
- Parameters
- ----------
- endpoint: str
- The URL for a Druid service.
- '''
- return StatusClient(DruidRestClient(endpoint), True)
-
- @property
- def catalog(self) -> CatalogClient:
- '''
- Returns the catalog client to interact with the Druid catalog.
- '''
- if not self.catalog_client:
- self.catalog_client = CatalogClient(self.rest_client)
- return self.catalog_client
-
- @property
- def sql(self) -> QueryClient:
- '''
- Returns the SQL query client to submit interactive or MSQ queries.
- '''
- if not self.sql_client:
- self.sql_client = QueryClient(self)
- return self.sql_client
-
- @property
- def tasks(self) -> TaskClient:
- '''
- Returns the Overlord tasks client to submit and track tasks.
- '''
- if not self.tasks_client:
- self.tasks_client = TaskClient(self.rest_client)
- return self.tasks_client
-
- @property
- def datasources(self) -> DatasourceClient:
- '''
- Returns the Coordinator datasources client to manipulate datasources.
- Prefer to use the SQL client to query the INFORMATION_SCHEMA to obtain
- information about datasources.
- '''
- if not self.datasource_client:
- self.datasource_client = DatasourceClient(self.rest_client)
- return self.datasource_client
-
- def basic_security(self, authenticator, authorizer=None):
- '''
- Returns a client to work with a basic authorization authenticator/authorizer pair.
- This client assumes the typical case of one authenticator and one authorizer. If
- you have more than one, create multiple clients.
-
- The basic security API is not proxied through the Router: it must work directly with
- the Coordinator. Create an ad hoc Druid client for your Coordinator. Because you have
- basic security enabled, you must specify the admin user and password:
-
- ```
- coord = druidapi.jupyter_client('http://localhost:8081', auth=('admin', 'admin-pwd'))
- ac = coord.basic_security('yourAuthenticator', 'yourAuthorizer')
- ```
-
- Parameters
- ----------
- authenticator: str
- Authenticator name as set in the `druid.auth.authenticatorChain`
- runtime property.
-
- authorizer: str, default = same as authenticator
- Authorizer name as set in the `druid.auth.authorizers` runtime property.
- Defaults to the same name as the `authenticator` parameter for simple cases.
- '''
- return BasicAuthClient(self.rest_client, authenticator, authorizer)
-
- @property
- def display(self):
- return self.display_client
-
- def close(self):
- self.rest_client.close()
- self.rest_client = None
- self.catalog_client = None
- self.tasks_client = None
- self.datasource_client = None
- self.sql_client = None
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/error.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/error.py
deleted file mode 100644
index 8e1af52566e3..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/error.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-class ClientError(Exception):
- '''
- Indicates an error with usage of the Python API.
- '''
-
- def __init__(self, msg):
- self.message = msg
-
-class DruidError(Exception):
- '''
- Indicates that something went wrong on Druid, typically as the result of a
- request that this client sent.
- '''
-
- def __init__(self, msg):
- self.message = msg
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/html_display.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/html_display.py
deleted file mode 100644
index e63946993f9b..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/html_display.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from IPython.display import display, HTML
-from html import escape
-from druidapi.display import DisplayClient
-from druidapi.base_table import BaseTable
-
-STYLES = '''
-
-'''
-
-def escape_for_html(s):
- # Annoying: IPython treats $ as the start of Latex, which is cool,
- # but not wanted here.
- return s.replace('$', '\\$')
-
-def html(s):
- display(HTML(s))
-
-initialized = False
-
-alignments = ['druid-left', 'druid-center', 'druid-right']
-
-def start_tag(tag, align):
- s = '<' + tag
- if align:
- s += ' class="{}"'.format(alignments[align])
- return s + '>'
-
-class HtmlTable(BaseTable):
-
- def __init__(self):
- BaseTable.__init__(self)
-
- def widths(self, widths):
- self._widths = widths
-
- def format(self) -> str:
- if not self._rows and not self._headers:
- return ''
- _, width = self.row_width(self._rows)
- headers = self.pad_headers(width)
- rows = self.pad_rows(self._rows, width)
- s = '\n'
- s += self.gen_header(headers)
- s += self.gen_rows(rows)
- return s + '\n
'
-
- def gen_header(self, headers):
- if not headers:
- return ''
- s = ''
- for i in range(len(headers)):
- s += start_tag('th', self.col_align(i)) + escape(headers[i]) + ''
- return s + '
\n'
-
- def gen_rows(self, rows):
- html_rows = []
- for row in rows:
- r = ''
- for i in range(len(row)):
- r += start_tag('td', self.col_align(i))
- cell = row[i]
- value = '' if cell is None else escape(str(cell))
- r += value + ''
- html_rows.append(r + '
')
- return '\n'.join(html_rows)
-
- def col_align(self, col):
- if not self._align:
- return None
- if col >= len(self._align):
- return None
- return self._align[col]
-
-class HtmlDisplayClient(DisplayClient):
-
- def __init__(self):
- DisplayClient.__init__(self)
- global initialized
- if not initialized:
- display(HTML(STYLES))
- initialized = True
-
- def text(self, msg):
- html('' + escape_for_html(msg) + '
')
-
- def alert(self, msg):
- html('' + escape_for_html(msg.replace('\n', '
')) + '
')
-
- def error(self, msg):
- html('ERROR: ' + escape_for_html(msg.replace('\n', '
')) + '
')
-
- def new_table(self):
- return HtmlTable()
-
- def show_table(self, table):
- self.text(table.format())
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/rest.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/rest.py
deleted file mode 100644
index 2f10681ed1ed..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/rest.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import requests
-from druidapi.util import dict_get
-from urllib.parse import quote
-from druidapi.error import ClientError
-
-def check_error(response):
- '''
- Raises an HttpError from the requests library if the response code is neither
- OK (200) nor Accepted (202).
-
- Druid's REST API is inconsistent with how it reports errors. Some APIs return
- an error as a JSON object. Others return a text message. Still others return
- nothing at all. With the JSON format, sometimes the error returns an
- 'errorMessage' field, other times only a generic 'error' field.
-
- This method attempts to parse these variations. If the error response JSON
- matches one of the known error formats, then raises a `ClientError` with the error
- message. Otherise, raises a Requests library `HTTPError` for a generic error.
- If the response includes a JSON payload, then the it is returned in the json field
- of the `HTTPError` object so that the client can perhaps decode it.
- '''
- code = response.status_code
- if code == requests.codes.ok or code == requests.codes.accepted:
- return
- json = None
- try:
- json = response.json()
- except Exception:
- # If we can't get the JSON, raise a Requests error
- response.raise_for_status()
-
- # Druid JSON payload. Try to make sense of the error
- msg = dict_get(json, 'errorMessage')
- if not msg:
- msg = dict_get(json, 'error')
- if msg:
- # We have an explanation from Druid. Raise a Client exception
- raise ClientError(msg)
-
- # Don't know what the Druid JSON is. Raise a Requests exception, but
- # add on the JSON in the hopes that the caller can make use of it.
- try:
- response.raise_for_status()
- except Exception as e:
- e.json = json
- raise e
-
-def build_url(endpoint, req, args=None) -> str:
- '''
- Returns the full URL for a REST call given the relative request API and
- optional parameters to fill placeholders within the request URL.
-
- Parameters
- ----------
- endpoint: str
- The base URL for the service.
-
- req: str
- Relative URL, with optional {} placeholders
-
- args: list
- Optional list of values to match {} placeholders in the URL.
- '''
- url = endpoint + req
- if args:
- quoted = [quote(arg) for arg in args]
- url = url.format(*quoted)
- return url
-
-class DruidRestClient:
- '''
- Wrapper around the basic Druid REST API operations using the
- requests Python package. Handles the grunt work of building up
- URLs, working with JSON, etc.
-
- The REST client accepts an endpoint that represents a Druid service, typically
- the Router. All requests are made to this service, which means using the service
- URL as the base. That is, if the service is http://localhost:8888, then
- a request for status is just '/status': the methods here build up the URL by
- concatenating the service endpoint with the request URL.
- '''
-
- def __init__(self, endpoint, auth=None):
- '''
- Creates a Druid rest client endpoint using the given endpoint URI and
- optional authentication.
-
- Parameters
- ----------
- endpoint: str
- The Druid router endpoint of the form `'server:port'`. Use
- `'localhost:8888'` for a Druid instance running locally.
-
- auth: str, default = None
- Optional authorization credentials in the format described
- by the Requests library. For Basic auth use
- `auth=('user', 'password')`
- '''
- self.endpoint = endpoint
- self.trace = False
- self.session = requests.Session()
- if auth:
- self.session.auth = auth
-
- def enable_trace(self, flag=True):
- self.trace = flag
-
- def build_url(self, req, args=None) -> str:
- '''
- Returns the full URL for a REST call given the relative request API and
- optional parameters to fill placeholders within the request URL.
-
- Parameters
- ----------
- req: str
- Relative URL, with optional {} placeholders
-
- args: list
- Optional list of values to match {} placeholders in the URL.
- '''
- return build_url(self.endpoint, req, args)
-
- def get(self, req, args=None, params=None, require_ok=True) -> requests.Request:
- '''
- Generic GET request to this service.
-
- Parameters
- ----------
- req: str
- The request URL without host, port or query string.
- Example: `/status`
-
- args: [str], default = None
- Optional parameters to fill in to the URL.
- Example: `/customer/{}`
-
- params: dict, default = None
- Optional map of query variables to send in
- the URL. Query parameters are the name/value pairs
- that appear after the `?` marker.
-
- require_ok: bool, default = True
- Whether to require an OK (200) response. If `True`, and
- the request returns a different response code, then raises
- a `RestError` exception.
-
- Returns
- -------
- The `requests` `Request` object.
- '''
- url = self.build_url(req, args)
- if self.trace:
- print('GET:', url)
- r = self.session.get(url, params=params)
- if require_ok:
- check_error(r)
- return r
-
- def get_json(self, url_tail, args=None, params=None):
- '''
- Generic GET request which expects a JSON response.
- '''
- r = self.get(url_tail, args, params)
- return r.json()
-
- def post(self, req, body, args=None, headers=None, require_ok=True) -> requests.Response:
- '''
- Issues a POST request for the given URL on this
- node, with the given payload and optional URL query
- parameters.
- '''
- url = self.build_url(req, args)
- if self.trace:
- print('POST:', url)
- print('body:', body)
- r = self.session.post(url, data=body, headers=headers)
- if require_ok:
- check_error(r)
- return r
-
- def post_json(self, req, body, args=None, headers=None, params=None) -> requests.Response:
- '''
- Issues a POST request for the given URL on this node, with a JSON request. Returns
- the JSON response.
-
- Parameters
- ----------
- req: str
- URL relative to the service base URL.
-
- body: any
- JSON-encodable Python object to send in the request body.
-
- args: array[str], default = None
- Arguments to include in the relative URL to replace {} markers.
-
- headers: dict, default = None
- Additional HTTP header fields to send in the request.
-
- params: dict, default = None
- Parameters to inlude in the URL as the `?name=value` query string.
-
- Returns
- -------
- The JSON response as a Python object.
-
- See
- ---
- `post_only_json()` for the form that returns the response object, not JSON.
- '''
- r = self.post_only_json(req, body, args, headers, params)
- check_error(r)
- return r.json()
-
- def post_only_json(self, req, body, args=None, headers=None, params=None, require_ok=True) -> requests.Request:
- '''
- Issues a POST request for the given URL on this node, with a JSON request, returning
- the Requests library `Response` object.
-
- Parameters
- ----------
- req: str
- URL relative to the service base URL.
-
- body: any
- JSON-encodable Python object to send in the request body.
-
- args: array[str], default = None
- Arguments to include in the relative URL to replace {} markers.
-
- headers: dict, default = None
- Additional HTTP header fields to send in the request.
-
- params: dict, default = None
- Parameters to inlude in the URL as the `?name=value` query string.
-
- Returns
- -------
- The JSON response as a Python object.
-
- See
- ---
- `post_json()` for the form that returns the response JSON.
- '''
- url = self.build_url(req, args)
- if self.trace:
- print('POST:', url)
- print('body:', body)
- r = self.session.post(url, json=body, headers=headers, params=params)
- if require_ok:
- check_error(r)
- return r
-
- def delete(self, req, args=None, params=None, headers=None, require_ok=True):
- url = self.build_url(req, args)
- if self.trace:
- print('DELETE:', url)
- r = self.session.delete(url, params=params, headers=headers)
- if require_ok:
- check_error(r)
- return r
-
- def delete_json(self, req, args=None, params=None, headers=None):
- return self.delete(req, args=args, params=params, headers=headers).json()
-
- def close(self):
- '''
- Close the session. Use in scripts and tests when the system will otherwise complain
- about open sockets.
- '''
- self.session.close()
- self.session = None
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/sql.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/sql.py
deleted file mode 100644
index 46bad764dae9..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/sql.py
+++ /dev/null
@@ -1,869 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time, requests
-from druidapi import consts
-from druidapi.util import dict_get, split_table_name
-from druidapi.error import DruidError, ClientError
-
-REQ_SQL = consts.ROUTER_BASE + '/sql'
-REQ_SQL_TASK = REQ_SQL + '/task'
-
-class SqlRequest:
-
- def __init__(self, query_client, sql):
- self.query_client = query_client
- self.sql = sql
- self.context = None
- self.params = None
- self.header = False
- self.format = consts.SQL_OBJECT
- self.headers = None
- self.types = None
- self.sql_types = None
-
- def with_format(self, result_format):
- self.format = result_format
- return self
-
- def with_headers(self, sql_types=False, druidTypes=False):
- self.headers = True
- self.types = druidTypes
- self.sql_types = sql_types
- return self
-
- def with_context(self, context):
- if not self.context:
- self.context = context
- else:
- self.context.update(context)
- return self
-
- def add_context(self, key, value):
- return self.with_context({key: value})
-
- def with_parameters(self, params):
- '''
- Set the array of parameters. Parameters must each be a map of 'type'/'value' pairs:
- {'type': the_type, 'value': the_value}. The type must be a valid SQL type
- (in upper case). See the consts module for a list.
- '''
- for param in params:
- self.add_parameters(param)
- return self
-
- def add_parameter(self, value):
- '''
- Add one parameter value. Infers the type of the parameter from the Python type.
- '''
- if value is None:
- raise ClientError('Druid does not support null parameter values')
- data_type = None
- value_type = type(value)
- if value_type is str:
- data_type = consts.SQL_VARCHAR_TYPE
- elif value_type is int:
- data_type = consts.SQL_BIGINT_TYPE
- elif value_type is float:
- data_type = consts.SQL_DOUBLE_TYPE
- elif value_type is list:
- data_type = consts.SQL_ARRAY_TYPE
- else:
- raise ClientError('Unsupported value type')
- if not self.params:
- self.params = []
- self.params.append({'type': data_type, 'value': value})
-
- def response_header(self):
- self.header = True
- return self
-
- def request_headers(self, headers):
- self.headers = headers
- return self
-
- def to_common_format(self):
- self.header = False
- self.sql_types = False
- self.types = False
- self.format = consts.SQL_OBJECT
- return self
-
- def to_request(self):
- query_obj = {'query': self.sql}
- if self.context:
- query_obj['context'] = self.context
- if self.params:
- query_obj['parameters'] = self.params
- if self.header:
- query_obj['header'] = True
- if self.format:
- query_obj['resultFormat'] = self.format
- if self.sql_types is not None: # Note: boolean variable
- query_obj['sqlTypesHeader'] = self.sql_types
- if self.types is not None: # Note: boolean variable
- query_obj['typesHeader'] = self.types
- return query_obj
-
- def result_format(self):
- return self.format.lower()
-
- def run(self):
- return self.query_client.sql_query(self)
-
-def request_from_sql_query(query_client, sql_query):
- try:
- req = SqlRequest(query_client, sql_query['query'])
- except KeyError:
- raise ClientError('A SqlRequest dictionary must have \'query\' set')
- req.context = sql_query.get('context')
- req.params = sql_query.get('parameters')
- req.header = sql_query.get('header')
- req.format = sql_query.get('resultFormat')
- req.format = consts.SQL_OBJECT if req.format is None else req.format
- req.sql_types = sql_query.get('sqlTypesHeader')
- req.types = sql_query.get('typesHeader')
- return req
-
-def parse_rows(fmt, context, results):
- if fmt == consts.SQL_ARRAY_WITH_TRAILER:
- rows = results['results']
- elif fmt == consts.SQL_ARRAY:
- rows = results
- else:
- return results
- if not context.get('headers', False):
- return rows
- header_size = 1
- if context.get('sqlTypesHeader', False):
- header_size += 1
- if context.get('typesHeader', False):
- header_size += 1
- return rows[header_size:]
-
-def label_non_null_cols(results):
- if not results:
- return []
- is_null = {}
- for key in results[0].keys():
- is_null[key] = True
- for row in results:
- for key, value in row.items():
- # The following is hack to check for null values, empty strings and numeric 0s.
- is_null[key] = not not value
- return is_null
-
-def filter_null_cols(results):
- '''
- Filter columns from a Druid result set by removing all null-like
- columns. A column is considered null if all values for that column
- are null. A value is null if it is either a JSON null, an empty
- string, or a numeric 0. All rows are preserved, as is the order
- of the remaining columns.
- '''
- if not results:
- return results
- is_null = label_non_null_cols(results)
- revised = []
- for row in results:
- new_row = {}
- for key, value in row.items():
- if is_null[key]:
- continue
- new_row[key] = value
- revised.append(new_row)
- return revised
-
-def parse_object_schema(results):
- schema = []
- if len(results) == 0:
- return schema
- row = results[0]
- for k, v in row.items():
- druid_type = None
- sql_type = None
- if type(v) is str:
- druid_type = consts.DRUID_STRING_TYPE
- sql_type = consts.SQL_VARCHAR_TYPE
- elif type(v) is int or type(v) is float:
- druid_type = consts.DRUID_LONG_TYPE
- sql_type = consts.SQL_BIGINT_TYPE
- schema.append(ColumnSchema(k, sql_type, druid_type))
- return schema
-
-def parse_array_schema(context, results):
- schema = []
- if len(results) == 0:
- return schema
- has_headers = context.get(consts.HEADERS_KEY, False)
- if not has_headers:
- return schema
- has_sql_types = context.get(consts.SQL_TYPES_HEADERS_KEY, False)
- has_druid_types = context.get(consts.DRUID_TYPE_HEADERS_KEY, False)
- size = len(results[0])
- for i in range(size):
- druid_type = None
- if has_druid_types:
- druid_type = results[1][i]
- sql_type = None
- if has_sql_types:
- sql_type = results[2][i]
- schema.append(ColumnSchema(results[0][i], sql_type, druid_type))
- return schema
-
-def parse_schema(fmt, context, results):
- if fmt == consts.SQL_OBJECT:
- return parse_object_schema(results)
- elif fmt == consts.SQL_ARRAY or fmt == consts.SQL_ARRAY_WITH_TRAILER:
- return parse_array_schema(context, results)
- else:
- return []
-
-def is_response_ok(http_response):
- code = http_response.status_code
- return code == requests.codes.ok or code == requests.codes.accepted
-
-class ColumnSchema:
-
- def __init__(self, name, sql_type, druid_type):
- self.name = name
- self.sql_type = sql_type
- self.druid_type = druid_type
-
- def __str__(self):
- return '{{name={}, SQL type={}, Druid type={}}}'.format(self.name, self.sql_type, self.druid_type)
-
-class SqlQueryResult:
- '''
- Response from a classic request/response query.
- '''
-
- def __init__(self, request, response):
- self.http_response = response
- self._json = None
- self._rows = None
- self._schema = None
- self.request = request
- self._error = None
- self._id = None
- if not is_response_ok(response):
- try:
- self._error = response.json()
- except Exception:
- self._error = response.text
- if not self._error:
- self._error = 'Failed with HTTP status {}'.format(response.status_code)
- try:
- self._id = self.http_response.headers['X-Druid-SQL-Query-Id']
- except KeyError:
- self._error = 'Query returned no query ID'
-
- @property
- def _druid(self):
- return self.request.query_client.druid_client
-
- @property
- def result_format(self):
- return self.request.result_format()
-
- @property
- def ok(self):
- '''
- Reports if the query succeeded.
-
- The query rows and schema are available only if ok is True.
- '''
- return is_response_ok(self.http_response)
-
- @property
- def error(self):
- '''
- If the query fails, returns the error, if any provided by Druid.
- '''
- if self.ok:
- return None
- if self._error:
- return self._error
- if not self.http_response:
- return { 'error': 'unknown'}
- if is_response_ok(self.http_response):
- return None
- return {'error': 'HTTP {}'.format(self.http_response.status_code)}
-
- @property
- def error_message(self):
- if self.ok:
- return None
- err = self.error
- if not err:
- return 'unknown'
- if type(err) is str:
- return err
- msg = err.get('error')
- text = err.get('errorMessage')
- if not msg and not text:
- return 'unknown'
- if not msg:
- return text
- if not text:
- return msg
- return msg + ': ' + text
-
- @property
- def id(self):
- '''
- Returns the unique identifier for the query.
- '''
- return self._id
-
- @property
- def non_null(self):
- if not self.ok:
- return None
- if self.result_format != consts.SQL_OBJECT:
- return None
- return filter_null_cols(self.rows)
-
- @property
- def as_array(self):
- if self.result_format == consts.SQL_OBJECT:
- rows = []
- for obj in self.rows:
- rows.append([v for v in obj.values()])
- return rows
- else:
- return self.rows
-
- @property
- def json(self):
- if not self.ok:
- return None
- if not self._json:
- self._json = self.http_response.json()
- return self._json
-
- @property
- def rows(self):
- '''
- Returns the rows of data for the query.
-
- Druid supports many data formats. The method makes its best
- attempt to map the format into an array of rows of some sort.
- '''
- if not self._rows:
- json = self.json
- if not json:
- return self.http_response.text
- self._rows = parse_rows(self.result_format, self.request.context, json)
- return self._rows
-
- @property
- def schema(self):
- '''
- Returns the data schema as a list of ColumnSchema objects.
-
- Druid supports many data formats; not all of which provide
- schema information. This method makes a best effort to
- extract the schema from the query results.
- '''
- if not self._schema:
- self._schema = parse_schema(self.result_format, self.request.context, self.json)
- return self._schema
-
- def _display(self, display):
- return self._druid.display if not display else display
-
- def show(self, non_null=False, display=None):
- display = self._display(display)
- if not self.ok:
- display.error(self.error_message)
- return
- data = None
- if non_null:
- data = self.non_null
- if not data:
- data = self.as_array
- if not data:
- display.alert('Query returned no results')
- return
- display.data_table(data, [c.name for c in self.schema])
-
- def show_schema(self, display=None):
- display = self._display(display)
- if not self.ok:
- display.error(self.error_message)
- return
- data = []
- for c in self.schema:
- data.append([c.name, c.sql_type, c.druid_type])
- if not data:
- display.alert('Query returned no schema')
- return
- display.data_table(data, ['Name', 'SQL Type', 'Druid Type'])
-
-class QueryTaskResult:
- '''
- Response from an asynchronous MSQ query, which may be an ingestion or a retrieval
- query. Can monitor task progress and wait for the task to complete. For a SELECT query,
- obtains the rows from the task reports. There are no results for an ingestion query,
- just a success/failure status.
-
- Note that SELECT query support is preliminary. The result structure is subject to
- change. Use a version of the library that matches your version of Druid for best
- results with MSQ SELECT queries.
- '''
-
- def __init__(self, request, response):
- self._request = request
- self.http_response = response
- self._status = None
- self._results = None
- self._details = None
- self._schema = None
- self._rows = None
- self._reports = None
- self._schema = None
- self._results = None
- self._error = None
- self._id = None
- if not is_response_ok(response):
- self._state = consts.FAILED_STATE
- try:
- self._error = response.json()
- except Exception:
- self._error = response.text
- if not self._error:
- self._error = 'Failed with HTTP status {}'.format(response.status_code)
- return
-
- # Typical response:
- # {'taskId': '6f7b514a446d4edc9d26a24d4bd03ade_fd8e242b-7d93-431d-b65b-2a512116924c_bjdlojgj',
- # 'state': 'RUNNING'}
- self.response_obj = response.json()
- self._id = self.response_obj['taskId']
- self._state = self.response_obj['state']
-
- @property
- def ok(self):
- '''
- Reports if the query completed successfully or is still running.
- Use succeeded() to check if the task is done and successful.
- '''
- return not self._error
-
- @property
- def id(self):
- return self._id
-
- def _druid(self):
- return self._request.query_client.druid_client
-
- def _tasks(self):
- return self._druid().tasks
-
- @property
- def status(self):
- '''
- Polls Druid for an update on the query run status.
- '''
- self.check_valid()
- # Example:
- # {'task': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6',
- # 'status': {
- # 'id': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6',
- # 'groupId': 'talaria-sql-w000-b373b68d-2675-4035-b4d2-7a9228edead6',
- # 'type': 'talaria0', 'createdTime': '2022-04-28T23:19:50.331Z',
- # 'queueInsertionTime': '1970-01-01T00:00:00.000Z',
- # 'statusCode': 'RUNNING', 'status': 'RUNNING', 'runnerStatusCode': 'PENDING',
- # 'duration': -1, 'location': {'host': None, 'port': -1, 'tlsPort': -1},
- # 'dataSource': 'w000', 'errorMsg': None}}
- self._status = self._tasks().task_status(self._id)
- self._state = self._status['status']['status']
- if self._state == consts.FAILED_STATE:
- self._error = self._status['status']['errorMsg']
- return self._status
-
- @property
- def done(self):
- '''
- Reports whether the query is done. The query is done when the Overlord task
- that runs the query completes. A completed task is one with a status of either
- SUCCESS or FAILED.
- '''
- return self._state == consts.FAILED_STATE or self._state == consts.SUCCESS_STATE
-
- @property
- def succeeded(self):
- '''
- Reports if the query succeeded.
- '''
- return self._state == consts.SUCCESS_STATE
-
- @property
- def state(self):
- '''
- Reports the task state from the Overlord task.
-
- Updated after each call to status().
- '''
- return self._state
-
- @property
- def error(self):
- return self._error
-
- @property
- def error_message(self):
- err = self.error()
- if not err:
- return 'unknown'
- if type(err) is str:
- return err
- msg = dict_get(err, 'error')
- text = dict_get(err, 'errorMessage')
- if not msg and not text:
- return 'unknown'
- if text:
- text = text.replace('\\n', '\n')
- if not msg:
- return text
- if not text:
- return msg
- return msg + ': ' + text
-
- def join(self):
- '''
- Wait for the task to complete, if still running. Returns at task
- completion: success or failure.
-
- Returns True for success, False for failure.
- '''
- if not self.done:
- self.status
- while not self.done:
- time.sleep(0.5)
- self.status
- return self.succeeded
-
- def check_valid(self):
- if not self._id:
- raise ClientError('Operation is invalid on a failed query')
-
- def wait_until_done(self):
- '''
- Wait for the task to complete. Raises an error if the task fails.
- A caller can proceed to do something with the successful result
- once this method returns without raising an error.
- '''
- if not self.join():
- raise DruidError('Query failed: ' + self.error_message())
-
- def wait(self):
- '''
- Wait for a SELECT query to finish running, then returns the rows from the query.
- '''
- self.wait_until_done()
- return self.rows
-
- @property
- def reports(self) -> dict:
- self.check_valid()
- if not self._reports:
- self.join()
- self._reports = self._tasks().task_reports(self._id)
- return self._reports
-
- def reports_no_wait(self) -> dict:
- return self._tasks().task_reports(self._id, require_ok=False)
-
- @property
- def results(self):
- if not self._results:
- rpts = self.reports()
- self._results = rpts['multiStageQuery']['payload']['results']
- return self._results
-
- @property
- def schema(self):
- if not self._schema:
- results = self.results
- sig = results['signature']
- sql_types = results['sqlTypeNames']
- size = len(sig)
- self._schema = []
- for i in range(size):
- self._schema.append(ColumnSchema(sig[i]['name'], sql_types[i], sig[i]['type']))
- return self._schema
-
- @property
- def rows(self):
- if not self._rows:
- results = self.results
- self._rows = results['results']
- return self._rows
-
- def _display(self, display):
- return self._druid().display if not display else display
-
- def show(self, non_null=False, display=None):
- display = self._display(display)
- if not self.done:
- display.alert('Task has not finished running')
- return
- if not self.succeeded:
- display.error(self.error_message)
- return
- data = self.rows
- if non_null:
- data = filter_null_cols(data)
- if not data:
- display.alert('Query returned no {}rows'.format("visible " if non_null else ''))
- return
- display.data_table(data, [c.name for c in self.schema])
-
-class QueryClient:
-
- def __init__(self, druid, rest_client=None):
- self.druid_client = druid
- self._rest_client = druid.rest_client if not rest_client else rest_client
-
- @property
- def rest_client(self):
- return self._rest_client
-
- def _prepare_query(self, request):
- if not request:
- raise ClientError('No query provided.')
- # If the request is a dictionary, assume it is already in SqlQuery form.
- query_obj = None
- if type(request) == dict:
- query_obj = request
- request = request_from_sql_query(self, request)
- elif type(request) == str:
- request = self.sql_request(request)
- if not request.sql:
- raise ClientError('No query provided.')
- if self.rest_client.trace:
- print(request.sql)
- if not query_obj:
- query_obj = request.to_request()
- return (request, query_obj)
-
- def sql_query(self, request) -> SqlQueryResult:
- '''
- Submits a SQL query with control over the context, parameters and other
- options. Returns a response with either a detailed error message, or
- the rows and query ID.
-
- Parameters
- ----------
- request: str | SqlRequest | dict
- If a string, then gives the SQL query to execute.
-
- Can also be a `SqlRequest`, obtained from the
- 'sql_request()` method, with optional query context, query parameters or
- other options.
-
- Can also be a dictionary that represents a `SqlQuery` object. The
- `SqlRequest` is a convenient wrapper to generate a `SqlQuery`.
-
- Note that some of the Druid SqlQuery options will return data in a format
- that this library cannot parse. In that case, obtain the raw payload from
- the response and avoid using the rows() and schema() methods.
-
- Returns
- -------
- A SqlQueryResult object that provides either the error message for a failed query,
- or the results of a successul query. The object provides access to the schema and
- rows if data is requested in a supported format. The default request object sets the
- options to return data in the required format.
- '''
- request, query_obj = self._prepare_query(request)
- r = self.rest_client.post_only_json(REQ_SQL, query_obj, headers=request.headers)
- return SqlQueryResult(request, r)
-
- def sql(self, sql, *args) -> list:
- '''
- Run a SQL query and return the results. Typically used to receive data as part
- of another operation, rathre than to display results to the user.
-
- Parameters
- ----------
- sql: str
- The SQL statement with optional Python `{}` parameters.
-
- args: list[str], Default = None
- Array of values to insert into the parameters.
- '''
- if len(args) > 0:
- sql = sql.format(*args)
- resp = self.sql_query(sql)
- if resp.ok:
- return resp.rows
- raise ClientError(resp.error_message)
-
- def explain_sql(self, query):
- '''
- Runs an EXPLAIN PLAN FOR query for the given query.
-
- Returns
- -------
- An object with the plan JSON parsed into Python objects:
- plan: the query plan
- columns: column schema
- tables: dictionary of name/type pairs
- '''
- if not query:
- raise ClientError('No query provided.')
- results = self.sql('EXPLAIN PLAN FOR ' + query)
- return results[0]
-
- def sql_request(self, sql) -> SqlRequest:
- '''
- Creates a SqlRequest object for the given SQL query text.
- '''
- return SqlRequest(self, sql)
-
- def task(self, query) -> QueryTaskResult:
- '''
- Submits an MSQ query. Returns a QueryTaskResult to track the task.
-
- Parameters
- ----------
- query
- The query as either a string or a SqlRequest object.
- '''
- request, query_obj = self._prepare_query(query)
- r = self.rest_client.post_only_json(REQ_SQL_TASK, query_obj, headers=request.headers)
- return QueryTaskResult(request, r)
-
- def run_task(self, query):
- '''
- Submits an MSQ query and wait for completion. Returns a QueryTaskResult to track the task.
-
- Parameters
- ----------
- query
- The query as either a string or a SqlRequest object.
- '''
- resp = self.task(query)
- if not resp.ok:
- raise ClientError(resp.error_message)
- resp.wait_until_done()
-
- def _tables_query(self, schema):
- return self.sql_query('''
- SELECT TABLE_NAME AS TableName
- FROM INFORMATION_SCHEMA.TABLES
- WHERE TABLE_SCHEMA = '{}'
- ORDER BY TABLE_NAME
- '''.format(schema))
-
- def tables(self, schema=consts.DRUID_SCHEMA):
- '''
- Returns a list of tables in the given schema.
-
- Parameters
- ----------
- schema
- The schema to query, `druid` by default.
- '''
- return self._tables_query(schema).rows
-
- def _schemas_query(self):
- return self.sql_query('''
- SELECT SCHEMA_NAME AS SchemaName
- FROM INFORMATION_SCHEMA.SCHEMATA
- ORDER BY SCHEMA_NAME
- ''')
-
- def schemas(self):
- return self._schemas_query().rows
-
- def _schema_query(self, table_name):
- parts = split_table_name(table_name, consts.DRUID_SCHEMA)
- return self.sql_query('''
- SELECT
- ORDINAL_POSITION AS "Position",
- COLUMN_NAME AS "Name",
- DATA_TYPE AS "Type"
- FROM INFORMATION_SCHEMA.COLUMNS
- WHERE TABLE_SCHEMA = '{}'
- AND TABLE_NAME = '{}'
- ORDER BY ORDINAL_POSITION
- '''.format(parts[0], parts[1]))
-
- def table_schema(self, table_name):
- '''
- Returns the schema of a table as an array of dictionaries of the
- form {"Position": "", "Name": "", "Type": ""}
-
- Parameters
- ----------
- table_name: str
- The name of the table as either "table" or "schema.table".
- If the form is "table", then the 'druid' schema is assumed.
- '''
- return self._schema_query(table_name).rows
-
- def _function_args_query(self, table_name):
- parts = split_table_name(table_name, consts.EXT_SCHEMA)
- return self.sql_query('''
- SELECT
- ORDINAL_POSITION AS "Position",
- PARAMETER_NAME AS "Parameter",
- DATA_TYPE AS "Type",
- IS_OPTIONAL AS "Optional"
- FROM INFORMATION_SCHEMA.PARAMETERS
- WHERE SCHEMA_NAME = '{}'
- AND FUNCTION_NAME = '{}'
- ORDER BY ORDINAL_POSITION
- '''.format(parts[0], parts[1]))
-
- def function_parameters(self, table_name):
- '''
- Retruns the list of parameters for a partial external table defined in
- the Druid catalog. Returns the parameters as an array of objects in the
- form {"Position": , "Parameter": "", "Type": "",
- "Optional": True|False}
-
- Parameters
- ----------
- table_name str
- The name of the table as either "table" or "schema.table".
- If the form is "table", then the 'ext' schema is assumed.
- '''
- return self._function_args_query(table_name).rows
-
- def wait_until_ready(self, table_name, verify_load_status=True):
- '''
- Waits for a datasource to be loaded in the cluster, and to become available to SQL.
-
- Parameters
- ----------
- table_name str
- The name of a datasource in the 'druid' schema.
- verify_load_status
- If true, checks whether all published segments are loaded before testing query.
- If false, tries the test query before checking whether all published segments are loaded.
- '''
- if verify_load_status:
- self.druid_client.datasources.wait_until_ready(table_name)
- while True:
- try:
- self.sql('SELECT 1 FROM "{}" LIMIT 1'.format(table_name));
- return
- except Exception:
- time.sleep(0.5)
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/status.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/status.py
deleted file mode 100644
index 89141d268565..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/status.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-
-STATUS_BASE = '/status'
-REQ_STATUS = STATUS_BASE
-REQ_HEALTH = STATUS_BASE + '/health'
-REQ_PROPERTIES = STATUS_BASE + '/properties'
-REQ_IN_CLUSTER = STATUS_BASE + '/selfDiscovered/status'
-
-ROUTER_BASE = '/druid/router/v1'
-REQ_BROKERS = ROUTER_BASE + '/brokers'
-
-class StatusClient:
- '''
- Client for status APIs. These APIs are available on all nodes.
- If used with the Router, they report the status of just the Router.
- To check the status of other nodes, first create a REST endpoint for that
- node:
-
- status_client = StatusClient(DruidRestClient(""))
-
- You can find the service endpoints by querying the sys.servers table using SQL.
-
- See https://druid.apache.org/docs/latest/api-reference/api-reference.html#process-information
- '''
-
- def __init__(self, rest_client, owns_client=False):
- self.rest_client = rest_client
- self.owns_client = owns_client
-
- def close(self):
- if self.owns_client:
- self.rest_client.close()
- self.rest_client = None
-
- #-------- Common --------
-
- @property
- def status(self):
- '''
- Returns the Druid version, loaded extensions, memory used, total memory
- and other useful information about the Druid service.
-
- GET `/status`
- '''
- return self.rest_client.get_json(REQ_STATUS)
-
- @property
- def is_healthy(self) -> bool:
- '''
- Returns `True` if the node is healthy, `False` otherwise. Check service health
- before using other Druid API methods to ensure the server is ready.
-
- See also `wait_until_ready()`.
-
- GET `/status/health`
- '''
- try:
- return self.rest_client.get_json(REQ_HEALTH)
- except Exception:
- return False
-
- def wait_until_ready(self):
- '''
- Sleeps until the node reports itself as healthy. Will run forever if the node
- is down or never becomes healthy.
- '''
- while not self.is_healthy:
- time.sleep(0.5)
-
- @property
- def properties(self) -> map:
- '''
- Returns the effective set of Java properties used by the service, including
- system properties and properties from the `common_runtime.propeties` and
- `runtime.properties` files.
-
- GET `/status/properties`
- '''
- return self.rest_client.get_json(REQ_PROPERTIES)
-
- @property
- def in_cluster(self):
- '''
- Returns `True` if the node is visible within the cluster, `False` if not.
- That is, returns the value of the `{"selfDiscovered": true/false}`
- field in the response.
-
- GET `/status/selfDiscovered/status`
- '''
- try:
- result = self.rest_client.get_json(REQ_IN_CLUSTER)
- return result.get('selfDiscovered', False)
- except ConnectionError:
- return False
-
- @property
- def version(self):
- '''
- Returns the version of the Druid server. If the server is running in an IDE, the
- version will be empty.
- '''
- return self.status.get('version')
-
- @property
- def brokers(self):
- '''
- Returns the list of broker nodes known to this node. Must be called on the Router.
- '''
- return self.rest_client.get_json(REQ_BROKERS)
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/tasks.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/tasks.py
deleted file mode 100644
index b5652ba6aba1..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/tasks.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from druidapi.consts import OVERLORD_BASE
-import requests
-
-REQ_TASKS = OVERLORD_BASE + '/tasks'
-REQ_POST_TASK = OVERLORD_BASE + '/task'
-REQ_GET_TASK = REQ_POST_TASK + '/{}'
-REQ_TASK_STATUS = REQ_GET_TASK + '/status'
-REQ_TASK_REPORTS = REQ_GET_TASK + '/reports'
-REQ_END_TASK = REQ_GET_TASK
-REQ_END_DS_TASKS = REQ_END_TASK + '/shutdownAllTasks'
-
-class TaskClient:
- '''
- Client for Overlord task-related APIs.
-
- See https://druid.apache.org/docs/latest/api-reference/api-reference.html#tasks
- '''
-
- def __init__(self, rest_client):
- self.client = rest_client
-
- def tasks(self, state=None, table=None, task_type=None, max=None, created_time_interval=None):
- '''
- Retrieves the list of tasks.
-
- Parameters
- ----------
- state: str, default = None
- Filter list of tasks by task state. Valid options are "running",
- "complete", "waiting", and "pending". Constants are defined for
- each of these in the `consts` file.
-
- table: str, default = None
- Return tasks for only for one Druid table (datasource).
-
- created_time_interval: str, Default = None
- Return tasks created within the specified interval.
-
- max: int, default = None
- Maximum number of "complete" tasks to return. Only applies when state is set to "complete".
-
- task_type: str, default = None
- Filter tasks by task type.
-
- Reference
- ---------
- `GET /druid/indexer/v1/tasks`
- '''
- params = {}
- if state:
- params['state'] = state
- if table:
- params['datasource'] = table
- if task_type:
- params['type'] = task_type
- if max is not None:
- params['max'] = max
- if created_time_interval:
- params['createdTimeInterval'] = created_time_interval
- return self.client.get_json(REQ_TASKS, params=params)
-
- def task(self, task_id) -> dict:
- '''
- Retrieves the "payload" of a task.
-
- Parameters
- ----------
- task_id: str
- The ID of the task to retrieve.
-
- Returns
- -------
- The task payload as a Python dictionary.
-
- Reference
- ---------
- `GET /druid/indexer/v1/task/{taskId}`
- '''
- return self.client.get_json(REQ_GET_TASK, args=[task_id])
-
- def task_status(self, task_id) -> dict:
- '''
- Retrieves the status of a task.
-
- Parameters
- ----------
- task_id: str
- The ID of the task to retrieve.
-
- Returns
- -------
- The task status as a Python dictionary. See the `consts` module for a list
- of status codes.
-
- Reference
- ---------
- `GET /druid/indexer/v1/task/{taskId}/status`
- '''
- return self.client.get_json(REQ_TASK_STATUS, args=[task_id])
-
- def task_reports(self, task_id, require_ok = True) -> dict:
- '''
- Retrieves the completion report for a completed task.
-
- Parameters
- ----------
- task_id: str
- The ID of the task to retrieve.
-
- Returns
- -------
- The task reports as a Python dictionary.
-
- Reference
- ---------
- `GET /druid/indexer/v1/task/{taskId}/reports`
- '''
- if require_ok:
- return self.client.get_json(REQ_TASK_REPORTS, args=[task_id])
- else:
- resp = self.client.get(REQ_TASK_REPORTS, args=[task_id], require_ok=require_ok)
- if resp.status_code == requests.codes.ok:
- try:
- result = resp.json()
- except Exception as ex:
- result = {"message":"Payload could not be converted to json.", "payload":f"{resp.content}", "exception":f"{ex}"}
- return result
- else:
- return {"message":f"Request return code:{resp.status_code}"}
-
-
- def submit_task(self, payload):
- '''
- Submits a task to the Overlord.
-
- Returns the `taskId` of the submitted task.
-
- Parameters
- ----------
- payload: object
- The task object represented as a Python dictionary.
-
- Returns
- -------
- The REST response.
-
- Reference
- ---------
- `POST /druid/indexer/v1/task`
- '''
- return self.client.post_json(REQ_POST_TASK, payload)
-
- def shut_down_task(self, task_id):
- '''
- Shuts down a task.
-
- Parameters
- ----------
- task_id: str
- The ID of the task to shut down.
-
- Returns
- -------
- The REST response.
-
- Reference
- ---------
- `POST /druid/indexer/v1/task/{taskId}/shutdown`
- '''
- return self.client.post_json(REQ_END_TASK, args=[task_id])
-
- def shut_down_tasks_for(self, table):
- '''
- Shuts down all tasks for a table (datasource).
-
- Parameters
- ----------
- table: str
- The name of the table (datasource).
-
- Returns
- -------
- The REST response.
-
- Reference
- ---------
- `POST /druid/indexer/v1/datasources/{dataSource}/shutdownAllTasks`
- '''
- return self.client.post_json(REQ_END_DS_TASKS, args=[table])
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/text_display.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/text_display.py
deleted file mode 100644
index 45a9df2c6a45..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/text_display.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from druidapi.display import DisplayClient
-from druidapi.base_table import pad, BaseTable
-
-alignments = ['', '^', '>']
-
-def simple_table(table_def):
- table = []
- if table_def.headers:
- table.append(' '.join(table_def.format_row(table_def.headers)))
- for row in table_def.rows:
- table.append(' '.join(table_def.format_row(row)))
- return table
-
-def border_table(table_def):
- fmt = ' | '.join(table_def.formats)
- table = []
- if table_def.headers:
- table.append(fmt.format(*table_def.headers))
- bar = ''
- for i in range(table_def.width):
- width = table_def.widths[i]
- if i > 0:
- bar += '+'
- if table_def.width == 1:
- pass
- elif i == 0:
- width += 1
- elif i == table_def.width - 1:
- width += 1
- else:
- width += 2
- bar += '-' * width
- table.append(bar)
- for row in table_def.rows:
- table.append(fmt.format(*row))
- return table
-
-class TableDef:
-
- def __init__(self):
- self.width = None
- self.headers = None
- self.align = None
- self.formats = None
- self.rows = None
- self.widths = None
-
- def find_widths(self):
- self.widths = [0 for i in range(self.width)]
- if self.headers:
- for i in range(len(self.headers)):
- self.widths[i] = len(self.headers[i])
- for row in self.rows:
- for i in range(len(row)):
- if row[i] is not None:
- self.widths[i] = max(self.widths[i], len(row[i]))
-
- def apply_widths(self, widths):
- if not widths:
- return
- for i in range(min(len(self.widths), len(widths))):
- if widths[i] is not None:
- self.widths[i] = widths[i]
-
- def define_row_formats(self):
- self.formats = []
- for i in range(self.width):
- f = '{{:{}{}.{}}}'.format(
- alignments[self.align[i]],
- self.widths[i], self.widths[i])
- self.formats.append(f)
-
- def format_header(self):
- if not self.headers:
- return None
- return self.format_row(self.headers)
-
- def format_row(self, data_row):
- row = []
- for i in range(self.width):
- value = data_row[i]
- if not value:
- row.append(' ' * self.widths[i])
- else:
- row.append(self.formats[i].format(value))
- return row
-
-class TextTable(BaseTable):
-
- def __init__(self):
- BaseTable.__init__(self)
- self.formatter = simple_table
- self._widths = None
-
- def with_border(self):
- self.formatter = border_table
-
- def widths(self, widths):
- self._widths = widths
-
- def compute_def(self, rows):
- table_def = TableDef()
- min_width, max_width = self.row_width(rows)
- table_def.width = max_width
- table_def.headers = self.pad_headers(max_width)
- table_def.rows = self.format_rows(rows, min_width, max_width)
- table_def.find_widths()
- table_def.apply_widths(self._widths)
- table_def.align = self.find_alignments(rows, max_width)
- table_def.define_row_formats()
- return table_def
-
- def format(self):
- if not self._rows:
- self._rows = []
- table_rows = self.formatter(self.compute_def(self._rows))
- return '\n'.join(table_rows)
-
- def format_rows(self, rows, min_width, max_width):
- if not self._col_fmt:
- return self.default_row_format(rows, min_width, max_width)
- else:
- return self.apply_row_formats(rows, max_width)
-
- def default_row_format(self, rows, min_width, max_width):
- new_rows = []
- if min_width <= max_width:
- rows = self.pad_rows(rows, max_width)
- for row in rows:
- new_row = ['' if v is None else str(v) for v in row]
- new_rows.append(pad(new_row, max_width, None))
- return new_rows
-
- def apply_row_formats(self, rows, max_width):
- new_rows = []
- fmts = self._col_fmt
- if len(fmts) < max_width:
- fmts = fmts.copy()
- for i in range(len(fmts), max_width):
- fmts.append(lambda v: v)
- for row in rows:
- new_row = []
- for i in range(len(row)):
- new_row.append(fmts[i](row[i]))
- new_rows.append(pad(new_row, max_width, None))
- return new_rows
-
-class TextDisplayClient(DisplayClient):
-
- def __init__(self):
- DisplayClient.__init__(self)
-
- def text(self, msg):
- print(msg)
-
- def alert(self, msg):
- print("Alert:", msg)
-
- def error(self, msg):
- print("ERROR:", msg)
-
- def new_table(self):
- return TextTable()
-
- def show_table(self, table):
- print(table.format())
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/util.py b/examples/quickstart/jupyter-notebooks/druidapi/druidapi/util.py
deleted file mode 100644
index e2d93dad2be1..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/druidapi/util.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from druidapi.error import ClientError
-
-def dict_get(dict, key, default=None):
- '''
- Returns the value of key in the given dict, or the default value if
- the key is not found.
- '''
- if not dict:
- return default
- return dict.get(key, default)
-
-def split_table_name(table_name, default_schema):
- if not table_name:
- raise ClientError('Table name is required')
- parts = table_name.split('.')
- if len(parts) > 2:
- raise ClientError('Druid supports one or two-part table names')
- if len(parts) == 2:
- return parts
- return [default_schema, parts[0]]
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt b/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt
deleted file mode 100644
index b67ab75d9f68..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ------------------------------------------------------------------------
-
-# Requirements for the the druiapi library.
-# See: https://pip.pypa.io/en/stable/reference/requirements-file-format/
-#
-# Requirements are both few and simple at present.
-
-requests
diff --git a/examples/quickstart/jupyter-notebooks/druidapi/setup.py b/examples/quickstart/jupyter-notebooks/druidapi/setup.py
deleted file mode 100644
index 29841b2b9076..000000000000
--- a/examples/quickstart/jupyter-notebooks/druidapi/setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from setuptools import setup, find_packages
-
-setup(
- name='druidapi',
- version='0.1.0',
- description='Python API client for Apache Druid',
- url='https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/druidapi',
- author='Apache Druid project',
- author_email='dev@druid.apache.org',
- license='Apache License 2.0',
- packages=find_packages(),
- install_requires=['requests'],
-
- classifiers=[
- 'Development Status :: 3 - Alpha',
- 'Intended Audience :: Developers',
- 'Intended Audience :: End Users/Desktop',
- 'License :: OSI Approved :: Apache Software License',
- 'Operating System :: OS Independent',
- 'Programming Language :: Python :: 3',
- ],
-)
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/00-START-HERE.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/00-START-HERE.ipynb
deleted file mode 100644
index 6b47ef1f49d1..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/00-START-HERE.ipynb
+++ /dev/null
@@ -1,163 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "e415d732",
- "metadata": {},
- "source": [
- "# Jupyter Notebook tutorials for Druid\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "You can try out the Druid APIs using the Jupyter Notebook-based tutorials. These\n",
- "tutorials provide snippets of Python code that you can use to run calls against\n",
- "the Druid API to complete the tutorial."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "60015702",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "Before starting the Jupyter-based tutorials, make sure you meet the requirements listed in this section.\n",
- "The simplest way to get started is to use Docker. In this case, you only need to set up Docker Desktop.\n",
- "For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- "\n",
- "Otherwise, you need the following:\n",
- "- An available Druid instance. You can use the local quickstart configuration\n",
- " described in [Quickstart](https://druid.apache.org/docs/latest/tutorials/index.html).\n",
- " The tutorials assume that you are using the quickstart, so no authentication or authorization\n",
- " is expected unless explicitly mentioned.\n",
- "- Python 3.7 or later\n",
- "- JupyterLab (recommended) or Jupyter Notebook running on a non-default port. By default, Druid\n",
- " and Jupyter both try to use port `8888`, so start Jupyter on a different port.\n",
- "- The `requests` Python package\n",
- "- The `druidapi` Python package\n",
- "\n",
- "For setup instructions, see [Tutorial setup without using Docker](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html#tutorial-setup-without-using-docker).\n",
- "Individual tutorials may require additional Python packages, such as for visualization or streaming ingestion.\n",
- "\n",
- "## Simple Druid API\n",
- "\n",
- "The `druidapi` Python package is a REST API for Druid.\n",
- "One of the notebooks shows how to use the Druid REST API. The others focus on other\n",
- "topics and use a simple set of Python wrappers around the underlying REST API. The\n",
- "wrappers reside in the `druidapi` package within this directory. While the package\n",
- "can be used in any Python program, the key purpose, at present, is to support these\n",
- "notebooks. See the [Introduction to the Druid Python API](../01-introduction/01-druidapi-package-intro.ipynb)\n",
- "for an overview of the Python API."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d9e18342",
- "metadata": {},
- "source": [
- "## Tutorials\n",
- "\n",
- "If you run the [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html), all the notebooks are included.\n",
- "\n",
- "Otherwise, you can find the notebooks in the [apache/druid repo](\n",
- "https://github.com/apache/druid/tree/master/examples/quickstart/jupyter-notebooks/).\n",
- "You can either clone the repo or download the notebooks you want individually.\n",
- "\n",
- "The links that follow are the raw GitHub URLs, so you can use them to download the\n",
- "notebook directly, such as with `wget`, or manually through your web browser. Note\n",
- "that if you save the file from your web browser, make sure to remove the `.txt` extension.\n",
- "\n",
- "- [Introduction to the Druid REST API](../04-api/00-getting-started.ipynb) walks you through some of the\n",
- " basics related to the Druid REST API and several endpoints.\n",
- "- [Introduction to the Druid Python API](../01-introduction/01-druidapi-package-intro.ipynb) walks you through some of the\n",
- " basics related to the Druid API using the Python wrapper API.\n",
- "- [Learn the basics of Druid SQL](../03-query/00-using-sql-with-druidapi.ipynb) introduces you to the unique aspects of Druid SQL with the primary focus on the SELECT statement.\n",
- "- [Learn to use the Data Generator](./02-datagen-intro.ipynb) gets you started with streaming and batch file data generation for testing of any data schema.\n",
- "- [Ingest and query data from Apache Kafka](../02-ingestion/01-streaming-from-kafka.ipynb) walks you through ingesting an event stream from Kafka."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1a4b986a",
- "metadata": {},
- "source": [
- "## Contributing\n",
- "\n",
- "If you build a Jupyter tutorial, you need to do a few things to add it to the docs\n",
- "in addition to saving the notebook in this directory. The process requires two PRs to the repo.\n",
- "\n",
- "For the first PR, do the following:\n",
- "\n",
- "1. Depending on the goal of the notebook, you may want to clear the outputs from your notebook\n",
- " before you make the PR. You can use the following command:\n",
- "\n",
- " ```bash\n",
- " jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace ./path/to/notebook/notebookName.ipynb\n",
- " ```\n",
- " \n",
- " This can also be done in Jupyter Notebook itself: `Kernel` → `Restart & Clear Output`\n",
- "\n",
- "2. Create the PR as you normally would. Make sure to note that this PR is the one that\n",
- " contains only the Jupyter notebook and that there will be a subsequent PR that updates\n",
- " related pages.\n",
- "\n",
- "3. After this first PR is merged, grab the \"raw\" URL for the file from GitHub. For example,\n",
- " navigate to the file in the GitHub web UI and select **Raw**. Use the URL for this in the\n",
- " second PR as the download link.\n",
- "\n",
- "For the second PR, do the following:\n",
- "\n",
- "1. Update the list of [Tutorials](#tutorials) on this page and in the\n",
- " [Jupyter tutorial index page](../../../docs/tutorials/tutorial-jupyter-index.md#tutorials)\n",
- " in the `docs/tutorials` directory.\n",
- "\n",
- "2. Update `tutorial-jupyter-index.md` and provide the URL to the raw version of the file\n",
- " that becomes available after the first PR is merged.\n",
- "\n",
- "Note that you can skip the second PR, if you just copy the prefix link from one of the\n",
- "existing notebook links when doing your first PR."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/01-druidapi-package-intro.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/01-druidapi-package-intro.ipynb
deleted file mode 100644
index 88b79fd8d927..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/01-druidapi-package-intro.ipynb
+++ /dev/null
@@ -1,779 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ce2efaaa",
- "metadata": {},
- "source": [
- "# Learn the Druid Python API\n",
- "\n",
- "\n",
- "\n",
- "This notebook provides a quick introduction to the Python wrapper around the [Druid REST API](api-tutorial.ipynb). This notebook assumes you are familiar with the basics of the REST API, and the [set of operations which Druid provides](https://druid.apache.org/docs/latest/api-reference/api-reference.html). This tutorial focuses on using Python to access those APIs rather than explaining the APIs themselves. The APIs themselves are covered in other notebooks that use the Python API.\n",
- "\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
- "\n",
- "The Druid Python API is primarily intended to help with these notebook tutorials. It can also be used in your own ad-hoc notebooks, or in a regular Python program.\n",
- "\n",
- "The Druid Python API is a work in progress. The Druid team adds API wrappers as needed for the notebook tutorials. If you find you need additional wrappers, please feel free to add them, and post a PR to Apache Druid with your additions.\n",
- "\n",
- "The API provides two levels of functions. Most are simple wrappers around Druid's REST APIs. Others add additional code to make the API easier to use. The SQL query interface is a prime example: extra code translates a simple SQL query into Druid's `SQLQuery` object and interprets the results into a form that can be displayed in a notebook.\n",
- "\n",
- "This notebook contains sample output to allow it to function as a reference. To run it yourself, start by using the `Kernel` → `Restart & Clear Output` menu command to clear the sample output.\n",
- "\n",
- "Start by importing the `druidapi` package from the same folder as this notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6d90ca5d",
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "fb68a838",
- "metadata": {},
- "source": [
- "Next, connect to your cluster by providing the router endpoint. The code assumes the cluster is on your local machine, using the default port. Go ahead and change this if your setup is different.\n",
- "\n",
- "The API uses the router to forward messages to each of Druid's services so that you don't have to keep track of the host and port for each service.\n",
- "\n",
- "In the Docker Compose tutorial environment, the Router service runs at \"http://router:8888\".\n",
- "If you are not using the Docker Compose environment, edit the URL for the `jupyter_client`.\n",
- "For example, to `http://localhost:8888/`.\n",
- "\n",
- "The `jupyter_client()` method waits for the cluster to be ready and sets up the client to display tables and messages as HTML. To use this code without waiting and without HTML formatting, use the `client()` method instead."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ae601081",
- "metadata": {},
- "outputs": [],
- "source": [
- "druid = druidapi.jupyter_client('http://router:8888')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8b4e774b",
- "metadata": {},
- "source": [
- "## Status Client\n",
- "\n",
- "The SDK groups Druid REST API calls into categories, with a client for each. Start with the status client."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ff16fc3b",
- "metadata": {},
- "outputs": [],
- "source": [
- "status_client = druid.status"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "be992774",
- "metadata": {},
- "source": [
- "Use the Python `help()` function to learn what methods are avaialble."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "03f26417",
- "metadata": {},
- "outputs": [],
- "source": [
- "help(status_client)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "e803c9fe",
- "metadata": {},
- "source": [
- "Check the version of your cluster. Some of these notebooks illustrate newer features available only on specific versions of Druid."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2faa0d81",
- "metadata": {},
- "outputs": [],
- "source": [
- "status_client.version"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "d78a6c35",
- "metadata": {},
- "source": [
- "You can also check which extensions are loaded in your cluster. Some notebooks require specific extensions to be available."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1001f412",
- "metadata": {},
- "outputs": [],
- "source": [
- "status_client.properties['druid.extensions.loadList']"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "012b2e61",
- "metadata": {},
- "source": [
- "## Display Client\n",
- "\n",
- "The display client performs Druid operations, then formats the results for display in a notebook. Running SQL queries in a notebook is easy with the display client.\n",
- "\n",
- "When run outside a notebook, the display client formats results as text. The display client is the most convenient way to work with Druid in a notebook. Most operations also have a form that returns results as Python objects rather than displaying them. Use these methods if you write code to work with the results. Here the goal is just to interact with Druid."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f867f1f0",
- "metadata": {},
- "outputs": [],
- "source": [
- "display = druid.display"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "d051bc5e",
- "metadata": {},
- "source": [
- "Start by getting a list of schemas."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "dd8387e0",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.schemas()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "b8261ab0",
- "metadata": {},
- "source": [
- "Then, retreive the tables (or datasources) within any schema."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "64dcb46a",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.tables('INFORMATION_SCHEMA')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ff311595",
- "metadata": {},
- "source": [
- "The above shows the list of datasources by default. You'll get an empty result if you have no datasources yet."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "616770ce",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.tables()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "7392e484",
- "metadata": {},
- "source": [
- "You can easily run a query and show the results:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2c649eef",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT TABLE_NAME\n",
- "FROM INFORMATION_SCHEMA.TABLES\n",
- "WHERE TABLE_SCHEMA = 'INFORMATION_SCHEMA'\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c6c4e1d4",
- "metadata": {},
- "source": [
- "The query above showed the same results as `tables()`. That is not surprising: `tables()` just runs this query for you."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f414d145",
- "metadata": {},
- "source": [
- "## SQL Client\n",
- "\n",
- "While the display client is handy for simple queries, sometimes you need more control, or want to work with the data returned from a query. For this you use the SQL client."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9951e976",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_client = druid.sql"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "7b944084",
- "metadata": {},
- "source": [
- "The SQL client allows you create a SQL request object that enables passing context parameters and query parameters. Druid will work out the query parameter type based on the Python type. Use the display client to show the query results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "dd559827",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT TABLE_NAME\n",
- "FROM INFORMATION_SCHEMA.TABLES\n",
- "WHERE TABLE_SCHEMA = ?\n",
- "'''\n",
- "req = sql_client.sql_request(sql)\n",
- "req.add_parameter('INFORMATION_SCHEMA')\n",
- "req.add_context(\"someParameter\", \"someValue\")\n",
- "display.sql(req)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "937dc6b1",
- "metadata": {},
- "source": [
- "The request has other features for advanced use cases: see the code for details. The query API actually returns a sql response object. Use this if you want to get the values directly, work with the schema, etc."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fd7a1827",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT TABLE_NAME\n",
- "FROM INFORMATION_SCHEMA.TABLES\n",
- "WHERE TABLE_SCHEMA = 'INFORMATION_SCHEMA'\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2fe6a749",
- "metadata": {},
- "outputs": [],
- "source": [
- "col1 = resp.schema[0]\n",
- "print(col1.name, col1.sql_type, col1.druid_type)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "41d27bb1",
- "metadata": {},
- "outputs": [],
- "source": [
- "resp.rows"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "481af1f2",
- "metadata": {},
- "source": [
- "The `show()` method uses this information for format an HTML table to present the results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8dba807b",
- "metadata": {},
- "outputs": [],
- "source": [
- "resp.show()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "99f8db7b",
- "metadata": {},
- "source": [
- "The display and SQL clients are intened for exploratory queries. The [pydruid](https://pythonhosted.org/pydruid/) library provides a robust way to run native queries, to run SQL queries, and to convert the results to various formats."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "9e3be017",
- "metadata": {},
- "source": [
- "## MSQ Ingestion\n",
- "\n",
- "The SQL client also performs MSQ-based ingestion using `INSERT` or `REPLACE` statements. Use the extension check above to ensure that `druid-multi-stage-query` is loaded in Druid 26. (Later versions may have MSQ built in.)\n",
- "\n",
- "An MSQ query is run using a different API: `task()`. This API returns a response object that describes the Overlord task which runs the MSQ query. For tutorials, data is usually small enough you can wait for the ingestion to complete. Do that with the `run_task()` call which handles the waiting. To illustrate, here is a query that ingests a subset of columns, and includes a few data clean-up steps:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "10f1e451",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "REPLACE INTO \"myWiki1\" OVERWRITE ALL\n",
- "SELECT\n",
- " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " namespace,\n",
- " page,\n",
- " channel,\n",
- " \"user\",\n",
- " countryName,\n",
- " CASE WHEN isRobot = 'true' THEN 1 ELSE 0 END AS isRobot,\n",
- " \"added\",\n",
- " \"delta\",\n",
- " CASE WHEN isNew = 'true' THEN 1 ELSE 0 END AS isNew,\n",
- " CAST(\"deltaBucket\" AS DOUBLE) AS deltaBucket,\n",
- " \"deleted\"\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
- " '{\"type\":\"json\"}',\n",
- " '[{\"name\":\"isRobot\",\"type\":\"string\"},{\"name\":\"channel\",\"type\":\"string\"},{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"flags\",\"type\":\"string\"},{\"name\":\"isUnpatrolled\",\"type\":\"string\"},{\"name\":\"page\",\"type\":\"string\"},{\"name\":\"diffUrl\",\"type\":\"string\"},{\"name\":\"added\",\"type\":\"long\"},{\"name\":\"comment\",\"type\":\"string\"},{\"name\":\"commentLength\",\"type\":\"long\"},{\"name\":\"isNew\",\"type\":\"string\"},{\"name\":\"isMinor\",\"type\":\"string\"},{\"name\":\"delta\",\"type\":\"long\"},{\"name\":\"isAnonymous\",\"type\":\"string\"},{\"name\":\"user\",\"type\":\"string\"},{\"name\":\"deltaBucket\",\"type\":\"long\"},{\"name\":\"deleted\",\"type\":\"long\"},{\"name\":\"namespace\",\"type\":\"string\"},{\"name\":\"cityName\",\"type\":\"string\"},{\"name\":\"countryName\",\"type\":\"string\"},{\"name\":\"regionIsoCode\",\"type\":\"string\"},{\"name\":\"metroCode\",\"type\":\"long\"},{\"name\":\"countryIsoCode\",\"type\":\"string\"},{\"name\":\"regionName\",\"type\":\"string\"}]'\n",
- " )\n",
- ")\n",
- "PARTITIONED BY DAY\n",
- "CLUSTERED BY namespace, page\n",
- "'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d752b1d4",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.run_task(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ef4512f8",
- "metadata": {},
- "source": [
- "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Wait for the table to become ready."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "37fcedf2",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_client.wait_until_ready('myWiki1')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "11d9c95a",
- "metadata": {},
- "source": [
- "`display.table()` lists the columns in a table."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b662697b",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.table('myWiki1')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "936f57fb",
- "metadata": {},
- "source": [
- "You can sample a few rows of data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c4cfa5dc",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.sql('SELECT * FROM myWiki1 LIMIT 10')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c1152f41",
- "metadata": {},
- "source": [
- "## Datasource Client\n",
- "\n",
- "The Datasource client lets you perform operations on datasource objects. The SQL layer allows you to get metadata and do queries. The datasource client works with the underlying segments. Explaining the full functionality is the topic of another notebook. For now, you can use the datasource client to clean up the datasource created above. The `True` argument asks for \"if exists\" semantics so you don't get an error if the datasource was already deleted."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fba659ce",
- "metadata": {},
- "outputs": [],
- "source": [
- "ds_client = druid.datasources\n",
- "ds_client.drop('myWiki', True)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c96fdcc6",
- "metadata": {},
- "source": [
- "## Tasks Client\n",
- "\n",
- "Use the tasks client to work with Overlord tasks. The `run_task()` call above actually uses the task client internally to poll Overlord."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b4f5ea17",
- "metadata": {},
- "outputs": [],
- "source": [
- "task_client = druid.tasks\n",
- "task_client.tasks()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "1deaf95f",
- "metadata": {},
- "source": [
- "## REST Client\n",
- "\n",
- "The Druid Python API starts with a REST client that itself is built on the `requests` package. The REST client implements the common patterns seen in the Druid REST API. You can create a client directly:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b1e55635",
- "metadata": {},
- "outputs": [],
- "source": [
- "from druidapi.rest import DruidRestClient\n",
- "rest_client = DruidRestClient(\"http://localhost:8888\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "dcb8055f",
- "metadata": {},
- "source": [
- "Or, if you have already created the Druid client, you can reuse the existing REST client. This is how the various other clients work internally."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "370ba76a",
- "metadata": {},
- "outputs": [],
- "source": [
- "rest_client = druid.rest"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "2654e72c",
- "metadata": {},
- "source": [
- "Use the REST client if you need to make calls that are not yet wrapped by the Python API, or if you want to do something special. To illustrate the client, you can make some of the same calls as in the [Druid REST API notebook](api-tutorial.ipynb).\n",
- "\n",
- "The REST API maintains the Druid host: you just provide the specifc URL tail. There are methods to get or post JSON results. For example, to get status information:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9e42dfbc",
- "metadata": {},
- "outputs": [],
- "source": [
- "rest_client.get_json('/status')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "837e08b0",
- "metadata": {},
- "source": [
- "A quick comparison of the three approaches (Requests, REST client, Python client):\n",
- "\n",
- "Status:\n",
- "\n",
- "* Requests: `session.get(druid_host + '/status').json()`\n",
- "* REST client: `rest_client.get_json('/status')`\n",
- "* Status client: `status_client.status()`\n",
- "\n",
- "Health:\n",
- "\n",
- "* Requests: `session.get(druid_host + '/status/health').json()`\n",
- "* REST client: `rest_client.get_json('/status/health')`\n",
- "* Status client: `status_client.is_healthy()`\n",
- "\n",
- "Ingest data:\n",
- "\n",
- "* Requests: See the [REST tutorial](api_tutorial.ipynb)\n",
- "* REST client: as the REST tutorial, but use `rest_client.post_json('/druid/v2/sql/task', sql_request)` and\n",
- " `rest_client.get_json(f\"/druid/indexer/v1/task/{ingestion_taskId}/status\")`\n",
- "* SQL client: `sql_client.run_task(sql)`, also a form for a full SQL request.\n",
- "\n",
- "List datasources:\n",
- "\n",
- "* Requests: `session.get(druid_host + '/druid/coordinator/v1/datasources').json()`\n",
- "* REST client: `rest_client.get_json('/druid/coordinator/v1/datasources')`\n",
- "* Datasources client: `ds_client.names()`\n",
- "\n",
- "Query data, where `sql_request` is a properly formatted `SqlRequest` dictionary:\n",
- "\n",
- "* Requests: `session.post(druid_host + '/druid/v2/sql', json=sql_request).json()`\n",
- "* REST client: `rest_client.post_json('/druid/v2/sql', sql_request)`\n",
- "* SQL Client: `sql_client.show(sql)`, where `sql` is the query text\n",
- "\n",
- "In general, you have to provide the all the details for the Requests library. The REST client handles the low-level repetitious bits. The Python clients provide methods that encapsulate the specifics of the URLs and return formats."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "edc4ee39",
- "metadata": {},
- "source": [
- "## Constants\n",
- "\n",
- "Druid has a large number of special constants: type names, options, etc. The `consts` module provides definitions for many of these:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a90187c6",
- "metadata": {},
- "outputs": [],
- "source": [
- "from druidapi import consts"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fc535898",
- "metadata": {},
- "outputs": [],
- "source": [
- "help(consts)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "b661b29f",
- "metadata": {},
- "source": [
- "Using the constants avoids typos:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3393af62",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_client.tables(consts.SYS_SCHEMA)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "5e789ca7",
- "metadata": {},
- "source": [
- "## Tracing\n",
- "\n",
- "It is often handy to see what the Druid API is doing: what messages it sends to Druid. You may need to debug some function that isn't working as expected. Or, perhaps you want to see what is sent to Druid so you can replicate it in your own code. Either way, just turn on tracing:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ac68b60e",
- "metadata": {},
- "outputs": [],
- "source": [
- "druid.trace(True)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "7b9dc7e3",
- "metadata": {},
- "source": [
- "Then, each call to Druid prints what it sends:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "72c955c0",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_client.tables()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ddaf0dc2",
- "metadata": {},
- "source": [
- "## Conclusion\n",
- "\n",
- "This notebook have you a whirlwind tour of the Python Druid API: just enough to check your cluster, ingest some data with MSQ and query that data. Druid has many more APIs. As noted earlier, the Python API is a work in progress: the team adds new wrappers as needed for tutorials. Your [contributions](https://github.com/apache/druid/pulls) and [feedback](https://github.com/apache/druid/issues) are welcome."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/02-datagen-intro.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/02-datagen-intro.ipynb
deleted file mode 100644
index e3b3df2994ea..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/01-introduction/02-datagen-intro.ipynb
+++ /dev/null
@@ -1,642 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "9e07b3f5-d919-4179-91a1-0f6b66c42757",
- "metadata": {},
- "source": [
- "# Data Generator Server\n",
- "\n",
- "The default Docker Compose deployment includes a data generation service created from the published Docker image at `imply/datagen:latest`. \n",
- "This image is built by the project https://github.com/implydata/druid-datagenerator. \n",
- "\n",
- "This notebook shows you how to use the data generation service included in the Docker Compose deployment. It explains how to use predefined data generator configurations as well as how to build a custom data generator. You will also learn how to create sample data files for batch ingestion and how to generate live streaming data for streaming ingestion.\n",
- "\n",
- "## Table of contents\n",
- "\n",
- "* [Initialization](#Initialization)\n",
- "* [List available configurations](#List-available-configurations)\n",
- "* [Generate a data file for backfilling history](#Generate-a-data-file-for-backfilling-history)\n",
- "* [Batch ingestion of generated files](#Batch-ingestion-of-generated-files)\n",
- "* [Generate custom data](#Generate-custom-data)\n",
- "* [Stream generated data](#Stream-generated-data)\n",
- "* [Ingest data from a stream](#Ingest-data-from-a-stream)\n",
- "* [Cleanup](#Cleanup)\n",
- "\n",
- "\n",
- "## Initialization\n",
- "\n",
- "To interact with the data generation service, use the REST client provided in the [`druidapi` Python package](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-index.html#python-api-for-druid)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f84766c7-c6a5-4496-91a3-abdb8ddd2375",
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "import time\n",
- "\n",
- "# Datagen client \n",
- "datagen = druidapi.rest.DruidRestClient(\"http://datagen:9999\")\n",
- "\n",
- "if (os.environ['DRUID_HOST'] == None):\n",
- " druid_host=f\"http://router:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- "\n",
- "# Druid client\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "\n",
- "\n",
- "\n",
- "# these imports and constants are used by multiple cells\n",
- "from datetime import datetime, timedelta\n",
- "import json\n",
- "\n",
- "headers = {\n",
- " 'Content-Type': 'application/json'\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c54af617-0998-4010-90c3-9b5a38a09a5f",
- "metadata": {},
- "source": [
- "### List available configurations\n",
- "Use the `/list` API endpoint to get the data generator's available configuration values with predefined data generator schemas."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1ba6a80a-c49b-4abf-943b-9dad82f2ae13",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.get(f\"/list\", require_ok=False).json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ae88a3b7-60da-405d-bcf4-fb4affcfe973",
- "metadata": {},
- "source": [
- "### Generate a data file for backfilling history\n",
- "When generating a file for backfill purposes, you can select the start time and the duration of the simulation.\n",
- "\n",
- "Configure the data generator request as follows:\n",
- "* `name`: an arbitrary name you assign to the job. Refer to the job name to get the job status or to stop the job.\n",
- "* `target.type`: \"file\" to generate a data file\n",
- "* `target.path`: identifies the name of the file to generate. The data generator ignores any path specified and creates the file in the current working directory.\n",
- "* `time_type`,`time`: The data generator simulates the time range you specify with a start timestamp in the `time_type` property and a duration in the `time` property. To specify `time`, use the `h` suffix for hours, `m` for minutes, and `s` for seconds.\n",
- "- `concurrency` indicates the maximum number of entities used concurrently to generate events. Each entity is a separate state machine that simulates things like user sessions, IoT devices, or other concurrent sources of event data.\n",
- "\n",
- "The following example uses the `clickstream.json` predefined configuration to generate data into a file called `clicks.json`. The data generator starts the sample data at one hour prior to the current time and simulates events for a duration of one hour. Since it is simulated, it does this in just a few seconds."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "811ff58f-75af-4092-a08d-5e07a51592ff",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Configure the start time to one hour prior to the current time. \n",
- "startDateTime = (datetime.now() - timedelta(hours = 1)).strftime('%Y-%m-%dT%H:%M:%S.001')\n",
- "print(f\"Starting to generate history at {startDateTime}.\")\n",
- "\n",
- "# Give the datagen job a name for use in subsequent API calls\n",
- "job_name=\"gen_clickstream1\"\n",
- "\n",
- "# Generate a data file on the datagen server\n",
- "datagen_request = {\n",
- " \"name\": job_name,\n",
- " \"target\": { \"type\": \"file\", \"path\":\"clicks.json\"},\n",
- " \"config_file\": \"clickstream/clickstream.json\", \n",
- " \"time_type\": startDateTime,\n",
- " \"time\": \"1h\",\n",
- " \"concurrency\":100\n",
- "}\n",
- "response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d407d1d9-3f01-4128-a014-6a5f371c25a5",
- "metadata": {},
- "source": [
- "#### Display jobs\n",
- "Use the `/jobs` API endpoint to get the current jobs and job statuses."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3de698c5-bcf4-40c7-b295-728fb54d1f0a",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.get(f\"/jobs\").json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "972ebed0-34a1-4ad2-909d-69b8b27c3046",
- "metadata": {},
- "source": [
- "#### Get status of a job\n",
- "Use the `/status/JOB_NAME` API endpoint to get the current jobs and their status."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "debce4f8-9c16-476c-9593-21ec984985d2",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.get(f\"/status/{job_name}\", require_ok=False).json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ef818d78-6aa6-4d38-8a43-83416aede96f",
- "metadata": {},
- "source": [
- "#### Stop a job\n",
- "Use the `/stop/JOB_NAME` API endpoint to stop a job."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7631b8b8-d3d6-4803-9162-587f440d2ef2",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.post(f\"/stop/{job_name}\", '').json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a8dc7d3-64e5-41e3-8c28-c5f19c0536f5",
- "metadata": {},
- "source": [
- "#### List files created on datagen server\n",
- "Use the `/files` API endpoint to list files available on the server."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "06ee36bd-2d2b-4904-9987-10636cf52aac",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.get(f\"/files\", '').json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "83ef9edb-98e2-45b4-88e8-578703faedc1",
- "metadata": {},
- "source": [
- "### Batch ingestion of generated files\n",
- "Use a [Druid HTTP input source](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html#http-input-source) in the [EXTERN function](https://druid.apache.org/docs/latest/multi-stage-query/reference.html#extern-function) of a [SQL-based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) to load generated files.\n",
- "You can access files by name from within Druid using the URI `http://datagen:9999/file/FILE_NAME`. Alternatively, if you run Druid outside of Docker but on the same machine, access the file with `http://localhost:9999/file/FILE_NAME`.\n",
- "The following example assumes that both Druid and the data generator server are running in Docker Compose."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0d72b015-f8ec-4713-b6f2-fe7a15afff59",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "REPLACE INTO \"clicks\" OVERWRITE ALL\n",
- "WITH \"ext\" AS (SELECT *\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"http://datagen:9999/file/clicks.json\"]}',\n",
- " '{\"type\":\"json\"}'\n",
- " )\n",
- ") EXTEND (\"time\" VARCHAR, \"user_id\" VARCHAR, \"event_type\" VARCHAR, \"client_ip\" VARCHAR, \"client_device\" VARCHAR, \"client_lang\" VARCHAR, \"client_country\" VARCHAR, \"referrer\" VARCHAR, \"keyword\" VARCHAR, \"product\" VARCHAR))\n",
- "SELECT\n",
- " TIME_PARSE(\"time\") AS \"__time\",\n",
- " \"user_id\",\n",
- " \"event_type\",\n",
- " \"client_ip\",\n",
- " \"client_device\",\n",
- " \"client_lang\",\n",
- " \"client_country\",\n",
- " \"referrer\",\n",
- " \"keyword\",\n",
- " \"product\"\n",
- "FROM \"ext\"\n",
- "PARTITIONED BY DAY\n",
- "''' \n",
- "\n",
- "druid.display.run_task(sql)\n",
- "print(\"Waiting for segment avaialbility ...\")\n",
- "druid.sql.wait_until_ready('clicks')\n",
- "print(\"Data is available for query.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b0997b38-02c2-483e-bd15-439c4bf0097a",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT \"event_type\", \"user_id\", count( DISTINCT \"client_ip\") ip_count\n",
- "FROM \"clicks\"\n",
- "GROUP BY 1,2\n",
- "ORDER BY 3 DESC\n",
- "LIMIT 10\n",
- "'''\n",
- "druid.display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "66ec013f-28e4-4d5a-94a6-06e0ed537b4e",
- "metadata": {},
- "source": [
- "## Generate custom data\n",
- "\n",
- "You can find the full set of configuration options for the data generator in the [README](https://github.com/implydata/druid-datagenerator#data-generator-configuration).\n",
- "\n",
- "This section demonstrates a simple custom configuration as an example. Notice that the emitter defined the schema as a list of dimensions, each dimension specifies how its values are generated: "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d6451310-b7dd-4b39-a23b-7b735b152d6c",
- "metadata": {},
- "outputs": [],
- "source": [
- "gen_config = {\n",
- " \"emitters\": [\n",
- " {\n",
- " \"name\": \"simple_record\",\n",
- " \"dimensions\": [\n",
- " {\n",
- " \"type\": \"string\",\n",
- " \"name\": \"random_string_column\",\n",
- " \"length_distribution\": {\n",
- " \"type\": \"constant\",\n",
- " \"value\": 13\n",
- " },\n",
- " \"cardinality\": 0,\n",
- " \"chars\": \"#.abcdefghijklmnopqrstuvwxyz\"\n",
- " },\n",
- " {\n",
- " \"type\": \"int\",\n",
- " \"name\": \"distributed_number\",\n",
- " \"distribution\": {\n",
- " \"type\": \"uniform\",\n",
- " \"min\": 0,\n",
- " \"max\": 1000\n",
- " },\n",
- " \"cardinality\": 10,\n",
- " \"cardinality_distribution\": {\n",
- " \"type\": \"exponential\",\n",
- " \"mean\": 5\n",
- " }\n",
- " }\n",
- " ]\n",
- " }\n",
- " ],\n",
- " \"interarrival\": {\n",
- " \"type\": \"constant\",\n",
- " \"value\": 1\n",
- " },\n",
- " \"states\": [\n",
- " {\n",
- " \"name\": \"state_1\",\n",
- " \"emitter\": \"simple_record\",\n",
- " \"delay\": {\n",
- " \"type\": \"constant\",\n",
- " \"value\": 1\n",
- " },\n",
- " \"transitions\": [\n",
- " {\n",
- " \"next\": \"state_1\",\n",
- " \"probability\": 1.0\n",
- " }\n",
- " ]\n",
- " }\n",
- " ]\n",
- "}\n",
- "\n",
- "target = { \"type\":\"file\", \"path\":\"sample_data.json\"}"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "89a22645-aea5-4c15-b81a-959b27df731f",
- "metadata": {},
- "source": [
- "This example uses the `config` attribute of the request to configure a new custom data generator instead of using a predefined `config_file`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e5c535-3474-42b4-9772-14279e712f3d",
- "metadata": {},
- "outputs": [],
- "source": [
- "# generate 1 hour of simulated time using custom configuration\n",
- "datagen_request = {\n",
- " \"name\": \"sample_custom\",\n",
- " \"target\": target,\n",
- " \"config\": gen_config, \n",
- " \"time\": \"1h\",\n",
- " \"concurrency\":10,\n",
- " \"time_type\": \"SIM\"\n",
- "}\n",
- "response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "952386f7-8181-4325-972b-5f30dc12cf21",
- "metadata": {},
- "outputs": [],
- "source": [
- "display(datagen.get(f\"/jobs\", require_ok=False).json())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "470b3a2a-4fd9-45a2-9221-497d906f62a9",
- "metadata": {},
- "outputs": [],
- "source": [
- "# display the first 1k characters of the generated data file\n",
- "display( datagen.get(f\"/file/sample_data.json\").content[:1024])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "350faea6-55b0-4386-830c-5160ae495012",
- "metadata": {},
- "outputs": [],
- "source": [
- "datagen.post(f\"/stop/sample_custom\",'')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "77bff054-0f16-4fd5-8ade-2d44b30d0cf2",
- "metadata": {},
- "source": [
- "## Stream generated data\n",
- "\n",
- "The data generator works exactly the same whether it is writing data to a file or publishing messages into a stream. You only need to change the target configuration.\n",
- "\n",
- "To use the Kafka container running on Docker Compose, use the host name `kafka:9092`. This tutorial uses the KAFKA_HOST environment variable from Docker Compose to specify the Kafka host. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9959b7c3-6223-479d-b0c2-115a1c555090",
- "metadata": {},
- "outputs": [],
- "source": [
- "if (os.environ['KAFKA_HOST'] == None):\n",
- " kafka_host=f\"kafka:9092\"\n",
- "else:\n",
- " kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "497abc18-6538-4536-a17f-fe10c4367611",
- "metadata": {},
- "source": [
- "The simplest `target` object for Kafka and, similarly, Confluent is:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "686a74ab-e2dd-458e-9e93-10291064e9db",
- "metadata": {},
- "outputs": [],
- "source": [
- "target = {\n",
- " \"type\":\"kafka\",\n",
- " \"endpoint\": kafka_host,\n",
- " \"topic\": \"custom_data\"\n",
- "}\n",
- "\n",
- "# Generate 1 hour of real time using custom configuration, this means that this stream will run for an hour if not stopped\n",
- "datagen_request = {\n",
- " \"name\": \"sample_custom\",\n",
- " \"target\": target,\n",
- " \"config\": gen_config, \n",
- " \"time\": \"1h\",\n",
- " \"concurrency\":10,\n",
- " \"time_type\": \"REAL\"\n",
- "}\n",
- "response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ec17d0c7-a3ab-4f37-bbf0-cc02bff44cf1",
- "metadata": {},
- "outputs": [],
- "source": [
- "time.sleep(1) # avoid race condition of async job start\n",
- "display(datagen.get(f\"/jobs\", require_ok=False).json())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "84d7b706-9040-4a69-a956-1b1bbb037c32",
- "metadata": {},
- "source": [
- "### Ingest data from a stream \n",
- "This example shows how to start a streaming ingestion supervisor in Apache Druid to consume your custom data:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "51912409-e4e7-48d1-b3a5-b269622b4e56",
- "metadata": {},
- "outputs": [],
- "source": [
- "ingestion_spec ={\n",
- " \"type\": \"kafka\",\n",
- " \"spec\": {\n",
- " \"ioConfig\": {\n",
- " \"type\": \"kafka\",\n",
- " \"consumerProperties\": {\n",
- " \"bootstrap.servers\": \"kafka:9092\"\n",
- " },\n",
- " \"topic\": \"custom_data\",\n",
- " \"inputFormat\": {\n",
- " \"type\": \"json\"\n",
- " },\n",
- " \"useEarliestOffset\": True\n",
- " },\n",
- " \"tuningConfig\": {\n",
- " \"type\": \"kafka\",\n",
- " \"maxRowsInMemory\": 100000,\n",
- " \"resetOffsetAutomatically\": False\n",
- " },\n",
- " \"dataSchema\": {\n",
- " \"dataSource\": \"custom_data\",\n",
- " \"timestampSpec\": {\n",
- " \"column\": \"time\",\n",
- " \"format\": \"iso\"\n",
- " },\n",
- " \"dimensionsSpec\": {\n",
- " \"dimensions\": [\n",
- " \"random_string_column\",\n",
- " {\n",
- " \"type\": \"long\",\n",
- " \"name\": \"distributed_number\"\n",
- " }\n",
- " ]\n",
- " },\n",
- " \"granularitySpec\": {\n",
- " \"queryGranularity\": \"none\",\n",
- " \"rollup\": False,\n",
- " \"segmentGranularity\": \"hour\"\n",
- " }\n",
- " }\n",
- " }\n",
- "}\n",
- "\n",
- "headers = {\n",
- " 'Content-Type': 'application/json'\n",
- "}\n",
- "\n",
- "druid.rest.post(\"/druid/indexer/v1/supervisor\", json.dumps(ingestion_spec), headers=headers)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dddfb1cc-f863-4bf4-8c5a-b261b0b9c2f0",
- "metadata": {},
- "source": [
- "Query the data on the stream, but first wait for its availability. It takes a bit of time for the streaming tasks to start, but once they are consuming you can see data very close to real time: Run the following cell multiple times to see how the data is changing:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7e1284ed-5c49-4f37-81f7-c3b720473158",
- "metadata": {},
- "outputs": [],
- "source": [
- "druid.sql.wait_until_ready('custom_data', verify_load_status=False)\n",
- "druid.display.sql('''\n",
- "SELECT SUM(distributed_number) sum_randoms, count(*) total_count\n",
- "FROM custom_data\n",
- "''')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4486e430-0776-46ad-8a8b-4f0354f17bfb",
- "metadata": {},
- "source": [
- "### Cleanup\n",
- "\n",
- "Stop the streaming ingestion and the streaming producer:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38943a92-dc23-41cf-91a4-1b68d2178033",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(f\"Stop streaming generator: [{datagen.post('/stop/sample_custom','',require_ok=False)}]\")\n",
- "print(f'Reset offsets for streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/custom_data/reset\",\"\", require_ok=False)}]')\n",
- "print(f'Stop streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/custom_data/terminate\",\"\", require_ok=False)}]')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0cf53bdc-de7f-425d-84b1-68d0cef420d8",
- "metadata": {},
- "source": [
- "Wait for streaming ingestion to complete and then remove the custom data table:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "87341e7c-f7ab-488c-9913-091f712534cb",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(f\"Drop datasource: [{druid.datasources.drop('custom_data')}]\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/01-streaming-from-kafka.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/01-streaming-from-kafka.ipynb
deleted file mode 100644
index fc36b4b19ad2..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/01-streaming-from-kafka.ipynb
+++ /dev/null
@@ -1,598 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Ingest and query data from Apache Kafka\n",
- "\n",
- "\n",
- "\n",
- "This tutorial introduces you to streaming ingestion in Apache Druid using the Apache Kafka event streaming platform.\n",
- "Follow along to learn how to create and load data into a Kafka topic, start ingesting data from the topic into Druid, and query results over time. This tutorial assumes you have a basic understanding of Druid ingestion, querying, and API requests."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Table of contents\n",
- "\n",
- "* [Prerequisites](#Prerequisites)\n",
- "* [Load Druid API client](#Load-Druid-API-client)\n",
- "* [Create Kafka topic](#Create-Kafka-topic)\n",
- "* [Load data into Kafka topic](#Load-data-into-Kafka-topic)\n",
- "* [Start Druid ingestion](#Start-Druid-ingestion)\n",
- "* [Query Druid datasource and visualize query results](#Query-Druid-datasource-and-visualize-query-results)\n",
- "* [Learn more](#Learn-more)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `all-services` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "* A running Druid instance.\n",
- " * Update the `druid_host` variable to point to your Router endpoint. For example, `druid_host = \"http://localhost:8888\"`.\n",
- " * Update the `rest_client` variable to point to your Coordinator endpoint. For example, `\"http://localhost:8081\"`.\n",
- "* A running Kafka cluster.\n",
- " * Update the Kafka bootstrap servers to point to your servers. For example, `bootstrap_servers=[\"localhost:9092\"]`.\n",
- "* A running [Data Generator server](https://github.com/implydata/druid-datagenerator) accessible to the cluster.\n",
- " * Update the data generator client. For example `datagen = druidapi.rest.DruidRestClient(\"http://localhost:9999\")`.\n",
- "* The following Python packages:\n",
- " * `druidapi`, a Python client for Apache Druid\n",
- " * `kafka`, a Python client for Apache Kafka\n",
- " * `pandas`, `matplotlib`, and `seaborn` for data visualization\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load Druid API client"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To start the tutorial, run the following cell. It imports the required Python packages and defines a variable for the Druid client, and another for the SQL client used to run SQL commands."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "import time\n",
- "\n",
- "if 'DRUID_HOST' not in os.environ.keys():\n",
- " druid_host=f\"http://localhost:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- " \n",
- "print(f\"Opening a connection to {druid_host}.\")\n",
- "druid = druidapi.jupyter_client(druid_host)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Use kafka_host variable when connecting to kafka \n",
- "if 'KAFKA_HOST' not in os.environ.keys():\n",
- " kafka_host=f\"http://localhost:9092\"\n",
- "else:\n",
- " kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\"\n",
- "\n",
- "# this is the kafka topic we will be working with:\n",
- "topic_name = \"social_media\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "\n",
- "# shortcuts for display and sql api's\n",
- "display = druid.display\n",
- "sql_client = druid.sql\n",
- "\n",
- "# client for Data Generator API\n",
- "datagen = druidapi.rest.DruidRestClient(\"http://datagen:9999\")\n",
- "\n",
- "# client for Druid API\n",
- "rest_client = druid.rest"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Publish generated data directly to Kafka topic"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this section, you use the data generator included as part of the Docker application to generate a stream of messages. The data generator creates and send messages to a Kafka topic named `social_media`. To learn more about the Druid Data Generator, see the [project](https://github.com/implydata/druid-datagenerator) and the [data generation notebook](../01-introduction/02-datagen-intro.ipynb)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Generate data\n",
- "Run the following cells to load sample data into the `social_media` Kafka topic. The data generator sends events until it reaches 50,000 messages."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "headers = {\n",
- " 'Content-Type': 'application/json'\n",
- "}\n",
- "\n",
- "datagen_request = {\n",
- " \"name\": \"social_stream\",\n",
- " \"target\": { \"type\": \"kafka\", \"endpoint\": kafka_host, \"topic\": topic_name },\n",
- " \"config_file\": \"social/social_posts.json\", \n",
- " \"total_events\":50000,\n",
- " \"concurrency\":100\n",
- "}\n",
- "datagen.post(\"/start\", json.dumps(datagen_request), headers=headers)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check the status of the job with the following cell:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "time.sleep(1) # avoid race between start of the job and its status being available\n",
- "response = datagen.get('/status/social_stream')\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Start Druid ingestion"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that you have a new Kafka topic and data being streamed into the topic, you ingest the data into Druid by submitting a Kafka ingestion spec.\n",
- "The ingestion spec describes the following:\n",
- "* where to source the data to ingest (in `spec > ioConfig`),\n",
- "* the datasource to ingest data into (in `spec > dataSchema > dataSource`), and\n",
- "* what the data looks like (in `spec > dataSchema > dimensionsSpec`).\n",
- "\n",
- "Other properties control how Druid aggregates and stores data. For more information, see the Druid documenation:\n",
- "* [Apache Kafka ingestion](https://druid.apache.org/docs/latest/development/extensions-core/kafka-ingestion.html)\n",
- "* [Ingestion spec reference](https://druid.apache.org/docs/latest/ingestion/ingestion-spec.html)\n",
- "\n",
- "Run the following cells to define and view the Kafka ingestion spec."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "kafka_ingestion_spec = {\n",
- " \"type\": \"kafka\",\n",
- " \"spec\": {\n",
- " \"ioConfig\": {\n",
- " \"type\": \"kafka\",\n",
- " \"consumerProperties\": {\n",
- " \"bootstrap.servers\": \"kafka:9092\"\n",
- " },\n",
- " \"topic\": \"social_media\",\n",
- " \"inputFormat\": {\n",
- " \"type\": \"json\"\n",
- " },\n",
- " \"useEarliestOffset\": True\n",
- " },\n",
- " \"tuningConfig\": {\n",
- " \"type\": \"kafka\"\n",
- " },\n",
- " \"dataSchema\": {\n",
- " \"dataSource\": \"social_media\",\n",
- " \"timestampSpec\": {\n",
- " \"column\": \"time\",\n",
- " \"format\": \"iso\"\n",
- " },\n",
- " \"dimensionsSpec\": {\n",
- " \"dimensions\": [\n",
- " \"username\",\n",
- " \"post_title\",\n",
- " {\n",
- " \"type\": \"long\",\n",
- " \"name\": \"views\"\n",
- " },\n",
- " {\n",
- " \"type\": \"long\",\n",
- " \"name\": \"upvotes\"\n",
- " },\n",
- " {\n",
- " \"type\": \"long\",\n",
- " \"name\": \"comments\"\n",
- " },\n",
- " \"edited\"\n",
- " ]\n",
- " },\n",
- " \"granularitySpec\": {\n",
- " \"queryGranularity\": \"none\",\n",
- " \"rollup\": False,\n",
- " \"segmentGranularity\": \"hour\"\n",
- " }\n",
- " }\n",
- " }\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Send the spec to Druid to start the streaming ingestion from Kafka:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "headers = {\n",
- " 'Content-Type': 'application/json'\n",
- "}\n",
- "\n",
- "supervisor = rest_client.post(\"/druid/indexer/v1/supervisor\", json.dumps(kafka_ingestion_spec), headers=headers)\n",
- "print(supervisor.status_code)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "A `200` response indicates that the request was successful. You can view the running ingestion task and the new datasource in the web console's [ingestion view](http://localhost:8888/unified-console.html#ingestion).\n",
- "\n",
- "The following cell pauses further execution until the ingestion has started and the datasource is available for querying:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "druid.sql.wait_until_ready('social_media', verify_load_status=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Query Druid datasource and visualize query results"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can now query the new datasource called `social_media`. In this section, you also visualize query results using the Matplotlib and Seaborn visualization libraries. Run the following cell import these packages."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import matplotlib\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run a simple query to view a subset of rows from the new datasource:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT * FROM social_media LIMIT 5\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this social media scenario, each incoming event represents a post on social media, for which you collect the timestamp, username, and post metadata. You are interested in analyzing the total number of upvotes for all posts, compared between users. Preview this data with the following query:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " COUNT(post_title) as num_posts,\n",
- " SUM(upvotes) as total_upvotes,\n",
- " username\n",
- "FROM social_media\n",
- "GROUP BY username\n",
- "ORDER BY num_posts\n",
- "'''\n",
- "\n",
- "response = sql_client.sql_query(sql)\n",
- "response.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Visualize the total number of upvotes per user using a line plot. You sort the results by username before plotting because the order of users may vary as new results arrive."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.DataFrame(response.json)\n",
- "df = df.sort_values('username')\n",
- "\n",
- "df.plot(x='username', y='total_upvotes', marker='o')\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.ylabel(\"Total number of upvotes\")\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The total number of upvotes likely depends on the total number of posts created per user. To better assess the relative impact per user, you compare the total number of upvotes (line plot) with the total number of posts."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "matplotlib.rc_file_defaults()\n",
- "ax1 = sns.set_style(style=None, rc=None )\n",
- "\n",
- "fig, ax1 = plt.subplots()\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "\n",
- "\n",
- "sns.lineplot(\n",
- " data=df, x='username', y='total_upvotes',\n",
- " marker='o', ax=ax1, label=\"Sum of upvotes\")\n",
- "ax1.get_legend().remove()\n",
- "\n",
- "ax2 = ax1.twinx()\n",
- "sns.barplot(data=df, x='username', y='num_posts',\n",
- " order=df['username'], alpha=0.5, ax=ax2, log=True,\n",
- " color=\"orange\", label=\"Number of posts\")\n",
- "\n",
- "\n",
- "# ask matplotlib for the plotted objects and their labels\n",
- "lines, labels = ax1.get_legend_handles_labels()\n",
- "lines2, labels2 = ax2.get_legend_handles_labels()\n",
- "ax2.legend(lines + lines2, labels + labels2, bbox_to_anchor=(1.55, 1))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You should see a correlation between total number of upvotes and total number of posts. In order to track user impact on a more equal footing, normalize the total number of upvotes relative to the total number of posts, and plot the result:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['upvotes_normalized'] = df['total_upvotes']/df['num_posts']\n",
- "\n",
- "df.plot(x='username', y='upvotes_normalized', marker='o', color='green')\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.ylabel(\"Number of upvotes (normalized)\")\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You've been working with data taken at a single snapshot in time from when you ran the last query. Run the same query again, and store the output in `response2`, which you will compare with the previous results:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "response2 = sql_client.sql_query(sql)\n",
- "response2.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Normalizing the data also helps you evaluate trends over time more consistently on the same plot axes. Plot the normalized data again, this time alongside the results from the previous snapshot:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df2 = pd.DataFrame(response2.json)\n",
- "df2 = df2.sort_values('username')\n",
- "df2['upvotes_normalized'] = df2['total_upvotes']/df2['num_posts']\n",
- "\n",
- "ax = df.plot(x='username', y='upvotes_normalized', marker='o', color='green', label=\"Time 1\")\n",
- "df2.plot(x='username', y='upvotes_normalized', marker='o', color='purple', ax=ax, label=\"Time 2\")\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.ylabel(\"Number of upvotes (normalized)\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This plot shows how some users maintain relatively consistent social media impact between the two query snapshots, whereas other users grow or decline in their influence."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Cleanup \n",
- "The following cells stop the data generation and ingestion jobs and removes the datasource from Druid."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(f\"Stop streaming generator: [{datagen.post('/stop/social_stream','',require_ok=False)}]\")\n",
- "print(f'Reset offsets for ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/social_media/reset\",\"\", require_ok=False)}]')\n",
- "print(f'Stop streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/social_media/terminate\",\"\", require_ok=False)}]')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once the ingestion process ends and completes any final ingestion steps, remove the datasource with the following cell:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "time.sleep(5) # wait for streaming ingestion tasks to end\n",
- "print(f\"Drop datasource: [{druid.datasources.drop('social_media')}]\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Learn more\n",
- "\n",
- "This tutorial showed you how to create a Kafka topic using a Python client for Kafka, send a simulated stream of data to Kafka using a data generator, and query and visualize results over time. For more information, see the following resources:\n",
- "\n",
- "* [Apache Kafka ingestion](https://druid.apache.org/docs/latest/development/extensions-core/kafka-ingestion.html)\n",
- "* [Querying data](https://druid.apache.org/docs/latest/tutorials/tutorial-query.html)\n",
- "* [Tutorial: Run with Docker](https://druid.apache.org/docs/latest/tutorials/docker.html)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- },
- "vscode": {
- "interpreter": {
- "hash": "a4289e5b8bae5973a6609d90f7bc464162478362b9a770893a3c5c597b0b36e7"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/02-working-with-nested-columns.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/02-working-with-nested-columns.ipynb
deleted file mode 100644
index f4538607c416..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/02-ingestion/02-working-with-nested-columns.ipynb
+++ /dev/null
@@ -1,426 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Working with nested columns\n",
- "\n",
- "\n",
- "\n",
- "This tutorial demonstrates how to work with [nested columns](https://druid.apache.org/docs/latest/querying/nested-columns.html) in Apache Druid.\n",
- "\n",
- "Druid stores nested data structures in `COMPLEX` columns. In this tutorial you perform the following tasks:\n",
- "\n",
- "- Ingest nested JSON data using SQL-based ingestion.\n",
- "- Transform nested data during ingestion using SQL JSON functions.\n",
- "- Perform queries to display, filter, and aggregate nested data.\n",
- "- Use helper operators to examine nested data and plan your queries.\n",
- "\n",
- "Druid supports directly ingesting nested data with the following formats: JSON, Parquet, Avro, ORC, Protobuf."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Table of contents\n",
- "\n",
- "- [Prerequisites](#Prerequisites)\n",
- "- [Initialization](#Initialization)\n",
- "- [Ingest nested data](#Ingest-nested-data)\n",
- "- [Transform nested data](#Transform-nested-data)\n",
- "- [Query nested data](#Query-nested-data)\n",
- "- [Group, filter, and aggregate nested data](#Group-filter-and-aggregate-nested-data)\n",
- "- [Use helper operators](#Use-helper-operators)\n",
- "- [Learn more](#Learn-more)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
- "\n",
- "### Run with Docker\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- "\n",
- "### Run without Docker\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Apache Druid instance, with a `DRUID_HOST` local environment variable containing the server name of your Druid router.\n",
- "* [druidapi](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/druidapi/README.md), a Python client for Apache Druid. Follow the instructions in the Install section of the README file."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Initialization"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run the next cell to set up the Druid Python client's connection to Apache Druid.\n",
- "\n",
- "If successful, the Druid version number will be shown in the output."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "\n",
- "if 'DRUID_HOST' not in os.environ.keys():\n",
- " druid_host=f\"http://localhost:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- " \n",
- "print(f\"Opening a connection to {druid_host}.\")\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "\n",
- "display = druid.display\n",
- "sql_client = druid.sql\n",
- "status_client = druid.status\n",
- "\n",
- "status_client.version"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Ingest nested data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run the following cell to ingest sample clickstream data from the [Koalas to the Max](https://www.koalastothemax.com/) game."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "INSERT INTO example_koalas_nesteddata\n",
- " WITH \"source\" AS\n",
- " (SELECT * FROM TABLE(EXTERN('{\"type\":\"http\",\"uris\":[\"https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz\"]}',\n",
- " '{\"type\":\"json\"}','[{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"client_ip\",\"type\":\"string\"},\n",
- " {\"name\":\"session\",\"type\":\"string\"},{\"name\":\"session_length\",\"type\":\"string\"},{\"name\":\"event\",\"type\":\"COMPLEX\"},\n",
- " {\"name\":\"agent\",\"type\":\"COMPLEX\"},{\"name\":\"geo_ip\",\"type\":\"COMPLEX\"}]')))\n",
- " SELECT TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " \"client_ip\", \n",
- " \"session\", \n",
- " \"session_length\", \n",
- " \"event\", \n",
- " \"agent\", \n",
- " \"geo_ip\"\n",
- " FROM \"source\"\n",
- " PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready(\"example_koalas_nesteddata\")\n",
- "display.table(\"example_koalas_nesteddata\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Druid reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments.\n",
- "\n",
- "Wait for the table detail to display, then run the following cell to query the data and return selected columns from 3 rows. Note the nested structure of the `event`, `agent`, and `geo_ip` columns."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT session, event, agent, geo_ip \n",
- "FROM example_koalas_nesteddata LIMIT 3\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Transform nested data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can use Druid's [SQL JSON functions](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) to transform nested data in your ingestion query.\n",
- "\n",
- "Run the following cell to insert sample data into a new datasource named `example_koalas_nesteddata_transform`. The SELECT query extracts the `country` and `city` elements from the nested `geo_ip` column and creates a composite object `sessionDetails` containing `session` and `session_length`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "INSERT INTO example_koalas_nesteddata_transform\n",
- " WITH \"source\" AS\n",
- " (SELECT * FROM TABLE(EXTERN('{\"type\":\"http\",\"uris\":[\"https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz\"]}',\n",
- " '{\"type\":\"json\"}','[{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"session\",\"type\":\"string\"},{\"name\":\"session_length\",\"type\":\"string\"},\n",
- " {\"name\":\"event\",\"type\":\"COMPLEX\"},{\"name\":\"agent\",\"type\":\"COMPLEX\"},{\"name\":\"geo_ip\",\"type\":\"COMPLEX\"}]')))\n",
- " SELECT TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " JSON_QUERY(geo_ip, '$.country') as country,\n",
- " JSON_QUERY(geo_ip, '$.city') as city,\n",
- " JSON_OBJECT('session':session, 'session_length':session_length) as sessionDetails\n",
- " FROM \"source\"\n",
- " PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready(\"example_koalas_nesteddata_transform\")\n",
- "display.table(\"example_koalas_nesteddata_transform\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When the table detail displays, run the following cell to query the data and return `country`, `city`, and `sessionDetails` from 3 rows:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT country, city, sessionDetails \n",
- "FROM example_koalas_nesteddata_transform \n",
- "LIMIT 3\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Query nested data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run the following cell to display the data types for columns in the `example_koalas_nesteddata` datasource. Note that nested columns display as `COMPLEX`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE\n",
- "FROM INFORMATION_SCHEMA.COLUMNS\n",
- "WHERE TABLE_NAME = 'example_koalas_nesteddata'\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can use [`JSON_VALUE`](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) to extract specific elements from a `COMPLEX` object.\n",
- " \n",
- "Run the following cell to extract `continent` from `geo_ip` and `category` from `agent` for 3 rows:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT JSON_VALUE(geo_ip, '$.continent') as continent,\n",
- "JSON_VALUE(agent, '$.category') as category\n",
- "FROM example_koalas_nesteddata LIMIT 3\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Group, filter, and aggregate nested data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run the following cell to see how you can use the SELECT COUNT(DISTINCT) operator with `JSON_VALUE`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT COUNT(DISTINCT(JSON_VALUE(geo_ip, '$.city'))) as \"Number of cities\"\n",
- "FROM example_koalas_nesteddata\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Run the following cell to filter and group a query using `JSON_VALUE`. The query selects the `browser` element from the `agent` column and the `country` and `city` elements from the `geo_ip` column, for all rows with city `Helsinki`. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT JSON_VALUE(agent, '$.browser') as browser,\n",
- "JSON_VALUE(geo_ip, '$.country') as country,\n",
- "JSON_VALUE(geo_ip, '$.city') as city\n",
- "FROM example_koalas_nesteddata\n",
- "WHERE JSON_VALUE(geo_ip, '$.city') in ('Helsinki')\n",
- "GROUP BY 1,2,3\n",
- "ORDER BY 1\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Use helper operators"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can use SQL helper operators such as [`JSON_KEYS`](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) and [`JSON_PATHS`](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) to examine nested data and plan your queries. Run the following cell to return an array of field names and an array of paths for the `geo_ip` nested column."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT ARRAY_CONCAT_AGG(DISTINCT JSON_KEYS(geo_ip, '$.')) as \"geo_ip keys\",\n",
- "ARRAY_CONCAT_AGG(DISTINCT JSON_PATHS(geo_ip)) as \"geo_ip paths\"\n",
- "FROM example_koalas_nesteddata\n",
- "'''\n",
- "resp = sql_client.sql_query(sql)\n",
- "resp.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Learn more"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This tutorial covers the basics of working with nested data. To learn more about nested data in Druid and related Druid features, see the following topics:\n",
- "\n",
- "- [Nested columns](https://druid.apache.org/docs/latest/querying/nested-columns.html) for information about the nested columns feature, with ingestion and query examples. \n",
- "- [SQL JSON functions](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) for details on all of the functions you used in this tutorial.\n",
- "- [SQL-based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) for information on how to use Druid SQL-based ingestion."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.3"
- },
- "vscode": {
- "interpreter": {
- "hash": "a4289e5b8bae5973a6609d90f7bc464162478362b9a770893a3c5c597b0b36e7"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/03-query/00-using-sql-with-druidapi.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/03-query/00-using-sql-with-druidapi.ipynb
deleted file mode 100644
index 47291d64dacc..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/03-query/00-using-sql-with-druidapi.ipynb
+++ /dev/null
@@ -1,699 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ad4e60b6",
- "metadata": {
- "tags": []
- },
- "source": [
- "# Learn the basics of Druid SQL\n",
- "\n",
- "\n",
- " \n",
- "Apache Druid supports two query languages: Druid SQL and native queries.\n",
- "Druid SQL is a Structured Query Language (SQL) dialect that enables you to query datasources in Apache Druid using SQL statements.\n",
- "SQL and Druid SQL use similar syntax, with some notable differences.\n",
- "Not all SQL functions are supported in Druid SQL. Instead, Druid includes Druid-specific SQL functions for optimized query performance.\n",
- "\n",
- "This interactive tutorial introduces you to the unique aspects of Druid SQL with the primary focus on the SELECT statement.\n",
- "To learn about native queries, see [Native queries](https://druid.apache.org/docs/latest/querying/querying.html)."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8d6bbbcb",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `druid-jupyter` or `all-services` profiles of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Druid instance.
\n",
- " Update the `druid_host` variable to point to your Router endpoint. For example:\n",
- " ```\n",
- " druid_host = \"http://localhost:8888\"\n",
- " ```\n",
- "* The [Druid Python API](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/) to simplify access to Druid.\n",
- "\n",
- "It will also help to have a working knowledge of SQL.\n",
- "\n",
- "\n",
- "To start the tutorial, run the following cell. It imports the required Python packages and defines a variable for the Druid client, and another for the SQL client used to run SQL commands."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b7f08a52",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "import druidapi\n",
- "\n",
- "# druid_host is the hostname and port for your Druid deployment. \n",
- "# In the Docker Compose tutorial environment, this is the Router\n",
- "# service running at \"http://router:8888\".\n",
- "# If you are not using the Docker Compose environment, edit the `druid_host`.\n",
- "\n",
- "druid_host = \"http://router:8888\"\n",
- "druid_host\n",
- "\n",
- "\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "display = druid.display\n",
- "sql_client = druid.sql"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "e893ef7d-7136-442f-8bd9-31b5a5276518",
- "metadata": {},
- "source": [
- "## Druid SQL statements\n",
- "\n",
- "The following are the main Druid SQL statements:\n",
- "\n",
- "* SELECT: extract data from a datasource\n",
- "* INSERT INTO: create a new datasource or append to an existing datasource\n",
- "* REPLACE INTO: create a new datasource or overwrite data in an existing datasource\n",
- "\n",
- "Druid SQL does not support CREATE TABLE, DELETE, and DROP TABLE statements.\n",
- "\n",
- "## Ingest data\n",
- "\n",
- "You can use either INSERT INTO or REPLACE INTO to create a datasource and ingest data.\n",
- "INSERT INTO and REPLACE INTO statements both require the PARTITIONED BY clause which defines the granularity of time-based partitioning. For more information, see [Partitioning by time](https://druid.apache.org/docs/latest/multi-stage-query/concepts.html#partitioning-by-time).\n",
- "\n",
- "Run the following cell to ingest data from an external source into a table called `wikipedia-sql-tutorial`. \n",
- "If you already have a table with the same name, use REPLACE INTO instead of INSERT INTO.\n",
- "\n",
- "Note the following about the query to ingest data:\n",
- "- The query uses the TIME_PARSE function to parse ISO 8601 time strings into timestamps. See the section on [timestamp values](#timestamp-values) for more information.\n",
- "- The asterisk ( * ) tells Druid to ingest all the columns.\n",
- "- The EXTERN statement lets you define the data source type and the input schema. See [Read external data with EXTERN](https://druid.apache.org/docs/latest/multi-stage-query/concepts.html#read-external-data-with-extern) for more information.\n",
- "\n",
- "The following cell defines the query, uses MSQ to ingest the data, and waits for the MSQ task to complete. You will see an asterisk `[*]` in the left margin while the task runs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "045f782c-74d8-4447-9487-529071812b51",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "INSERT INTO \"wikipedia-sql-tutorial\" \n",
- "SELECT TIME_PARSE(\"timestamp\") AS __time, * \n",
- "FROM TABLE (EXTERN(\n",
- " '{\"type\": \"http\", \"uris\": [\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
- " '{\"type\": \"json\"}', \n",
- " '[{\"name\": \"added\", \"type\": \"long\"}, {\"name\": \"channel\", \"type\": \"string\"}, {\"name\": \"cityName\", \"type\": \"string\"}, {\"name\": \"comment\", \"type\": \"string\"}, {\"name\": \"commentLength\", \"type\": \"long\"}, {\"name\": \"countryIsoCode\", \"type\": \"string\"}, {\"name\": \"countryName\", \"type\": \"string\"}, {\"name\": \"deleted\", \"type\": \"long\"}, {\"name\": \"delta\", \"type\": \"long\"}, {\"name\": \"deltaBucket\", \"type\": \"string\"}, {\"name\": \"diffUrl\", \"type\": \"string\"}, {\"name\": \"flags\", \"type\": \"string\"}, {\"name\": \"isAnonymous\", \"type\": \"string\"}, {\"name\": \"isMinor\", \"type\": \"string\"}, {\"name\": \"isNew\", \"type\": \"string\"}, {\"name\": \"isRobot\", \"type\": \"string\"}, {\"name\": \"isUnpatrolled\", \"type\": \"string\"}, {\"name\": \"metroCode\", \"type\": \"string\"}, {\"name\": \"namespace\", \"type\": \"string\"}, {\"name\": \"page\", \"type\": \"string\"}, {\"name\": \"regionIsoCode\", \"type\": \"string\"}, {\"name\": \"regionName\", \"type\": \"string\"}, {\"name\": \"timestamp\", \"type\": \"string\"}, {\"name\": \"user\", \"type\": \"string\"}]'\n",
- " ))\n",
- "PARTITIONED BY DAY\n",
- "'''\n",
- "sql_client.run_task(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "a141e962",
- "metadata": {},
- "source": [
- "MSQ reports task completion as soon as ingestion is done. However, it takes a while for Druid to load the resulting segments. Wait for the table to become ready."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cca15307",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_client.wait_until_ready('wikipedia-sql-tutorial')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "240b0ad5-48f2-4737-b12b-5fd5f98da300",
- "metadata": {},
- "source": [
- "## Datasources\n",
- "\n",
- "Druid supports a variety of datasources, with the table datasource being the most common. In Druid documentation, the word \"datasource\" often implicitly refers to the table datasource.\n",
- "The [Datasources](https://druid.apache.org/docs/latest/querying/datasource.html) topic provides a comprehensive overview of datasources supported by Druid SQL.\n",
- "\n",
- "In Druid SQL, table datasources reside in the `druid` schema. This is the default schema, so table datasources can be referenced as either `druid.DATASOURCE_NAME` or `DATASOURCE_NAME`.\n",
- "\n",
- "For example, run the next cell to return the rows of the column named `channel` from the `wikipedia-sql-tutorial` table. Because this tutorial is running in Jupyter, the cells use the LIMIT clause to limit the size of the query results for display purposes. The cell uses the built-in table formatting feature of the Python API. You can also retrieve the values as a Python object if you wish to perform additional processing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6e5d8de0",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT \"channel\" FROM \"wikipedia-sql-tutorial\" LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "cbeb5a63",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Data types\n",
- "\n",
- "Druid maps SQL data types onto native types at query runtime.\n",
- "The following native types are supported for Druid columns:\n",
- "\n",
- "* SQL: `VARCHAR`, Druid: `STRING`: UTF-8 encoded strings and string arrays\n",
- "* SQL: `BIGINT`, Druid: `LONG`: 64-bit signed int\n",
- "* SQL & Druid: `FLOAT`: 32-bit float\n",
- "* SQL & Druid: `DOUBLE`: 64-bit float\n",
- "* Druid `COMPLEX`: represents non-standard data types, such as nested JSON, hyperUnique and approxHistogram aggregators, and DataSketches aggregators\n",
- "\n",
- "For reference on how SQL data types map onto Druid native types, see [Standard types](https://druid.apache.org/docs/latest/querying/sql-data-types.html#standard-types).\n",
- "\n",
- "Druid exposes table and column metadata through [INFORMATION_SCHEMA](https://druid.apache.org/docs/latest/querying/sql-metadata-tables.html#information-schema) tables. Run the following query to retrieve metadata for the `wikipedia-sql-tutorial` datasource. In the response body, each JSON object correlates to a column in the table.\n",
- "Check the objects' `DATA_TYPE` property for SQL data types. You should see TIMESTAMP, BIGINT, and VARCHAR SQL data types. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c7a86e2e",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT COLUMN_NAME, DATA_TYPE \n",
- "FROM INFORMATION_SCHEMA.COLUMNS \n",
- "WHERE \"TABLE_SCHEMA\" = 'druid' AND \"TABLE_NAME\" = 'wikipedia-sql-tutorial' \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "59f41229",
- "metadata": {},
- "source": [
- "This is such a common query that the SQL client has it built in:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1ac6c410",
- "metadata": {},
- "outputs": [],
- "source": [
- "display.table('wikipedia-sql-tutorial')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c59ca797-dd91-442b-8d02-67b711b3fcc6",
- "metadata": {},
- "source": [
- "### Timestamp values\n",
- "\n",
- "Druid stores timestamp values as the number of milliseconds since the Unix epoch.\n",
- "Primary timestamps are stored in a column named `__time`.\n",
- "If a dataset doesn't have a timestamp, Druid uses the default value of `1970-01-01 00:00:00`.\n",
- "\n",
- "Druid time functions perform best when used with the `__time` column.\n",
- "By default, time functions use the UTC time zone.\n",
- "For more information about timestamp handling, see [Date and time functions](https://druid.apache.org/docs/latest/querying/sql-scalar.html#date-and-time-functions).\n",
- "\n",
- "Run the following cell to see a time function at work. This example uses the `TIME_IN_INTERVAL` function to query the `channel` and `page` columns of the `wikipedia-sql-tutorial` for rows whose timestamp is contained within the specified interval. The cell groups the results by columns."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "16c1a31a",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT channel, page\n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "WHERE TIME_IN_INTERVAL(__time, '2016-06-27T00:05:54.56/2016-06-27T00:06:53')\n",
- "GROUP BY channel, page\n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f7cfdfae-ccba-49ba-a70f-63d0bd3527b2",
- "metadata": {},
- "source": [
- "### NULL values\n",
- "\n",
- "Druid supports SQL compatible NULL handling, allowing string columns to distinguish empty strings from NULL and numeric columns to contain NULL rows. To store and query data in SQL compatible mode, explicitly set the `useDefaultValueForNull` property to `false` in `_common/common.runtime.properties`. See [Configuration reference](https://druid.apache.org/docs/latest/configuration/index.html) for common configuration properties.\n",
- "\n",
- "When `useDefaultValueForNull` is set to `true` (default behavior), Druid stores NULL values as `0` for numeric columns and as `''` for string columns."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "29c24856",
- "metadata": {
- "tags": []
- },
- "source": [
- "## SELECT statement syntax\n",
- "\n",
- "Druid SQL supports SELECT statements with the following structure:\n",
- "\n",
- "``` mysql\n",
- "[ EXPLAIN PLAN FOR ]\n",
- "[ WITH tableName [ ( column1, column2, ... ) ] AS ( query ) ]\n",
- "SELECT [ ALL | DISTINCT ] { * | exprs }\n",
- "FROM { | () | [ INNER | LEFT ] JOIN ON condition }\n",
- "[ WHERE expr ]\n",
- "[ GROUP BY [ exprs | GROUPING SETS ( (exprs), ... ) | ROLLUP (exprs) | CUBE (exprs) ] ]\n",
- "[ HAVING expr ]\n",
- "[ ORDER BY expr [ ASC | DESC ], expr [ ASC | DESC ], ... ]\n",
- "[ LIMIT limit ]\n",
- "[ OFFSET offset ]\n",
- "[ UNION ALL ]\n",
- "```\n",
- "\n",
- "As a general rule, use the LIMIT clause with `SELECT *` to limit the number of rows returned. "
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8cf212e5-fb3f-4206-acdd-46ef1da327ab",
- "metadata": {
- "tags": []
- },
- "source": [
- "## WHERE clause\n",
- "\n",
- "Druid SQL uses the [SQL WHERE clause](https://druid.apache.org/docs/latest/querying/sql.html#where) of a SELECT statement to fetch data based on a particular condition.\n",
- "\n",
- "In most cases, filtering your results by time using the WHERE clause improves query performance.\n",
- "This is because Druid partitions data into time chunks and having a time range allows Druid to skip over unrelated data.\n",
- "At ingestion time, you can further partition segments within a time chunk using the CLUSTERED BY clause to improve locality.\n",
- "At query time, using the WHERE clause to filter on clustered dimensions can improve query performance.\n",
- "\n",
- "Druid supports range filtering on columns that contain long millisecond values, with the boundaries specified as ISO 8601 time intervals. This is suitable for the `__time` column, long metric columns, and dimensions with values that can be parsed as long milliseconds.\n",
- " \n",
- "For example, the following cell uses a comparison operator on the `__time` field to filter results from a certain time range."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "123187d3",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT channel, page, comment \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "WHERE __time >= TIMESTAMP '2015-09-12 23:33:55' \n",
- " AND namespace = 'Main' \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "5160db26-7e8d-40f7-8588-b7eabfc08355",
- "metadata": {},
- "source": [
- "### Comparison operators\n",
- "\n",
- "Druid SQL supports the following comparison operators. You can use these operators in conjunction with the WHERE clause to compare expressions.\n",
- "\n",
- "- equal to (=)\n",
- "- greater than(>)\n",
- "- less than (<)\n",
- "- greater than or equal (>=)\n",
- "- less than or equal (<=)\n",
- "- not equal to( <>)\n",
- "\n",
- "For example, the next cell returns the first seven records that match the following criteria:\n",
- "- `cityName` is not an empty string\n",
- "- `countryIsoCode` value equals to `US`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b4656c81",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT channel, page, comment \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "WHERE \"cityName\" <> '' AND \"countryIsoCode\" = 'US' \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "dd24470d-25c2-4031-a711-8477d69c9e94",
- "metadata": {
- "tags": []
- },
- "source": [
- "### Logical operators\n",
- "\n",
- "Druid's handling of logical operators is comparable to SQL with a few exceptions. For example, if an IN list contains NULL, the IN operator matches NULL values. This behavior is different from the SQL IN operator, which does not match NULL values. For a complete list of logical SQL operators supported by Druid SQL, see [Logical operators](https://druid.apache.org/docs/latest/querying/sql-operators.html#logical-operators)."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "2baf21b9-74d1-4df6-862f-afbaeef1812b",
- "metadata": {},
- "source": [
- "## GROUP BY clause\n",
- "\n",
- "Druid SQL uses the [SQL GROUP BY](https://druid.apache.org/docs/latest/querying/sql.html#group-by) clause to separate items into groups, where each group is composed of rows with identical values. \n",
- "The GROUP BY clause is often used with [aggregation functions](https://druid.apache.org/docs/latest/querying/sql-aggregations.html), such as COUNT or SUM, to produce summary values for each group.\n",
- "\n",
- "For example, the following cell counts all of the entries separated by the field `channel`. The output is limited to seven rows and has two fields: `channel` and `counts`. For each unique value of `channel`, Druid aggregates all rows having that value, counts the number of entries in the group, and assigns the results to a field called `counts`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0127e401",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT channel, COUNT(*) AS counts \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "GROUP BY channel \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "eab67db4-a0f3-4177-b5be-1fef355bf33f",
- "metadata": {},
- "source": [
- "You can further define the groups by specifying multiple dimensions.\n",
- "Druid SQL supports using numbers in GROUP BY and ORDER BY clauses to refer to column positions in the SELECT clause.\n",
- "Similar to SQL, Druid SQL uses one-based indexing to reference elements in SQL statements.\n",
- "\n",
- "For example, the next cell aggregates entries grouped by fields `cityName` and `countryName`.\n",
- "The output has three fields: `cityName`, `countryName`, and `counts`. For each unique combination of `cityName` and `countryName`, Druid aggregates all rows and averages the entries in the group.\n",
- "The output is limited to seven rows for display purposes."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d3724cc3",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT cityName, countryName, COUNT(*) AS counts \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "GROUP BY cityName, countryName \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "9c2e56af-3fdf-40f1-869b-822ac8aafbc8",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Query types\n",
- "\n",
- "Druid SQL is optimized for the following [native query](https://druid.apache.org/docs/latest/querying/querying.html) types:\n",
- "- Scan\n",
- "- Timeseries\n",
- "- TopN\n",
- "- GroupBy\n",
- "\n",
- "Native queries are low-level JSON-based queries designed to be lightweight and complete very quickly.\n",
- "Druid translates SQL statements into native queries using the [Apache Calcite](https://calcite.apache.org/) data management framework. The queries are then executed by the Druid cluster.\n",
- "\n",
- "To get information about how a Druid SQL query is translated into a native query type, add [EXPLAIN PLAN FOR](https://druid.apache.org/docs/latest/querying/sql.html#explain-plan) to the beginning of the query.\n",
- "Alternatively, you can set up [request logging](https://druid.apache.org/docs/latest/configuration/index.html#request-logging)."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "2de6e6ba-473c-4ef0-9739-a9472a4c7065",
- "metadata": {},
- "source": [
- "### Scan\n",
- "\n",
- "The Scan native query type returns raw Druid rows in streaming mode.\n",
- "Druid SQL uses the Scan query type for queries that do not aggregate—queries that do not have GROUP BY or DISTINCT clauses.\n",
- "\n",
- "For example, run the next cell to scan the `wikipedia-sql-tutorial` table for comments from Mexico City. Calcite translates this Druid SQL query into the Scan native query type."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3ed58bfd",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT comment AS \"Entry\" \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "WHERE cityName = 'Mexico City' \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "407cf489-3947-4326-9d81-18b38abaee58",
- "metadata": {},
- "source": [
- "### TopN\n",
- "\n",
- "The TopN native query type returns a sorted set of results for the values in a given dimension according to some criteria. TopN results are always computed in memory. In some cases, the TopN query type delivers approximate ranking and results. To prevent this, set the `useApproximateTopN` query context parameter to `false` when calling the [Druid SQL API](https://druid.apache.org/docs/latest/api-reference/sql-api.html). See [SQL query context](https://druid.apache.org/docs/latest/querying/sql-query-context.html) for more information.\n",
- "\n",
- "Druid SQL uses TopN for queries that meet the following criteria:\n",
- "- queries that GROUP BY a single expression\n",
- "- queries that have ORDER BY and LIMIT clauses\n",
- "- queries that do not contain HAVING\n",
- "- queries that are not nested\n",
- "\n",
- "For example, the next cell returns the channels based on the number of events for each one in ascending order. Calcite translates this Druid SQL query into the TopN native query type."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bb694442",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT channel, count(*) as \"Number of events\" \n",
- "FROM \"wikipedia-sql-tutorial\" \n",
- "GROUP BY channel \n",
- "ORDER BY \"Number of events\" ASC \n",
- "LIMIT 5\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "38ee7d1d-2d47-4d36-b8c6-b0868c36c871",
- "metadata": {},
- "source": [
- "### Timeseries\n",
- "\n",
- "The Timeseries native query type returns an array of JSON objects, where each object represents a value asked for by the Timeseries query.\n",
- "\n",
- "Druid SQL uses Timeseries for queries that meet the following criteria:\n",
- "- queries that GROUP BY `FLOOR(__time TO unit)` or `TIME_FLOOR(__time, period)`\n",
- "- queries that do not contain other grouping expressions\n",
- "- queries that do not contain HAVING\n",
- "- queries that are not nested\n",
- "- queries that either have no ORDER BY clause or an ORDER BY clause that orders by the same expression as present in GROUP BY\n",
- "\n",
- "For example, Calcite translates the following Druid SQL query into the Timeseries native query type:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "644b0cdd",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT \n",
- " countryName AS \"Country\", \n",
- " SUM(deleted) AS deleted, \n",
- " SUM(added) AS added \n",
- "FROM \"wikipedia-sql-tutorial\"\n",
- "WHERE countryName = 'France' \n",
- "GROUP BY countryName , FLOOR(__time TO HOUR) \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "eb5cfc5c-8f58-4e56-a218-7a55867e3e0c",
- "metadata": {},
- "source": [
- "### GroupBy\n",
- "\n",
- "The GroupBy native query type returns an array of JSON objects where each object represents a grouping asked for by the GroupBy query. GroupBy delivers exact results and rankings.\n",
- "\n",
- "Druid SQL uses GroupBy for aggregations, including nested aggregation queries.\n",
- "\n",
- "For example, Calcite translates the following Druid SQL query into the GroupBy native query type:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b4f5d1dd",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT \n",
- " countryName AS \"Country\\\", \n",
- " countryIsoCode AS \"ISO\" \n",
- "FROM \"wikipedia-sql-tutorial\"\n",
- "WHERE channel = '#es.wikipedia' \n",
- "GROUP BY countryName, countryIsoCode \n",
- "LIMIT 7\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8fbfa1fa-2cde-46d5-8107-60bd436fb64e",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Learn more\n",
- "\n",
- "This tutorial covers the basics of Druid SQL. To learn more about querying datasets using Druid SQL, see the following topics:\n",
- "\n",
- "- [Druid SQL overview](https://druid.apache.org/docs/latest/querying/sql.html) to learn about Druid SQL syntax.\n",
- "- [SQL data types](https://druid.apache.org/docs/latest/querying/sql-data-types.html) for information on how SQL data types map to Druid SQL.\n",
- "- [SQL query translation](https://druid.apache.org/docs/latest/querying/sql-translation.html) for best practices that help you minimize the impact of SQL translation.\n",
- "- [Druid SQL operators](https://druid.apache.org/docs/latest/querying/sql-operators.html) for operators supported by Druid SQL.\n",
- "- [SQL aggregation functions](https://druid.apache.org/docs/latest/querying/sql-aggregations.html) for reference on the aggregation functions supported by Druid SQL. \n",
- "- [Unsupported features](https://druid.apache.org/docs/latest/querying/sql-translation.html#unsupported-features) for a list of SQL features not supported by Druid SQL.\n",
- "- [SQL keywords](https://calcite.apache.org/docs/reference.html#keywords) for a list of SQL keywords."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- },
- "toc-autonumbering": false,
- "toc-showcode": false,
- "toc-showmarkdowntxt": false,
- "toc-showtags": false,
- "vscode": {
- "interpreter": {
- "hash": "392d024d9e577b3899d42c3b7a7b6a06db0d6efdc0b44e46dc281b668e7b3887"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/03-query/02-approxRanking.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/03-query/02-approxRanking.ipynb
deleted file mode 100644
index e49ac92e84a0..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/03-query/02-approxRanking.ipynb
+++ /dev/null
@@ -1,596 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "557e06e8-9b35-4b34-8322-8a8ede6de709",
- "metadata": {},
- "source": [
- "# Using TopN approximation in Druid queries\n",
- "\n",
- "\n",
- "\n",
- "Imagine you’re building a dynamic filter in your app: you want to populate it with, say, the top most popular (COUNT) dimension values in descending order (ORDER BY). Druid speeds up this type of query using TopN approximation by default. In this tutorial, work through some examples and see the effect of turning approximation off."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c94ff5c9-ada9-4f1d-8541-649e70cfc9a3",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 26.0.0 or later.\n",
- "\n",
- "#### Run using Docker\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- " \n",
- "#### Run without using Docker\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Apache Druid instance, with a `DRUID_HOST` local environment variable containing the server name of your Druid router.\n",
- "* [druidapi](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/druidapi/README.md), a Python client for Apache Druid. Follow the instructions in the Install section of the README file.\n",
- "* [matplotlib](https://matplotlib.org/), a library for creating visualizations in Python.\n",
- "* [pandas](https://pandas.pydata.org/), a data analysis and manipulation tool."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e6b56cfc-9951-4a4e-a3f4-828e2dd5b3b5",
- "metadata": {},
- "source": [
- "### Initialize Python\n",
- "\n",
- "Run the next cell to set up the Druid Python client's connection to Apache Druid.\n",
- "\n",
- "If successful, the Druid version number will be shown in the output."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "685b872e-0d59-4100-a636-39ec93c627fb",
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "\n",
- "if 'DRUID_HOST' not in os.environ.keys():\n",
- " druid_host=f\"http://localhost:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- " \n",
- "print(f\"Opening a connection to {druid_host}.\")\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "\n",
- "display = druid.display\n",
- "sql_client = druid.sql\n",
- "status_client = druid.status\n",
- "\n",
- "status_client.version"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d6fe9c99-ee0d-4205-9ca5-a8810c977335",
- "metadata": {},
- "source": [
- "### Load example data\n",
- "\n",
- "Once your Druid environment is up and running, ingest the sample data for this tutorial.\n",
- "\n",
- "Run the following cell to create a table called `example-flights-topn`. When completed, you'll see a description of the final table.\n",
- "\n",
- "Monitor the ingestion task process in the Druid console."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e89a3000-a65e-4c4a-a917-3c37cbe975b3",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "REPLACE INTO \"example-flights-topn\" OVERWRITE ALL\n",
- "WITH \"ext\" AS (SELECT *\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"https://static.imply.io/example-data/flight_on_time/flights/On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2005_11.csv.zip\"]}',\n",
- " '{\"type\":\"csv\",\"findColumnsFromHeader\":true}'\n",
- " )\n",
- ") EXTEND (\"depaturetime\" VARCHAR, \"arrivalime\" VARCHAR, \"Year\" BIGINT, \"Quarter\" BIGINT, \"Month\" BIGINT, \"DayofMonth\" BIGINT, \"DayOfWeek\" BIGINT, \"FlightDate\" VARCHAR, \"Reporting_Airline\" VARCHAR, \"DOT_ID_Reporting_Airline\" BIGINT, \"IATA_CODE_Reporting_Airline\" VARCHAR, \"Tail_Number\" VARCHAR, \"Flight_Number_Reporting_Airline\" BIGINT, \"OriginAirportID\" BIGINT, \"OriginAirportSeqID\" BIGINT, \"OriginCityMarketID\" BIGINT, \"Origin\" VARCHAR, \"OriginCityName\" VARCHAR, \"OriginState\" VARCHAR, \"OriginStateFips\" BIGINT, \"OriginStateName\" VARCHAR, \"OriginWac\" BIGINT, \"DestAirportID\" BIGINT, \"DestAirportSeqID\" BIGINT, \"DestCityMarketID\" BIGINT, \"Dest\" VARCHAR, \"DestCityName\" VARCHAR, \"DestState\" VARCHAR, \"DestStateFips\" BIGINT, \"DestStateName\" VARCHAR, \"DestWac\" BIGINT, \"CRSDepTime\" BIGINT, \"DepTime\" BIGINT, \"DepDelay\" BIGINT, \"DepDelayMinutes\" BIGINT, \"DepDel15\" BIGINT, \"DepartureDelayGroups\" BIGINT, \"DepTimeBlk\" VARCHAR, \"TaxiOut\" BIGINT, \"WheelsOff\" BIGINT, \"WheelsOn\" BIGINT, \"TaxiIn\" BIGINT, \"CRSArrTime\" BIGINT, \"ArrTime\" BIGINT, \"ArrDelay\" BIGINT, \"ArrDelayMinutes\" BIGINT, \"ArrDel15\" BIGINT, \"ArrivalDelayGroups\" BIGINT, \"ArrTimeBlk\" VARCHAR, \"Cancelled\" BIGINT, \"CancellationCode\" VARCHAR, \"Diverted\" BIGINT, \"CRSElapsedTime\" BIGINT, \"ActualElapsedTime\" BIGINT, \"AirTime\" BIGINT, \"Flights\" BIGINT, \"Distance\" BIGINT, \"DistanceGroup\" BIGINT, \"CarrierDelay\" BIGINT, \"WeatherDelay\" BIGINT, \"NASDelay\" BIGINT, \"SecurityDelay\" BIGINT, \"LateAircraftDelay\" BIGINT, \"FirstDepTime\" VARCHAR, \"TotalAddGTime\" VARCHAR, \"LongestAddGTime\" VARCHAR, \"DivAirportLandings\" VARCHAR, \"DivReachedDest\" VARCHAR, \"DivActualElapsedTime\" VARCHAR, \"DivArrDelay\" VARCHAR, \"DivDistance\" VARCHAR, \"Div1Airport\" VARCHAR, \"Div1AirportID\" VARCHAR, \"Div1AirportSeqID\" VARCHAR, \"Div1WheelsOn\" VARCHAR, \"Div1TotalGTime\" VARCHAR, \"Div1LongestGTime\" VARCHAR, \"Div1WheelsOff\" VARCHAR, \"Div1TailNum\" VARCHAR, \"Div2Airport\" VARCHAR, \"Div2AirportID\" VARCHAR, \"Div2AirportSeqID\" VARCHAR, \"Div2WheelsOn\" VARCHAR, \"Div2TotalGTime\" VARCHAR, \"Div2LongestGTime\" VARCHAR, \"Div2WheelsOff\" VARCHAR, \"Div2TailNum\" VARCHAR, \"Div3Airport\" VARCHAR, \"Div3AirportID\" VARCHAR, \"Div3AirportSeqID\" VARCHAR, \"Div3WheelsOn\" VARCHAR, \"Div3TotalGTime\" VARCHAR, \"Div3LongestGTime\" VARCHAR, \"Div3WheelsOff\" VARCHAR, \"Div3TailNum\" VARCHAR, \"Div4Airport\" VARCHAR, \"Div4AirportID\" VARCHAR, \"Div4AirportSeqID\" VARCHAR, \"Div4WheelsOn\" VARCHAR, \"Div4TotalGTime\" VARCHAR, \"Div4LongestGTime\" VARCHAR, \"Div4WheelsOff\" VARCHAR, \"Div4TailNum\" VARCHAR, \"Div5Airport\" VARCHAR, \"Div5AirportID\" VARCHAR, \"Div5AirportSeqID\" VARCHAR, \"Div5WheelsOn\" VARCHAR, \"Div5TotalGTime\" VARCHAR, \"Div5LongestGTime\" VARCHAR, \"Div5WheelsOff\" VARCHAR, \"Div5TailNum\" VARCHAR, \"Unnamed: 109\" VARCHAR))\n",
- "SELECT\n",
- " TIME_PARSE(\"depaturetime\") AS \"__time\",\n",
- " \"Reporting_Airline\",\n",
- " \"Tail_Number\",\n",
- " \"Distance\",\n",
- " \"Origin\",\n",
- " \"Flight_Number_Reporting_Airline\"\n",
- "FROM \"ext\"\n",
- "PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('example-flights-topn')\n",
- "display.table('example-flights-topn')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "00141575-29b4-440e-b23f-f7c6b237ef28",
- "metadata": {},
- "source": [
- "When this is completed, run the following cell to load some Python libraries we need to explore what TopN does."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f2a19226-6abc-436d-ac3c-9c04d6026707",
- "metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "import matplotlib\n",
- "import matplotlib.pyplot as plt\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f388633f-195b-4381-98cc-7a2f80f48690",
- "metadata": {},
- "source": [
- "## Example TopN style queries\n",
- "\n",
- "Druid looks for patterns in `SELECT` statements to determine if they would benefit from using approximation. A ranking query, like the following, matches the rules for TopN approximation, so Druid enables it by default.\n",
- "\n",
- "For Druid to automatically optimize for TopN, you need an SQL statement that has:\n",
- "* A GROUP BY on one dimension, and\n",
- "* an ORDER BY on one aggregate.\n",
- "\n",
- "Run this query to see what the results are like:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b76e5184-9fe4-4f21-a471-4e15d16515c8",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " \"Reporting_Airline\",\n",
- " COUNT(*) AS Flights,\n",
- " SUM(\"Distance\") AS SumDistance\n",
- "FROM\n",
- " \"example-flights-topn\"\n",
- "GROUP BY 1\n",
- "ORDER BY 2 DESC\n",
- "LIMIT 10\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5600fc48-c999-406f-800b-3f0f6a973aa0",
- "metadata": {},
- "source": [
- "Run the following cell, which uses the `explain_sql` method to show the [`EXPLAIN PLAN`](https://druid.apache.org/docs/latest/querying/sql-translation#interpreting-explain-plan-output) for this query."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7595eec0-a709-4cd6-985e-eec8a6e37b61",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(json.dumps(json.loads(sql_client.explain_sql(sql)['PLAN']), indent=2))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8658b26e-2f09-4a97-96e8-589168130559",
- "metadata": {},
- "source": [
- "The plan `queryType` is `topN`, showing that TopN approximation was used.\n",
- "\n",
- "Druid applied a `LIMIT` operation on the results calculated by each data service involved in the query, improving processing efficiency by minimizing the amount of data transferred to the Broker.\n",
- "\n",
- "This [pushed-down](https://druid.apache.org/docs/latest/querying/groupbyquery#limit-pushdown-optimization) `LIMIT` is the `max` of the `threshold` in the plan (which came from the `LIMIT` in the SQL) and the [`minTopNThreshold`](https://druid.apache.org/docs/latest/querying/topnquery.html#aliasing) setting in your cluster - the default being 1,000.\n",
- "\n",
- "To see the implication of this `LIMIT` in action, the cardinality of the `GROUP BY` dimension therefore needs to exceed this cap.\n",
- "\n",
- "Run the following query to discover the cardinality of the `GROUP BY` on `Reporting_Airline`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "65a968e5-d51e-47e9-af04-88181f3b865b",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT COUNT (DISTINCT \"Reporting_Airline\") AS UniqueReportingAirlines\n",
- "FROM \"example-flights-topn\"\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "13c8a101-dcba-49be-8d05-0a5dbd9731ca",
- "metadata": {},
- "source": [
- "The number of unique values is below the `LIMIT` cap, meaning, there is no trimming and the results are not approximate; all the data servers will return all their results, without trimming, to be merged and passed back to us.\n",
- "\n",
- "What is the cardinality for the `Tail_Number` dimension?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "71c1816b-f090-4a3d-b476-8d40eb9c2dec",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " COUNT (DISTINCT \"Tail_Number\") AS UniqueTailNumbers\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Tail_Number\" <> ''\n",
- "'''\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "72a1a858-bda8-464e-988b-c4ed80b63f43",
- "metadata": {},
- "source": [
- "With this many distinct values to `GROUP BY`, the services involved in the query will trim their results when using the\n",
- "`topN` engine.\n",
- "\n",
- "Run the next query to visualize the distribution of unique `Tail_Number`s in the example dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "208f1463-34dd-4b0e-aa78-e582e2133a8f",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " \"Tail_Number\",\n",
- " COUNT(*) AS RecordCount\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Tail_Number\" <> ''\n",
- "GROUP BY 1\n",
- "ORDER BY 2 DESC\n",
- "LIMIT 500\n",
- "'''\n",
- "\n",
- "df4 = pd.DataFrame(sql_client.sql(sql))\n",
- "\n",
- "df4.plot(x='Tail_Number', y='RecordCount', marker='o')\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b16d9a4d-c9e7-447b-8cdb-7ad1c0f88d73",
- "metadata": {},
- "source": [
- "The plot shows that we have a long tail distribution, meaning there is a high likelihood the same `Tail_Number` will be in rank position one across the data set, and therefore across all segments. The flatter the distribution, the less reliable this assertion is.\n",
- "\n",
- "Take a look at the following cell to see a query that counts the number of records and sums total distance for each `Tail_Number`.\n",
- "\n",
- "Run the cell to execute this query in both TopN and non-TopN modes. The first run puts the results into a Dataframe `df1` running `sql_client.sql(sql)` directly. The second uses a crafted `req` object that adds the `useApproximateTopN` query context parameter to turn off approximation, storing the results in `df2`.\n",
- "\n",
- "It then runs a `compare` of `df2` against `df1` using `df3` and prints the results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "71db4746-4e8a-447e-aa58-f4c4ce3d7ffc",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " \"Tail_Number\",\n",
- " COUNT(*) AS \"count\",\n",
- " SUM(Distance) AS \"distance\"\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Tail_Number\" IS NOT NULL\n",
- "GROUP BY 1\n",
- "ORDER BY 3 DESC\n",
- "LIMIT 500\n",
- "'''\n",
- "\n",
- "# Load the results into a pandas DataFrame\n",
- "\n",
- "df1 = pd.DataFrame(sql_client.sql(sql))\n",
- "\n",
- "# Set up a sql_request to turn off TopN approximation\n",
- "\n",
- "req = sql_client.sql_request(sql)\n",
- "req.add_context(\"useApproximateTopN\", \"false\")\n",
- "resp = sql_client.sql_query(req)\n",
- "\n",
- "# Load the non-TopN results into a second pandas DataFrame using that request\n",
- "\n",
- "df2 = pd.DataFrame(sql_client.sql_query(req).rows)\n",
- "\n",
- "# Load the compare of df1 to df2 into a new dataframe and print\n",
- "\n",
- "df3 = df1.compare(df2, keep_equal=True)\n",
- "df3"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "78836242-acc8-4403-9e96-2177b96110ed",
- "metadata": {},
- "source": [
- "You can see:\n",
- "\n",
- "* The `self` (df1) and `other` (df2) rank position of each `Tail_Number` in each position\n",
- "* The self / other values for the calculated `count` and `distance`\n",
- "\n",
- "You may notice some `Tail_Number`s are in different positions depending on what the calculated `distance` is: certain data servers returned different sets of results, depending entirely on local data distribution. And some `Tail_Number`s may not appear in the list at all as they drop below the fold the cut-off applied to that specific process.\n",
- "\n",
- "Let's try this with a different dimension, `Flight_Number_Reporting_Airline`. The example dataset has more unique values, but the distribution is much flatter than `Tail_Number`. Run the following cell to see the count and a distribution plot."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a96f924c-9fc1-4000-9a54-7a951db5d2bb",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT COUNT(DISTINCT \"Flight_Number_Reporting_Airline\") AS UniqueReportingAirlines\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Flight_Number_Reporting_Airline\" <> ''\n",
- "'''\n",
- "\n",
- "display.sql(sql)\n",
- "\n",
- "sql = '''\n",
- "SELECT \"Flight_Number_Reporting_Airline\", COUNT(*) AS Flights\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Flight_Number_Reporting_Airline\" <> ''\n",
- "GROUP BY 1\n",
- "ORDER BY 2 DESC\n",
- "LIMIT 500\n",
- "'''\n",
- "\n",
- "# Load the results into a pandas DataFrame\n",
- "\n",
- "df5 = pd.DataFrame(sql_client.sql(sql))\n",
- "\n",
- "# Display a plot\n",
- "\n",
- "df5.plot(x='Flight_Number_Reporting_Airline', y='Flights', kind=\"bar\", xticks=[])\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4866091d-e689-4209-8f6e-4edd526646e9",
- "metadata": {},
- "source": [
- "This dimension, unlike `Tail_Number`, has a flatter distribution. Each data process is likely to have a flatter distribution of data, too, meaning the top ranking results are less prominent. The \"voting\" across the servers as to what is in the top is less clear.\n",
- "\n",
- "Run the following cell to repeat the same test we did before, creating two sets of results, and comparing them."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "405f7a9b-ac13-4c13-8e30-42058df4cbce",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT\n",
- " \"Flight_Number_Reporting_Airline\",\n",
- " AVG(\"Distance\") AS AverageDistance\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Flight_Number_Reporting_Airline\" IS NOT NULL\n",
- "GROUP BY 1\n",
- "ORDER BY 2 DESC\n",
- "LIMIT 10\n",
- "'''\n",
- "\n",
- "# Set up a sql_request to turn off TopN approximation\n",
- "\n",
- "req = sql_client.sql_request(sql)\n",
- "req.add_context(\"useApproximateTopN\", \"false\")\n",
- "resp = sql_client.sql_query(req)\n",
- "\n",
- "# Load two pandas DataFrames - one with the TopN and one with the non-TopN results\n",
- "\n",
- "df1 = pd.DataFrame(sql_client.sql(sql))\n",
- "df2 = pd.DataFrame(sql_client.sql_query(req).rows)\n",
- "\n",
- "# Load the compare of df1 to df2 into a new dataframe and print\n",
- "\n",
- "df3 = df1.compare(df2, keep_equal=True)\n",
- "df3"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "989a3e1c-cc8a-47c9-ad15-0b95fa00c7a6",
- "metadata": {},
- "source": [
- "Here, the flatter distribution exaggerates ranking and calculation error. Further issues are caused by the calculation being non-additive.\n",
- "\n",
- "The following cell contains a query that is a good example of TopN being applied: it creates a list of `Tail_Number`s within a particular period of time. Imagine that you might use this list to provide an interactive filter on `Tail_Number` to the end user when they're looking at this specific time period.\n",
- "\n",
- "Run the following cell to show the cardinality of `Tail_Number`s in that period, and then to plot the distribution:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d039c393-96f4-4847-ac60-4414477ebc3b",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT COUNT (DISTINCT \"Tail_Number\") AS UniqueTailNumbers\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Tail_Number\" <> ''\n",
- "AND (TIMESTAMP '2005-11-01' <= \"__time\" AND \"__time\" <= TIMESTAMP '2005-11-14')\n",
- "'''\n",
- "display.sql(sql)\n",
- "\n",
- "sql = '''\n",
- "SELECT\n",
- " \"Tail_Number\",\n",
- " COUNT(*) AS \"Flights\"\n",
- "FROM \"example-flights-topn\"\n",
- "WHERE \"Tail_Number\" <> ''\n",
- "AND (TIMESTAMP '2005-11-01' <= \"__time\" AND \"__time\" <= TIMESTAMP '2005-11-14')\n",
- "GROUP BY 1\n",
- "ORDER BY 2 DESC\n",
- "LIMIT 500\n",
- "'''\n",
- "\n",
- "df4 = pd.DataFrame(sql_client.sql(sql))\n",
- "\n",
- "df4.plot(x='Tail_Number', y='Flights', marker='o')\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "eeed8fa8-d1ce-41b2-955b-88fb0834ab36",
- "metadata": {},
- "source": [
- "This distribution pattern is good for TopN - the highest ranking values are very prominent.\n",
- "\n",
- "Run the following cell to compare the two styles of execution:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d47d2017-1d89-4622-a42c-d86f29a774e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT \"Tail_Number\", COUNT(*) AS \"count\", SUM(Distance) AS \"distance\"\n",
- " FROM \"example-flights-topn\"\n",
- " WHERE \"Tail_Number\" IS NOT NULL\n",
- " AND (TIMESTAMP '2005-11-01' <= \"__time\" AND \"__time\" <= TIMESTAMP '2005-11-14')\n",
- " GROUP BY 1\n",
- " ORDER BY 3 DESC\n",
- " LIMIT 500\n",
- "'''\n",
- "\n",
- "req = sql_client.sql_request(sql)\n",
- "req.add_context(\"useApproximateTopN\", \"false\")\n",
- "resp = sql_client.sql_query(req)\n",
- "\n",
- "df1 = pd.DataFrame(sql_client.sql(sql))\n",
- "df2 = pd.DataFrame(sql_client.sql_query(req).rows)\n",
- "\n",
- "df3 = df1.compare(df2, keep_equal=True)\n",
- "df3"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f58a1846-5072-4495-b840-a620de3c0442",
- "metadata": {},
- "source": [
- "The distribution, together with our filters, means that these results are useful for this kind of interactive UI element."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b43cd060-429e-4e84-b559-ad63624e7c14",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "The speed boost we receive through TopN, at the expense of some accuracy, makes it useful for interactive elements like filters or initial lists of results that people will then deep dive into.\n",
- "\n",
- "* TopN is the default execution model for `GROUP BY` queries with one dimension, an `ORDER BY` and a `LIMIT` clause\n",
- "* You can turn TopN off with a query context parameter\n",
- "* Accuracy is highly dependent on distribution of the data, after filters etc., across the database"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d08b8804-1051-4d38-88e7-2be1776934eb",
- "metadata": {},
- "source": [
- "## Learn more\n",
- "\n",
- "Read the following documentation for more information:\n",
- "\n",
- "* [TopN queries](https://druid.apache.org/docs/latest/querying/topnquery)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/03-query/04-UnionOperations.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/03-query/04-UnionOperations.ipynb
deleted file mode 100644
index 69fe16eafb6f..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/03-query/04-UnionOperations.ipynb
+++ /dev/null
@@ -1,509 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "557e06e8-9b35-4b34-8322-8a8ede6de709",
- "metadata": {},
- "source": [
- "# Using `UNION ALL` to address multiple `TABLE`s in the same query\n",
- "\n",
- "\n",
- " \n",
- "While working with Druid, you may need to bring together two different tables of results together into a single result list, or to treat multiple tables as a single input to a query. This notebook introduces the `UNION ALL` operator, walking through two ways in which this operator can be used to achieve this result: top-level and table-level `UNION ALL`."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cf4554ae-6516-4e76-b202-d6e2fdf31603",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 26.0.0 or later.\n",
- "\n",
- "#### Run using Docker\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- " \n",
- "#### Run Druid without Docker\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Druid instance, with a `DRUID_HOST` local environment variable containing the servername of your Druid router\n",
- "* [druidapi](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/druidapi/README.md), a Python client for Apache Druid. Follow the instructions in the Install section of the README file."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ee0c3171-def8-4ad9-9c56-d3a67f309631",
- "metadata": {},
- "source": [
- "### Initialization\n",
- "\n",
- "Run the next cell to attempt a connection to Druid services. If successful, the output shows the Druid version number."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9fa4abfe-f878-4031-88f2-94c13e922279",
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "\n",
- "if 'DRUID_HOST' not in os.environ.keys():\n",
- " druid_host=f\"http://localhost:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- " \n",
- "print(f\"Opening a connection to {druid_host}.\")\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "\n",
- "display = druid.display\n",
- "sql_client = druid.sql\n",
- "status_client = druid.status\n",
- "\n",
- "status_client.version"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fc3001a0-27e5-4f41-876a-ce6eab2acd6a",
- "metadata": {},
- "source": [
- "Finally, run the following cell to import the Python JSON module."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6b058d8b-2bae-4929-ab0c-5a6df1850387",
- "metadata": {},
- "outputs": [],
- "source": [
- "import json"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f388633f-195b-4381-98cc-7a2f80f48690",
- "metadata": {},
- "source": [
- "## Using Top-level `UNION ALL` to concatenate result sets\n",
- "\n",
- "Run the following cell to ingest the wikipedia data example. Once completed, you will see a description of the new table.\n",
- "\n",
- "You can optionally monitor the ingestion in the Druid console while it runs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a399196b-12db-42ff-ae24-c7232f150aba",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "REPLACE INTO \"example-wikipedia-unionall\" OVERWRITE ALL\n",
- "WITH \"ext\" AS (SELECT *\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
- " '{\"type\":\"json\"}'\n",
- " )\n",
- ") EXTEND (\"isRobot\" VARCHAR, \"channel\" VARCHAR, \"timestamp\" VARCHAR, \"flags\" VARCHAR, \"isUnpatrolled\" VARCHAR, \"page\" VARCHAR, \"diffUrl\" VARCHAR, \"added\" BIGINT, \"comment\" VARCHAR, \"commentLength\" BIGINT, \"isNew\" VARCHAR, \"isMinor\" VARCHAR, \"delta\" BIGINT, \"isAnonymous\" VARCHAR, \"user\" VARCHAR, \"deltaBucket\" BIGINT, \"deleted\" BIGINT, \"namespace\" VARCHAR, \"cityName\" VARCHAR, \"countryName\" VARCHAR, \"regionIsoCode\" VARCHAR, \"metroCode\" BIGINT, \"countryIsoCode\" VARCHAR, \"regionName\" VARCHAR))\n",
- "SELECT\n",
- " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " \"isRobot\",\n",
- " \"channel\",\n",
- " \"page\",\n",
- " \"commentLength\",\n",
- " \"countryName\",\n",
- " \"user\"\n",
- "FROM \"ext\"\n",
- "PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('example-wikipedia-unionall')\n",
- "display.table('example-wikipedia-unionall')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "24b47cc3-68f5-4a73-b374-94bbfa32d91d",
- "metadata": {},
- "source": [
- "You can use `UNION ALL` to append the results of one query with another.\n",
- "\n",
- "The first query in the cell below, `set1`, returns the ten first edits to any \"fr\"-like `channel` between midday and 1pm on the 27th June 2016. The second query repeats this but for any \"en\"-like `channel`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b76e5184-9fe4-4f21-a471-4e15d16515c8",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "WITH\n",
- "set1 AS (\n",
- " SELECT\n",
- " __time,\n",
- " \"channel\",\n",
- " \"page\",\n",
- " \"isRobot\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE DATE_TRUNC('HOUR', __time) = TIMESTAMP '2016-06-27 12:00:00'\n",
- " AND channel LIKE '#fr%'\n",
- " ORDER BY __time\n",
- " LIMIT 10\n",
- " ),\n",
- "set2 AS (\n",
- " SELECT\n",
- " __time,\n",
- " \"channel\",\n",
- " \"page\",\n",
- " \"isRobot\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE DATE_TRUNC('HOUR', __time) = TIMESTAMP '2016-06-27 12:00:00'\n",
- " AND channel LIKE '#en%'\n",
- " ORDER BY __time\n",
- " LIMIT 10\n",
- " )\n",
- " \n",
- "SELECT * from set1\n",
- "UNION ALL\n",
- "SELECT * from set2\n",
- "'''\n",
- "\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f5e77fa9-a60c-4955-b763-58d970d7326d",
- "metadata": {},
- "source": [
- "This is a [top-level](https://druid.apache.org/docs/latest/querying/sql.html#top-level) `UNION` operation. First, Druid calculated `set1` and appended subsequent results sets.\n",
- "\n",
- "Notice that these results are not in order by time – even though the individual sets did `ORDER BY` time. Druid simply concatenated the two result sets together.\n",
- "\n",
- "Optionally, run the next cell to show the precise [`EXPLAIN PLAN`](https://druid.apache.org/docs/latest/querying/sql-translation#interpreting-explain-plan-output) for the query. You can see there are two `query` execution plans, one for each subquery. Also, Druid's planning process optimized execution of the outer query."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "97934da2-17d1-4c91-8ae3-926cc89185c1",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(json.dumps(json.loads(sql_client.explain_sql(sql)['PLAN']), indent=2))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "800add1a-d459-4796-b974-b2f094db417f",
- "metadata": {},
- "source": [
- "Run next cell to perform another top-level UNION ALL, this time where the sets use `GROUP BY`.\n",
- "\n",
- "Notice that the aggregates have `AS` to set specific field names."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8e687466-74bb-4cc0-ba17-913d1807fc60",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "WITH\n",
- "set1 AS (\n",
- " SELECT\n",
- " TIME_FLOOR(__time, 'PT1H') AS \"Period\",\n",
- " countryName,\n",
- " AVG(commentLength) AS \"Average Comment Size\",\n",
- " COUNT(DISTINCT \"page\") AS \"Pages\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE countryName='China'\n",
- " GROUP BY 1, 2\n",
- " LIMIT 10\n",
- " ),\n",
- "set2 AS (\n",
- " SELECT\n",
- " TIME_FLOOR(__time, 'PT1H') AS \"Episode\",\n",
- " countryName,\n",
- " COUNT(DISTINCT \"page\") AS \"Pages\",\n",
- " AVG(commentLength) AS \"Average Comment Length\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE countryName='Austria'\n",
- " GROUP BY 1, 2\n",
- " LIMIT 10\n",
- " )\n",
- "\n",
- "SELECT * from set1\n",
- "UNION ALL\n",
- "SELECT * from set2\n",
- "'''\n",
- "\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f2c95ffc-b260-4671-bacc-c8cc3137e9c2",
- "metadata": {},
- "source": [
- "Look carefully at these results - Druid has simply appended the results from `set2` to `set1` without introducing redundant columns.\n",
- "\n",
- "* Column name in `set2` (`Period` versus `Episode` and `Average Comment Size` versus `Average Comment Length`) did not result in new columns\n",
- "* Columns with the same name (`Pages`) did not result in that aggregate being put into same column - Austria's values are simply appended `Average Comment Size`\n",
- "\n",
- "Run the next cell, which uses explicit column names at the top-level, rather than `*`, to ensure the calculations appear in the right columns in the final result. It also aliases the columns for the results by using `AS`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "39f9be92-7b2e-417c-b16a-5060b8cd2c30",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "WITH\n",
- "set1 AS (\n",
- " SELECT\n",
- " TIME_FLOOR(__time, 'PT1H') AS \"Period\",\n",
- " countryName,\n",
- " AVG(commentLength) AS \"Average Comment Size\",\n",
- " COUNT(DISTINCT \"page\") AS \"Pages\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE countryName='China'\n",
- " GROUP BY 1, 2\n",
- " LIMIT 10\n",
- " ),\n",
- "set2 AS (\n",
- " SELECT\n",
- " TIME_FLOOR(__time, 'PT1H') AS \"Episode\",\n",
- " countryName,\n",
- " COUNT(DISTINCT \"page\") AS \"Pages\",\n",
- " AVG(commentLength) AS \"Average Comment Length\"\n",
- " FROM \"example-wikipedia-unionall\"\n",
- " WHERE countryName='Austria'\n",
- " GROUP BY 1, 2\n",
- " LIMIT 10\n",
- " )\n",
- "\n",
- "SELECT \"Period\", \"countryName\" AS \"Country\", \"Average Comment Size\" AS \"Edit Size\", \"Pages\" AS \"Unique Pages\" from set1\n",
- "UNION ALL\n",
- "SELECT \"Episode\", \"countryName\", \"Average Comment Length\", \"Pages\" from set2\n",
- "'''\n",
- "\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "25001794-e1a7-4325-adb3-2b8f26036261",
- "metadata": {},
- "source": [
- "## Using Table-level `UNION ALL` to work with multiple tables\n",
- "\n",
- "From one source of data, data engineers may create multiple `TABLE` datasources in order to:\n",
- "\n",
- "* Separate data with different levels of `__time` granularity (ie. the level of summarisation),\n",
- "* Apply different security to different parts, for example, per tenant,\n",
- "* Break up the data using filtering at ingestion time, for example, different tables for different HTTP error codes,\n",
- "* Separate upstream data by the source device or system, for example, different types of IOT device,\n",
- "* Isolate different periods of time, perhaps with different retention periods.\n",
- "\n",
- "You can use `UNION ALL` to access _all_ the source data, referencing all the `TABLE` datasources through a sub-query or a `FROM` clause.\n",
- "\n",
- "The next two cells create two new tables, `example-wikipedia-unionall-en` and `example-wikipedia-unionall-fr`. `example-wikipedia-unionall-en` contains only data for English language channel edits, while `example-wikipedia-unionall-fr` contains only French channels.\n",
- "\n",
- "Run the next two cells, monitoring the ingestion in the Druid Console as they run."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "269c6aef-c3a5-46ad-8332-30b7bf30ddfb",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "REPLACE INTO \"example-wikipedia-unionall-en\" OVERWRITE ALL\n",
- "WITH \"ext\" AS (SELECT *\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
- " '{\"type\":\"json\"}'\n",
- " )\n",
- ") EXTEND (\"isRobot\" VARCHAR, \"channel\" VARCHAR, \"timestamp\" VARCHAR, \"flags\" VARCHAR, \"isUnpatrolled\" VARCHAR, \"page\" VARCHAR, \"diffUrl\" VARCHAR, \"added\" BIGINT, \"comment\" VARCHAR, \"commentLength\" BIGINT, \"isNew\" VARCHAR, \"isMinor\" VARCHAR, \"delta\" BIGINT, \"isAnonymous\" VARCHAR, \"user\" VARCHAR, \"deltaBucket\" BIGINT, \"deleted\" BIGINT, \"namespace\" VARCHAR, \"cityName\" VARCHAR, \"countryName\" VARCHAR, \"regionIsoCode\" VARCHAR, \"metroCode\" BIGINT, \"countryIsoCode\" VARCHAR, \"regionName\" VARCHAR))\n",
- "SELECT\n",
- " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " \"isRobot\",\n",
- " \"channel\",\n",
- " \"page\",\n",
- " \"commentLength\",\n",
- " \"countryName\",\n",
- " \"user\"\n",
- "FROM \"ext\"\n",
- "WHERE \"channel\" LIKE '#en%'\n",
- "PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('example-wikipedia-unionall-en')\n",
- "display.table('example-wikipedia-unionall-en')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "61740d61-28fc-48e9-b026-d472bd04f390",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql='''\n",
- "REPLACE INTO \"example-wikipedia-unionall-fr\" OVERWRITE ALL\n",
- "WITH \"ext\" AS (SELECT *\n",
- "FROM TABLE(\n",
- " EXTERN(\n",
- " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
- " '{\"type\":\"json\"}'\n",
- " )\n",
- ") EXTEND (\"isRobot\" VARCHAR, \"channel\" VARCHAR, \"timestamp\" VARCHAR, \"flags\" VARCHAR, \"isUnpatrolled\" VARCHAR, \"page\" VARCHAR, \"diffUrl\" VARCHAR, \"added\" BIGINT, \"comment\" VARCHAR, \"commentLength\" BIGINT, \"isNew\" VARCHAR, \"isMinor\" VARCHAR, \"delta\" BIGINT, \"isAnonymous\" VARCHAR, \"user\" VARCHAR, \"deltaBucket\" BIGINT, \"deleted\" BIGINT, \"namespace\" VARCHAR, \"cityName\" VARCHAR, \"countryName\" VARCHAR, \"regionIsoCode\" VARCHAR, \"metroCode\" BIGINT, \"countryIsoCode\" VARCHAR, \"regionName\" VARCHAR))\n",
- "SELECT\n",
- " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
- " \"isRobot\",\n",
- " \"channel\",\n",
- " \"page\",\n",
- " \"commentLength\",\n",
- " \"countryName\",\n",
- " \"user\"\n",
- "FROM \"ext\"\n",
- "WHERE \"channel\" LIKE '#fr%'\n",
- "PARTITIONED BY DAY\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('example-wikipedia-unionall-fr')\n",
- "display.table('example-wikipedia-unionall-fr')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f8bbf2c6-681a-46f5-82f2-201cbbe8058d",
- "metadata": {},
- "source": [
- "The next cell uses `UNION ALL` in a `WITH` statement that creates `unifiedSource`. This will be a unified source of data for both tables that can then be used in a `SELECT` query.\n",
- "\n",
- "Druid executes these \"[top level](https://druid.apache.org/docs/26.0.0/querying/sql.html#top-level)\" `UNION ALL` queries differently to \"[table level](https://druid.apache.org/docs/26.0.0/querying/sql.html#table-level)\" queries you have used so far. Table level `UNION ALL` makes use of `union` datasources, and it's important that you read the [documentation](https://druid.apache.org/docs/26.0.0/querying/datasource.html#union) to understand the functionality available to you. Operations such as filtering, for example, can only be done in the outer `SELECT` statement on `unifiedSource` in the sample query below. \n",
- "\n",
- "Run the following cell to count the number of robot and non-robot edits by channel across both sets."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "783fe77d-2e7b-476a-9748-67ea90c8bb91",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "WITH unifiedSource AS (\n",
- " SELECT\n",
- " \"__time\",\n",
- " \"isRobot\",\n",
- " \"channel\",\n",
- " \"user\",\n",
- " \"countryName\"\n",
- " FROM \"example-wikipedia-unionall-en\"\n",
- " UNION ALL\n",
- " SELECT\n",
- " \"__time\",\n",
- " \"isRobot\",\n",
- " \"channel\",\n",
- " \"user\",\n",
- " \"countryName\"\n",
- " FROM \"example-wikipedia-unionall-fr\"\n",
- " )\n",
- "\n",
- "SELECT\n",
- " \"channel\",\n",
- " COUNT(*) FILTER (WHERE isRobot=true) AS \"Robot Edits\",\n",
- " COUNT (DISTINCT user) FILTER (WHERE isRobot=true) AS \"Robot Editors\",\n",
- " COUNT(*) FILTER (WHERE isRobot=false) AS \"Human Edits\",\n",
- " COUNT (DISTINCT user) FILTER (WHERE isRobot=false) AS \"Human Editors\"\n",
- "FROM unifiedSource\n",
- "GROUP BY 1\n",
- "'''\n",
- "\n",
- "display.sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f58a1846-5072-4495-b840-a620de3c0442",
- "metadata": {},
- "source": [
- "## Conclusion\n",
- "\n",
- "* There are two modes for `UNION ALL` in Druid - top level and table level\n",
- "* Top level is a simple concatenation, and operations must be done on the source `TABLE`s\n",
- "* Table level uses a `union` data source, and operations must be done on the outer `SELECT`\n",
- "\n",
- "## Learn more\n",
- "\n",
- "* Watch [Plan your Druid table datasources](https://youtu.be/OpYDX4RYLV0?list=PLDZysOZKycN7MZvNxQk_6RbwSJqjSrsNR) by Peter Marshall\n",
- "* Read about [union](https://druid.apache.org/docs/26.0.0/querying/datasource.html#union) datasources in the documentation\n",
- "* Read the latest [documentation](https://druid.apache.org/docs/26.0.0/querying/sql.html#union-all) on the `UNION ALL` operator"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/04-api/00-getting-started.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/04-api/00-getting-started.ipynb
deleted file mode 100644
index c703fc5fcfa4..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/04-api/00-getting-started.ipynb
+++ /dev/null
@@ -1,719 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "ad4e60b6",
- "metadata": {
- "tags": []
- },
- "source": [
- "# Learn the basics of the Druid API\n",
- "\n",
- "\n",
- " \n",
- "This tutorial introduces you to the basics of the Druid API and some of the endpoints you might use frequently to perform tasks, such as the following:\n",
- "\n",
- "- Checking if your cluster is up\n",
- "- Ingesting data\n",
- "- Querying data\n",
- "\n",
- "Different [Druid server types](https://druid.apache.org/docs/latest/design/processes.html#server-types) are responsible for handling different APIs for the Druid services. For example, the Overlord service on the Master server provides the status of a task. You'll also interact the Broker service on the Query Server to see what datasources are available. And to run queries, you'll interact with the Broker. The Router service on the Query servers routes API calls.\n",
- "\n",
- "For more information, see the [API reference](https://druid.apache.org/docs/latest/api-reference/api-reference.html), which is organized by server type.\n",
- "\n",
- "For work within other notebooks, prefer to use the [Python API](Python_API_Tutorial.ipynb) which is a notebook-friendly wrapper around the low-level API calls shown here.\n",
- "\n",
- "## Table of contents\n",
- "\n",
- "- [Prerequisites](#Prerequisites)\n",
- "- [Get basic cluster information](#Get-basic-cluster-information)\n",
- "- [Ingest data](#Ingest-data)\n",
- "- [Query your data](#Query-your-data)\n",
- "- [Learn more](#Learn-more)\n",
- "\n",
- "For the best experience, use JupyterLab so that you can always access the table of contents."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8d6bbbcb",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
- "\n",
- "\n",
- "Launch this tutorial and all prerequisites using the `druid-jupyter` or `all-services` profiles of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Druid instance.
\n",
- " Update the `druid_host` variable to point to your Router endpoint. For example:\n",
- " ```\n",
- " druid_host = \"http://localhost:8888\"`\n",
- " ```\n",
- " "
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "12c0e1c3",
- "metadata": {},
- "source": [
- "To start this tutorial, run the next cell. It imports the Python packages you'll need and defines a variable for the the Druid host, where the Router service listens.\n",
- "\n",
- "`druid_host` is the hostname and port for your Druid deployment. In a distributed environment, you can point to other Druid services. In this tutorial, you'll use the Router service as the `druid_host`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b7f08a52",
- "metadata": {},
- "outputs": [],
- "source": [
- "import requests\n",
- "\n",
- "# druid_host is the hostname and port for your Druid deployment. \n",
- "# In the Docker Compose tutorial environment, this is the Router\n",
- "# service running at \"http://router:8888\".\n",
- "# If you are not using the Docker Compose environment, edit the `druid_host`.\n",
- "\n",
- "druid_host = \"http://router:8888\"\n",
- "druid_host"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "a22c69c8",
- "metadata": {},
- "source": [
- "If your cluster is secure, you'll need to provide authorization information on each request. You can automate it by using the Requests `session` feature. Although this tutorial assumes no authorization, the configuration below defines a session as an example."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "06ea91de",
- "metadata": {},
- "outputs": [],
- "source": [
- "session = requests.Session()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "2093ecf0-fb4b-405b-a216-094583580e0a",
- "metadata": {},
- "source": [
- "In the rest of this tutorial, the `endpoint` and other variables are updated in code cells to call a different Druid endpoint to accomplish a task."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "29c24856",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Get basic cluster information\n",
- "\n",
- "In this cell, you'll use the `GET /status` endpoint to return basic information about your cluster, such as the Druid version, loaded extensions, and resource consumption.\n",
- "\n",
- "The following cell sets `endpoint` to `/status` and updates the HTTP method to `GET`. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8a1b453e",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "endpoint = druid_host + '/status'\n",
- "endpoint"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "1e853795",
- "metadata": {},
- "source": [
- "The Requests `Session` has a `get()` method that posts an HTTP `GET` request. The method takes multiple arguments. Here you only need the URL. The method returns a Requests `Response` object, which can convert the returned JSON result to Python. JSON objects become Python dictionaries. JSON arrays become Python arrays. When you run the cell, you should receive a response that starts with the version number of your Druid deployment."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "baa140b8",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "response = session.get(endpoint)\n",
- "json = response.json()\n",
- "json"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "de82029e",
- "metadata": {},
- "source": [
- "Because the JSON result is converted to Python, you can use Python to pull out the information you want. For example, to see just the version:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "64e83cec",
- "metadata": {},
- "outputs": [],
- "source": [
- "json['version']"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "cbeb5a63",
- "metadata": {
- "tags": []
- },
- "source": [
- "### Get cluster health\n",
- "\n",
- "The `/status/health` endpoint returns JSON `true` if your cluster is up and running. It's useful if you want to do something like programmatically check if your cluster is available. When you run the following cell, you should receive the `True` Python value if your Druid cluster has finished starting up and is running."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "21121c13",
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint = druid_host + '/status/health'\n",
- "endpoint"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5e51170e",
- "metadata": {},
- "outputs": [],
- "source": [
- "is_healthy = session.get(endpoint).json()\n",
- "is_healthy"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "1917aace",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Ingest data\n",
- "\n",
- "Now that you've confirmed that your cluster is up and running, you can start ingesting data. There are different ways to ingest data based on what your needs are. For more information, see [Ingestion methods](https://druid.apache.org/docs/latest/ingestion/index.html#ingestion-methods).\n",
- "\n",
- "This tutorial uses the multi-stage query (MSQ) task engine and its `sql/task` endpoint to perform SQL-based ingestion.\n",
- "\n",
- "To learn more about SQL-based ingestion, see [SQL-based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html). For information about the endpoint specifically, see [SQL-based ingestion and multi-stage query task API](https://druid.apache.org/docs/latest/api-reference/sql-ingestion-api.html)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "994ba704",
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint = druid_host + '/druid/v2/sql/task'\n",
- "endpoint"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "2168db3a",
- "metadata": {
- "tags": []
- },
- "source": [
- "The example uses INSERT, but you could also use REPLACE INTO. In fact, if you have an existing datasource with the name `wikipedia_api`, you need to use REPLACE INTO instead. \n",
- "\n",
- "The `/sql/task` endpoint accepts [SQL requests in the JSON-over-HTTP format](https://druid.apache.org/docs/latest/api-reference/sql-api.html#request-body) using the `query`, `context`, and `parameters` fields.\n",
- "\n",
- "The query inserts data from an external source into a table named `wikipedia_api`. \n",
- "\n",
- "Before you ingest the data, take a look at the query. Pay attention to two parts of it, `__time` and `PARTITIONED BY`, which relate to how Druid partitions data:\n",
- "\n",
- "- **`__time`**\n",
- "\n",
- " The `__time` column is a key concept for Druid. It's the default partition for Druid and is treated as the primary timestamp. Use it to help you write faster and more efficient queries. Big datasets, such as those for event data, typically have a time component. This means that instead of writing a query using only `COUNT`, you can combine that with `WHERE __time` to return results much more quickly.\n",
- "\n",
- "- **`PARTITIONED BY DAY`**\n",
- "\n",
- " If you partition by day, Druid creates segment files within the partition based on the day. You can only replace, delete and update data at the partition level. So when you're deciding how to partition data, make the partition large enough (min 500,000 rows) for good performance but not so big that those operations become impractical to run.\n",
- "\n",
- "To learn more, see [Partitioning](https://druid.apache.org/docs/latest/ingestion/partitioning.html).\n",
- "\n",
- "The query uses `INSERT INTO`. If you have an existing datasource with the name `wikipedia_api`, use `REPLACE INTO` instead."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "90c34908",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "INSERT INTO wikipedia_api \n",
- "SELECT \n",
- " TIME_PARSE(\"timestamp\") AS __time,\n",
- " * \n",
- "FROM TABLE(EXTERN(\n",
- " '{\"type\": \"http\", \"uris\": [\"https://druid.apache.org/data/wikipedia.json.gz\"]}', \n",
- " '{\"type\": \"json\"}', \n",
- " '[{\"name\": \"added\", \"type\": \"long\"}, {\"name\": \"channel\", \"type\": \"string\"}, {\"name\": \"cityName\", \"type\": \"string\"}, {\"name\": \"comment\", \"type\": \"string\"}, {\"name\": \"commentLength\", \"type\": \"long\"}, {\"name\": \"countryIsoCode\", \"type\": \"string\"}, {\"name\": \"countryName\", \"type\": \"string\"}, {\"name\": \"deleted\", \"type\": \"long\"}, {\"name\": \"delta\", \"type\": \"long\"}, {\"name\": \"deltaBucket\", \"type\": \"string\"}, {\"name\": \"diffUrl\", \"type\": \"string\"}, {\"name\": \"flags\", \"type\": \"string\"}, {\"name\": \"isAnonymous\", \"type\": \"string\"}, {\"name\": \"isMinor\", \"type\": \"string\"}, {\"name\": \"isNew\", \"type\": \"string\"}, {\"name\": \"isRobot\", \"type\": \"string\"}, {\"name\": \"isUnpatrolled\", \"type\": \"string\"}, {\"name\": \"metroCode\", \"type\": \"string\"}, {\"name\": \"namespace\", \"type\": \"string\"}, {\"name\": \"page\", \"type\": \"string\"}, {\"name\": \"regionIsoCode\", \"type\": \"string\"}, {\"name\": \"regionName\", \"type\": \"string\"}, {\"name\": \"timestamp\", \"type\": \"string\"}, {\"name\": \"user\", \"type\": \"string\"}]'\n",
- " ))\n",
- "PARTITIONED BY DAY\n",
- "'''"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f7dcddd7",
- "metadata": {},
- "source": [
- "The query is included inline here. You can also store it in a file and provide the file.\n",
- "\n",
- "The above showed how the Requests library can convert the response from JSON to Python. Requests can also convert the request from Python to JSON. The next cell builds up a Python map that represents the Druid `SqlRequest` object. In this case, you need the query and a context variable to set the task count to 3."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b6e82c0a",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql_request = {\n",
- " 'query': sql,\n",
- " 'context': {\n",
- " 'maxNumTasks': 3\n",
- " }\n",
- "}"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "6aa7f230",
- "metadata": {},
- "source": [
- "With the SQL request ready, use the the `json` parameter to the `Session` `post` method to send a `POST` request with the `sql_request` object as the payload. The result is a Requests `Response` which is saved in a variable.\n",
- "\n",
- "Now, run the next cell to start the ingestion."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e2939a07",
- "metadata": {},
- "outputs": [],
- "source": [
- "response = session.post(endpoint, json=sql_request)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "9ba1821f",
- "metadata": {
- "tags": []
- },
- "source": [
- "The MSQ task engine uses a task to ingest data. The response for the API includes a `taskId` and `state` for your ingestion. You can use this `taskId` to reference this task later on to get more information about it."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f9cc2e45",
- "metadata": {},
- "source": [
- "It is good practice to ensure that the response succeeded by checking the return status. The status should be 20x. (202 means \"accepted\".) If the response is something else, such as 4xx, display `response.text` to see the error message."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "01d875f7",
- "metadata": {},
- "outputs": [],
- "source": [
- "response.status_code"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "a983270d",
- "metadata": {},
- "source": [
- "Convert the JSON response to a Python object."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "95eeb9bf",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "json = response.json()\n",
- "json"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "9f9c1b6b",
- "metadata": {},
- "source": [
- "Extract the taskId value from the taskId_response variable so that you can reference it later:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2eeb3e3f",
- "metadata": {},
- "outputs": [],
- "source": [
- "ingestion_taskId = json['taskId']\n",
- "ingestion_taskId"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f17892d9-a8c1-43d6-890c-7d68cd792c72",
- "metadata": {
- "tags": []
- },
- "source": [
- "### Get the status of your task\n",
- "\n",
- "The following cell shows you how to get the status of your ingestion task. The example continues to run API calls against the endpoint to fetch the status until the ingestion task completes. When it's done, you'll see the JSON response.\n",
- "\n",
- "You can see basic information about your query, such as when it started and whether or not it's finished.\n",
- "\n",
- "In addition to the status, you can retrieve a full report about it if you want using `GET /druid/indexer/v1/task/TASK_ID/reports`. But you won't need that information for this tutorial."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f7b65866",
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint = druid_host + f\"/druid/indexer/v1/task/{ingestion_taskId}/status\"\n",
- "endpoint"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "22b08321",
- "metadata": {},
- "outputs": [],
- "source": [
- "json = session.get(endpoint).json()\n",
- "json"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "74a9b593",
- "metadata": {},
- "source": [
- "Wait until your ingestion completes before proceeding. Depending on what else is happening in your Druid cluster and the resources available, ingestion can take some time. Pro tip: an asterisk appears next to the cell while Python runs and waits for the task."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "aae5c228",
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "\n",
- "ingestion_status = json['status']['status']\n",
- "\n",
- "if ingestion_status == \"RUNNING\":\n",
- " print(\"The ingestion is running...\")\n",
- "\n",
- "while ingestion_status != \"SUCCESS\":\n",
- " time.sleep(5) # 5 seconds \n",
- " json = session.get(endpoint).json()\n",
- " ingestion_status = json['status']['status']\n",
- " \n",
- "if ingestion_status == \"SUCCESS\": \n",
- " print(\"The ingestion is complete\")\n",
- "else:\n",
- " print(\"The ingestion task failed:\", json)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "3b55af57-9c79-4e45-a22c-438c1b94112e",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Query your data\n",
- "\n",
- "When you ingest data into Druid, Druid stores the data in a datasource, and this datasource is what you run queries against.\n",
- "\n",
- "### List your datasources\n",
- "\n",
- "You can get a list of datasources from the `/druid/coordinator/v1/datasources` endpoint. Since you're just getting started, there should only be a single datasource, the `wikipedia_api` table you created earlier when you ingested external data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a482c1f0",
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint = druid_host + '/druid/coordinator/v1/datasources'\n",
- "endpoint"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a0a7799e",
- "metadata": {},
- "outputs": [],
- "source": [
- "session.get(endpoint).json()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "622f2158-75c9-4b12-bd8a-c92d30994c1f",
- "metadata": {
- "tags": []
- },
- "source": [
- "### SELECT data\n",
- "\n",
- "Now, you can query the data. Because this tutorial is running in Jupyter, make sure to limit the size of your query results using `LIMIT`. For example, the following cell selects all columns but limits the results to 3 rows for display purposes because each row is a JSON object. In actual use cases, you'll want to only select the rows that you need. For more information about the kinds of things you can do, see [Druid SQL](https://druid.apache.org/docs/latest/querying/sql.html).\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "44868ff9",
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint = druid_host + '/druid/v2/sql'\n",
- "endpoint"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "b2b366ad",
- "metadata": {},
- "source": [
- "As for ingestion, define a query, then create a `SQLRequest` object as a Python `dict`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b7c77093",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT * \n",
- "FROM wikipedia_api \n",
- "LIMIT 3\n",
- "'''\n",
- "sql_request = {'query': sql}"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "0df529d6",
- "metadata": {},
- "source": [
- "Now run the query. The result is an array of JSON objects. Each JSON object in the response represents a row in the `wikipedia_api` datasource."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ac4f95b5",
- "metadata": {},
- "outputs": [],
- "source": [
- "json = session.post(endpoint, json=sql_request).json()\n",
- "json"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "950b2cc4-9935-497d-a3f5-e89afcc85965",
- "metadata": {
- "tags": []
- },
- "source": [
- "In addition to the query, there are a few additional things you can define within the payload. For a full list, see [Druid SQL API](https://druid.apache.org/docs/latest/querying/sql-api.html)\n",
- "\n",
- "This tutorial uses a context parameter and a dynamic parameter.\n",
- "\n",
- "Context parameters can control certain characteristics related to a query, such as configuring a custom timeout. For information, see [Context parameters](https://druid.apache.org/docs/latest/querying/query-context.html). In the example query that follows, the context block assigns a custom `sqlQueryID` to the query. Typically, the `sqlQueryId` is autogenerated. With a custom ID, you can use it to reference the query more easily. For example, if you need to cancel a query.\n",
- "\n",
- "\n",
- "Druid supports dynamic parameters, so you can either define certain parameters within the query explicitly or insert a `?` as a placeholder and define it in a parameters block. In the following cell, the `?` gets bound to the timestmap value of `2016-06-27` at execution time. For more information, see [Dynamic parameters](https://druid.apache.org/docs/latest/querying/sql.html#dynamic-parameters).\n",
- "\n",
- "\n",
- "The following cell selects rows where the `__time` column contains a value greater than the value defined dynamically in `parameters` and sets a custom `sqlQueryId`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9a645287",
- "metadata": {},
- "outputs": [],
- "source": [
- "sql = '''\n",
- "SELECT * \n",
- "FROM wikipedia_api \n",
- "WHERE __time > ? \n",
- "LIMIT 1\n",
- "'''\n",
- "sql_request = {\n",
- " 'query': sql,\n",
- " 'context': {\n",
- " 'sqlQueryId' : 'important-query'\n",
- " },\n",
- " 'parameters': [\n",
- " { 'type': 'TIMESTAMP', 'value': '2016-06-27'}\n",
- " ]\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5cced58",
- "metadata": {},
- "outputs": [],
- "source": [
- "json = session.post(endpoint, json=sql_request).json()\n",
- "json"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "8fbfa1fa-2cde-46d5-8107-60bd436fb64e",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Learn more\n",
- "\n",
- "This tutorial covers the some of the basics related to the Druid API. To learn more about the kinds of things you can do, see the API documentation:\n",
- "\n",
- "- [Druid SQL API](https://druid.apache.org/docs/latest/api-reference/api-reference.html)\n",
- "- [API reference](https://druid.apache.org/docs/latest/api-reference/api-reference.html)\n",
- "\n",
- "You can also try out the [druid-client](https://github.com/paul-rogers/druid-client), a Python library for Druid created by Paul Rogers, a Druid contributor. A simplified version of that library is included with these tutorials. See [the Python API Tutorial](Python_API_Tutorial.ipynb) for an overview. That tutorial shows how to do the same tasks as this one, but in a simpler form: focusing on the Druid actions and not the mechanics of the REST API."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- },
- "vscode": {
- "interpreter": {
- "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/notebooks/99-contributing/notebook-template.ipynb b/examples/quickstart/jupyter-notebooks/notebooks/99-contributing/notebook-template.ipynb
deleted file mode 100644
index 5fafa2a4a696..000000000000
--- a/examples/quickstart/jupyter-notebooks/notebooks/99-contributing/notebook-template.ipynb
+++ /dev/null
@@ -1,319 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "0cb3b009-ebde-4d56-9d59-a028d66d8309",
- "metadata": {},
- "source": [
- "# (Result) by (action) using (feature)\n",
- "\n",
- "\n",
- "Introductory paragraph - for example:\n",
- "\n",
- "This tutorial demonstrates how to work with [feature](link to feature doc). In this tutorial you perform the following tasks:\n",
- "\n",
- "- Task 1\n",
- "- Task 2\n",
- "- Task 3\n",
- "- etc\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b74aa63d-3d21-472d-8ade-8573ef3c50cf",
- "metadata": {},
- "source": [
- "## Table of contents\n",
- "\n",
- "- [Prerequisites](#Prerequisites)\n",
- "- [Initalization](#Initalization)\n",
- "- [Next section](#Nextsection)\n",
- "- etc"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bbdbf6ad-ca7b-40f5-8ca3-1070f4a3ee42",
- "metadata": {},
- "source": [
- "## Prerequisites\n",
- "\n",
- "This tutorial works with Druid XX.0.0 or later.\n",
- "\n",
- "#### Run with Docker\n",
- "\n",
- "\n",
- "\n",
- "Launch this tutorial and all prerequisites using the ....... profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n",
- " \n",
- "#### Run without Docker\n",
- "\n",
- "If you do not use the Docker Compose environment, you need the following:\n",
- "\n",
- "* A running Apache Druid instance, with a `DRUID_HOST` local environment variable containing the server name of your Druid router\n",
- "* [druidapi](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/druidapi/README.md), a Python client for Apache Druid. Follow the instructions in the Install section of the README file.\n",
- "\n",
- " \n",
- "* A running Apache Kafka instance, with a `KAFKA_HOST` local environment variable containing the broker server name.\n",
- "* [matplotlib](https://matplotlib.org/), a library for creating visualizations in Python.\n",
- "* [pandas](https://pandas.pydata.org/), a data analysis and manipulation tool."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5007a243-b81a-4601-8f57-5b14940abbff",
- "metadata": {},
- "source": [
- "### Initialization\n",
- "\n",
- "Run the next cell to set up the Druid Python client's connection to Apache Druid.\n",
- "\n",
- "If successful, the Druid version number will be shown in the output."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c1ec783b-df3f-4168-9be2-cdc6ad3e33c2",
- "metadata": {},
- "outputs": [],
- "source": [
- "import druidapi\n",
- "import os\n",
- "\n",
- "if 'DRUID_HOST' not in os.environ.keys():\n",
- " druid_host=f\"http://localhost:8888\"\n",
- "else:\n",
- " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
- " \n",
- "print(f\"Opening a connection to {druid_host}.\")\n",
- "druid = druidapi.jupyter_client(druid_host)\n",
- "\n",
- "display = druid.display\n",
- "sql_client = druid.sql\n",
- "status_client = druid.status\n",
- "\n",
- "status_client.version"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2efdbee0-62da-4fd3-84e1-f66b8c0150b3",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "Run the next cell to set up the connection to Apache Kafka."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c075de81-04c9-4b23-8253-20a15d46252e",
- "metadata": {},
- "outputs": [],
- "source": [
- "if 'KAFKA_HOST' not in os.environ.keys():\n",
- " kafka_host=f\"http://localhost:9092\"\n",
- "else:\n",
- " kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "472589e4-1026-4b3b-bb79-eedabb2b44c4",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Load example data\n",
- "\n",
- "Once your Druid environment is up and running, ingest the sample data for this tutorial.\n",
- "\n",
- "Run the following cell to create a table called `example-dataset-notebook`. Notice {the use of X as a timestamp | only required columns are ingested | WHERE / expressions / GROUP BY are front-loaded | partitions on X period and clusters by Y}.\n",
- "\n",
- "When completed, you'll see a description of the final table.\n",
- "\n",
- "\n",
- "\n",
- "Monitor the ingestion task process in the Druid console."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f52a94fb-d2e4-403f-ab10-84d3af7bf2c8",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Replace `example-dataset-notebook` with your table name here.\n",
- "# Remember to apply good data modelling practice to your INSERT / REPLACE.\n",
- "\n",
- "sql='''\n",
- "'''\n",
- "\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('example-dataset-notebook')\n",
- "display.table('example-dataset-notebook')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9c3d6b39-6551-4b2a-bdfb-9606aa92c853",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "Finally, run the following cell to import additional Python modules that you will use to X, Y, Z."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "dc4c2524-0eba-4bc6-84ed-da3a25aa5fbe",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Add your modules here, remembering to align this with the prerequisites section\n",
- "\n",
- "import json\n",
- "import matplotlib\n",
- "import matplotlib.pyplot as plt\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1b6c9b88-837d-4c80-a28d-36184ba63355",
- "metadata": {},
- "source": [
- "## Awesome!\n",
- "\n",
- "The main body of your notebook goes here!\n",
- "\n",
- "### This is a step\n",
- "\n",
- "Here things get done\n",
- "\n",
- "### And so is this!\n",
- "\n",
- "Wow! Awesome!"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "54b8d5fe-ba85-4b5b-9669-0dd47dfbccd1",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "* You learned this\n",
- "* Remember this\n",
- "\n",
- "## Go further\n",
- "\n",
- "* Try this out on your own data\n",
- "* Solve for problem X that is't covered here\n",
- "\n",
- "## Learn more\n",
- "\n",
- "* Read docs pages\n",
- "* Watch or read something cool from the community\n",
- "* Do some exploratory stuff on your own"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ca4d3362-b1a4-47a4-a782-9773c216b3ba",
- "metadata": {},
- "outputs": [],
- "source": [
- "# STANDARD CODE BLOCKS\n",
- "\n",
- "# When just wanting to display some SQL results\n",
- "display.sql(sql)\n",
- "\n",
- "# When ingesting data:\n",
- "sql_client.run_task(sql)\n",
- "sql_client.wait_until_ready('wikipedia-en')\n",
- "display.table('wikipedia-en')\n",
- "\n",
- "# When you want to make an EXPLAIN look pretty\n",
- "print(json.dumps(json.loads(sql_client.explain_sql(sql)['PLAN']), indent=2))\n",
- "\n",
- "# When you want a simple plot\n",
- "df = pd.DataFrame(sql_client.sql(sql))\n",
- "df.plot(x='Tail_Number', y='Flights', marker='o')\n",
- "plt.xticks(rotation=45, ha='right')\n",
- "plt.gca().get_legend().remove()\n",
- "plt.show()\n",
- "\n",
- "# When you want to add some query context parameters\n",
- "req = sql_client.sql_request(sql)\n",
- "req.add_context(\"useApproximateTopN\", \"false\")\n",
- "resp = sql_client.sql_query(req)\n",
- "\n",
- "# When you want to compare two different sets of results\n",
- "df3 = df1.compare(df2, keep_equal=True)\n",
- "df3"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/quickstart/jupyter-notebooks/requirements.txt b/examples/quickstart/jupyter-notebooks/requirements.txt
deleted file mode 100644
index cecc427e1165..000000000000
--- a/examples/quickstart/jupyter-notebooks/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ------------------------------------------------------------------------
-
-# Requirements for the Jupyter Notebooks
-# See: https://pip.pypa.io/en/stable/reference/requirements-file-format/
-#
-# Requirements are both few and simple at present.
-
-requests
diff --git a/website/sidebars.json b/website/sidebars.json
index 3aa3182ac043..d007ffb0c24d 100644
--- a/website/sidebars.json
+++ b/website/sidebars.json
@@ -25,7 +25,6 @@
"tutorials/tutorial-sql-query-view",
"tutorials/tutorial-unnest-arrays",
"tutorials/tutorial-query-deep-storage",
- "tutorials/tutorial-jupyter-index",
"tutorials/tutorial-jdbc"
],
"Design": [