diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index e47497b2..86288e2d 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -75,6 +75,8 @@ jobs: set -x source venv/bin/activate cd docs + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet make html - name: Copy & push the generated HTML diff --git a/docs/.gitignore b/docs/.gitignore index 41e13534..6e8a53b6 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,2 +1,4 @@ pokemon.csv yellow_trip_data.parquet +yellow_tripdata_2021-01.parquet + diff --git a/docs/build.sh b/docs/build.sh index 5afe8581..31398d19 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -19,8 +19,17 @@ # set -e + +if [ ! -f pokemon.csv ]; then + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv +fi + +if [ ! -f yellow_tripdata_2021-01.parquet ]; then + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet +fi + rm -rf build 2> /dev/null rm -rf temp 2> /dev/null mkdir temp cp -rf source/* temp/ -make SOURCEDIR=`pwd`/temp html \ No newline at end of file +make SOURCEDIR=`pwd`/temp html diff --git a/docs/source/images/jupyter_lab_df_view.png b/docs/source/images/jupyter_lab_df_view.png new file mode 100644 index 00000000..9dafb4f6 Binary files /dev/null and b/docs/source/images/jupyter_lab_df_view.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index b0103a33..34eb23b2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,27 +43,13 @@ Example .. ipython:: python - import datafusion - from datafusion import col - import pyarrow + from datafusion import SessionContext - # create a context - ctx = datafusion.SessionContext() + ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]], name="batch_array") + df = ctx.read_csv("pokemon.csv") - # create a new statement - df = df.select( - col("a") + col("b"), - col("a") - col("b"), - ) - - df + df.show() .. _toc.links: @@ -85,9 +71,10 @@ Example user-guide/introduction user-guide/basics - user-guide/configuration + user-guide/data-sources user-guide/common-operations/index user-guide/io/index + user-guide/configuration user-guide/sql diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 3c97d1ef..f37378a4 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -20,72 +20,76 @@ Concepts ======== -In this section, we will cover a basic example to introduce a few key concepts. +In this section, we will cover a basic example to introduce a few key concepts. We will use the same +source file as described in the :ref:`Introduction `, the Pokemon data set. -.. code-block:: python +.. ipython:: python - import datafusion - from datafusion import col - import pyarrow + from datafusion import SessionContext, col, lit, functions as f - # create a context - ctx = datafusion.SessionContext() + ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") - # create a new statement df = df.select( - col("a") + col("b"), - col("a") - col("b"), + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), ) - # execute and collect the first (and only) batch - result = df.collect()[0] + df.show() -The first statement group: +Session Context +--------------- + +The first statement group creates a :py:class:`~datafusion.context.SessionContext`. .. code-block:: python # create a context ctx = datafusion.SessionContext() -creates a :py:class:`~datafusion.context.SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state -of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality: +A Session Context is the main interface for executing queries with DataFusion. It maintains the state +of the connection between a user and an instance of the DataFusion engine. Additionally it provides +the following functionality: -- Create a DataFrame from a CSV or Parquet data source. -- Register a CSV or Parquet data source as a table that can be referenced from a SQL query. -- Register a custom data source that can be referenced from a SQL query. +- Create a DataFrame from a data source. +- Register a data source as a table that can be referenced from a SQL query. - Execute a SQL query +DataFrame +--------- + The second statement group creates a :code:`DataFrame`, .. code-block:: python - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) + # Create a DataFrame from a file + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. -The third statement uses :code:`Expressions` to build up a query definition. +Expressions +----------- + +The third statement uses :code:`Expressions` to build up a query definition. You can find +explanations for what the functions below do in the user documentation for +:py:func:`~datafusion.col`, :py:func:`~datafusion.lit`, :py:func:`~datafusion.functions.round`, +and :py:func:`~datafusion.expr.Expr.alias`. .. code-block:: python df = df.select( - col("a") + col("b"), - col("a") - col("b"), + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), ) -Finally the :py:func:`~datafusion.dataframe.DataFrame.collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, -collecting all results into a list of `RecordBatch `_. +Finally the :py:func:`~datafusion.dataframe.DataFrame.show` method converts the logical plan +represented by the DataFrame into a physical plan and execute it, collecting all results and +displaying them to the user. It is important to note that DataFusion performs lazy evaluation +of the DataFrame. Until you call a method such as :py:func:`~datafusion.dataframe.DataFrame.show` +or :py:func:`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 8fee26a1..e458e5fc 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -26,15 +26,7 @@ to form a single summary value. For performing an aggregation, DataFusion provid .. ipython:: python - import urllib.request - from datafusion import SessionContext - from datafusion import col, lit - from datafusion import functions as f - - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) + from datafusion import SessionContext, col, lit, functions as f ctx = SessionContext() df = ctx.read_csv("pokemon.csv") diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index a0b95c90..8d6a8085 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -25,14 +25,8 @@ We'll use the pokemon dataset in the following examples. .. ipython:: python - import urllib.request from datafusion import SessionContext - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) - ctx = SessionContext() ctx.register_csv("pokemon", "pokemon.csv") df = ctx.table("pokemon") diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index b15b04c6..d7c708c2 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -18,6 +18,8 @@ Common Operations ================= +The contents of this section are designed to guide a new user through how to use DataFusion. + .. toctree:: :maxdepth: 2 diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst index 07590912..083bcbbd 100644 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -21,18 +21,15 @@ Column Selections Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection. DataFusion can work with several file types, to start simple we can use a subset of the -`TLC Trip Record Data `_ +`TLC Trip Record Data `_, +which you can download `here `_. .. ipython:: python - - import urllib.request - from datafusion import SessionContext - urllib.request.urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet", - "yellow_trip_data.parquet") + from datafusion import SessionContext ctx = SessionContext() - df = ctx.read_parquet("yellow_trip_data.parquet") + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") df.select("trip_distance", "passenger_count") For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index 60917689..8225d125 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -30,16 +30,10 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples. .. ipython:: python - import urllib.request from datafusion import SessionContext from datafusion import col from datafusion import functions as f - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) - ctx = SessionContext() df = ctx.read_csv("pokemon.csv") diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst new file mode 100644 index 00000000..ba5967c9 --- /dev/null +++ b/docs/source/user-guide/data-sources.rst @@ -0,0 +1,187 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _user_guide_data_sources: + +Data Sources +============ + +DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations. + +Local file +---------- + +DataFusion has the abilty to read from a variety of popular file formats, such as :ref:`Parquet `, +:ref:`CSV `, :ref:`JSON `, and :ref:`AVRO `. + +.. ipython:: python + + from datafusion import SessionContext + ctx = SessionContext() + df = ctx.read_csv("pokemon.csv") + df.show() + +Create in-memory +---------------- + +Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object. +To do this in DataFusion, you can use one of the three functions +:py:func:`~datafusion.context.SessionContext.from_pydict`, +:py:func:`~datafusion.context.SessionContext.from_pylist`, or +:py:func:`~datafusion.context.SessionContext.create_dataframe`. + +As their names suggest, ``from_pydict`` and ``from_pylist`` will create DataFrames from Python +dictionary and list objects, respectively. ``create_dataframe`` assumes you will pass in a list +of list of `PyArrow Record Batches `_. + +The following three examples all will create identical DataFrames: + +.. ipython:: python + + import pyarrow as pa + + ctx.from_pylist([ + { "a": 1, "b": 10.0, "c": "alpha" }, + { "a": 2, "b": 20.0, "c": "beta" }, + { "a": 3, "b": 30.0, "c": "gamma" }, + ]).show() + + ctx.from_pydict({ + "a": [1, 2, 3], + "b": [10.0, 20.0, 30.0], + "c": ["alpha", "beta", "gamma"], + }).show() + + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3]), + pa.array([10.0, 20.0, 30.0]), + pa.array(["alpha", "beta", "gamma"]), + ], + names=["a", "b", "c"], + ) + + ctx.create_dataframe([[batch]]).show() + + +Object Store +------------ + +DataFusion has support for multiple storage options in addition to local files. +The example below requires an appropriate S3 account with access credentials. + +Supported Object Stores are + +- :py:class:`~datafusion.object_store.AmazonS3` +- :py:class:`~datafusion.object_store.GoogleCloud` +- :py:class:`~datafusion.object_store.Http` +- :py:class:`~datafusion.object_store.LocalFileSystem` +- :py:class:`~datafusion.object_store.MicrosoftAzure` + +.. code-block:: python + + from datafusion.object_store import AmazonS3 + + region = "us-east-1" + bucket_name = "yellow-trips" + + s3 = AmazonS3( + bucket_name=bucket_name, + region=region, + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + ) + + path = f"s3://{bucket_name}/" + ctx.register_object_store("s3://", s3, None) + + ctx.register_parquet("trips", path) + + ctx.table("trips").show() + +Other DataFrame Libraries +------------------------- + +DataFusion can import DataFrames directly from other libraries, such as +`Polars `_ and `Pandas `_. +Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule +interface can be imported to DataFusion using the +:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older verions of Polars may +not support the arrow interface. In those cases, you can still import via the +:py:func:`~datafusion.context.SessionContext.from_polars` function. + +.. code-block:: python + + import pandas as pd + + data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] } + pandas_df = pd.DataFrame(data) + + datafusion_df = ctx.from_arrow(pandas_df) + datafusion_df.show() + +.. code-block:: python + + import polars as pl + polars_df = pl.DataFrame(data) + + datafusion_df = ctx.from_arrow(polars_df) + datafusion_df.show() + +Delta Lake +---------- + +DataFusion 43.0.0 and later support the ability to register table providers from sources such +as Delta Lake. This will require a recent version of +`deltalake `_ to provide the required interfaces. + +.. code-block:: python + + from deltalake import DeltaTable + + delta_table = DeltaTable("path_to_table") + ctx.register_table_provider("my_delta_table", delta_table) + df = ctx.table("my_delta_table") + df.show() + +On older versions of ``deltalake`` (prior to 0.22) you can use the +`Arrow DataSet `_ +interface to import to DataFusion, but this does not support features such as filter push down +which can lead to a significant performance difference. + +.. code-block:: python + + from deltalake import DeltaTable + + delta_table = DeltaTable("path_to_table") + ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset()) + df = ctx.table("my_delta_table") + df.show() + +Iceberg +------- + +Coming soon! + +Custom Table Provider +--------------------- + +You can implement a custom Data Provider in Rust and expose it to DataFusion through the +the interface as describe in the :ref:`Custom Table Provider ` +section. This is an advanced topic, but a +`user example `_ +is provided in the DataFusion repository. diff --git a/docs/source/user-guide/introduction.rst b/docs/source/user-guide/introduction.rst index 8abb9113..7b30ef2b 100644 --- a/docs/source/user-guide/introduction.rst +++ b/docs/source/user-guide/introduction.rst @@ -39,5 +39,39 @@ You can verify the installation by running: import datafusion datafusion.__version__ +In this documentation we will also show some examples for how DataFusion integrates +with Jupyter notebooks. To install and start a Jupyter labs session use +.. code-block:: shell + + pip install jupyterlab + jupyter lab + +To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show +options for data sources. For our first example, we demonstrate using a Pokemon dataset that you +can download +`here `_. + +With that file in place you can use the following python example to view the DataFrame in +DataFusion. + +.. ipython:: python + + from datafusion import SessionContext + + ctx = SessionContext() + + df = ctx.read_csv("pokemon.csv") + + df.show() + +If you are working in a Jupyter notebook, you can also use the following to give you a table +display that may be easier to read. + +.. code-block:: shell + + display(df) +.. image:: ../images/jupyter_lab_df_view.png + :width: 800 + :alt: Rendered table showing Pokemon DataFrame diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst index 5f1ff728..66398ac7 100644 --- a/docs/source/user-guide/io/avro.rst +++ b/docs/source/user-guide/io/avro.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_avro: + Avro ==== diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst index d2a62bfe..144b6615 100644 --- a/docs/source/user-guide/io/csv.rst +++ b/docs/source/user-guide/io/csv.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_csv: + CSV === diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst index f9da3755..39030db7 100644 --- a/docs/source/user-guide/io/json.rst +++ b/docs/source/user-guide/io/json.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_json: + JSON ==== `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst index 75bc981c..c5b9ca3d 100644 --- a/docs/source/user-guide/io/parquet.rst +++ b/docs/source/user-guide/io/parquet.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_parquet: + Parquet ======= @@ -22,7 +24,6 @@ It is quite simple to read a parquet file using the :py:func:`~datafusion.contex .. code-block:: python - from datafusion import SessionContext ctx = SessionContext() diff --git a/docs/source/user-guide/io/table_provider.rst b/docs/source/user-guide/io/table_provider.rst index 2ff9ae46..bd1d6b80 100644 --- a/docs/source/user-guide/io/table_provider.rst +++ b/docs/source/user-guide/io/table_provider.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_custom_table_provider: + Custom Table Provider ===================== diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index efd4038a..e283f590 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -473,7 +473,7 @@ def join_on( *on_exprs: Expr, how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", ) -> DataFrame: - """Join two :py:class:`DataFrame`using the specified expressions. + """Join two :py:class:`DataFrame` using the specified expressions. On expressions are used to support in-equality predicates. Equality predicates are correctly optimized diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 3836edec..a71965f4 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -42,7 +42,7 @@ class LogicalPlan: (table) with a potentially different schema. Plans form a dataflow tree where data flows from leaves up to the root to produce the query result. - `LogicalPlan`s can be created by the SQL query planner, the DataFrame API, + A `LogicalPlan` can be created by the SQL query planner, the DataFrame API, or programmatically (for example custom query languages). """ @@ -107,7 +107,7 @@ def __init__(self, plan: df_internal.ExecutionPlan) -> None: self._raw_plan = plan def children(self) -> List[ExecutionPlan]: - """Get a list of children `ExecutionPlan`s that act as inputs to this plan. + """Get a list of children `ExecutionPlan` that act as inputs to this plan. The returned list will be empty for leaf nodes such as scans, will contain a single value for unary nodes, or two values for binary nodes (such as joins).