apache · andygrove · Nov 29, 2024 · Nov 23, 2024 · Nov 24, 2024 · Nov 24, 2024
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -75,6 +75,8 @@ jobs:
           set -x
           source venv/bin/activate
           cd docs
+          curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv
+          curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet
           make html
 
       - name: Copy & push the generated HTML

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -1,2 +1,4 @@
 pokemon.csv
 yellow_trip_data.parquet
+yellow_tripdata_2021-01.parquet
+
diff --git a/docs/build.sh b/docs/build.sh
@@ -19,8 +19,17 @@
 #
 
 set -e
+
+if [ ! -f pokemon.csv ]; then
+    curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv
+fi
+
+if [ ! -f yellow_tripdata_2021-01.parquet ]; then
+    curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet
+fi
+
 rm -rf build 2> /dev/null
 rm -rf temp 2> /dev/null
 mkdir temp
 cp -rf source/* temp/
-make SOURCEDIR=`pwd`/temp html
+make SOURCEDIR=`pwd`/temp html
diff --git a/docs/source/images/jupyter_lab_df_view.png b/docs/source/images/jupyter_lab_df_view.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -43,27 +43,13 @@ Example
 
 .. ipython:: python
 
-    import datafusion
-    from datafusion import col
-    import pyarrow
+    from datafusion import SessionContext
 
-    # create a context
-    ctx = datafusion.SessionContext()
+    ctx = SessionContext()
 
-    # create a RecordBatch and a new DataFrame from it
-    batch = pyarrow.RecordBatch.from_arrays(
-        [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df = ctx.create_dataframe([[batch]], name="batch_array")
+    df = ctx.read_csv("pokemon.csv")
 
-    # create a new statement
-    df = df.select(
-        col("a") + col("b"),
-        col("a") - col("b"),
-    )
-
-    df
+    df.show()
 
 
 .. _toc.links:
@@ -85,9 +71,10 @@ Example
 
    user-guide/introduction
    user-guide/basics
-   user-guide/configuration
+   user-guide/data-sources
    user-guide/common-operations/index
    user-guide/io/index
+   user-guide/configuration
    user-guide/sql
 
 

diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst
@@ -20,72 +20,76 @@
 Concepts
 ========
 
-In this section, we will cover a basic example to introduce a few key concepts.
+In this section, we will cover a basic example to introduce a few key concepts. We will use the same
+source file as described in the :ref:`Introduction <guide>`, the Pokemon data set.
 
-.. code-block:: python
+.. ipython:: python
 
-    import datafusion
-    from datafusion import col
-    import pyarrow
+    from datafusion import SessionContext, col, lit, functions as f
 
-    # create a context
-    ctx = datafusion.SessionContext()
+    ctx = SessionContext()
 
-    # create a RecordBatch and a new DataFrame from it
-    batch = pyarrow.RecordBatch.from_arrays(
-        [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df = ctx.create_dataframe([[batch]])
+    df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
 
-    # create a new statement
     df = df.select(
-        col("a") + col("b"),
-        col("a") - col("b"),
+        "trip_distance",
+        col("total_amount").alias("total"),
+        (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
     )
 
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
+    df.show()
 
-The first statement group:
+Session Context
+---------------
+
+The first statement group creates a :py:class:`~datafusion.context.SessionContext`.
 
 .. code-block:: python
 
     # create a context
     ctx = datafusion.SessionContext()
 
-creates a :py:class:`~datafusion.context.SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state
-of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality:
+A Session Context is the main interface for executing queries with DataFusion. It maintains the state
+of the connection between a user and an instance of the DataFusion engine. Additionally it provides
+the following functionality:
 
-- Create a DataFrame from a CSV or Parquet data source.
-- Register a CSV or Parquet data source as a table that can be referenced from a SQL query.
-- Register a custom data source that can be referenced from a SQL query.
+- Create a DataFrame from a data source.
+- Register a data source as a table that can be referenced from a SQL query.
 - Execute a SQL query
 
+DataFrame
+---------
+
 The second statement group creates a :code:`DataFrame`,
 
 .. code-block:: python
 
-    # create a RecordBatch and a new DataFrame from it
-    batch = pyarrow.RecordBatch.from_arrays(
-        [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df = ctx.create_dataframe([[batch]])
+    # Create a DataFrame from a file
+    df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
 
 A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html>`_.
 DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by
 calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`,
 and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition.
 
-The third statement uses :code:`Expressions` to build up a query definition.
+Expressions
+-----------
+
+The third statement uses :code:`Expressions` to build up a query definition. You can find
+explanations for what the functions below do in the user documentation for
+:py:func:`~datafusion.col`, :py:func:`~datafusion.lit`, :py:func:`~datafusion.functions.round`,
+and :py:func:`~datafusion.expr.Expr.alias`.
 
 .. code-block:: python
 
     df = df.select(
-        col("a") + col("b"),
-        col("a") - col("b"),
+        "trip_distance",
+        col("total_amount").alias("total"),
+        (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
     )
 
-Finally the :py:func:`~datafusion.dataframe.DataFrame.collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it,
-collecting all results into a list of `RecordBatch <https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html>`_.
+Finally the :py:func:`~datafusion.dataframe.DataFrame.show` method converts the logical plan
+represented by the DataFrame into a physical plan and execute it, collecting all results and
+displaying them to the user. It is important to note that DataFusion performs lazy evaluation
+of the DataFrame. Until you call a method such as :py:func:`~datafusion.dataframe.DataFrame.show`
+or :py:func:`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query.
diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst
@@ -26,15 +26,7 @@ to form a single summary value. For performing an aggregation, DataFusion provid
 
 .. ipython:: python
 
-    import urllib.request
-    from datafusion import SessionContext
-    from datafusion import col, lit
-    from datafusion import functions as f
-
-    urllib.request.urlretrieve(
-        "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv",
-        "pokemon.csv",
-    )
+    from datafusion import SessionContext, col, lit, functions as f
 
     ctx = SessionContext()
     df = ctx.read_csv("pokemon.csv")

diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst
@@ -25,14 +25,8 @@ We'll use the pokemon dataset in the following examples.
 
 .. ipython:: python
 
-    import urllib.request
     from datafusion import SessionContext
 
-    urllib.request.urlretrieve(
-    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv",
-    "pokemon.csv",
-    )
-
     ctx = SessionContext()
     ctx.register_csv("pokemon", "pokemon.csv")
     df = ctx.table("pokemon")

diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst
@@ -18,6 +18,8 @@
 Common Operations
 =================
 
+The contents of this section are designed to guide a new user through how to use DataFusion.
+
 .. toctree::
    :maxdepth: 2
 

diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst
@@ -21,18 +21,15 @@ Column Selections
 Use :py:func:`~datafusion.dataframe.DataFrame.select`  for basic column selection.
 
 DataFusion can work with several file types, to start simple we can use a subset of the 
-`TLC Trip Record Data <https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page>`_
+`TLC Trip Record Data <https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page>`_,
+which you can download `here <https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet>`_.
 
 .. ipython:: python
-
-    import urllib.request
-    from datafusion import SessionContext
 
-    urllib.request.urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
-                               "yellow_trip_data.parquet")
+    from datafusion import SessionContext
 
     ctx = SessionContext()
-    df = ctx.read_parquet("yellow_trip_data.parquet")
+    df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
     df.select("trip_distance", "passenger_count")
 
 For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting

diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst
@@ -30,16 +30,10 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
 
 .. ipython:: python
 
-    import urllib.request
     from datafusion import SessionContext
     from datafusion import col
     from datafusion import functions as f
 
-    urllib.request.urlretrieve(
-        "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv",
-        "pokemon.csv",
-    )
-
     ctx = SessionContext()
     df = ctx.read_csv("pokemon.csv")