From 2b133c6882c706fcedf08a8f2d26fba4004b5cc7 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Mon, 19 Oct 2020 20:50:04 -0400
Subject: [PATCH] Docs new changes (#197)

* adding docs and update python examples.

* update docs
---
 docs/docs/compile.md                        |   8 +-
 docs/docs/python.md                         |  46 ++++--
 docs/docs/release/cylon_release_0.2.0.md    |  24 +--
 python/examples/table_compute_examples.py   | 174 ++++++++++++++++++++
 python/examples/table_relational_algebra.py | 131 +++++++++++++++
 5 files changed, 351 insertions(+), 32 deletions(-)
 create mode 100644 python/examples/table_compute_examples.py
 create mode 100644 python/examples/table_relational_algebra.py
diff --git a/docs/docs/compile.md b/docs/docs/compile.md
index 5eed9f7d9..ffb033510 100644
--- a/docs/docs/compile.md
+++ b/docs/docs/compile.md
@@ -104,18 +104,22 @@ do the following,
 ./build.sh -pyenv /home/<username>/cylon/ENV -bpath /home/<username>/cylon/build --python
 ```
 
+Note: You only need to do `--python` just once after the initial C++ build. If you develop the
+Cython or Python APIs, use `--cython` flag instead.
+
 ### Example
 
 Before running the code in the base path of the cloned repo
 run the following command. Or add this to your `bashrc`.
 
 ```bash
-export LD_LIBRARY_PATH=/home/<username>/twisterx/build/arrow/install/lib:/home/<username>/twisterx/build/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/<username>/cylon/build/arrow/install/lib:/home/<username>/twisterx
+/build/lib:$LD_LIBRARY_PATH
 ```
 
 4. Test Python API
 
 
 ```bash
-python3 python/test/test_pytwisterx.py
+python3 python/test/test_pycylon.py
 ```
\ No newline at end of file
diff --git a/docs/docs/python.md b/docs/docs/python.md
index a8d8d1305..d373c64d1 100644
--- a/docs/docs/python.md
+++ b/docs/docs/python.md
@@ -38,10 +38,10 @@ Note: In the current release, Cylon only supports MPI as a distributed backend
 Using Cylon 
 
 ```python
-from pycylon.data.table import Table
-from pycylon.data.table import csv_reader
+from pycylon Table
+from pycylon.io import read_csv
 
-tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',')
+tb1: Table = read_csv(ctx, '/tmp/csv.csv', ',')
 ```
 
 Using PyArrow and convert to PyCylon Table
@@ -58,7 +58,7 @@ cylon_tb = Table.from_arrow(pyarrow_tb)
 Also a Cylon Table can be converted to a PyArrow Table
 
 ```python
-pyarrow_tb: PyArrowTable = Table.to_arrow(cylon_tb)
+pyarrow_tb: PyArrowTable = cylon_tb.to_arrow()
 ```
 ### Join
 
@@ -69,19 +69,27 @@ as using Python `str`.
 Sequential Join
 
 ```python
-tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',')
-tb2: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',')
+csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
+
+tb1: Table = read_csv(ctx, table1_path, csv_read_options)
+
+tb2: Table = read_csv(ctx, table2_path, csv_read_options)
 
-tb3: Table = tb1.join(ctx, table=tb2, join_type='left', algorithm='hash', left_col=0, right_col=0)
+tb3: Table = tb1.join(table=tb2, join_type='inner', algorithm='hash', left_on=[0],
+                      right_on=[0])
 ```
 
 Distributed Join
 
 ```python
-tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',')
-tb2: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',')
+csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
+
+tb1: Table = read_csv(ctx, table1_path, csv_read_options)
+
+tb2: Table = read_csv(ctx, table2_path, csv_read_options)
 
-tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type='left', algorithm='hash', left_col=0, right_col=0)
+tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='hash', left_on=[0],
+                                  right_on=[0])
 ```
 
 ### Union
@@ -89,13 +97,13 @@ tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type='left', algorithm='h
 Sequential Union
 
 ```python
-tb4: Table = tb1.union(ctx, table=tb2)
+tb4: Table = tb1.union(tb2)
 ```
 
 Distributed Union
 
 ```python
-tb5: Table = tb1.distributed_union(ctx, table=tb2)
+tb5: Table = tb1.distributed_union(table=tb2)
 ```
 
 ### Intersect
@@ -103,13 +111,13 @@ tb5: Table = tb1.distributed_union(ctx, table=tb2)
 Sequential Intersect
 
 ```python
-tb4: Table = tb1.intersect(ctx, table=tb2)
+tb4: Table = tb1.intersect(table=tb2)
 ```
 
 Distributed Intersect
 
 ```python
-tb5: Table = tb1.distributed_intersect(ctx, table=tb2)
+tb5: Table = tb1.distributed_intersect(table=tb2)
 ```
 
 ### Subtract 
@@ -117,13 +125,13 @@ tb5: Table = tb1.distributed_intersect(ctx, table=tb2)
 Sequential Subtract
 
 ```python
-tb4: Table = tb1.subtract(ctx, table=tb2)
+tb4: Table = tb1.subtract(table=tb2)
 ```
 
 Distributed Subtract
 
 ```python
-tb5: Table = tb1.distributed_subtract(ctx, table=tb2)
+tb5: Table = tb1.distributed_subtract(table=tb2)
 ```
 
 
@@ -135,5 +143,7 @@ This is not yet supported from PyCylon API, but LibCylon supports this.
 
 ## Python Examples
 
-1. [Simple Data Loading Benchmark](https://github.com/cylondata/cylon/blob/master/python/examples/cylon_simple_dataloader.py)
-2. [Sequential MNIST with PyTorch](https://github.com/cylondata/cylon/blob/master/python/examples/cylon_sequential_mnist.py)
\ No newline at end of file
+1. [Relational Algebra Examples](https://github.com/cylondata/cylon/blob/master/python/examples
+/table_relational_algebra.py)
+2. [Compute Examples](https://github.com/cylondata/cylon/blob/master/python/examples
+/table_compute_examples.py)
\ No newline at end of file
diff --git a/docs/docs/release/cylon_release_0.2.0.md b/docs/docs/release/cylon_release_0.2.0.md
index 09eee33ef..ac2c4b116 100644
--- a/docs/docs/release/cylon_release_0.2.0.md
+++ b/docs/docs/release/cylon_release_0.2.0.md
@@ -5,36 +5,36 @@ sidebar_label: Cylon Release 0.2.0
 ---
 
 Cylon 0.2.0 adds the following features. Please note that this release may not be backward
- compatible with v0.1.0. 
+ compatible with v0.1.0.
 
-## Major Features 
+## Major Features
 
-### C++ 
+### C++
 
-- Adding aggregates and group-by API 
+- Adding aggregates and group-by API
 - Creating tables using `std::vector`s or `cylon::Column`s
-- C++ API refactoring 
+- C++ API refactoring
 - Major performance improvements in the existing C++ API
 
-### Python (pycylon)
+### Python (Pycylon)
 
 - Extending Cython API for extended development for other Cython/Python libraries
 - Aggregates and Groupby addition
 - Column name-based relational algebra operations and aggregate/groupby ops addition
 - Major performance improvements in the existing Python API
 
-### Java 
+## Java (JCylon)
 
-- Performance improvements 
+- Performance improvements
 
 You can download source code from [Github](https://github.com/cylondata/cylon/releases)
 
-## Examples 
+# Examples
 
 - [C++ examples](https://github.com/cylondata/cylon/tree/0.2.0/cpp/src/examples)
-- [Python examples](https://github.com/cylondata/cylon/tree/0.2.0/python/examples)   
+- [Python examples](https://github.com/cylondata/cylon/tree/0.2.0/python/examples)
 - [Java examples](https://github.com/cylondata/cylon/tree/0.2.0/java/src/main/java/org/cylondata/cylon/examples)
 
-## License
+# License
 
-Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
+Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
\ No newline at end of file
diff --git a/python/examples/table_compute_examples.py b/python/examples/table_compute_examples.py
new file mode 100644
index 000000000..fd227a795
--- /dev/null
+++ b/python/examples/table_compute_examples.py
@@ -0,0 +1,174 @@
+##
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##
+
+
+import numpy as np
+import pyarrow as pa
+import pandas as pd
+import pycylon as cn
+from pycylon import CylonContext
+
+ctx: CylonContext = CylonContext(config=None, distributed=False)
+
+columns = 2
+
+data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
+data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32)
+
+nd_array_list = [data1, data2]
+
+ar_array: pa.array = pa.array(nd_array_list)
+
+ar_table: pa.Table = pa.Table.from_arrays(nd_array_list, names=['x0', 'x1'])
+
+print(ar_table)
+
+ar1 = pa.array([1, 2, 3, 4])
+ar2 = pa.array(['a', 'b', 'c', 'd'])
+
+ar_tb2: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2'])
+
+print(ar_tb2)
+
+col_names = ['col1', 'col2']
+
+cn_tb1 = cn.Table.from_numpy(ctx, col_names, nd_array_list)
+
+cn_tb1.show()
+
+data_list = [[1, 2, 3, 4], ['p', 'q', 'r', 's']]
+
+cn_tb2 = cn.Table.from_list(ctx, col_names, data_list)
+
+cn_tb2.show()
+
+dict1 = {'col1': [1, 2], 'col2': ['a', 'b']}
+
+ar_tb3: pa.Table = pa.Table.from_pydict(dict1)
+
+print(ar_tb3)
+
+cn_tb3: cn.Table = cn.Table.from_pydict(ctx, dict1)
+
+cn_tb3.show()
+
+pdf = pd.DataFrame(dict1)
+
+# df, Schema schema=None, preserve_index=None, nthreads=None, columns=None, bool safe=True
+
+cn_tb4: cn.Table = cn.Table.from_pandas(ctx, pdf)
+
+cn_tb4.show()
+
+print(cn_tb4.to_pandas())
+
+dict2 = {'col1': [1, 2], 'col2': [2, 4]}
+
+cn_tb5: cn.Table = cn.Table.from_pydict(ctx, dict2)
+
+npy = cn_tb5.to_numpy()
+print(npy, npy.dtype)
+
+dict3 = cn_tb5.to_pydict()
+
+print(dict3)
+
+print(cn_tb5.column_names)
+
+print(ar_tb2)
+
+print(cn_tb5.to_numpy())
+
+## Aggregate Sum
+
+cn_tb6 = cn_tb5.sum('col1')
+
+cn_tb6.show()
+
+cn_tb7 = cn_tb5.sum(0)
+
+## Aggregate Count
+
+cn_tb8 = cn_tb5.count('col1')
+
+cn_tb8.show()
+
+cn_tb9 = cn_tb5.count(0)
+
+cn_tb9.show()
+
+## Aggregate Min
+
+cn_tb10 = cn_tb5.min('col1')
+
+cn_tb10.show()
+
+cn_tb11 = cn_tb5.min(0)
+
+cn_tb11.show()
+
+## Aggregate Max
+
+cn_tb12 = cn_tb5.max('col1')
+
+cn_tb12.show()
+
+cn_tb13 = cn_tb5.max(0)
+
+cn_tb13.show()
+
+from pycylon.data.aggregates import AggregationOp
+
+op1 = AggregationOp.SUM
+
+assert (op1 == AggregationOp.SUM)
+
+print(op1.name)
+
+dict3 = {'col1': [1, 2, 3, 4, 5, 1, 3, 6, 8, 1, 9, 10], 'col2': [2, 4, 0, 1, 5, 6, 8, 1, 3, 4, 0,
+                                                                 1]}
+
+cn_tb14: cn.Table = cn.Table.from_pydict(ctx, dict3)
+
+cn_tb14.groupby(0, [0], [AggregationOp.COUNT])
+
+cn_tb14.show()
+
+df = pd.DataFrame({'AnimalId': [1, 1, 2, 2, 3, 4, 4, 3],
+
+                   'Max Speed': [380., 370., 24., 26., 23.1, 300.1, 310.2, 25.2]})
+
+ar_tb_gb = pa.Table.from_pandas(df)
+
+cn_tb_gb = cn.Table.from_arrow(ctx, ar_tb_gb)
+
+
+pdf1 = df.groupby(['AnimalId']).sum()
+
+print(pdf1)
+
+cn_tb_gb_res = cn_tb_gb.groupby(0, [1], [AggregationOp.SUM]).sort(0)
+
+cn_tb_gb_res.show()
+
+cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed'], [AggregationOp.SUM]).sort(0)
+
+cn_tb_gb_res1.show()
+
+cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed', 'Max Speed', 'Max Speed'], [AggregationOp.SUM,
+                                                                              AggregationOp.MIN,
+                                                                              AggregationOp.MAX])\
+                                                                              .sort(0)
+
+cn_tb_gb_res1.show()
\ No newline at end of file
diff --git a/python/examples/table_relational_algebra.py b/python/examples/table_relational_algebra.py
new file mode 100644
index 000000000..4c7ff0594
--- /dev/null
+++ b/python/examples/table_relational_algebra.py
@@ -0,0 +1,131 @@
+##
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ ##
+
+from pycylon import Table
+from pycylon import CylonContext
+from pycylon.io import CSVReadOptions
+from pycylon.io import read_csv
+from pycylon.net import MPIConfig
+
+table1_path = '/tmp/user_device_tm_1.csv'
+table2_path = '/tmp/user_usage_tm_1.csv'
+
+
+def single_process():
+    ctx: CylonContext = CylonContext(config=None, distributed=False)
+
+    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
+
+    tb1: Table = read_csv(ctx, table1_path, csv_read_options)
+
+    tb2: Table = read_csv(ctx, table2_path, csv_read_options)
+
+    print(tb1.column_names)
+    print(tb2.column_names)
+
+    configs = {'join_type': 'inner', 'algorithm': 'sort'}
+
+    tb3: Table = tb1.join(table=tb2,
+                          join_type=configs['join_type'],
+                          algorithm=configs['algorithm'],
+                          left_on=[0],
+                          right_on=[3]
+                          )
+
+    tb3.show()
+
+    tb4: Table = tb1.join(table=tb2,
+                          join_type=configs['join_type'],
+                          algorithm=configs['algorithm'],
+                          left_on=['use_id'],
+                          right_on=['use_id']
+                          )
+
+    tb4.show()
+
+    tb4: Table = tb1.join(table=tb2,
+                          join_type=configs['join_type'],
+                          algorithm=configs['algorithm'],
+                          on=['use_id']
+                          )
+
+    tb4.show()
+
+    # tb5: Table = tb1.join(ctx, table=tb2,
+    #                       join_type=configs['join_type'],
+    #                       algorithm=configs['algorithm'],
+    #                       on=[0]
+    #                       )
+    #
+    # tb5.show()
+
+    ctx.finalize()
+
+
+def multi_process():
+    mpi_config = MPIConfig()
+    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)
+
+    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
+
+    tb1: Table = read_csv(ctx, table1_path, csv_read_options)
+
+    tb2: Table = read_csv(ctx, table2_path, csv_read_options)
+
+    print(tb1.column_names)
+    print(tb2.column_names)
+
+    configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0,
+               'right_col': 0}
+
+    tb3: Table = tb1.distributed_join(table=tb2,
+                                      join_type=configs['join_type'],
+                                      algorithm=configs['algorithm'],
+                                      left_on=[0],
+                                      right_on=[3]
+                                      )
+
+    tb3.show()
+
+    tb4: Table = tb1.distributed_join(table=tb2,
+                                      join_type=configs['join_type'],
+                                      algorithm=configs['algorithm'],
+                                      left_on=['use_id'],
+                                      right_on=['use_id']
+                                      )
+
+    tb4.show()
+
+    tb4: Table = tb1.distributed_join(table=tb2,
+                                      join_type=configs['join_type'],
+                                      algorithm=configs['algorithm'],
+                                      on=['use_id']
+                                      )
+
+    tb4.show()
+
+    # tb5: Table = tb1.distributed_join(ctx, table=tb2,
+    #                       join_type=configs['join_type'],
+    #                       algorithm=configs['algorithm'],
+    #                       on=[0]
+    #                       )
+    #
+    # tb5.show()
+
+    ctx.finalize()
+
+
+single_process()
+
+multi_process()