From 2b133c6882c706fcedf08a8f2d26fba4004b5cc7 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Mon, 19 Oct 2020 20:50:04 -0400 Subject: [PATCH] Docs new changes (#197) * adding docs and update python examples. * update docs --- docs/docs/compile.md | 8 +- docs/docs/python.md | 46 ++++-- docs/docs/release/cylon_release_0.2.0.md | 24 +-- python/examples/table_compute_examples.py | 174 ++++++++++++++++++++ python/examples/table_relational_algebra.py | 131 +++++++++++++++ 5 files changed, 351 insertions(+), 32 deletions(-) create mode 100644 python/examples/table_compute_examples.py create mode 100644 python/examples/table_relational_algebra.py diff --git a/docs/docs/compile.md b/docs/docs/compile.md index 5eed9f7d9..ffb033510 100644 --- a/docs/docs/compile.md +++ b/docs/docs/compile.md @@ -104,18 +104,22 @@ do the following, ./build.sh -pyenv /home//cylon/ENV -bpath /home//cylon/build --python ``` +Note: You only need to do `--python` just once after the initial C++ build. If you develop the +Cython or Python APIs, use `--cython` flag instead. + ### Example Before running the code in the base path of the cloned repo run the following command. Or add this to your `bashrc`. ```bash -export LD_LIBRARY_PATH=/home//twisterx/build/arrow/install/lib:/home//twisterx/build/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/home//cylon/build/arrow/install/lib:/home//twisterx +/build/lib:$LD_LIBRARY_PATH ``` 4. Test Python API ```bash -python3 python/test/test_pytwisterx.py +python3 python/test/test_pycylon.py ``` \ No newline at end of file diff --git a/docs/docs/python.md b/docs/docs/python.md index a8d8d1305..d373c64d1 100644 --- a/docs/docs/python.md +++ b/docs/docs/python.md @@ -38,10 +38,10 @@ Note: In the current release, Cylon only supports MPI as a distributed backend Using Cylon ```python -from pycylon.data.table import Table -from pycylon.data.table import csv_reader +from pycylon Table +from pycylon.io import read_csv -tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',') +tb1: Table = read_csv(ctx, '/tmp/csv.csv', ',') ``` Using PyArrow and convert to PyCylon Table @@ -58,7 +58,7 @@ cylon_tb = Table.from_arrow(pyarrow_tb) Also a Cylon Table can be converted to a PyArrow Table ```python -pyarrow_tb: PyArrowTable = Table.to_arrow(cylon_tb) +pyarrow_tb: PyArrowTable = cylon_tb.to_arrow() ``` ### Join @@ -69,19 +69,27 @@ as using Python `str`. Sequential Join ```python -tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',') -tb2: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',') +csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) + +tb1: Table = read_csv(ctx, table1_path, csv_read_options) + +tb2: Table = read_csv(ctx, table2_path, csv_read_options) -tb3: Table = tb1.join(ctx, table=tb2, join_type='left', algorithm='hash', left_col=0, right_col=0) +tb3: Table = tb1.join(table=tb2, join_type='inner', algorithm='hash', left_on=[0], + right_on=[0]) ``` Distributed Join ```python -tb1: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',') -tb2: Table = csv_reader.read(ctx, '/tmp/csv.csv', ',') +csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) + +tb1: Table = read_csv(ctx, table1_path, csv_read_options) + +tb2: Table = read_csv(ctx, table2_path, csv_read_options) -tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type='left', algorithm='hash', left_col=0, right_col=0) +tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='hash', left_on=[0], + right_on=[0]) ``` ### Union @@ -89,13 +97,13 @@ tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type='left', algorithm='h Sequential Union ```python -tb4: Table = tb1.union(ctx, table=tb2) +tb4: Table = tb1.union(tb2) ``` Distributed Union ```python -tb5: Table = tb1.distributed_union(ctx, table=tb2) +tb5: Table = tb1.distributed_union(table=tb2) ``` ### Intersect @@ -103,13 +111,13 @@ tb5: Table = tb1.distributed_union(ctx, table=tb2) Sequential Intersect ```python -tb4: Table = tb1.intersect(ctx, table=tb2) +tb4: Table = tb1.intersect(table=tb2) ``` Distributed Intersect ```python -tb5: Table = tb1.distributed_intersect(ctx, table=tb2) +tb5: Table = tb1.distributed_intersect(table=tb2) ``` ### Subtract @@ -117,13 +125,13 @@ tb5: Table = tb1.distributed_intersect(ctx, table=tb2) Sequential Subtract ```python -tb4: Table = tb1.subtract(ctx, table=tb2) +tb4: Table = tb1.subtract(table=tb2) ``` Distributed Subtract ```python -tb5: Table = tb1.distributed_subtract(ctx, table=tb2) +tb5: Table = tb1.distributed_subtract(table=tb2) ``` @@ -135,5 +143,7 @@ This is not yet supported from PyCylon API, but LibCylon supports this. ## Python Examples -1. [Simple Data Loading Benchmark](https://github.com/cylondata/cylon/blob/master/python/examples/cylon_simple_dataloader.py) -2. [Sequential MNIST with PyTorch](https://github.com/cylondata/cylon/blob/master/python/examples/cylon_sequential_mnist.py) \ No newline at end of file +1. [Relational Algebra Examples](https://github.com/cylondata/cylon/blob/master/python/examples +/table_relational_algebra.py) +2. [Compute Examples](https://github.com/cylondata/cylon/blob/master/python/examples +/table_compute_examples.py) \ No newline at end of file diff --git a/docs/docs/release/cylon_release_0.2.0.md b/docs/docs/release/cylon_release_0.2.0.md index 09eee33ef..ac2c4b116 100644 --- a/docs/docs/release/cylon_release_0.2.0.md +++ b/docs/docs/release/cylon_release_0.2.0.md @@ -5,36 +5,36 @@ sidebar_label: Cylon Release 0.2.0 --- Cylon 0.2.0 adds the following features. Please note that this release may not be backward - compatible with v0.1.0. + compatible with v0.1.0. -## Major Features +## Major Features -### C++ +### C++ -- Adding aggregates and group-by API +- Adding aggregates and group-by API - Creating tables using `std::vector`s or `cylon::Column`s -- C++ API refactoring +- C++ API refactoring - Major performance improvements in the existing C++ API -### Python (pycylon) +### Python (Pycylon) - Extending Cython API for extended development for other Cython/Python libraries - Aggregates and Groupby addition - Column name-based relational algebra operations and aggregate/groupby ops addition - Major performance improvements in the existing Python API -### Java +## Java (JCylon) -- Performance improvements +- Performance improvements You can download source code from [Github](https://github.com/cylondata/cylon/releases) -## Examples +# Examples - [C++ examples](https://github.com/cylondata/cylon/tree/0.2.0/cpp/src/examples) -- [Python examples](https://github.com/cylondata/cylon/tree/0.2.0/python/examples) +- [Python examples](https://github.com/cylondata/cylon/tree/0.2.0/python/examples) - [Java examples](https://github.com/cylondata/cylon/tree/0.2.0/java/src/main/java/org/cylondata/cylon/examples) -## License +# License -Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 +Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 \ No newline at end of file diff --git a/python/examples/table_compute_examples.py b/python/examples/table_compute_examples.py new file mode 100644 index 000000000..fd227a795 --- /dev/null +++ b/python/examples/table_compute_examples.py @@ -0,0 +1,174 @@ +## +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## + + +import numpy as np +import pyarrow as pa +import pandas as pd +import pycylon as cn +from pycylon import CylonContext + +ctx: CylonContext = CylonContext(config=None, distributed=False) + +columns = 2 + +data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32) +data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32) + +nd_array_list = [data1, data2] + +ar_array: pa.array = pa.array(nd_array_list) + +ar_table: pa.Table = pa.Table.from_arrays(nd_array_list, names=['x0', 'x1']) + +print(ar_table) + +ar1 = pa.array([1, 2, 3, 4]) +ar2 = pa.array(['a', 'b', 'c', 'd']) + +ar_tb2: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2']) + +print(ar_tb2) + +col_names = ['col1', 'col2'] + +cn_tb1 = cn.Table.from_numpy(ctx, col_names, nd_array_list) + +cn_tb1.show() + +data_list = [[1, 2, 3, 4], ['p', 'q', 'r', 's']] + +cn_tb2 = cn.Table.from_list(ctx, col_names, data_list) + +cn_tb2.show() + +dict1 = {'col1': [1, 2], 'col2': ['a', 'b']} + +ar_tb3: pa.Table = pa.Table.from_pydict(dict1) + +print(ar_tb3) + +cn_tb3: cn.Table = cn.Table.from_pydict(ctx, dict1) + +cn_tb3.show() + +pdf = pd.DataFrame(dict1) + +# df, Schema schema=None, preserve_index=None, nthreads=None, columns=None, bool safe=True + +cn_tb4: cn.Table = cn.Table.from_pandas(ctx, pdf) + +cn_tb4.show() + +print(cn_tb4.to_pandas()) + +dict2 = {'col1': [1, 2], 'col2': [2, 4]} + +cn_tb5: cn.Table = cn.Table.from_pydict(ctx, dict2) + +npy = cn_tb5.to_numpy() +print(npy, npy.dtype) + +dict3 = cn_tb5.to_pydict() + +print(dict3) + +print(cn_tb5.column_names) + +print(ar_tb2) + +print(cn_tb5.to_numpy()) + +## Aggregate Sum + +cn_tb6 = cn_tb5.sum('col1') + +cn_tb6.show() + +cn_tb7 = cn_tb5.sum(0) + +## Aggregate Count + +cn_tb8 = cn_tb5.count('col1') + +cn_tb8.show() + +cn_tb9 = cn_tb5.count(0) + +cn_tb9.show() + +## Aggregate Min + +cn_tb10 = cn_tb5.min('col1') + +cn_tb10.show() + +cn_tb11 = cn_tb5.min(0) + +cn_tb11.show() + +## Aggregate Max + +cn_tb12 = cn_tb5.max('col1') + +cn_tb12.show() + +cn_tb13 = cn_tb5.max(0) + +cn_tb13.show() + +from pycylon.data.aggregates import AggregationOp + +op1 = AggregationOp.SUM + +assert (op1 == AggregationOp.SUM) + +print(op1.name) + +dict3 = {'col1': [1, 2, 3, 4, 5, 1, 3, 6, 8, 1, 9, 10], 'col2': [2, 4, 0, 1, 5, 6, 8, 1, 3, 4, 0, + 1]} + +cn_tb14: cn.Table = cn.Table.from_pydict(ctx, dict3) + +cn_tb14.groupby(0, [0], [AggregationOp.COUNT]) + +cn_tb14.show() + +df = pd.DataFrame({'AnimalId': [1, 1, 2, 2, 3, 4, 4, 3], + + 'Max Speed': [380., 370., 24., 26., 23.1, 300.1, 310.2, 25.2]}) + +ar_tb_gb = pa.Table.from_pandas(df) + +cn_tb_gb = cn.Table.from_arrow(ctx, ar_tb_gb) + + +pdf1 = df.groupby(['AnimalId']).sum() + +print(pdf1) + +cn_tb_gb_res = cn_tb_gb.groupby(0, [1], [AggregationOp.SUM]).sort(0) + +cn_tb_gb_res.show() + +cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed'], [AggregationOp.SUM]).sort(0) + +cn_tb_gb_res1.show() + +cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed', 'Max Speed', 'Max Speed'], [AggregationOp.SUM, + AggregationOp.MIN, + AggregationOp.MAX])\ + .sort(0) + +cn_tb_gb_res1.show() \ No newline at end of file diff --git a/python/examples/table_relational_algebra.py b/python/examples/table_relational_algebra.py new file mode 100644 index 000000000..4c7ff0594 --- /dev/null +++ b/python/examples/table_relational_algebra.py @@ -0,0 +1,131 @@ +## + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + ## + +from pycylon import Table +from pycylon import CylonContext +from pycylon.io import CSVReadOptions +from pycylon.io import read_csv +from pycylon.net import MPIConfig + +table1_path = '/tmp/user_device_tm_1.csv' +table2_path = '/tmp/user_usage_tm_1.csv' + + +def single_process(): + ctx: CylonContext = CylonContext(config=None, distributed=False) + + csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) + + tb1: Table = read_csv(ctx, table1_path, csv_read_options) + + tb2: Table = read_csv(ctx, table2_path, csv_read_options) + + print(tb1.column_names) + print(tb2.column_names) + + configs = {'join_type': 'inner', 'algorithm': 'sort'} + + tb3: Table = tb1.join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + left_on=[0], + right_on=[3] + ) + + tb3.show() + + tb4: Table = tb1.join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + left_on=['use_id'], + right_on=['use_id'] + ) + + tb4.show() + + tb4: Table = tb1.join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + on=['use_id'] + ) + + tb4.show() + + # tb5: Table = tb1.join(ctx, table=tb2, + # join_type=configs['join_type'], + # algorithm=configs['algorithm'], + # on=[0] + # ) + # + # tb5.show() + + ctx.finalize() + + +def multi_process(): + mpi_config = MPIConfig() + ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) + + csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) + + tb1: Table = read_csv(ctx, table1_path, csv_read_options) + + tb2: Table = read_csv(ctx, table2_path, csv_read_options) + + print(tb1.column_names) + print(tb2.column_names) + + configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, + 'right_col': 0} + + tb3: Table = tb1.distributed_join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + left_on=[0], + right_on=[3] + ) + + tb3.show() + + tb4: Table = tb1.distributed_join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + left_on=['use_id'], + right_on=['use_id'] + ) + + tb4.show() + + tb4: Table = tb1.distributed_join(table=tb2, + join_type=configs['join_type'], + algorithm=configs['algorithm'], + on=['use_id'] + ) + + tb4.show() + + # tb5: Table = tb1.distributed_join(ctx, table=tb2, + # join_type=configs['join_type'], + # algorithm=configs['algorithm'], + # on=[0] + # ) + # + # tb5.show() + + ctx.finalize() + + +single_process() + +multi_process()