From 5621a0b02cf8f1d6eb50dba712f5ac6c2e0ee1be Mon Sep 17 00:00:00 2001 From: Wilson Beebe Date: Mon, 18 Sep 2023 16:04:31 -0700 Subject: [PATCH] Make enemble's object and source fields public --- .../binning_slowly_changing_sources.ipynb | 14 +- docs/tutorials/scaling_to_large_data.ipynb | 2 +- .../tutorials/working_with_the_ensemble.ipynb | 1381 ++++++++++++++++- src/tape/ensemble.py | 170 +- tests/tape_tests/test_ensemble.py | 138 +- 5 files changed, 1501 insertions(+), 204 deletions(-) diff --git a/docs/tutorials/binning_slowly_changing_sources.ipynb b/docs/tutorials/binning_slowly_changing_sources.ipynb index c68fea34..0414ad94 100644 --- a/docs/tutorials/binning_slowly_changing_sources.ipynb +++ b/docs/tutorials/binning_slowly_changing_sources.ipynb @@ -60,7 +60,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 500)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 500)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -90,7 +90,7 @@ "source": [ "ens.bin_sources(time_window=7.0, offset=0.0)\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 500)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 500)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -120,7 +120,7 @@ "source": [ "ens.bin_sources(time_window=28.0, offset=0.0, custom_aggr={\"midPointTai\": \"min\"})\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 500)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 500)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -150,7 +150,7 @@ "ens.from_source_dict(rows, column_mapper=cmap)\n", "\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 60)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 60)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -179,7 +179,7 @@ "ens.bin_sources(time_window=1.0, offset=0.0)\n", "\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 60)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 60)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -209,7 +209,7 @@ "ens.bin_sources(time_window=1.0, offset=0.5)\n", "\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 60)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 60)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] @@ -259,7 +259,7 @@ "ens.bin_sources(time_window=1.0, offset=0.5)\n", "\n", "fig, ax = plt.subplots(1, 1)\n", - "_ = ax.hist(ens._source[\"midPointTai\"].compute().tolist(), 60)\n", + "_ = ax.hist(ens.source[\"midPointTai\"].compute().tolist(), 60)\n", "_ = ax.set_xlabel(\"Time (MJD)\")\n", "_ = ax.set_ylabel(\"Source Count\")" ] diff --git a/docs/tutorials/scaling_to_large_data.ipynb b/docs/tutorials/scaling_to_large_data.ipynb index b1238409..6624b7e2 100644 --- a/docs/tutorials/scaling_to_large_data.ipynb +++ b/docs/tutorials/scaling_to_large_data.ipynb @@ -216,7 +216,7 @@ "\n", "print(\"number of lightcurve results in mapres: \", len(mapres))\n", "print(\"number of lightcurve results in groupres: \", len(groupres))\n", - "print(\"True number of lightcurves in the dataset:\", len(np.unique(ens._source.index)))" + "print(\"True number of lightcurves in the dataset:\", len(np.unique(ens.source.index)))" ] }, { diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index c5098095..3285c016 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -68,7 +68,9 @@ "outputs": [ { "data": { - "text/plain": "" + "text/plain": [ + "" + ] }, "execution_count": 2, "metadata": {}, @@ -119,7 +121,9 @@ "outputs": [ { "data": { - "text/plain": "" + "text/plain": [ + "" + ] }, "execution_count": 3, "metadata": {}, @@ -170,8 +174,67 @@ "outputs": [ { "data": { - "text/plain": "Dask DataFrame Structure:\n time flux error band\nnpartitions=1 \n0 float64 float64 float64 string\n9 ... ... ... ...\nDask Name: sort_index, 4 graph layers", - "text/html": "
Dask DataFrame Structure:
\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
npartitions=1
0float64float64float64string
9............
\n
\n
Dask Name: sort_index, 4 graph layers
" + "text/html": [ + "
Dask DataFrame Structure:
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
npartitions=1
0float64float64float64string
9............
\n", + "
\n", + "
Dask Name: sort_index, 4 graph layers
" + ], + "text/plain": [ + "Dask DataFrame Structure:\n", + " time flux error band\n", + "npartitions=1 \n", + "0 float64 float64 float64 string\n", + "9 ... ... ... ...\n", + "Dask Name: sort_index, 4 graph layers" + ] }, "execution_count": 4, "metadata": {}, @@ -179,7 +242,7 @@ } ], "source": [ - "ens._source # We have not actually loaded any data into memory" + "ens.source # We have not actually loaded any data into memory" ] }, { @@ -201,8 +264,138 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n0 1.0 120.851100 11.633225 g\n0 2.0 136.016225 12.635291 g\n0 3.0 100.005719 14.429710 g\n0 4.0 115.116629 11.786349 g\n0 5.0 107.337795 14.542676 g\n.. ... ... ... ...\n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r\n\n[1000 rows x 4 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n

1000 rows × 4 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n", + "

1000 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "0 1.0 120.851100 11.633225 g\n", + "0 2.0 136.016225 12.635291 g\n", + "0 3.0 100.005719 14.429710 g\n", + "0 4.0 115.116629 11.786349 g\n", + "0 5.0 107.337795 14.542676 g\n", + ".. ... ... ... ...\n", + "9 96.0 138.371176 12.237541 r\n", + "9 97.0 104.060829 10.920638 r\n", + "9 98.0 149.920678 14.143664 r\n", + "9 99.0 119.480601 10.154990 r\n", + "9 100.0 145.260138 14.733641 r\n", + "\n", + "[1000 rows x 4 columns]" + ] }, "execution_count": 5, "metadata": {}, @@ -306,8 +499,80 @@ "outputs": [ { "data": { - "text/plain": "band nobs_g nobs_r nobs_total\nid \n0 50 50 100\n1 50 50 100\n2 50 50 100\n3 50 50 100\n4 50 50 100", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
bandnobs_gnobs_rnobs_total
id
05050100
15050100
25050100
35050100
45050100
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bandnobs_gnobs_rnobs_total
id
05050100
15050100
25050100
35050100
45050100
\n", + "
" + ], + "text/plain": [ + "band nobs_g nobs_r nobs_total\n", + "id \n", + "0 50 50 100\n", + "1 50 50 100\n", + "2 50 50 100\n", + "3 50 50 100\n", + "4 50 50 100" + ] }, "execution_count": 7, "metadata": {}, @@ -330,8 +595,87 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "9 96.0 138.371176 12.237541 r\n", + "9 97.0 104.060829 10.920638 r\n", + "9 98.0 149.920678 14.143664 r\n", + "9 99.0 119.480601 10.154990 r\n", + "9 100.0 145.260138 14.733641 r" + ] }, "execution_count": 8, "metadata": {}, @@ -361,8 +705,138 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n0 1.0 120.851100 11.633225 g\n0 2.0 136.016225 12.635291 g\n0 3.0 100.005719 14.429710 g\n0 4.0 115.116629 11.786349 g\n0 5.0 107.337795 14.542676 g\n.. ... ... ... ...\n9 96.0 138.371176 12.237541 r\n9 97.0 104.060829 10.920638 r\n9 98.0 149.920678 14.143664 r\n9 99.0 119.480601 10.154990 r\n9 100.0 145.260138 14.733641 r\n\n[1000 rows x 4 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n

1000 rows × 4 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
01.0120.85110011.633225g
02.0136.01622512.635291g
03.0100.00571914.429710g
04.0115.11662911.786349g
05.0107.33779514.542676g
...............
996.0138.37117612.237541r
997.0104.06082910.920638r
998.0149.92067814.143664r
999.0119.48060110.154990r
9100.0145.26013814.733641r
\n", + "

1000 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "0 1.0 120.851100 11.633225 g\n", + "0 2.0 136.016225 12.635291 g\n", + "0 3.0 100.005719 14.429710 g\n", + "0 4.0 115.116629 11.786349 g\n", + "0 5.0 107.337795 14.542676 g\n", + ".. ... ... ... ...\n", + "9 96.0 138.371176 12.237541 r\n", + "9 97.0 104.060829 10.920638 r\n", + "9 98.0 149.920678 14.143664 r\n", + "9 99.0 119.480601 10.154990 r\n", + "9 100.0 145.260138 14.733641 r\n", + "\n", + "[1000 rows x 4 columns]" + ] }, "execution_count": 9, "metadata": {}, @@ -396,8 +870,138 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n0 2.0 136.016225 12.635291 g\n0 12.0 134.260975 10.685679 g\n0 14.0 143.905872 13.484091 g\n0 16.0 133.523376 13.777315 g\n0 21.0 140.037228 10.099401 g\n.. ... ... ... ...\n9 91.0 140.368263 14.320720 r\n9 92.0 148.476901 12.239495 r\n9 96.0 138.371176 12.237541 r\n9 98.0 149.920678 14.143664 r\n9 100.0 145.260138 14.733641 r\n\n[422 rows x 4 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
02.0136.01622512.635291g
012.0134.26097510.685679g
014.0143.90587213.484091g
016.0133.52337613.777315g
021.0140.03722810.099401g
...............
991.0140.36826314.320720r
992.0148.47690112.239495r
996.0138.37117612.237541r
998.0149.92067814.143664r
9100.0145.26013814.733641r
\n

422 rows × 4 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
02.0136.01622512.635291g
012.0134.26097510.685679g
014.0143.90587213.484091g
016.0133.52337613.777315g
021.0140.03722810.099401g
...............
991.0140.36826314.320720r
992.0148.47690112.239495r
996.0138.37117612.237541r
998.0149.92067814.143664r
9100.0145.26013814.733641r
\n", + "

422 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "0 2.0 136.016225 12.635291 g\n", + "0 12.0 134.260975 10.685679 g\n", + "0 14.0 143.905872 13.484091 g\n", + "0 16.0 133.523376 13.777315 g\n", + "0 21.0 140.037228 10.099401 g\n", + ".. ... ... ... ...\n", + "9 91.0 140.368263 14.320720 r\n", + "9 92.0 148.476901 12.239495 r\n", + "9 96.0 138.371176 12.237541 r\n", + "9 98.0 149.920678 14.143664 r\n", + "9 100.0 145.260138 14.733641 r\n", + "\n", + "[422 rows x 4 columns]" + ] }, "execution_count": 10, "metadata": {}, @@ -428,7 +1032,21 @@ "outputs": [ { "data": { - "text/plain": "id\n0 False\n0 True\n0 False\n0 False\n0 True\n ... \n9 False\n9 False\n9 False\n9 False\n9 False\nName: error, Length: 422, dtype: bool" + "text/plain": [ + "id\n", + "0 False\n", + "0 True\n", + "0 False\n", + "0 False\n", + "0 True\n", + " ... \n", + "9 False\n", + "9 False\n", + "9 False\n", + "9 False\n", + "9 False\n", + "Name: error, Length: 422, dtype: bool" + ] }, "execution_count": 11, "metadata": {}, @@ -436,7 +1054,7 @@ } ], "source": [ - "keep_rows = ens._source[\"error\"] < 12.0\n", + "keep_rows = ens.source[\"error\"] < 12.0\n", "keep_rows.compute()" ] }, @@ -459,8 +1077,138 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n0 12.0 134.260975 10.685679 g\n0 21.0 140.037228 10.099401 g\n0 22.0 148.413079 10.131055 g\n0 24.0 134.616131 11.231055 g\n0 30.0 143.907125 11.395918 g\n.. ... ... ... ...\n9 81.0 149.016644 10.755373 r\n9 85.0 130.071670 11.960329 r\n9 86.0 136.297942 11.419338 r\n9 88.0 134.215481 11.202422 r\n9 89.0 147.302751 11.271162 r\n\n[169 rows x 4 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n

169 rows × 4 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n", + "

169 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "0 12.0 134.260975 10.685679 g\n", + "0 21.0 140.037228 10.099401 g\n", + "0 22.0 148.413079 10.131055 g\n", + "0 24.0 134.616131 11.231055 g\n", + "0 30.0 143.907125 11.395918 g\n", + ".. ... ... ... ...\n", + "9 81.0 149.016644 10.755373 r\n", + "9 85.0 130.071670 11.960329 r\n", + "9 86.0 136.297942 11.419338 r\n", + "9 88.0 134.215481 11.202422 r\n", + "9 89.0 147.302751 11.271162 r\n", + "\n", + "[169 rows x 4 columns]" + ] }, "execution_count": 12, "metadata": {}, @@ -559,8 +1307,151 @@ "outputs": [ { "data": { - "text/plain": " time flux error band band2\nid \n0 12.0 134.260975 10.685679 g g2\n0 21.0 140.037228 10.099401 g g2\n0 22.0 148.413079 10.131055 g g2\n0 24.0 134.616131 11.231055 g g2\n0 30.0 143.907125 11.395918 g g2\n.. ... ... ... ... ...\n9 81.0 149.016644 10.755373 r r2\n9 85.0 130.071670 11.960329 r r2\n9 86.0 136.297942 11.419338 r r2\n9 88.0 134.215481 11.202422 r r2\n9 89.0 147.302751 11.271162 r r2\n\n[169 rows x 5 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorbandband2
id
012.0134.26097510.685679gg2
021.0140.03722810.099401gg2
022.0148.41307910.131055gg2
024.0134.61613111.231055gg2
030.0143.90712511.395918gg2
..................
981.0149.01664410.755373rr2
985.0130.07167011.960329rr2
986.0136.29794211.419338rr2
988.0134.21548111.202422rr2
989.0147.30275111.271162rr2
\n

169 rows × 5 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorbandband2
id
012.0134.26097510.685679gg2
021.0140.03722810.099401gg2
022.0148.41307910.131055gg2
024.0134.61613111.231055gg2
030.0143.90712511.395918gg2
..................
981.0149.01664410.755373rr2
985.0130.07167011.960329rr2
986.0136.29794211.419338rr2
988.0134.21548111.202422rr2
989.0147.30275111.271162rr2
\n", + "

169 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band band2\n", + "id \n", + "0 12.0 134.260975 10.685679 g g2\n", + "0 21.0 140.037228 10.099401 g g2\n", + "0 22.0 148.413079 10.131055 g g2\n", + "0 24.0 134.616131 11.231055 g g2\n", + "0 30.0 143.907125 11.395918 g g2\n", + ".. ... ... ... ... ...\n", + "9 81.0 149.016644 10.755373 r r2\n", + "9 85.0 130.071670 11.960329 r r2\n", + "9 86.0 136.297942 11.419338 r r2\n", + "9 88.0 134.215481 11.202422 r r2\n", + "9 89.0 147.302751 11.271162 r r2\n", + "\n", + "[169 rows x 5 columns]" + ] }, "execution_count": 14, "metadata": {}, @@ -569,7 +1460,7 @@ ], "source": [ "# Add a new column so we can filter it out later.\n", - "ens._source = ens._source.assign(band2=ens._source[\"band\"] + \"2\")\n", + "ens.source = ens.source.assign(band2=ens.source[\"band\"] + \"2\")\n", "ens.compute(\"source\")" ] }, @@ -585,8 +1476,138 @@ "outputs": [ { "data": { - "text/plain": " time flux error band\nid \n0 12.0 134.260975 10.685679 g\n0 21.0 140.037228 10.099401 g\n0 22.0 148.413079 10.131055 g\n0 24.0 134.616131 11.231055 g\n0 30.0 143.907125 11.395918 g\n.. ... ... ... ...\n9 81.0 149.016644 10.755373 r\n9 85.0 130.071670 11.960329 r\n9 86.0 136.297942 11.419338 r\n9 88.0 134.215481 11.202422 r\n9 89.0 147.302751 11.271162 r\n\n[169 rows x 4 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n

169 rows × 4 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorband
id
012.0134.26097510.685679g
021.0140.03722810.099401g
022.0148.41307910.131055g
024.0134.61613111.231055g
030.0143.90712511.395918g
...............
981.0149.01664410.755373r
985.0130.07167011.960329r
986.0136.29794211.419338r
988.0134.21548111.202422r
989.0147.30275111.271162r
\n", + "

169 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band\n", + "id \n", + "0 12.0 134.260975 10.685679 g\n", + "0 21.0 140.037228 10.099401 g\n", + "0 22.0 148.413079 10.131055 g\n", + "0 24.0 134.616131 11.231055 g\n", + "0 30.0 143.907125 11.395918 g\n", + ".. ... ... ... ...\n", + "9 81.0 149.016644 10.755373 r\n", + "9 85.0 130.071670 11.960329 r\n", + "9 86.0 136.297942 11.419338 r\n", + "9 88.0 134.215481 11.202422 r\n", + "9 89.0 147.302751 11.271162 r\n", + "\n", + "[169 rows x 4 columns]" + ] }, "execution_count": 15, "metadata": {}, @@ -621,8 +1642,151 @@ "outputs": [ { "data": { - "text/plain": " time flux error band lower_bnd\nid \n0 12.0 134.260975 10.685679 g 112.889618\n0 21.0 140.037228 10.099401 g 119.838427\n0 22.0 148.413079 10.131055 g 128.150969\n0 24.0 134.616131 11.231055 g 112.154020\n0 30.0 143.907125 11.395918 g 121.115288\n.. ... ... ... ... ...\n9 81.0 149.016644 10.755373 r 127.505899\n9 85.0 130.071670 11.960329 r 106.151012\n9 86.0 136.297942 11.419338 r 113.459267\n9 88.0 134.215481 11.202422 r 111.810638\n9 89.0 147.302751 11.271162 r 124.760428\n\n[169 rows x 5 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timefluxerrorbandlower_bnd
id
012.0134.26097510.685679g112.889618
021.0140.03722810.099401g119.838427
022.0148.41307910.131055g128.150969
024.0134.61613111.231055g112.154020
030.0143.90712511.395918g121.115288
..................
981.0149.01664410.755373r127.505899
985.0130.07167011.960329r106.151012
986.0136.29794211.419338r113.459267
988.0134.21548111.202422r111.810638
989.0147.30275111.271162r124.760428
\n

169 rows × 5 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timefluxerrorbandlower_bnd
id
012.0134.26097510.685679g112.889618
021.0140.03722810.099401g119.838427
022.0148.41307910.131055g128.150969
024.0134.61613111.231055g112.154020
030.0143.90712511.395918g121.115288
..................
981.0149.01664410.755373r127.505899
985.0130.07167011.960329r106.151012
986.0136.29794211.419338r113.459267
988.0134.21548111.202422r111.810638
989.0147.30275111.271162r124.760428
\n", + "

169 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " time flux error band lower_bnd\n", + "id \n", + "0 12.0 134.260975 10.685679 g 112.889618\n", + "0 21.0 140.037228 10.099401 g 119.838427\n", + "0 22.0 148.413079 10.131055 g 128.150969\n", + "0 24.0 134.616131 11.231055 g 112.154020\n", + "0 30.0 143.907125 11.395918 g 121.115288\n", + ".. ... ... ... ... ...\n", + "9 81.0 149.016644 10.755373 r 127.505899\n", + "9 85.0 130.071670 11.960329 r 106.151012\n", + "9 86.0 136.297942 11.419338 r 113.459267\n", + "9 88.0 134.215481 11.202422 r 111.810638\n", + "9 89.0 147.302751 11.271162 r 124.760428\n", + "\n", + "[169 rows x 5 columns]" + ] }, "execution_count": 16, "metadata": {}, @@ -656,7 +1820,20 @@ "outputs": [ { "data": { - "text/plain": "id\n0 {'g': -0.8833723170736909, 'r': -0.81291313232...\n1 {'g': -0.7866661902102343, 'r': -0.79927945599...\n2 {'g': -0.8650811883274131, 'r': -0.87939085289...\n3 {'g': -0.9140015912865537, 'r': -0.90284371456...\n4 {'g': -0.8232578922439672, 'r': -0.81922455220...\n5 {'g': -0.668795976899231, 'r': -0.784477243304...\n6 {'g': -0.8115552290707235, 'r': -0.90666227394...\n7 {'g': -0.6217573153267577, 'r': -0.60999974938...\n8 {'g': -0.7001359525394822, 'r': -0.73620435205...\n9 {'g': -0.7266040976469818, 'r': -0.68878460237...\nName: stetsonJ, dtype: object" + "text/plain": [ + "id\n", + "0 {'g': -0.8833723170736909, 'r': -0.81291313232...\n", + "1 {'g': -0.7866661902102343, 'r': -0.79927945599...\n", + "2 {'g': -0.8650811883274131, 'r': -0.87939085289...\n", + "3 {'g': -0.9140015912865537, 'r': -0.90284371456...\n", + "4 {'g': -0.8232578922439672, 'r': -0.81922455220...\n", + "5 {'g': -0.668795976899231, 'r': -0.784477243304...\n", + "6 {'g': -0.8115552290707235, 'r': -0.90666227394...\n", + "7 {'g': -0.6217573153267577, 'r': -0.60999974938...\n", + "8 {'g': -0.7001359525394822, 'r': -0.73620435205...\n", + "9 {'g': -0.7266040976469818, 'r': -0.68878460237...\n", + "Name: stetsonJ, dtype: object" + ] }, "execution_count": 17, "metadata": {}, @@ -673,23 +1850,137 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## Using light-curve package features\n", "\n", "`Ensemble.batch` also supports the use of [light-curve](https://pypi.org/project/light-curve/) package feature extractor:" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-30T14:58:37.514514Z", + "start_time": "2023-08-30T14:58:37.494001Z" + }, + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": " amplitude anderson_darling_normal stetson_K\nid \n0 7.076052 0.177751 0.834036\n1 8.591493 0.513749 0.769344\n2 8.141189 0.392628 0.856307\n3 5.751674 0.295631 0.809191\n4 7.871321 0.555775 0.849305\n5 8.666473 0.342937 0.823194\n6 8.649326 0.241117 0.832815\n7 8.856443 1.141906 0.772267\n8 9.297713 0.984247 0.968132\n9 8.774109 0.335798 0.754355", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
amplitudeanderson_darling_normalstetson_K
id
07.0760520.1777510.834036
18.5914930.5137490.769344
28.1411890.3926280.856307
35.7516740.2956310.809191
47.8713210.5557750.849305
58.6664730.3429370.823194
68.6493260.2411170.832815
78.8564431.1419060.772267
89.2977130.9842470.968132
98.7741090.3357980.754355
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amplitudeanderson_darling_normalstetson_K
id
07.0760520.1777510.834036
18.5914930.5137490.769344
28.1411890.3926280.856307
35.7516740.2956310.809191
47.8713210.5557750.849305
58.6664730.3429370.823194
68.6493260.2411170.832815
78.8564431.1419060.772267
89.2977130.9842470.968132
98.7741090.3357980.754355
\n", + "
" + ], + "text/plain": [ + " amplitude anderson_darling_normal stetson_K\n", + "id \n", + "0 7.076052 0.177751 0.834036\n", + "1 8.591493 0.513749 0.769344\n", + "2 8.141189 0.392628 0.856307\n", + "3 5.751674 0.295631 0.809191\n", + "4 7.871321 0.555775 0.849305\n", + "5 8.666473 0.342937 0.823194\n", + "6 8.649326 0.241117 0.832815\n", + "7 8.856443 1.141906 0.772267\n", + "8 9.297713 0.984247 0.968132\n", + "9 8.774109 0.335798 0.754355" + ] }, "execution_count": 18, "metadata": {}, @@ -702,14 +1993,7 @@ "extractor = licu.Extractor(licu.Amplitude(), licu.AndersonDarlingNormal(), licu.StetsonK())\n", "res = ens.batch(extractor, compute=True, band_to_calc=\"g\")\n", "res" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-30T14:58:37.514514Z", - "start_time": "2023-08-30T14:58:37.494001Z" - } - } + ] }, { "attachments": {}, @@ -770,7 +2054,20 @@ "outputs": [ { "data": { - "text/plain": "id\n0 {'g': 140.03722843377682, 'r': 138.955084796142}\n1 {'g': 140.91515408243285, 'r': 141.44229039903...\n2 {'g': 139.42093950235392, 'r': 142.21649742828...\n3 {'g': 137.01337116218363, 'r': 139.05032340951...\n4 {'g': 134.61800608117045, 'r': 139.76505837028...\n5 {'g': 135.55144382138587, 'r': 139.41361800167...\n6 {'g': 142.93611137557423, 'r': 137.20679606847...\n7 {'g': 144.52647796976, 'r': 132.2470836256106}\n8 {'g': 144.7469760076462, 'r': 137.5226773361662}\n9 {'g': 136.89977482019205, 'r': 136.29794229244...\nName: id, dtype: object" + "text/plain": [ + "id\n", + "0 {'g': 140.03722843377682, 'r': 138.955084796142}\n", + "1 {'g': 140.91515408243285, 'r': 141.44229039903...\n", + "2 {'g': 139.42093950235392, 'r': 142.21649742828...\n", + "3 {'g': 137.01337116218363, 'r': 139.05032340951...\n", + "4 {'g': 134.61800608117045, 'r': 139.76505837028...\n", + "5 {'g': 135.55144382138587, 'r': 139.41361800167...\n", + "6 {'g': 142.93611137557423, 'r': 137.20679606847...\n", + "7 {'g': 144.52647796976, 'r': 132.2470836256106}\n", + "8 {'g': 144.7469760076462, 'r': 137.5226773361662}\n", + "9 {'g': 136.89977482019205, 'r': 136.29794229244...\n", + "Name: id, dtype: object" + ] }, "execution_count": 20, "metadata": {}, diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 0ba059b8..497ab8a1 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -33,8 +33,8 @@ def __init__(self, client=True, **kwargs): """ self.result = None # holds the latest query - self._source = None # Source Table - self._object = None # Object Table + self.source = None # Source Table + self.object = None # Object Table self._source_dirty = False # Source Dirty Flag self._object_dirty = False # Object Dirty Flag @@ -156,20 +156,20 @@ def insert_sources( df2 = df2.set_index(self._id_col, drop=True) # Save the divisions and number of partitions. - prev_div = self._source.divisions - prev_num = self._source.npartitions + prev_div = self.source.divisions + prev_num = self.source.npartitions # Append the new rows to the correct divisions. - self._source = dd.concat([self._source, df2], axis=0, interleave_partitions=True) + self.source = dd.concat([self.source, df2], axis=0, interleave_partitions=True) self._source_dirty = True # Do the repartitioning if requested. If the divisions were set, reuse them. # Otherwise, use the same number of partitions. if force_repartition: if all(prev_div): - self._source = self._source.repartition(divisions=prev_div) - elif self._source.npartitions != prev_num: - self._source = self._source.repartition(npartitions=prev_num) + self.source = self.source.repartition(divisions=prev_div) + elif self.source.npartitions != prev_num: + self.source = self.source.repartition(npartitions=prev_num) def client_info(self): """Calls the Dask Client, which returns cluster information @@ -204,9 +204,9 @@ def info(self, verbose=True, memory_usage=True, **kwargs): self._lazy_sync_tables(table="all") print("Object Table") - self._object.info(verbose=verbose, memory_usage=memory_usage, **kwargs) + self.object.info(verbose=verbose, memory_usage=memory_usage, **kwargs) print("Source Table") - self._source.info(verbose=verbose, memory_usage=memory_usage, **kwargs) + self.source.info(verbose=verbose, memory_usage=memory_usage, **kwargs) def compute(self, table=None, **kwargs): """Wrapper for dask.dataframe.DataFrame.compute() @@ -227,12 +227,12 @@ def compute(self, table=None, **kwargs): if table: self._lazy_sync_tables(table) if table == "object": - return self._object.compute(**kwargs) + return self.object.compute(**kwargs) elif table == "source": - return self._source.compute(**kwargs) + return self.source.compute(**kwargs) else: self._lazy_sync_tables(table="all") - return (self._object.compute(**kwargs), self._source.compute(**kwargs)) + return (self.object.compute(**kwargs), self.source.compute(**kwargs)) def persist(self, **kwargs): """Wrapper for dask.dataframe.DataFrame.persist() @@ -243,15 +243,15 @@ def persist(self, **kwargs): of the computation. """ self._lazy_sync_tables("all") - self._object = self._object.persist(**kwargs) - self._source = self._source.persist(**kwargs) + self.object = self.object.persist(**kwargs) + self.source = self.source.persist(**kwargs) def columns(self, table="object"): """Retrieve columns from dask dataframe""" if table == "object": - return self._object.columns + return self.object.columns elif table == "source": - return self._source.columns + return self.source.columns else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -260,9 +260,9 @@ def head(self, table="object", n=5, **kwargs): self._lazy_sync_tables(table) if table == "object": - return self._object.head(n=n, **kwargs) + return self.object.head(n=n, **kwargs) elif table == "source": - return self._source.head(n=n, **kwargs) + return self.source.head(n=n, **kwargs) else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -271,9 +271,9 @@ def tail(self, table="object", n=5, **kwargs): self._lazy_sync_tables(table) if table == "object": - return self._object.tail(n=n, **kwargs) + return self.object.tail(n=n, **kwargs) elif table == "source": - return self._source.tail(n=n, **kwargs) + return self.source.tail(n=n, **kwargs) else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -296,10 +296,10 @@ def dropna(self, table="source", **kwargs): scheme """ if table == "object": - self._object = self._object.dropna(**kwargs) + self.object = self.object.dropna(**kwargs) self._object_dirty = True # This operation modifies the object table elif table == "source": - self._source = self._source.dropna(**kwargs) + self.source = self.source.dropna(**kwargs) self._source_dirty = True # This operation modifies the source table else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -320,12 +320,12 @@ def select(self, columns, table="object"): """ self._lazy_sync_tables(table) if table == "object": - cols_to_drop = [col for col in self._object.columns if col not in columns] - self._object = self._object.drop(cols_to_drop, axis=1) + cols_to_drop = [col for col in self.object.columns if col not in columns] + self.object = self.object.drop(cols_to_drop, axis=1) self._object_dirty = True elif table == "source": - cols_to_drop = [col for col in self._source.columns if col not in columns] - self._source = self._source.drop(cols_to_drop, axis=1) + cols_to_drop = [col for col in self.source.columns if col not in columns] + self.source = self.source.drop(cols_to_drop, axis=1) self._source_dirty = True else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -355,10 +355,10 @@ def query(self, expr, table="object"): """ self._lazy_sync_tables(table) if table == "object": - self._object = self._object.query(expr) + self.object = self.object.query(expr) self._object_dirty = True elif table == "source": - self._source = self._source.query(expr) + self.source = self.source.query(expr) self._source_dirty = True return self @@ -377,10 +377,10 @@ def filter_from_series(self, keep_series, table="object"): """ self._lazy_sync_tables(table) if table == "object": - self._object = self._object[keep_series] + self.object = self.object[keep_series] self._object_dirty = True elif table == "source": - self._source = self._source[keep_series] + self.source = self.source[keep_series] self._source_dirty = True return self @@ -412,10 +412,10 @@ def assign(self, table="object", **kwargs): self._lazy_sync_tables(table) if table == "object": - self._object = self._object.assign(**kwargs) + self.object = self.object.assign(**kwargs) self._object_dirty = True elif table == "source": - self._source = self._source.assign(**kwargs) + self.source = self.source.assign(**kwargs) self._source_dirty = True else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -447,9 +447,9 @@ def coalesce(self, input_cols, output_col, table="object", drop_inputs=False): """ # we shouldn't need to sync for this if table == "object": - table_ddf = self._object + table_ddf = self.object elif table == "source": - table_ddf = self._source + table_ddf = self.source else: raise ValueError(f"{table} is not one of 'object' or 'source'") @@ -504,9 +504,9 @@ def coalesce_partition(df, input_cols, output_col): table_ddf = table_ddf.drop(columns=input_cols) if table == "object": - self._object = table_ddf + self.object = table_ddf elif table == "source": - self._source = table_ddf + self.source = table_ddf return self @@ -533,8 +533,8 @@ def prune(self, threshold=50, col_name=None): self._lazy_sync_tables(table="object") # Mask on object table - mask = self._object[col_name] >= threshold - self._object = self._object[mask] + mask = self.object[col_name] >= threshold + self.object = self.object[mask] self._object_dirty = True # Object Table is now dirty @@ -562,7 +562,7 @@ def find_day_gap_offset(self): self._lazy_sync_tables(table="source") # Compute a histogram of observations by hour of the day. - hours = self._source[self._time_col].apply( + hours = self.source[self._time_col].apply( lambda x: np.floor(x * 24.0).astype(int) % 24, meta=pd.Series(dtype=int) ) hour_counts = hours.value_counts().compute() @@ -638,9 +638,9 @@ def bin_sources( # Bin the time and add it as a column. We create a temporary column that # truncates the time into increments of `time_window`. tmp_time_col = "tmp_time_for_aggregation" - if tmp_time_col in self._source.columns: + if tmp_time_col in self.source.columns: raise KeyError(f"Column '{tmp_time_col}' already exists in source table.") - self._source[tmp_time_col] = self._source[self._time_col].apply( + self.source[tmp_time_col] = self.source[self._time_col].apply( lambda x: np.floor((x + offset) / time_window) * time_window, meta=pd.Series(dtype=float) ) @@ -648,7 +648,7 @@ def bin_sources( aggr_funs = {self._time_col: "mean", self._flux_col: "mean"} # If the source table has errors then add an aggregation function for it. - if self._err_col in self._source.columns: + if self._err_col in self.source.columns: aggr_funs[self._err_col] = dd.Aggregation( name="err_agg", chunk=lambda x: (x.count(), x.apply(lambda s: np.sum(np.power(s, 2)))), @@ -660,8 +660,8 @@ def bin_sources( # adding an initial column of all ones if needed. if count_col is not None: self._bin_count_col = count_col - if self._bin_count_col not in self._source.columns: - self._source[self._bin_count_col] = self._source[self._time_col].apply( + if self._bin_count_col not in self.source.columns: + self.source[self._bin_count_col] = self.source[self._time_col].apply( lambda x: 1, meta=pd.Series(dtype=int) ) aggr_funs[self._bin_count_col] = "sum" @@ -675,10 +675,10 @@ def bin_sources( aggr_funs[key] = custom_aggr[key] # Group the columns by id, band, and time bucket and aggregate. - self._source = self._source.groupby([self._id_col, self._band_col, tmp_time_col]).aggregate(aggr_funs) + self.source = self.source.groupby([self._id_col, self._band_col, tmp_time_col]).aggregate(aggr_funs) # Fix the indices and remove the temporary column. - self._source = self._source.reset_index().set_index(self._id_col).drop(tmp_time_col, axis=1) + self.source = self.source.reset_index().set_index(self._id_col).drop(tmp_time_col, axis=1) # Mark the source table as dirty. self._source_dirty = True @@ -776,15 +776,15 @@ def s2n_inter_quartile_range(flux, err): on = [on] # Convert to list if only one column is passed # Handle object columns to group on - source_cols = list(self._source.columns) - object_cols = list(self._object.columns) + source_cols = list(self.source.columns) + object_cols = list(self.object.columns) object_group_cols = [col for col in on if (col in object_cols) and (col not in source_cols)] if len(object_group_cols) > 0: - object_col_dd = self._object[object_group_cols] - source_to_batch = self._source.merge(object_col_dd, how="left") + object_col_dd = self.object[object_group_cols] + source_to_batch = self.source.merge(object_col_dd, how="left") else: - source_to_batch = self._source # Can directly use the source table + source_to_batch = self.source # Can directly use the source table id_col = self._id_col # pre-compute needed for dask in lambda function @@ -904,26 +904,26 @@ def from_dask_dataframe( self._load_column_mapper(column_mapper, **kwargs) # Set the index of the source frame and save the resulting table - self._source = source_frame.set_index(self._id_col, drop=True) + self.source = source_frame.set_index(self._id_col, drop=True) if object_frame is None: # generate an indexed object table from source - self._object = self._generate_object_table() - self._nobs_bands = [col for col in list(self._object.columns) if col != self._nobs_tot_col] + self.object = self._generate_object_table() + self._nobs_bands = [col for col in list(self.object.columns) if col != self._nobs_tot_col] else: - self._object = object_frame + self.object = object_frame if self._nobs_band_cols is None: # sets empty nobs cols in object - unq_filters = np.unique(self._source[self._band_col]) + unq_filters = np.unique(self.source[self._band_col]) self._nobs_band_cols = [f"nobs_{filt}" for filt in unq_filters] for col in self._nobs_band_cols: - self._object[col] = np.nan + self.object[col] = np.nan # Handle nobs_total column if self._nobs_tot_col is None: - self._object["nobs_total"] = np.nan + self.object["nobs_total"] = np.nan self._nobs_tot_col = "nobs_total" - self._object = self._object.set_index(self._id_col) + self.object = self.object.set_index(self._id_col) # Optionally sync the tables, recalculates nobs columns if sync_tables: @@ -932,9 +932,9 @@ def from_dask_dataframe( self._sync_tables() if npartitions and npartitions > 1: - self._source = self._source.repartition(npartitions=npartitions) + self.source = self.source.repartition(npartitions=npartitions) elif partition_size: - self._source = self._source.repartition(partition_size=partition_size) + self.source = self.source.repartition(partition_size=partition_size) return self @@ -1287,21 +1287,21 @@ def convert_flux_to_mag(self, zero_point, zp_form="mag", out_col_name=None, flux if zp_form == "flux": # mag = -2.5*np.log10(flux/zp) if isinstance(zero_point, str): - self._source = self._source.assign( + self.source = self.source.assign( **{out_col_name: lambda x: -2.5 * np.log10(x[flux_col] / x[zero_point])} ) else: - self._source = self._source.assign( + self.source = self.source.assign( **{out_col_name: lambda x: -2.5 * np.log10(x[flux_col] / zero_point)} ) elif zp_form == "magnitude" or zp_form == "mag": # mag = -2.5*np.log10(flux) + zp if isinstance(zero_point, str): - self._source = self._source.assign( + self.source = self.source.assign( **{out_col_name: lambda x: -2.5 * np.log10(x[flux_col]) + x[zero_point]} ) else: - self._source = self._source.assign( + self.source = self.source.assign( **{out_col_name: lambda x: -2.5 * np.log10(x[flux_col]) + zero_point} ) else: @@ -1309,7 +1309,7 @@ def convert_flux_to_mag(self, zero_point, zp_form="mag", out_col_name=None, flux # Calculate Errors if err_col is not None: - self._source = self._source.assign( + self.source = self.source.assign( **{out_col_name + "_err": lambda x: (2.5 / np.log(10)) * (x[err_col] / x[flux_col])} ) @@ -1317,7 +1317,7 @@ def convert_flux_to_mag(self, zero_point, zp_form="mag", out_col_name=None, flux def _generate_object_table(self): """Generate the object table from the source table.""" - counts = self._source.groupby([self._id_col, self._band_col])[self._time_col].aggregate("count") + counts = self.source.groupby([self._id_col, self._band_col])[self._time_col].aggregate("count") res = ( counts.to_frame() .reset_index() @@ -1327,14 +1327,14 @@ def _generate_object_table(self): # If the ensemble's keep_empty_objects attribute is True and there are previous # objects, then copy them into the res table with counts of zero. - if self.keep_empty_objects and self._object is not None: - prev_partitions = self._object.npartitions + if self.keep_empty_objects and self.object is not None: + prev_partitions = self.object.npartitions # Check that there are existing object ids. - object_inds = self._object.index.unique().values.compute() + object_inds = self.object.index.unique().values.compute() if len(object_inds) > 0: # Determine which object IDs are missing from the source table. - source_inds = self._source.index.unique().values.compute() + source_inds = self.source.index.unique().values.compute() missing_inds = np.setdiff1d(object_inds, source_inds).tolist() # Create a dataframe of the missing IDs with zeros for all bands and counts. @@ -1390,19 +1390,19 @@ def _sync_tables(self): if self._object_dirty: # Sync Object to Source; remove any missing objects from source - obj_idx = list(self._object.index.compute()) - self._source = self._source.map_partitions(lambda x: x[x.index.isin(obj_idx)]) - self._source = self._source.persist() # persist the source frame + obj_idx = list(self.object.index.compute()) + self.source = self.source.map_partitions(lambda x: x[x.index.isin(obj_idx)]) + self.source = self.source.persist() # persist the source frame if self._source_dirty: # not elif # Generate a new object table; updates n_obs, removes missing ids new_obj = self._generate_object_table() # Join old obj to new obj; pulls in other existing obj columns - self._object = new_obj.join(self._object, on=self._id_col, how="left", lsuffix="", rsuffix="_old") - old_cols = [col for col in list(self._object.columns) if "_old" in col] - self._object = self._object.drop(old_cols, axis=1) - self._object = self._object.persist() # persist object + self.object = new_obj.join(self.object, on=self._id_col, how="left", lsuffix="", rsuffix="_old") + old_cols = [col for col in list(self.object.columns) if "_old" in col] + self.object = self.object.drop(old_cols, axis=1) + self.object = self.object.persist() # persist object # Now synced and clean self._source_dirty = False @@ -1459,7 +1459,7 @@ def to_timeseries( if band_col is None: band_col = self._band_col - df = self._source.loc[target].compute() + df = self.source.loc[target].compute() ts = TimeSeries().from_dataframe( data=df, object_id=target, @@ -1531,11 +1531,11 @@ def sf2(self, sf_method="basic", argument_container=None, use_map=True): if argument_container.combine: result = calc_sf2( - self._source[self._time_col], - self._source[self._flux_col], - self._source[self._err_col], - self._source[self._band_col], - self._source.index, + self.source[self._time_col], + self.source[self._flux_col], + self.source[self._err_col], + self.source[self._band_col], + self.source.index, argument_container=argument_container, ) return result diff --git a/tests/tape_tests/test_ensemble.py b/tests/tape_tests/test_ensemble.py index 5ea9d225..f91b3acd 100644 --- a/tests/tape_tests/test_ensemble.py +++ b/tests/tape_tests/test_ensemble.py @@ -45,8 +45,8 @@ def test_from_parquet(data_fixture, request): parquet_ensemble = request.getfixturevalue(data_fixture) # Check to make sure the source and object tables were created - assert parquet_ensemble._source is not None - assert parquet_ensemble._object is not None + assert parquet_ensemble.source is not None + assert parquet_ensemble.object is not None # Check that the data is not empty. obj, source = parquet_ensemble.compute() @@ -65,7 +65,7 @@ def test_from_parquet(data_fixture, request): parquet_ensemble._provenance_col, ]: # Check to make sure the critical quantity labels are bound to real columns - assert parquet_ensemble._source[col] is not None + assert parquet_ensemble.source[col] is not None @pytest.mark.parametrize( @@ -84,8 +84,8 @@ def test_from_dataframe(data_fixture, request): ens = request.getfixturevalue(data_fixture) # Check to make sure the source and object tables were created - assert ens._source is not None - assert ens._object is not None + assert ens.source is not None + assert ens.object is not None # Check that the data is not empty. obj, source = ens.compute() @@ -103,7 +103,7 @@ def test_from_dataframe(data_fixture, request): ens._band_col, ]: # Check to make sure the critical quantity labels are bound to real columns - assert ens._source[col] is not None + assert ens.source[col] is not None # Check that we can compute an analysis function on the ensemble. amplitude = ens.batch(calc_stetson_J) @@ -208,7 +208,7 @@ def test_from_source_dict(dask_client): def test_insert(parquet_ensemble): - num_partitions = parquet_ensemble._source.npartitions + num_partitions = parquet_ensemble.source.npartitions (old_object, old_source) = parquet_ensemble.compute() old_size = old_source.shape[0] @@ -230,7 +230,7 @@ def test_insert(parquet_ensemble): ) # Check we did not increase the number of partitions. - assert parquet_ensemble._source.npartitions == num_partitions + assert parquet_ensemble.source.npartitions == num_partitions # Check that all the new data points are in there. The order may be different # due to the repartitioning. @@ -259,7 +259,7 @@ def test_insert(parquet_ensemble): ) # Check we *did* increase the number of partitions and the size increased. - assert parquet_ensemble._source.npartitions != num_partitions + assert parquet_ensemble.source.npartitions != num_partitions (new_obj, new_source) = parquet_ensemble.compute() assert new_source.shape[0] == old_size + 10 @@ -281,8 +281,8 @@ def test_insert_paritioned(dask_client): # Save the old data for comparison. old_data = ens.compute("source") - old_div = copy.copy(ens._source.divisions) - old_sizes = [len(ens._source.partitions[i]) for i in range(4)] + old_div = copy.copy(ens.source.divisions) + old_sizes = [len(ens.source.partitions[i]) for i in range(4)] assert old_data.shape[0] == num_points # Test an insertion of 5 observations. @@ -295,12 +295,12 @@ def test_insert_paritioned(dask_client): # Check we did not increase the number of partitions and the points # were placed in the correct partitions. - assert ens._source.npartitions == 4 - assert ens._source.divisions == old_div - assert len(ens._source.partitions[0]) == old_sizes[0] + 3 - assert len(ens._source.partitions[1]) == old_sizes[1] - assert len(ens._source.partitions[2]) == old_sizes[2] + 2 - assert len(ens._source.partitions[3]) == old_sizes[3] + assert ens.source.npartitions == 4 + assert ens.source.divisions == old_div + assert len(ens.source.partitions[0]) == old_sizes[0] + 3 + assert len(ens.source.partitions[1]) == old_sizes[1] + assert len(ens.source.partitions[2]) == old_sizes[2] + 2 + assert len(ens.source.partitions[3]) == old_sizes[3] # Check that all the new data points are in there. The order may be different # due to the repartitioning. @@ -318,12 +318,12 @@ def test_insert_paritioned(dask_client): # Check we did not increase the number of partitions and the points # were placed in the correct partitions. - assert ens._source.npartitions == 4 - assert ens._source.divisions == old_div - assert len(ens._source.partitions[0]) == old_sizes[0] + 3 - assert len(ens._source.partitions[1]) == old_sizes[1] + 5 - assert len(ens._source.partitions[2]) == old_sizes[2] + 2 - assert len(ens._source.partitions[3]) == old_sizes[3] + assert ens.source.npartitions == 4 + assert ens.source.divisions == old_div + assert len(ens.source.partitions[0]) == old_sizes[0] + 3 + assert len(ens.source.partitions[1]) == old_sizes[1] + 5 + assert len(ens.source.partitions[2]) == old_sizes[2] + 2 + assert len(ens.source.partitions[3]) == old_sizes[3] def test_core_wrappers(parquet_ensemble): @@ -356,9 +356,9 @@ def test_persist(dask_client): ens.query("flux <= 1.5", table="source") # Compute the task graph size before and after the persist. - old_graph_size = len(ens._source.dask) + old_graph_size = len(ens.source.dask) ens.persist() - new_graph_size = len(ens._source.dask) + new_graph_size = len(ens.source.dask) assert new_graph_size < old_graph_size @@ -480,12 +480,12 @@ def test_dropna(parquet_ensemble): # First test dropping na from the 'source' table # - source_pdf = parquet_ensemble._source.compute() + source_pdf = parquet_ensemble.source.compute() source_length = len(source_pdf.index) # Try dropping NaNs from source and confirm nothing is dropped (there are no NaNs). parquet_ensemble.dropna(table="source") - assert len(parquet_ensemble._source.compute().index) == source_length + assert len(parquet_ensemble.source.compute().index) == source_length # Get a valid ID to use and count its occurrences. valid_source_id = source_pdf.index.values[1] @@ -495,23 +495,23 @@ def test_dropna(parquet_ensemble): # We do this on the instantiated source (pdf) and convert it back into a # Dask DataFrame. source_pdf.loc[valid_source_id, parquet_ensemble._flux_col] = pd.NA - parquet_ensemble._source = dd.from_pandas(source_pdf, npartitions=1) + parquet_ensemble.source = dd.from_pandas(source_pdf, npartitions=1) # Try dropping NaNs from source and confirm that we did. parquet_ensemble.dropna(table="source") - assert len(parquet_ensemble._source.compute().index) == source_length - occurrences_source + assert len(parquet_ensemble.source.compute().index) == source_length - occurrences_source # Sync the table and check that the number of objects decreased. # parquet_ensemble._sync_tables() # Now test dropping na from 'object' table # - object_pdf = parquet_ensemble._object.compute() + object_pdf = parquet_ensemble.object.compute() object_length = len(object_pdf.index) # Try dropping NaNs from object and confirm nothing is dropped (there are no NaNs). parquet_ensemble.dropna(table="object") - assert len(parquet_ensemble._object.compute().index) == object_length + assert len(parquet_ensemble.object.compute().index) == object_length # get a valid object id and set at least two occurences of that id in the object table valid_object_id = object_pdf.index.values[1] @@ -521,14 +521,14 @@ def test_dropna(parquet_ensemble): # Set the nobs_g values for one object to NaN so we can drop it. # We do this on the instantiated object (pdf) and convert it back into a # Dask DataFrame. - object_pdf.loc[valid_object_id, parquet_ensemble._object.columns[0]] = pd.NA - parquet_ensemble._object = dd.from_pandas(object_pdf, npartitions=1) + object_pdf.loc[valid_object_id, parquet_ensemble.object.columns[0]] = pd.NA + parquet_ensemble.object = dd.from_pandas(object_pdf, npartitions=1) # Try dropping NaNs from object and confirm that we did. parquet_ensemble.dropna(table="object") - assert len(parquet_ensemble._object.compute().index) == object_length - occurrences_object + assert len(parquet_ensemble.object.compute().index) == object_length - occurrences_object - new_objects_pdf = parquet_ensemble._object.compute() + new_objects_pdf = parquet_ensemble.object.compute() assert len(new_objects_pdf.index) == len(object_pdf.index) - occurrences_object # Assert the filtered ID is no longer in the objects. @@ -544,24 +544,24 @@ def test_keep_zeros(parquet_ensemble): """Test that we can sync the tables and keep objects with zero sources.""" parquet_ensemble.keep_empty_objects = True - prev_npartitions = parquet_ensemble._object.npartitions - old_objects_pdf = parquet_ensemble._object.compute() - pdf = parquet_ensemble._source.compute() + prev_npartitions = parquet_ensemble.object.npartitions + old_objects_pdf = parquet_ensemble.object.compute() + pdf = parquet_ensemble.source.compute() # Set the psFlux values for one object to NaN so we can drop it. # We do this on the instantiated object (pdf) and convert it back into a # Dask DataFrame. valid_id = pdf.index.values[1] pdf.loc[valid_id, parquet_ensemble._flux_col] = pd.NA - parquet_ensemble._source = dd.from_pandas(pdf, npartitions=1) + parquet_ensemble.source = dd.from_pandas(pdf, npartitions=1) # Sync the table and check that the number of objects decreased. parquet_ensemble.dropna(table="source") parquet_ensemble._sync_tables() - new_objects_pdf = parquet_ensemble._object.compute() + new_objects_pdf = parquet_ensemble.object.compute() assert len(new_objects_pdf.index) == len(old_objects_pdf.index) - assert parquet_ensemble._object.npartitions == prev_npartitions + assert parquet_ensemble.object.npartitions == prev_npartitions # Check that all counts have stayed the same except the filtered index, # which should now be all zeros. @@ -580,7 +580,7 @@ def test_prune(parquet_ensemble): threshold = 10 parquet_ensemble.prune(threshold) - assert not np.any(parquet_ensemble._object["nobs_total"].values < threshold) + assert not np.any(parquet_ensemble.object["nobs_total"].values < threshold) def test_query(dask_client): @@ -622,7 +622,7 @@ def test_filter_from_series(dask_client): ens.from_source_dict(rows, column_mapper=cmap, npartitions=2) # Filter the data set to low flux sources only. - keep_series = ens._source[ens._time_col] >= 250.0 + keep_series = ens.source[ens._time_col] >= 250.0 ens.filter_from_series(keep_series, table="source") # Check that all of the filtered rows are value. @@ -647,22 +647,22 @@ def test_select(dask_client): } cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") ens.from_source_dict(rows, column_mapper=cmap, npartitions=2) - assert len(ens._source.columns) == 5 - assert "time" in ens._source.columns - assert "flux" in ens._source.columns - assert "band" in ens._source.columns - assert "count" in ens._source.columns - assert "something_else" in ens._source.columns + assert len(ens.source.columns) == 5 + assert "time" in ens.source.columns + assert "flux" in ens.source.columns + assert "band" in ens.source.columns + assert "count" in ens.source.columns + assert "something_else" in ens.source.columns # Select on just time and flux ens.select(["time", "flux"], table="source") - assert len(ens._source.columns) == 2 - assert "time" in ens._source.columns - assert "flux" in ens._source.columns - assert "band" not in ens._source.columns - assert "count" not in ens._source.columns - assert "something_else" not in ens._source.columns + assert len(ens.source.columns) == 2 + assert "time" in ens.source.columns + assert "flux" in ens.source.columns + assert "band" not in ens.source.columns + assert "count" not in ens.source.columns + assert "something_else" not in ens.source.columns def test_assign(dask_client): @@ -679,13 +679,13 @@ def test_assign(dask_client): } cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band") ens.from_source_dict(rows, column_mapper=cmap, npartitions=1) - assert len(ens._source.columns) == 4 - assert "lower_bnd" not in ens._source.columns + assert len(ens.source.columns) == 4 + assert "lower_bnd" not in ens.source.columns # Insert a new column for the "lower bound" computation. ens.assign(table="source", lower_bnd=lambda x: x["flux"] - 2.0 * x["err"]) - assert len(ens._source.columns) == 5 - assert "lower_bnd" in ens._source.columns + assert len(ens.source.columns) == 5 + assert "lower_bnd" in ens.source.columns # Check the values in the new column. new_source = ens.compute(table="source") @@ -695,10 +695,10 @@ def test_assign(dask_client): assert new_source.iloc[i]["lower_bnd"] == expected # Create a series directly from the table. - res_col = ens._source["band"] + "2" + res_col = ens.source["band"] + "2" ens.assign(table="source", band2=res_col) - assert len(ens._source.columns) == 6 - assert "band2" in ens._source.columns + assert len(ens.source.columns) == 6 + assert "band2" in ens.source.columns # Check the values in the new column. new_source = ens.compute(table="source") @@ -729,7 +729,7 @@ def test_coalesce(dask_client, drop_inputs): ens.coalesce(["flux1", "flux2", "flux3"], "flux", table="source", drop_inputs=drop_inputs) # Coalesce should return this exact flux array - assert list(ens._source["flux"].values.compute()) == [5.0, 3.0, 4.0, 10.0, 7.0] + assert list(ens.source["flux"].values.compute()) == [5.0, 3.0, 4.0, 10.0, 7.0] if drop_inputs: # The column mapping should be updated @@ -737,7 +737,7 @@ def test_coalesce(dask_client, drop_inputs): # The columns to drop should be dropped for col in ["flux1", "flux2", "flux3"]: - assert col not in ens._source.columns + assert col not in ens.source.columns # Test for the drop warning with pytest.warns(UserWarning): @@ -746,7 +746,7 @@ def test_coalesce(dask_client, drop_inputs): else: # The input columns should still be present for col in ["flux1", "flux2", "flux3"]: - assert col in ens._source.columns + assert col in ens.source.columns @pytest.mark.parametrize("zero_point", [("zp_mag", "zp_flux"), (25.0, 10**10)]) @@ -777,19 +777,19 @@ def test_convert_flux_to_mag(dask_client, zero_point, zp_form, out_col_name): if zp_form == "flux": ens.convert_flux_to_mag(zero_point[1], zp_form, out_col_name) - res_mag = ens._source.compute()[output_column].to_list()[0] + res_mag = ens.source.compute()[output_column].to_list()[0] assert pytest.approx(res_mag, 0.001) == 21.28925 - res_err = ens._source.compute()[output_column + "_err"].to_list()[0] + res_err = ens.source.compute()[output_column + "_err"].to_list()[0] assert pytest.approx(res_err, 0.001) == 0.355979 elif zp_form == "mag" or zp_form == "magnitude": ens.convert_flux_to_mag(zero_point[0], zp_form, out_col_name) - res_mag = ens._source.compute()[output_column].to_list()[0] + res_mag = ens.source.compute()[output_column].to_list()[0] assert pytest.approx(res_mag, 0.001) == 21.28925 - res_err = ens._source.compute()[output_column + "_err"].to_list()[0] + res_err = ens.source.compute()[output_column + "_err"].to_list()[0] assert pytest.approx(res_err, 0.001) == 0.355979 else: