diff --git a/.gitignore b/.gitignore index 8baa3870..a1d83942 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/autoapi/ _readthedocs/ # PyBuilder diff --git a/docs/Makefile b/docs/Makefile index a5622f10..05e02bcd 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,7 +4,7 @@ # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= -T -E -d _build/doctrees -D language=en -EXCLUDENB ?= -D exclude_patterns="notebooks/*","_build","**.ipynb_checkpoints" +EXCLUDENB ?= -D exclude_patterns="notebooks/*","_build","**.ipynb_checkpoints","**.ipynb" SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = ../_readthedocs/ diff --git a/src/tape/analysis/base.py b/src/tape/analysis/base.py index ee6a81a4..569c68c7 100644 --- a/src/tape/analysis/base.py +++ b/src/tape/analysis/base.py @@ -68,15 +68,14 @@ def meta(self, ens: "Ensemble"): @abstractmethod def on(self, ens: "Ensemble") -> List[str]: - """ - Return the columns to group source table by. + """Return the columns to group source table by. Parameters ---------- ens : Ensemble The ensemble object. - Returns: + Returns -------- List[str] The column names to group by. Typically, `[ens._id_col]`. diff --git a/src/tape/analysis/stetsonj.py b/src/tape/analysis/stetsonj.py index dc8d8790..8d9f24a3 100644 --- a/src/tape/analysis/stetsonj.py +++ b/src/tape/analysis/stetsonj.py @@ -51,7 +51,7 @@ def __call__( stetsonJ : `dict` StetsonJ statistic for each of input bands. - Notes + Note ---------- In case that no value for `band_to_calc` is passed, the function is executed on all available bands in `band`. @@ -113,7 +113,7 @@ def _stetson_J_single(fluxes, errors): .. [1] Stetson, P. B., "On the Automatic Determination of Light-Curve Parameters for Cepheid Variables", PASP, 108, 851S, 1996 - Notes + Note ---------- Taken from https://github.com/lsst/meas_base/blob/main/python/lsst/meas/base/diaCalculationPlugins.py @@ -168,7 +168,7 @@ def _stetson_J_mean(values, errors, mean=None, alpha=2.0, beta=2.0, n_iter=20, t .. [1] Stetson, P. B., "On the Automatic Determination of Light-Curve Parameters for Cepheid Variables", PASP, 108, 851S, 1996 - Notes + Note ---------- Taken from https://github.com/lsst/meas_base/blob/main/python/lsst/meas/base/diaCalculationPlugins.py diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 44d963c7..46959645 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -104,18 +104,19 @@ def add_frame(self, frame, label): Parameters ---------- - frame: `tape.ensemble.EnsembleFrame` + frame: `tape.ensemble_frame.EnsembleFrame` The frame object for the Ensemble to track. label: `str` - | The label for the Ensemble to use to track the frame. + The label for the Ensemble to use to track the frame. Returns ------- - self: `Ensemble` + Ensemble Raises ------ - ValueError if the label is "source", "object", or already tracked by the Ensemble. + ValueError + if the label is "source", "object", or already tracked by the Ensemble. """ if label == SOURCE_FRAME_LABEL or label == OBJECT_FRAME_LABEL: raise ValueError(f"Unable to add frame with reserved label " f"'{label}'") @@ -138,12 +139,13 @@ def update_frame(self, frame): Returns ------- - self: `Ensemble` + Ensemble Raises ------ - ValueError if the `frame.label` is unpopulated, or if the frame is not a SourceFrame or ObjectFrame - but uses the reserved labels. + ValueError + if the `frame.label` is unpopulated, or if the frame is not a SourceFrame or ObjectFrame + but uses the reserved labels. """ if frame.label is None: raise ValueError(f"Unable to update frame with no populated `EnsembleFrame.label`.") @@ -167,16 +169,18 @@ def drop_frame(self, label): Parameters ---------- label: `str` - | The label of the frame to be dropped by the Ensemble. + The label of the frame to be dropped by the Ensemble. Returns ------- - self: `Ensemble` + Ensemble Raises ------ - ValueError if the label is "source", or "object". - KeyError if the label is not tracked by the Ensemble. + ValueError + if the label is "source", or "object". + KeyError + if the label is not tracked by the Ensemble. """ if label == SOURCE_FRAME_LABEL or label == OBJECT_FRAME_LABEL: raise ValueError(f"Unable to drop frame with reserved label " f"'{label}'") @@ -191,15 +195,16 @@ def select_frame(self, label): Parameters ---------- label: `str` - | The label of a frame tracked by the Ensemble to be selected. + The label of a frame tracked by the Ensemble to be selected. Returns ------- - result: `tape.ensemble.EnsembleFrame` + tape.ensemble.EnsembleFrame Raises ------ - KeyError if the label is not tracked by the Ensemble. + KeyError + if the label is not tracked by the Ensemble. """ if label not in self.frames: raise KeyError( @@ -229,7 +234,8 @@ def frame_info(self, labels=None, verbose=True, memory_usage=True, **kwargs): Raises ------ - KeyError if a label in labels is not tracked by the Ensemble. + KeyError + if a label in labels is not tracked by the Ensemble. """ if labels is None: labels = self.frames.keys() @@ -265,7 +271,7 @@ def insert_sources( ): """Manually insert sources into the ensemble. - Requires, at a minimum, the object’s ID and the band, timestamp, + Requires, at a minimum, the object's ID and the band, timestamp, and flux of the observation. Note @@ -364,6 +370,7 @@ def info(self, verbose=True, memory_usage=True, **kwargs): memory_usage: `bool`, optional Specifies whether total memory usage of the DataFrame elements (including the index) should be displayed. + Returns ---------- None @@ -377,8 +384,7 @@ def info(self, verbose=True, memory_usage=True, **kwargs): self.source.info(verbose=verbose, memory_usage=memory_usage, **kwargs) def check_sorted(self, table="object"): - """Checks to see if an Ensemble Dataframe is sorted (increasing) on - the index. + """Checks to see if an Ensemble Dataframe is sorted (increasing) on the index. Parameters ---------- @@ -387,8 +393,8 @@ def check_sorted(self, table="object"): Returns ------- - A boolean value indicating whether the index is sorted (True) - or not (False) + boolean + indicating whether the index is sorted (True) or not (False) """ if table == "object": idx = self.object.index @@ -412,10 +418,10 @@ def check_lightcurve_cohesion(self): Returns ------- - A boolean value indicating whether the sources tied to a given object - are only found in a single partition (True), or if they are split - across multiple partitions (False) - + boolean + indicates whether the sources tied to a given object are only found + in a single partition (True), or if they are split across multiple + partitions (False) """ idx = self.source.index counts = idx.map_partitions(lambda a: Counter(a.unique())).compute() @@ -440,8 +446,9 @@ def compute(self, table=None, **kwargs): Returns ------- - A single pandas data frame for the specified table or a tuple of (object, source) - data frames. + `pd.Dataframe` + A single pandas data frame for the specified table or a tuple of + (object, source) data frames. """ if table: self._lazy_sync_tables(table) @@ -559,14 +566,17 @@ def query(self, expr, table="object"): Examples -------- - # Keep sources with flux above 100.0: - ens.query("flux > 100", table="source") + Keep sources with flux above 100.0:: - # Keep sources in the green band: - ens.query("band_col_name == 'g'", table="source") + ens.query("flux > 100", table="source") - # Filtering on the flux column without knowing its name: - ens.query(f"{ens._flux_col} > 100", table="source") + Keep sources in the green band:: + + ens.query("band_col_name == 'g'", table="source") + + Filtering on the flux column without knowing its name:: + + ens.query(f"{ens._flux_col} > 100", table="source") """ self._lazy_sync_tables(table) if table == "object": @@ -622,11 +632,13 @@ def assign(self, table="object", temporary=False, **kwargs): Examples -------- - # Direct assignment of my_series to a column named "new_column". - ens.assign(table="object", new_column=my_series) + Direct assignment of my_series to a column named "new_column":: + + ens.assign(table="object", new_column=my_series) + + Subtract the value in "err" from the value in "flux":: - # Subtract the value in "err" from the value in "flux". - ens.assign(table="source", lower_bnd=lambda x: x["flux"] - 2.0 * x["err"]) + ens.assign(table="source", lower_bnd=lambda x: x["flux"] - 2.0 * x["err"]) """ self._lazy_sync_tables(table) @@ -869,12 +881,12 @@ def bin_sources( Notes ----- * This should only be used for slowly varying sources where we can - treat the source as constant within `time_window`. + treat the source as constant within `time_window`. * As a default the function only aggregates and keeps the id, band, - time, flux, and flux error columns. Additional columns can be preserved - by providing the mapping of column name to aggregation function with the - `additional_cols` parameter. + time, flux, and flux error columns. Additional columns can be preserved + by providing the mapping of column name to aggregation function with the + `additional_cols` parameter. """ self._lazy_sync_tables(table="source") @@ -991,31 +1003,28 @@ def batch(self, func, *args, meta=None, by_band=False, use_map=True, on=None, la Examples -------- - Run a TAPE function on the ensemble: - ``` - from tape.analysis.stetsonj import calc_stetson_J - ens = Ensemble().from_dataset('rrlyr82') - ensemble.batch(calc_stetson_J, band_to_calc='i') - ``` - - Run a light-curve function on the ensemble: - ``` - from light_curve import EtaE - ens.batch(EtaE(), band_to_calc='g') - ``` - - Run a custom function on the ensemble: - ``` - def s2n_inter_quartile_range(flux, err): - first, third = np.quantile(flux / err, [0.25, 0.75]) - return third - first - - ens.batch(s2n_inter_quartile_range, ens._flux_col, ens._err_col) - ``` - Or even a numpy built-in function: - ``` - amplitudes = ens.batch(np.ptp, ens._flux_col) - ``` + Run a TAPE function on the ensemble:: + + from tape.analysis.stetsonj import calc_stetson_J + ens = Ensemble().from_dataset('rrlyr82') + ensemble.batch(calc_stetson_J, band_to_calc='i') + + Run a light-curve function on the ensemble:: + + from light_curve import EtaE + ens.batch(EtaE(), band_to_calc='g') + + Run a custom function on the ensemble:: + + def s2n_inter_quartile_range(flux, err): + first, third = np.quantile(flux / err, [0.25, 0.75]) + return third - first + + ens.batch(s2n_inter_quartile_range, ens._flux_col, ens._err_col) + + Or even a numpy built-in function:: + + amplitudes = ens.batch(np.ptp, ens._flux_col) """ self._lazy_sync_tables(table="all") @@ -1507,6 +1516,7 @@ def from_dask_dataframe( def from_hipscat(self, dir, source_subdir="source", object_subdir="object", column_mapper=None, **kwargs): """Read in parquet files from a hipscat-formatted directory structure + Parameters ---------- dir: 'str' @@ -1900,7 +1910,7 @@ def _lazy_sync_tables_from_frame(self, frame): Parameters ---------- - frame: `tape.EnsembleFrame` + frame: `tape.ensemble_frame.EnsembleFrame` The frame being modified. Only an `ObjectFrame` or `SourceFrame tracked by this `Ensemble` may trigger a sync. @@ -2144,7 +2154,7 @@ def sf2(self, sf_method="basic", argument_container=None, use_map=True): result : `pandas.DataFrame` Structure function squared for each of input bands. - Notes + Note ---------- In case that no value for `band_to_calc` is passed, the function is executed on all available bands in `band`. @@ -2186,7 +2196,7 @@ def _translate_meta(self, meta): Returns ---------- result : `ensemble.TapeFrame` or `ensemble.TapeSeries` - The appropriate meta for Dask producing an `Ensemble.EnsembleFrame` or + The appropriate meta for Dask producing an `tape.ensemble_frame.EnsembleFrame` or `Ensemble.EnsembleSeries` respectively """ if isinstance(meta, TapeFrame) or isinstance(meta, TapeSeries): diff --git a/src/tape/ensemble_frame.py b/src/tape/ensemble_frame.py index 352285ca..b1005fdf 100644 --- a/src/tape/ensemble_frame.py +++ b/src/tape/ensemble_frame.py @@ -120,12 +120,12 @@ def _args(self): return super()._args + (self.label, self.ensemble) def _propagate_metadata(self, new_frame): - """Propagatees any relevant metadata to a new frame. + """Propagates any relevant metadata to a new frame. Parameters ---------- new_frame: `_Frame` - | A frame to propage metadata to + A frame to propage metadata to Returns ---------- @@ -156,7 +156,7 @@ def assign(self, **kwargs): **kwargs: `dict` The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not change input DataFrame - (though pandas doesn’t check it). If the values are not callable, (e.g. a Series, + (though pandas doesn't check it). If the values are not callable, (e.g. a Series, scalar, or array), they are simply assigned. Returns @@ -256,7 +256,7 @@ def merge(self, right, **kwargs): Categorical-type and takes on a value of "left_only" for observations whose merge key only appears in `left` DataFrame, "right_only" for observations whose merge key only appears in `right` DataFrame, - and "both" if the observation’s merge key is found in both. + and "both" if the observation's merge key is found in both. npartitions: int or None, optional The ideal number of output partitions. This is only utilised when performing a hash_join (merging on columns only). If ``None`` then @@ -384,7 +384,7 @@ def drop(self, labels=None, axis=0, columns=None, errors="raise"): axis : {0 or 'index', 1 or 'columns'}, default 0 Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). - is equivalent to ``index=labels``). + is equivalent to ``index=labels``. columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). @@ -708,12 +708,14 @@ class EnsembleFrame(_Frame, dd.core.DataFrame): The underlying non-parallel dataframes are TapeFrames and TapeSeries which extend Pandas frames. - Example + Examples ---------- - import tape - ens = tape.Ensemble() - data = {...} # Some data you want tracked by the Ensemble - ensemble_frame = tape.EnsembleFrame.from_dict(data, label="my_frame", ensemble=ens) + Instatiation:: + + import tape + ens = tape.Ensemble() + data = {...} # Some data you want tracked by the Ensemble + ensemble_frame = tape.EnsembleFrame.from_dict(data, label="my_frame", ensemble=ens) """ _partition_type = TapeFrame # Tracks the underlying data type @@ -728,6 +730,7 @@ def __getitem__(self, key): @classmethod def from_tapeframe(cls, data, npartitions=None, chunksize=None, sort=True, label=None, ensemble=None): """Returns an EnsembleFrame constructed from a TapeFrame. + Parameters ---------- data: `TapeFrame` @@ -741,10 +744,12 @@ def from_tapeframe(cls, data, npartitions=None, chunksize=None, sort=True, label sort: `bool`, optional Whether to sort the frame by a default index. label: `str`, optional - | The label used to by the Ensemble to identify the frame. + The label used to by the Ensemble to identify the frame. ensemble: `tape.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.EnsembleFrame` The constructed EnsembleFrame object. """ @@ -756,15 +761,18 @@ def from_tapeframe(cls, data, npartitions=None, chunksize=None, sort=True, label @classmethod def from_dask_dataframe(cl, df, ensemble=None, label=None): """Returns an EnsembleFrame constructed from a Dask dataframe. + Parameters ---------- df: `dask.dataframe.DataFrame` or `list` a Dask dataframe to convert to an EnsembleFrame ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. label: `str`, optional - | The label used to by the Ensemble to identify the frame. + The label used to by the Ensemble to identify the frame. + Returns + ---------- result: `tape.EnsembleFrame` The constructed EnsembleFrame object. """ @@ -779,6 +787,7 @@ def update_ensemble(self): """Updates the Ensemble linked by the `EnsembelFrame.ensemble` property to track this frame. Returns + ---------- result: `tape.Ensemble` The Ensemble object which tracks this frame, `None` if no such Ensemble. """ @@ -820,6 +829,7 @@ def convert_flux_to_mag( The name of the output magnitude column, if None then the output is just the flux column name + "_mag". The error column is also generated as the out_col_name + "_err". + Returns ---------- result: `tape.EnsembleFrame` @@ -864,7 +874,6 @@ def coalesce(self, input_cols, output_col, drop_inputs=False): ------- ensemble: `tape.ensemble.Ensemble` An ensemble object. - """ def coalesce_partition(df, input_cols, output_col): @@ -923,6 +932,7 @@ def coalesce_partition(df, input_cols, output_col): @classmethod def from_parquet(cl, path, index=None, columns=None, label=None, ensemble=None, **kwargs): """Returns an EnsembleFrame constructed from loading a parquet file. + Parameters ---------- path: `str` or `list` @@ -939,10 +949,12 @@ def from_parquet(cl, path, index=None, columns=None, label=None, ensemble=None, be read (as determined by the pandas parquet metadata, if present). Provide a single field name instead of a list to read in the data as a Series. label: `str`, optional - | The label used to by the Ensemble to identify the frame. + The label used to by the Ensemble to identify the frame. ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.EnsembleFrame` The constructed EnsembleFrame object. """ @@ -1013,6 +1025,7 @@ def from_parquet( ensemble=None, ): """Returns a SourceFrame constructed from loading a parquet file. + Parameters ---------- path: `str` or `list` @@ -1029,8 +1042,10 @@ def from_parquet( inferred from the pandas parquet file metadata, if present. Use False to read all fields as columns. ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.EnsembleFrame` The constructed EnsembleFrame object. """ @@ -1051,14 +1066,17 @@ def from_parquet( @classmethod def from_dask_dataframe(cl, df, ensemble=None): - """Returns a SourceFrame constructed from a Dask dataframe.. + """Returns a SourceFrame constructed from a Dask dataframe. + Parameters ---------- df: `dask.dataframe.DataFrame` or `list` a Dask dataframe to convert to a SourceFrame ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.SourceFrame` The constructed SourceFrame object. """ @@ -1089,6 +1107,7 @@ def from_parquet( ensemble=None, ): """Returns an ObjectFrame constructed from loading a parquet file. + Parameters ---------- path: `str` or `list` @@ -1105,8 +1124,10 @@ def from_parquet( inferred from the pandas parquet file metadata, if present. Use False to read all fields as columns. ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.ObjectFrame` The constructed ObjectFrame object. """ @@ -1125,14 +1146,17 @@ def from_parquet( @classmethod def from_dask_dataframe(cl, df, ensemble=None): - """Returns an ObjectFrame constructed from a Dask dataframe.. + """Returns an ObjectFrame constructed from a Dask dataframe. + Parameters ---------- df: `dask.dataframe.DataFrame` or `list` a Dask dataframe to convert to an ObjectFrame ensemble: `tape.ensemble.Ensemble`, optional - | A link to the Ensemble object that owns this frame. + A link to the Ensemble object that owns this frame. + Returns + ---------- result: `tape.ObjectFrame` The constructed ObjectFrame object. """ diff --git a/src/tape/timeseries.py b/src/tape/timeseries.py index b819a6f2..5ab88267 100644 --- a/src/tape/timeseries.py +++ b/src/tape/timeseries.py @@ -5,7 +5,7 @@ class TimeSeries: - """represent and analyze Rubin TimeSeries data""" + """Represent and analyze Rubin TimeSeries data""" def __init__(self, data=None): self.data = data @@ -152,7 +152,7 @@ def stetson_J(self, band=None): stetsonJ : `dict` StetsonJ statistic for each of input bands. - Notes + Note ---------- In case that no value for band is passed, the function is executed on all available bands. @@ -165,8 +165,8 @@ def sf2(self, sf_method="basic", argument_container=None): Parameters ---------- bins : `numpy.array` or `list` - Manually provided bins, if not provided then bins are computed using - the `method` kwarg + Manually provided bins, if not provided then bins are computed using + the `method` kwarg band_to_calc : `str` or `list` of `str` Single band descriptor, or list of such descriptors. method : 'str' @@ -182,7 +182,7 @@ def sf2(self, sf_method="basic", argument_container=None): stetsonJ : `dict` Structure function squared statistic for each of input bands. - Notes + Note ---------- In case that no value for band_to_calc is passed, the function is executed on all available bands.