diff --git a/notebooks/outlier_detection.ipynb b/notebooks/outlier_detection.ipynb new file mode 100644 index 0000000..3d38b6b --- /dev/null +++ b/notebooks/outlier_detection.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f88d62a9", + "metadata": {}, + "outputs": [], + "source": [ + "from adler.dataclasses.AdlerPlanetoid import AdlerPlanetoid\n", + "from adler.science.PhaseCurve import PhaseCurve\n", + "from adler.dataclasses.AdlerData import AdlerData\n", + "import adler.utilities.science_utilities as utils\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.gridspec as gridspec\n", + "import astropy.units as u" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e657fe", + "metadata": {}, + "outputs": [], + "source": [ + "# ssObjectId of object to analyse\n", + "ssoid = \"8268570668335894776\"\n", + "\n", + "# load object from local database\n", + "fname = \"../tests/data/testing_database.db\"\n", + "planetoid = AdlerPlanetoid.construct_from_SQL(ssoid, sql_filename=fname)\n", + "\n", + "# retrieve observations in the r filter\n", + "obs_r = planetoid.observations_in_filter(\"r\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4131a10", + "metadata": {}, + "outputs": [], + "source": [ + "# define the phase curve model using the SSObject data\n", + "\n", + "sso_r = planetoid.SSObject_in_filter(\"r\")\n", + "r_H = sso_r.H\n", + "r_G12 = sso_r.G12\n", + "\n", + "pc = PhaseCurve(abs_mag=r_H * u.mag, phase_param=r_G12, model_name=\"HG12_Pen16\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f625b5a", + "metadata": {}, + "outputs": [], + "source": [ + "# calculate data - model residuals\n", + "res = obs_r.reduced_mag - pc.ReducedMag(obs_r.phaseAngle * u.degree).value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81e04bad", + "metadata": {}, + "outputs": [], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f573cce4", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the observations with the SSObject phase curve\n", + "x_plot = \"phaseAngle\"\n", + "y_plot = \"reduced_mag\"\n", + "\n", + "x = getattr(obs_r, x_plot)\n", + "y = getattr(obs_r, y_plot)\n", + "yerr = obs_r.magErr\n", + "\n", + "fig = plt.figure()\n", + "gs = gridspec.GridSpec(1, 1)\n", + "ax1 = plt.subplot(gs[0, 0])\n", + "\n", + "ax1.errorbar(x, y, yerr, fmt=\"o\")\n", + "\n", + "# plot the phase curve model\n", + "alpha = np.linspace(0, np.amax(obs_r.phaseAngle)) * u.deg\n", + "red_mag = pc.ReducedMag(alpha)\n", + "\n", + "# legend label for the phase curve model\n", + "pc_label = []\n", + "for x in pc.model_function.param_names:\n", + " pc_label.append(\"{}={:.2f}\".format(x, getattr(pc.model_function, x).value))\n", + "pc_label = \", \".join(pc_label)\n", + "\n", + "ax1.plot(alpha.value, red_mag.value, c=\"k\", label=pc_label)\n", + "\n", + "ax1.invert_yaxis()\n", + "ax1.set_xlabel(x_plot)\n", + "ax1.set_ylabel(y_plot)\n", + "ax1.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aee95371", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the data - model residuals\n", + "x_plot = \"phaseAngle\"\n", + "\n", + "x = getattr(obs_r, x_plot)\n", + "yerr = obs_r.magErr\n", + "\n", + "fig = plt.figure()\n", + "gs = gridspec.GridSpec(1, 1)\n", + "ax1 = plt.subplot(gs[0, 0])\n", + "\n", + "ax1.errorbar(x, res, yerr, fmt=\"o\")\n", + "\n", + "ax1.axhline(0, c=\"k\")\n", + "\n", + "# indicate the standard deviations of the residuals\n", + "res_std = np.std(res)\n", + "for i in range(1, 4):\n", + " ax1.axhline(res_std * i, ls=\":\", c=\"C{}\".format(i), label=\"{} sigma\".format(i))\n", + " ax1.axhline(-res_std * i, ls=\":\", c=\"C{}\".format(i))\n", + "\n", + "ax1.invert_yaxis()\n", + "ax1.set_xlabel(x_plot)\n", + "ax1.set_ylabel(\"data - model\")\n", + "ax1.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13e25b85", + "metadata": {}, + "outputs": [], + "source": [ + "# return a list of flags for outlying objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16c712ff", + "metadata": {}, + "outputs": [], + "source": [ + "# astropy sigma_clip normally uses the median as the central function\n", + "utils.sigma_clip(res, {\"cenfunc\": \"median\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8983c9dd", + "metadata": {}, + "outputs": [], + "source": [ + "# assuming that the model is the ground truth, we use zero as the centroid for the residuals\n", + "utils.sigma_clip(res, {\"cenfunc\": utils.zero_func})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15bdb74e", + "metadata": {}, + "outputs": [], + "source": [ + "# use the standard deviation of the residuals to identify outliers\n", + "utils.outlier_std(res, res, std_cut=3.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50276ce5", + "metadata": {}, + "outputs": [], + "source": [ + "# use a simple threshold value for residuals to find outliers\n", + "utils.outlier_diff(res, diff_cut=1.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e2f5bc", + "metadata": {}, + "outputs": [], + "source": [ + "# consider the residual compared to the uncertainty of the measurement\n", + "std_err = 5\n", + "utils.outlier_sigma_diff(res, yerr, std_err)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8cc586", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the data - model residuals\n", + "x_plot = \"phaseAngle\"\n", + "\n", + "x = getattr(obs_r, x_plot)\n", + "yerr = obs_r.magErr\n", + "\n", + "fig = plt.figure()\n", + "gs = gridspec.GridSpec(1, 1)\n", + "ax1 = plt.subplot(gs[0, 0])\n", + "\n", + "ax1.errorbar(x, res, yerr, fmt=\"o\")\n", + "\n", + "ax1.axhline(0, c=\"k\")\n", + "\n", + "# indicate the standard deviations of the residuals\n", + "res_std = np.std(res)\n", + "for i in range(1, 4):\n", + " ax1.axhline(res_std * i, ls=\":\", c=\"C{}\".format(i), label=\"{} sigma\".format(i))\n", + " ax1.axhline(-res_std * i, ls=\":\", c=\"C{}\".format(i))\n", + "\n", + "mask = utils.outlier_sigma_diff(res, yerr, std_err)\n", + "ax1.scatter(x[mask], res[mask], edgecolor=\"r\", facecolor=\"none\", marker=\"o\", zorder=3)\n", + "\n", + "ax1.invert_yaxis()\n", + "ax1.set_xlabel(x_plot)\n", + "ax1.set_ylabel(\"data - model\")\n", + "ax1.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c935b03", + "metadata": {}, + "outputs": [], + "source": [ + "# NB that for phase curve models, residuals can be much larger than the photometric uncertainty!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcccc758", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "adler-dev", + "language": "python", + "name": "adler-dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/adler/utilities/science_utilities.py b/src/adler/utilities/science_utilities.py new file mode 100644 index 0000000..c6f965d --- /dev/null +++ b/src/adler/utilities/science_utilities.py @@ -0,0 +1,108 @@ +import numpy as np +from astropy.stats import sigma_clip as astropy_sigma_clip + + +def outlier_diff(new_res, diff_cut=1.0): + """Test whether new data point(s) is an outlier compared to the model by considering the residual difference. + + new_res: array + The residuals of the new data points compared to the model + diff_cut: float + The threshold difference value for outlier detection. + + Returns + ----------- + outlier_flag : array + Array of flag indicating if data point is an outlier (True) + + """ + + if not isinstance(new_res, np.ndarray): + new_res = np.array(new_res) + + outlier_flag = np.array([False] * len(new_res)) + outlier_flag[np.abs(new_res) >= diff_cut] = True + + return outlier_flag + + +def outlier_std(new_res, data_res, std_cut=3.0): + """Test whether new data point(s) is an outlier compared to the model by considering the standard deviation of the residuals. + + new_res: array + The residuals of the new data point(s) compared to the model + data_res: array + The residuals of the data compared to the model. + std_cut: float + The threshold standard deviation for outlier detection. + + Returns + ----------- + outlier_flag : array + Array of flag indicating if data point is an outlier (True) + + """ + + # calculate the standard deviation of the data - model residuals + data_std = np.std(data_res) + + if not isinstance(new_res, np.ndarray): + new_res = np.array(new_res) + + outlier_flag = np.array([False] * len(new_res)) + outlier_flag[np.abs(new_res) >= (data_std * std_cut)] = True + + return outlier_flag + + +def zero_func(x, axis=None): + """Dummy function to return a zero. + Can be used as the centre function in astropy.stats.sigma_clip to get std relative to zero rather than median/mean value. + + x: + Dummy variable + axis: + required to match the syntax of numpy functions such as np.median + """ + return 0 + + +def sigma_clip(data_res, kwargs={"maxiters": 1, "cenfunc": zero_func}): + """Wrapper function for astropy.stats.sigma_clip, here we define the default centre of the data (the data - model residuals) to be zero + + data_res: array + The residuals of the data compared to the model. + kwargs: dict + Dictionary of keyword arguments from astropy.stats.sigma_clip + + Returns + ----------- + sig_clip_mask: array + returns only the mask from astropy.stats.sigma_clip + """ + + sig_clip = astropy_sigma_clip(data_res, **kwargs) + sig_clip_mask = sig_clip.mask + + return sig_clip_mask + + +def outlier_sigma_diff(data_res, data_sigma, std_sigma=1): + """Function to identify outliers by comparing the uncertainty of measurements to their residuals + + data_res: array + The residuals of the data compared to the model. + data_sigma: array + The uncertainties of the data points + std_sigma: float + Number of standard deviations to identify outliers, assuming the data uncertainties represent one standard deviation + + Returns + ----------- + outlier_flag : array + Array of flag indicating if data point is an outlier (True) + """ + + outlier_flag = np.abs(data_res) > (std_sigma * data_sigma) + + return outlier_flag diff --git a/tests/adler/utilities/test_science_utilities.py b/tests/adler/utilities/test_science_utilities.py new file mode 100644 index 0000000..3bee7c9 --- /dev/null +++ b/tests/adler/utilities/test_science_utilities.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest + +from numpy.testing import assert_array_equal + +import adler.utilities.science_utilities as sci_utils + +# set up test data residuals +y_res = np.array( + [ + 0.32766394, + 0.15295882, + 0.25896465, + 0.18974566, + -0.09082071, + -0.03174331, + -0.03881347, + 0.04204286, + -0.24224314, + -0.17745899, + -0.00492505, + -0.0862188, + -0.14134951, + -0.2535722, + -0.07715194, + -0.10041373, + -0.24621894, + -0.02628162, + -0.04041881, + 0.09704904, + -0.31718808, + -0.06039929, + -0.04072827, + -0.07548697, + -0.15773742, + -0.18076436, + -0.11133244, + -0.19906023, + -0.06507692, + 0.19938055, + 1.97134538, + 1.79108103, + 1.91881725, + 0.20952169, + 0.57060468, + -0.12014891, + 0.15038993, + 0.63377464, + ] +) +# uncertainties on y axis parameters +y_err = np.array( + [ + 0.095, + 0.049, + 0.07, + 0.047, + 0.036, + 0.038, + 0.034, + 0.011, + 0.052, + 0.06, + 0.042, + 0.098, + 0.1, + 0.116, + 0.046, + 0.06, + 0.082, + 0.083, + 0.063, + 0.35100001, + 0.2, + 0.057, + 0.034, + 0.036, + 0.054, + 0.051, + 0.046, + 0.17200001, + 0.199, + 0.17, + 0.264, + 0.28299999, + 0.31600001, + 0.163, + 0.248, + 0.182, + 0.211, + 0.38299999, + ] +) +# provide x data in case residual as a function of x parameter needs tested +x = np.array( + [ + 56.70839691, + 50.07988358, + 72.76689148, + 48.80239868, + 40.30448914, + 39.98022079, + 39.97434235, + 33.56268311, + 38.53084564, + 38.03194427, + 37.81433105, + 36.34851074, + 36.34802628, + 36.50101089, + 32.00559616, + 31.97466469, + 31.86707687, + 31.74268913, + 31.69605255, + 31.88496971, + 30.16068649, + 26.89824295, + 26.91216087, + 27.17892265, + 27.42977715, + 27.42983437, + 27.63732719, + 29.76596832, + 29.76592445, + 27.73990822, + 126.7827301, + 126.78705597, + 126.79136658, + 18.63666534, + 5.73195124, + 7.0533433, + 2.55333257, + 9.17568874, + ] +) +# flag True for outlying data points +outliers_y = np.array( + [ + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + True, + False, + False, + False, + False, + False, + ] +) + + +def test_outlier_diff(): + outliers = sci_utils.outlier_diff(y_res, diff_cut=1.0) + assert_array_equal(outliers, outliers_y) + + +def test_zero_func(): + assert sci_utils.zero_func(y_res) == 0 + + +def test_sigma_clip(): + outliers = sci_utils.sigma_clip(y_res, {"maxiters": 1, "cenfunc": sci_utils.zero_func}) + assert_array_equal(outliers, outliers_y) + + +def test_outlier_sigma_diff(): + outliers = sci_utils.outlier_sigma_diff(y_res, y_err, std_sigma=5.0) + assert_array_equal(outliers, outliers_y)