Skip to content

Commit

Permalink
[Fix] check_outliers() expose 'xi' parameter for OPTICS (#785)
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel <[email protected]>
  • Loading branch information
DominiqueMakowski and strengejacke authored Dec 4, 2024
1 parent fe53e13 commit cd2e056
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 38 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: performance
Title: Assessment of Regression Models Performance
Version: 0.12.4.12
Version: 0.12.4.13
Authors@R:
c(person(given = "Daniel",
family = "Lüdecke",
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## Breaking changes

* `check_outliers()` with `method = "optics"` now returns a further refined
cluster selection, by passing the `optics_xi` argument to `dbscan::extractXi()`.

* Deprecated arguments and alias-function-names have been removed.

* Argument names in `check_model()` that refer to plot-aesthetics (like
Expand Down
74 changes: 38 additions & 36 deletions R/check_outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@
#' extreme values), this algorithm functions in a different manner and won't
#' always detect outliers. Note that `method = "optics"` requires the
#' **dbscan** package to be installed, and that it takes some time to compute
#' the results.
#' the results. Additionally, the `optics_xi` (default to 0.05) is passed to
#' the [dbscan::extractXi()] function to further refine the cluster selection.
#'
#' - **Local Outlier Factor**:
#' Based on a K nearest neighbors algorithm, LOF compares the local density of
Expand Down Expand Up @@ -242,6 +243,7 @@
#' mcd = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
#' ics = 0.001,
#' optics = 2 * ncol(x),
#' optics_xi = 0.05,
#' lof = 0.001
#' )
#' ```
Expand Down Expand Up @@ -881,6 +883,13 @@ check_outliers.data.frame <- function(x,
} else if (is.numeric(threshold)) {
thresholds <- .check_outliers_thresholds(x)
thresholds <- lapply(thresholds, function(x) threshold)
# need to fix this manually - "optics" automatically includes method
# "optics_xi", which is allowed to range between 0 and 1 - since values
# for "optics" can be > 1, it might overwrite "optics_xi" with an invalid
# value...
if (thresholds$optics_xi > 1) {
thresholds$optics_xi <- 0.05
}
} else {
insight::format_error(
paste(
Expand All @@ -890,7 +899,13 @@ check_outliers.data.frame <- function(x,
)
}

thresholds <- thresholds[names(thresholds) %in% method]
# Keep only relevant threshold
valid <- method
if("optics" %in% valid) {

Check warning on line 904 in R/check_outliers.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/check_outliers.R,line=904,col=5,[spaces_left_parentheses_linter] Place a space before left parenthesis, except in a function call.
valid <- c(valid, "optics_xi")
method <- c(method, "optics_xi")
}
thresholds <- thresholds[names(thresholds) %in% valid]

out.meta <- .check_outliers.data.frame_method(x, method, thresholds, ID, ID.names, ...)
out <- out.meta$out
Expand Down Expand Up @@ -1207,7 +1222,8 @@ check_outliers.data.frame <- function(x,
out <- c(out, .check_outliers_optics(
x,
threshold = thresholds$optics,
ID.names = ID.names
ID.names = ID.names,
xi = thresholds$optics_xi
))

count.table <- datawizard::data_filter(
Expand Down Expand Up @@ -1506,38 +1522,23 @@ check_outliers.DHARMa <- check_outliers.performance_simres
}

.check_outliers_thresholds_nowarn <- function(x) {
zscore <- stats::qnorm(p = 1 - 0.001 / 2)
zscore_robust <- stats::qnorm(p = 1 - 0.001 / 2)
iqr <- 1.7
ci <- 1 - 0.001
eti <- 1 - 0.001
hdi <- 1 - 0.001
bci <- 1 - 0.001
cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x))
pareto <- 0.7
mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
ics <- 0.001
optics <- 2 * ncol(x)
lof <- 0.001

list(
zscore = zscore,
zscore_robust = zscore_robust,
iqr = iqr,
ci = ci,
hdi = hdi,
eti = eti,
bci = bci,
cook = cook,
pareto = pareto,
mahalanobis = mahalanobis_value,
mahalanobis_robust = mahalanobis_robust,
mcd = mcd,
ics = ics,
optics = optics,
lof = lof
zscore = stats::qnorm(p = 1 - 0.001 / 2),
zscore_robust = stats::qnorm(p = 1 - 0.001 / 2),
iqr = 1.7,
ci = 1 - 0.001,
hdi = 1 - 0.001,
eti = 1 - 0.001,
bci = 1 - 0.001,
cook = stats::qf(0.5, ncol(x), nrow(x) - ncol(x)),
pareto = 0.7,
mahalanobis = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
mahalanobis_robust = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
mcd = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
ics = 0.001,
optics = 2 * ncol(x),
optics_xi = 0.05,
lof = 0.001
)
}

Expand Down Expand Up @@ -1929,7 +1930,8 @@ check_outliers.DHARMa <- check_outliers.performance_simres

.check_outliers_optics <- function(x,
threshold = NULL,
ID.names = NULL) {
ID.names = NULL,
xi = 0.05) {
out <- data.frame(Row = seq_len(nrow(x)))

if (!is.null(ID.names)) {
Expand All @@ -1940,7 +1942,7 @@ check_outliers.DHARMa <- check_outliers.performance_simres

# Compute
rez <- dbscan::optics(x, minPts = threshold)
rez <- dbscan::extractXi(rez, xi = 0.05) # TODO: find automatic way of setting xi
rez <- dbscan::extractXi(rez, xi = xi) # TODO: find automatic way of setting xi

out$Distance_OPTICS <- rez$coredist

Expand Down
4 changes: 3 additions & 1 deletion man/check_outliers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cd2e056

Please sign in to comment.