Skip to content

Commit

Permalink
Revisited clustering-binary-data code
Browse files Browse the repository at this point in the history
  • Loading branch information
christophscheuch committed Nov 25, 2023
1 parent a3ddb9a commit 4886fb5
Show file tree
Hide file tree
Showing 21 changed files with 349 additions and 49 deletions.
Binary file modified .DS_Store
Binary file not shown.
246 changes: 246 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#| message: false
#| warning: false
library(tidyverse)
library(tidyclust)
centers <- tibble(
cluster = factor(1:3),
respondents = c(250, 500, 200),
q1 = c(1, 0, 1),
q2 = c(0, 1, 1),
q3 = c(0, 1, 1),
q4 = c(0, 0, 1)
)
set.seed(123)
labelled_respondents <- centers |>
mutate(
across(
starts_with("q"),
~map2(respondents, .x, function(x, y) {
rbinom(x, 1, max((y - 0.1), 0.1))
}),
.names = "{col}"
)
) |>
select(-respondents) |>
unnest(cols = c(q1, q2, q3, q4)) |>
sample_n(n())
labelled_respondents |>
pivot_longer(cols = -cluster, names_to = "question", values_to = "response") |>
mutate(response = response == 1) |>
ggplot(aes(x = response, y = question, color = cluster)) +
geom_jitter() +
theme_bw() +
labs(x = "Response", y = "Question", color = "Cluster",
title = "Visualization of simulated question responses by cluster")
k_means_example <- k_means(num_clusters = 3) |>
set_engine("stats", algorithm = "Lloyd") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
)
k_means_example_summary <- extract_fit_summary(k_means_example)
str(k_means_example_summary)
k_min <- 1
k_max <- 7
fit_k_means <- function(num_clusters) {
k_means(num_clusters = num_clusters) |>
set_engine("stats", algorithm = "Lloyd") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
}
fit_k_means(2)
k_min <- 1
k_max <- 7
kmeans_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~kmeans(respondents, centers = .x, iter.max = iter_max)),
)
respondents_labelled
labelled_respondents
kmeans_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~kmeans(labelled_respondents |> select(-cluster), centers = .x, iter.max = iter_max)),
)
kmeans_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~kmeans(labelled_respondents |> select(-cluster), centers = .x, iter.max = 100)),
)
kmeans_results
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, fit_k_means),
)
k_means_results
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
k_means_result = map(k, fit_k_means),
)
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, fit_k_means),
)
k_means_results
k_means_results |>
mutate(withinss = sse_within_total(kclust))
k_means_results |>
mutate(withinss = map(kclust, sse_within_total))
k_means_results
k_means_results$kclust[[1]]
k_means_results$kclust[[1]] |> sse_within_total()
k_means_results$kclust[[1]] |> summary()
k_means_results$kclust[[1]] |> sse_total()
k_means_results$kclust[[1]] |> summary() |> sse_total()
?sse_total
k_means_results$kclust[[1]] |> sse_total_vec()
k_means_results$kclust[[1]] |> extract_fit_summary()
k_means_results
k_means_results$kclust[[1]] |> summary()
k_means_results$kclust[[1]] |> extract_cluster_assignment()
k_means_results$kclust[[1]] |> extract_fit_summary()
k_means_results$kclust[1] |> extract_fit_summary()
k_means_results$kclust[1] |> str()
k_means_results$kclust[[1]] |> str()
tibble(k = k_min:k_max)
k_test <- means_results$kclust[[1]]
k_test <- k_means_results$kclust[[1]]
View(k_test)
class(k_test)
class(k_means_example)
fit_k_means <- function(num_clusters) {
k_means(num_clusters = num_clusters) |>
set_engine("stats", algorithm = "Lloyd") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
)
}
fit_k_means(3)
fit_k_means(3) |> class()
fit_k_means <- function(num_clusters) {
k_means(num_clusters = num_clusters) |>
set_engine("stats", algorithm = "Lloyd") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
}
k_means_results <- c|>
mutate(
kclust = map(k, fit_k_means),
)
k_means_results <- c|>
mutate(
kclust = map(k, fit_k_means),
)
fit_k_means <- function(num_clusters) {
k_means(num_clusters = num_clusters) |>
set_engine("stats", algorithm = "Lloyd") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
}
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, fit_k_means),
)
k_means_results
kmeans_logwithindiss <- k_means_results |>
mutate(withinss = map(kclust, sse_within_total))
k_means_results
k_means_results[[1]]
k_means_results$kclust[[1]]
k_means_results$kclust[[1]]$sse_within_total_total
k_means_results |>
mutate(withinss = map(kclust, ~.$sse_within_total_total))
k_means_results |>
mutate(withinss = map_dbl(kclust, ~.$sse_within_total_total))
k_means_results |>
mutate(withinss = map_dbl(kclust, ~sum(.$sse_within_total_total)))
kmeans_logwithindiss <- k_means_results |>
mutate(withinss = map_dbl(kclust, ~sum(.$sse_within_total_total)))
kmeans_logwithindiss <- k_means_results |>
mutate(withinss = map_dbl(kclust, ~sum(.$sse_within_total_total))) |>
mutate(logwithindiss = log(withinss) - log(withinss[k == 1]))
kmeans_logwithindiss
k_means_logwithindiss <- k_means_results |>
mutate(withinss = map_dbl(kclust, ~sum(.$sse_within_total_total))) |>
mutate(logwithindiss = log(withinss) - log(withinss[k == 1]))
k_means_metrics <- k_means_results |>
mutate(withinss = map_dbl(kclust, ~sum(.$sse_within_total_total))) |>
mutate(logwithiniss = log(withinss) - log(withinss[k == 1]))
k_means_metrics
k_means_metrics <- k_means_results |>
mutate(within_sse = map_dbl(kclust, ~sum(.$sse_within_total_total))) |>
mutate(log_within_sse = log(within_sse) - log(within_sse[k == 1]))
k_means_metrics <- k_means_results |>
mutate(
within_sse = map_dbl(kclust, ~sum(.$sse_within_total_total)),
log_within_sse = log(within_sse) - log(within_sse[k == 1])
)
k_means_metrics
k_modes_example <- k_means(num_clusters = 3) |>
set_engine("klaR") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
)
k_modes_example
k_means(num_clusters = num_clusters) |>
set_engine("stats") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
fit_k_means <- function(num_clusters) {
k_means(num_clusters = num_clusters) |>
set_engine("stats") |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
}
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, fit_k_means),
)
k_modes_example |> extract_fit_summary()
k_modes_example |> extract_fit_summary() |> str()
fit_k_means <- function(num_clusters, engine) {
k_means(num_clusters = num_clusters) |>
set_engine(engine) |>
fit(~ q1 + q2+ q3 + q4,
data = labelled_respondents
) |>
extract_fit_summary()
}
k_means_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~fit_k_means(., engine = "stats")),
)
k_modes_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~fit_k_means(., engine = "klaR")),
)
k_modes_results
k_modes_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~fit_k_means(., engine = "klaR")),
) |>
mutate(
within_sse = map_dbl(kclust, ~sum(.$sse_within_total_total)),
log_within_sse = log(within_sse) - log(within_sse[k == 1])
)
k_modes_results
tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~fit_k_means(., engine = "klaR")),
)
k_modes_results <- tibble(k = k_min:k_max) |>
mutate(
kclust = map(k, ~fit_k_means(., engine = "klaR")),
)
k_modes_results$kclust[[1]]
k_modes_results$kclust[[1]] |> str()
k_modes_results$kclust[[2]] |> str()
k_modes_results$kclust[[3]] |> str()
?sample_n
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.quarto/*
/.quarto/
.Rproj.user

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ website:
title: "Tidy Intelligence"
description: "Let your data spark joy!"
site-url: https://www.tidy-intelligence.com
favicon: images/favicon.png
search: false
page-navigation: false
navbar:
Expand All @@ -33,3 +34,4 @@ format:
toc: false
smooth-scroll: true
from: markdown+emoji
fig-align: "center"
13 changes: 13 additions & 0 deletions blog.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
5 changes: 3 additions & 2 deletions docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@


<script src="site_libs/quarto-nav/quarto-nav.js"></script>
<link href="./images/favicon.png" rel="icon" type="image/png">
<script src="site_libs/quarto-listing/list.min.js"></script>
<script src="site_libs/quarto-listing/quarto-listing.js"></script>
<script src="site_libs/clipboard/clipboard.min.js"></script>
Expand Down Expand Up @@ -137,7 +138,7 @@

<div class="quarto-listing quarto-listing-container-grid" id="listing-listing">
<div class="list grid quarto-listing-cols-3">
<div class="g-col-1" data-index="0" data-listing-date-sort="1700866800000" data-listing-file-modified-sort="1700923567106" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="16" data-listing-word-count-sort="3041">
<div class="g-col-1" data-index="0" data-listing-date-sort="1700866800000" data-listing-file-modified-sort="1700933904590" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="16" data-listing-word-count-sort="3078">
<a href="./posts/clustering-binary-data/index.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top">
Expand All @@ -154,7 +155,7 @@ <h5 class="no-anchor card-title listing-title">
</div>
</a>
</div>
<div class="g-col-1" data-index="1" data-listing-date-sort="1700780400000" data-listing-file-modified-sort="1700920380626" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="12" data-listing-word-count-sort="2247">
<div class="g-col-1" data-index="1" data-listing-date-sort="1700780400000" data-listing-file-modified-sort="1700924082986" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="12" data-listing-word-count-sort="2247">
<a href="./posts/tidy-data-a-recipe-for-efficient-data-analysis/index.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top">
Expand Down
Loading

0 comments on commit 4886fb5

Please sign in to comment.