Skip to content

Commit

Permalink
use Tucan for VegaLite plots
Browse files Browse the repository at this point in the history
  • Loading branch information
pnezis committed Oct 15, 2023
1 parent 07405d6 commit b79c134
Show file tree
Hide file tree
Showing 3 changed files with 997 additions and 1,403 deletions.
145 changes: 48 additions & 97 deletions notebooks/k_means.livemd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Mix.install([
{:scidata, "~> 0.1.10"},
{:req, "~> 0.3.9"},
{:kino, "~> 0.10.0"},
{:kino_vega_lite, "~> 0.1.9"}
{:kino_vega_lite, "~> 0.1.9"},
{:tucan, github: "pnezis/tucan", ref: "6549cdc"}
])
```

Expand All @@ -27,7 +28,6 @@ Mix.install([
The main purpose of this livebook is to introduce the **KMeans** clustering algorithm. We will explore KMeans in three different use cases.

```elixir
alias VegaLite, as: Vl
alias Scholar.Cluster.KMeans
require Explorer.DataFrame, as: DF
Nx.global_default_backend(EXLA.Backend)
Expand All @@ -36,14 +36,6 @@ key = Nx.Random.key(42)

<!-- livebook:{"output":true} -->

```
02:30:02.231 [info] TfrtCpuClient created.
```

<!-- livebook:{"output":true} -->

```
#Nx.Tensor<
u32[2]
Expand Down Expand Up @@ -143,15 +135,11 @@ DF.summarise(
We see that `petal_length` and `petal_width` are the most distinguishing features. Let's explore them a little bit more.

```elixir
Vl.new(title: [text: "Histograms of petal_length column by species", offset: 25, anchor: :middle])
|> Vl.data_from_values(df)
|> Vl.facet(
[field: "species"],
Vl.new(width: 200, height: 200)
|> Vl.mark(:bar)
|> Vl.encode_field(:x, "petal_length", type: :quantitative, bin: true)
|> Vl.encode(:y, aggregate: :count, scale: [domain: [0, 55]])
)
Tucan.histogram(df, "petal_length", color_by: "species")
|> Tucan.facet_by(:column, "species")
|> Tucan.Scale.set_y_domain(0, 55)
|> Tucan.set_size(200, 200)
|> Tucan.set_title("Histograms of petal_length column by species", offset: 25, anchor: :middle)
```

<!-- livebook:{"output":true} -->
Expand All @@ -161,19 +149,11 @@ Vl.new(title: [text: "Histograms of petal_length column by species", offset: 25,
```

```elixir
Vl.new(
width: 300,
height: 300,
title: [
text: "Scatterplot of data samples projected on plane petal_width x petal_length",
offset: 25
]
Tucan.scatter(df, "petal_length", "petal_width", filled: true, color_by: "species")
|> Tucan.set_size(300, 300)
|> Tucan.set_title("Scatterplot of data samples projected on plane petal_width x petal_length",
offset: 25
)
|> Vl.data_from_values(df)
|> Vl.mark(:circle)
|> Vl.encode_field(:x, "petal_length", type: :quantitative)
|> Vl.encode_field(:y, "petal_width", type: :quantitative)
|> Vl.encode_field(:color, "species")
```

<!-- livebook:{"output":true} -->
Expand All @@ -183,19 +163,11 @@ Vl.new(
```

```elixir
Vl.new(
title: [
text: "Scatterplot of data samples projected on plane petal_width x petal_length by species",
offset: 25
]
)
|> Vl.data_from_values(df)
|> Vl.facet(
[field: "species"],
Vl.new(width: 200, height: 200)
|> Vl.mark(:point)
|> Vl.encode_field(:x, "petal_length", type: :quantitative)
|> Vl.encode_field(:y, "petal_width", type: :quantitative)
Tucan.scatter(df, "petal_length", "petal_width")
|> Tucan.facet_by(:column, "species")
|> Tucan.set_title(
"Scatterplot of data samples projected on plane petal_width x petal_length by species",
offset: 25
)
```

Expand Down Expand Up @@ -227,15 +199,12 @@ inertias = for model <- models, do: Nx.to_number(model.inertia)
```

```elixir
Vl.new(width: 600, height: 300, title: "Elbow Plot")
|> Vl.data_from_values(num_clusters: clusterings, inertia: inertias)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "num_clusters",
title: "Number of Clusters",
type: :ordinal,
axis: [label_angle: 0]
Tucan.lineplot([num_clusters: clusterings, inertia: inertias], "num_clusters", "inertia",
x: [type: :nominal, axis: [label_angle: 0]],
title: "Elbow Plot"
)
|> Vl.encode_field(:y, "inertia", title: "Inertia", type: :quantitative)
|> Tucan.Axes.set_xy_titles("Number of Clusters", "Inertia")
|> Tucan.set_size(600, 300)
```

<!-- livebook:{"output":true} -->
Expand Down Expand Up @@ -298,28 +267,19 @@ coords = [
cluster_petal_width: best_model.clusters[[.., 3]] |> Nx.to_flat_list()
]

Vl.new(
width: 300,
height: 300,
title: [
text:
"Scatterplot of data samples projected on plane petal_width x petal_length with calculated centroids",
offset: 25
]
)
|> Vl.layers([
Vl.new()
|> Vl.data_from_values(df)
|> Vl.mark(:circle)
|> Vl.encode_field(:x, "petal_length", type: :quantitative)
|> Vl.encode_field(:y, "petal_width", type: :quantitative)
|> Vl.encode_field(:color, "species"),
Vl.new()
|> Vl.data_from_values(coords)
|> Vl.mark(:circle, color: :green, size: 100)
|> Vl.encode_field(:x, "cluster_petal_length", type: :quantitative)
|> Vl.encode_field(:y, "cluster_petal_width", type: :quantitative)
Tucan.layers([
Tucan.scatter(df, "petal_length", "petal_width", color_by: "species", filled: true),
Tucan.scatter(coords, "cluster_petal_length", "cluster_petal_width",
filled: true,
point_size: 100,
point_color: "green"
)
])
|> Tucan.set_size(300, 300)
|> Tucan.set_title(
"Scatterplot of data samples projected on plane petal_width x petal_length with calculated centroids",
offset: 25
)
```

<!-- livebook:{"output":true} -->
Expand Down Expand Up @@ -1203,19 +1163,14 @@ data = [
inertia: for(model <- models, do: Nx.to_number(model.inertia))
]

Vl.new(width: 600, height: 300, title: "Elbow Plot")
|> Vl.data_from_values(data)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "num_clusters",
title: "Number of Clusters",
type: :ordinal,
axis: [label_angle: 0]
)
|> Vl.encode_field(:y, "inertia",
title: "Inertia",
type: :quantitative,
scale: [domain: [4800, 11500]]
Tucan.lineplot(data, "num_clusters", "inertia",
x: [type: :ordinal, axis: [label_angle: 0]],
width: 600,
height: 300
)
|> Tucan.Axes.set_xy_titles("Number of Clusters", "Inertia")
|> Tucan.Scale.set_y_domain(4800, 11500)
|> Tucan.set_title("Elbow Plot")
```

<!-- livebook:{"output":true} -->
Expand Down Expand Up @@ -1247,19 +1202,15 @@ silhouette_scores =
```elixir
data = [num_clusters: nums_clusters, silhouette_scores: silhouette_scores]

Vl.new(width: 600, height: 300, title: "Silhouette score vs Number of Clusters")
|> Vl.data_from_values(data)
|> Vl.mark(:line, point: [point: true, color: :dark_blue])
|> Vl.encode_field(:x, "num_clusters",
title: "Number of Clusters",
type: :ordinal,
axis: [label_angle: 0]
)
|> Vl.encode_field(:y, "silhouette_scores",
title: "Silhouette score",
type: :quantitative,
scale: [domain: [0.088, 0.205]]
Tucan.lineplot(data, "num_clusters", "silhouette_scores",
points: true,
point_color: "darkBlue",
x: [type: :ordinal, axis: [label_angle: 0]]
)
|> Tucan.Axes.set_xy_titles("Number of Clusters", "Silhouette score")
|> Tucan.Scale.set_y_domain(0.088, 0.205)
|> Tucan.set_size(600, 300)
|> Tucan.set_title("Silhouette score vs Number of Clusters")
```

<!-- livebook:{"output":true} -->
Expand Down
Loading

0 comments on commit b79c134

Please sign in to comment.