Skip to content

Commit

Permalink
Drafted Python code in posts/data-storage-comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
christophscheuch committed Jan 17, 2024
1 parent bf03c58 commit d557895
Show file tree
Hide file tree
Showing 13 changed files with 421 additions and 79 deletions.
109 changes: 109 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,112 @@ install.packages("duckdb")
library(duckdb)
library(RSQLite)
install.packages("arrow")
# Chunk 1
library(tidyverse)
data <- tibble(
character_column = c("A", "B", "C", "D"),
date_column = as.Date(c("2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01")),
datetime_column = ymd_hms(c("2023-01-01 10:00:00", "2023-02-01 11:00:00",
"2023-03-01 12:00:00", "2023-04-01 13:00:00")),
numeric_column = c(1.5, 2.5, 3.5, 4.5),
integer_column = as.integer(c(1, 2, 3, 4)),
logical_column = c(TRUE, FALSE, FALSE, TRUE)
)
data |>
glimpse()
data <- read_csv("data.csv")
data |>
glimpse()
write_csv(data, file = "data.csv")
data <- read_csv("data.csv")
data |>
glimpse()
con_sqlite <- dbConnect(SQLite(), "data.sqlite")
library(RSQLite)
con_sqlite <- dbConnect(SQLite(), "data.sqlite")
con_sqlite
dbWriteTable(con_sqlite, "data", value = data, overwrite = TRUE)
data_sqlite <- tbl(con_sqlite, "data") |>
collect()
data_sqlite |>
glimpse()
?copy_to
copy_to(con_sqlite, data, "dta", overwrite = TRUE)
data_sqlite <- tbl(con_sqlite, "data") |>
collect()
data_sqlite |>
glimpse()
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
con_duckdb
copy_to(con_duckdb, data, "dta", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
collect()
tbl(con_duckdb, "data")
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
copy_to(con_duckdb, data, "dta", overwrite = TRUE)
con_duckdb
tbl(con_duckdb, "data")
copy_to(con_duckdb, data, "data", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
collect()
data_duckdb |>
glimpse()
library(arrow)
?write_parquet
write_parquet(data, "data.parquet")
data_parquet <- read_parquet("data.parquet")
data_parquet |>
glimpse()
duckdb_shutdown(con_sqlite)
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
copy_to(con_duckdb, data, "data", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
collect()
duckdb_shutdown(con_duckdb)
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
copy_to(con_duckdb, data, "data", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
collect()
duckdb_shutdown(duckdb())
data_duckdb |>
glimpse()
# Chunk 1
library(tidyverse)
data <- tibble(
character_column = c("A", "B", "C", "D"),
date_column = as.Date(c("2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01")),
datetime_column = ymd_hms(c("2023-01-01 10:00:00", "2023-02-01 11:00:00",
"2023-03-01 12:00:00", "2023-04-01 13:00:00")),
numeric_column = c(1.5, 2.5, 3.5, 4.5),
integer_column = as.integer(c(1, 2, 3, 4)),
logical_column = c(TRUE, FALSE, FALSE, TRUE)
)
# Chunk 2
#| message: false
write_csv(data, file = "data.csv")
data_csv <- read_csv("data.csv")
data_csv |>
glimpse()
# Chunk 3
library(RSQLite)
con_sqlite <- dbConnect(SQLite(), "data.sqlite")
copy_to(con_sqlite, data, "data", overwrite = TRUE)
data_sqlite <- tbl(con_sqlite, "data") |>
collect()
data_sqlite |>
glimpse()
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
copy_to(con_duckdb, data, "data", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
collect()
duckdb_shutdown(duckdb())
data_duckdb |>
glimpse()
data_duckdb |>
glimpse()
reticulate::repl_python()

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions data.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
character_column,date_column,datetime_column,numeric_column,integer_column,logical_column
A,2023-01-01,2023-01-01T10:00:00Z,1.5,1,TRUE
B,2023-02-01,2023-02-01T11:00:00Z,2.5,2,FALSE
C,2023-03-01,2023-03-01T12:00:00Z,3.5,3,FALSE
D,2023-04-01,2023-04-01T13:00:00Z,4.5,4,TRUE
A,2023-01-01,2023-01-01 10:00:00,1.5,1,True
B,2023-02-01,2023-02-01 11:00:00,2.5,2,False
C,2023-03-01,2023-03-01 12:00:00,3.5,3,False
D,2023-04-01,2023-04-01 13:00:00,4.5,4,True
282 changes: 215 additions & 67 deletions docs/posts/data-storage-comparison/index.html

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions posts/data-storage-comparison/data_python.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
character_column,date_column,datetime_column,numeric_column,integer_column,logical_column
A,2023-01-01,2023-01-01 10:00:00,1.5,1,True
B,2023-02-01,2023-02-01 11:00:00,2.5,2,False
C,2023-03-01,2023-03-01 12:00:00,3.5,3,False
D,2023-04-01,2023-04-01 13:00:00,4.5,4,True
Binary file added posts/data-storage-comparison/data_python.parquet
Binary file not shown.
Binary file not shown.
File renamed without changes.
Binary file added posts/data-storage-comparison/data_r.duckdb
Binary file not shown.
File renamed without changes.
File renamed without changes.
92 changes: 86 additions & 6 deletions posts/data-storage-comparison/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ draft: true
# Create example data

```{r}
#| message: false
library(tidyverse)
data <- tibble(
Expand All @@ -26,31 +27,57 @@ data <- tibble(
)
```

```{python}
import pandas as pd
data = r.data
data["character_column"] = data["character_column"].astype("string")
data["date_column"] = pd.to_datetime(data["date_column"])
data.info()
```

# CSV

Simple, widely supported, and easy to read and write in R, Python, and Julia.

Not efficient for large datasets and doesn't support complex data structures or metadata well.

::: {.panel-tabset}
### R
```{r}
#| message: false
write_csv(data, file = "data.csv")
data_csv <- read_csv("data.csv")
library(readr)
write_csv(data, file = "data_r.csv")
data_csv <- read_csv("data_r.csv")
data_csv |>
glimpse()
```

### Python
```{python}
data.to_csv("data_python.csv", index = False)
data_csv = pd.read_csv("data_python.csv")
data_csv.info()
```

### Julia

:::

# SQLite

Lightweight, file-based SQL database. Easy to use and supported by R, Python, and Julia without the need for a separate server.

Not suitable for very large or high-concurrency applications.

::: {.panel-tabset}
### R
```{r}
library(RSQLite)
con_sqlite <- dbConnect(SQLite(), "data.sqlite")
con_sqlite <- dbConnect(SQLite(), "data_r.sqlite")
copy_to(con_sqlite, data, "data", overwrite = TRUE)
data_sqlite <- tbl(con_sqlite, "data") |>
Expand All @@ -60,6 +87,22 @@ data_sqlite |>
glimpse()
```

### Python
```{python}
import sqlite3
con_sqlite = sqlite3.connect(database = "data_python.sqlite")
data.to_sql("data", con_sqlite, if_exists = "replace", index = False)
data_sqlite = pd.read_sql_query("SELECT * FROM data", con_sqlite)
data_sqlite.info()
```

### Julia

:::

# DuckDB

Expand All @@ -69,11 +112,13 @@ DuckDB is an emerging database management system that's gaining attention for it

As a relatively new system, it might not have the same level of community support, tools, and integrations as more established databases.

::: {.panel-tabset}
### R
```{r}
#| warning: false
library(duckdb)
con_duckdb <- dbConnect(duckdb(), "data.duckdb")
con_duckdb <- dbConnect(duckdb(), "data_r.duckdb")
copy_to(con_duckdb, data, "data", overwrite = TRUE)
data_duckdb <- tbl(con_duckdb, "data") |>
Expand All @@ -83,6 +128,24 @@ data_duckdb |>
glimpse()
```

### Python
```{python}
#| warning: false
import duckdb
con_duckdb = duckdb.connect("data_r.duckdb")
data.to_sql("data", con_duckdb, if_exists = "replace", index = False)
data_duckdb = pd.read_sql_query("SELECT * FROM data", con_duckdb)
data_duckdb.info()
```

### Julia

:::

# Parquet

https://r4ds.hadley.nz/arrow
Expand All @@ -91,16 +154,33 @@ Columnar storage format, which is great for analytics and large datasets. Offers

Requires additional libraries and understanding of its format.

::: {.panel-tabset}
### R
```{r}
#| message: false
library(arrow)
write_parquet(data, "data.parquet")
data_parquet <- read_parquet("data.parquet")
write_parquet(data, "data_r.parquet")
data_parquet <- read_parquet("data_r.parquet")
data_parquet |>
glimpse()
```

### Python
```{python}
import pyarrow.parquet as pq
data.to_parquet("data_python.parquet")
data_parquet = pd.read_parquet("data_python.parquet")
data_parquet.info()
```

### Julia

:::

# Conclusion

...

0 comments on commit d557895

Please sign in to comment.