DuckDB with R

DuckDB and R - a perfect match!

Install

Never heared of DuckDB? It’s EASY, it’s FAST, it’s FUN! Check this website: https://duckdb.org/

Installation is easy:

install.packages("duckdb")

To create your first local duckdb simply run:

library(DBI)
library(duckdb)
con <- dbConnect(duckdb(), "mydatabase.duckdb")

If you don’t provide a file name, the duckdb will be in memory.

Get Started

Writing a data frame into duckdb (in this case the beer data from {explore}):

# get beer data
library(explore)
df <- use_data_beer()

# write into duckdb
dbWriteTable(con, "tbl_beer", df)

# check result
dbGetQuery(con, "SHOW TABLES")
dbGetQuery(con, "DESCRIBE tbl_beer")
         column_name column_type null  key default extra
1               name     VARCHAR  YES <NA>    <NA>    NA
2              brand     VARCHAR  YES <NA>    <NA>    NA
3            country     VARCHAR  YES <NA>    <NA>    NA
4               year      DOUBLE  YES <NA>    <NA>    NA
5               type     VARCHAR  YES <NA>    <NA>    NA
6         color_dark      DOUBLE  YES <NA>    <NA>    NA
7    alcohol_vol_pct      DOUBLE  YES <NA>    <NA>    NA
8      original_wort      DOUBLE  YES <NA>    <NA>    NA
9  energy_kcal_100ml      DOUBLE  YES <NA>    <NA>    NA
10      carb_g_100ml      DOUBLE  YES <NA>    <NA>    NA
11     sugar_g_100ml      DOUBLE  YES <NA>    <NA>    NA

Now we can run simple SQL queries:

sql <- "SELECT * FROM tbl_beer LIMIT 10"
dbGetQuery(con, sql)
sql <- "SELECT brand, count(*) FROM tbl_beer GROUP BY brand"
dbGetQuery(con, sql)

Or use dbplyr:

library(dbplyr)
tbl(con, "tbl_beer") |> 
  count(type, brand)

SQL tips & tricks

Save time and run your SQL without SELECT

sql <- "FROM tbl_beer LIMIT 10"
dbGetQuery(con, sql)

You don’t have to repeat all columns in GROUP BY, simply use GROUP BY ALL

sql <- "SELECT type, brand, count(*)
        FROM tbl_beer GROUP BY ALL"
dbGetQuery(con, sql)

Excluding columns in SELECT is (now) simple

sql <- "SELECT * EXCLUDE(year, country)
        FROM tbl_beer"
dbGetQuery(con, sql)

Select columns using regex

sql <- "SELECT COLUMNS('.*_100ml')
        FROM tb_beer"
dbGetQuery(con, sql)

No need to switch to Excel to create a pivot table

sql <- "PIVOT tb_beer
        ON country
        USING avg(alcohol_vol_pct)
        GROUP BY type"
dbGetQuery(con, sql)
         type   Austria   Germany Swizerland
1        Rest 5.1197674 4.7675676        4.8
2 Alkoholfrei 0.2366667 0.4363636        0.5
3        Bock 7.0428571 7.9000000         NA

Need summary statistics of a table or the result of a SELECT?

sql <- "SUMMARIZE tbl_beer"
dbGetQuery(con, sql)
sql <- "SUMMARIZE 
        SELECT alcohol_vol_pct, energy_kcal_100ml, country FROM tbl_beer"
dbGetQuery(con, sql)
        column_name column_type min  max approx_unique    avg   std    q25 q50    q75 count null_percentage
1   alcohol_vol_pct      DOUBLE   0  8.4            34  4.324 2.000  4.425   5  5.300   161            1.24
2 energy_kcal_100ml      DOUBLE  20 62.0            33 39.893 9.137 36.500  42 44.125   161            6.83
3           country     VARCHAR  NA   NA             3     NA    NA     NA  NA     NA   161            0.00

Tired of manual dropping a table (if it already exists) before creating it?

sql <- "CREATE OR REPLACE TABlE tbl_result AS 
        SELECT alcohol_vol_pct, energy_kcal_100ml, country FROM tbl_beer"
dbExecute(con, sql)

More tipps: https://duckdb.org/docs/sql/dialect/friendly_sql.html

Read/Write CSV

You can query a CSV using DuckDB

dbGetQuery(con, "
          SELECT * FROM read_csv(
          'myfiles/data.csv',
          AUTO_DETECT=TRUE)
          ")

In case AUTO_DETECT doesn’t work you may specify sep, nullstr, …

dbGetQuery(con, "
          SELECT * FROM read_csv(
          'myfiles/data.csv',
          AUTO_DETECT=TRUE, 
          sep='\t', 
          nullstr='?', 
          decimal_separator=',')
          ")

Read CSV and write into DuckDB:

dbExecute(con, "
          CREATE TABLE csv_import AS
          SELECT * FROM read_csv(
          'myfiles/data.csv',
          AUTO_DETECT=TRUE, 
          sep='\t', 
          nullstr='?', 
          decimal_separator=',')
          ")

dbGetQuery(con, "DESCRIBE csv_import")

Write a DuckDB table as CSV

dbExecute(con, "
          COPY (SELECT * FROM tbl_beer) 
          TO 'myfiles/out.csv' 
          WITH (HEADER, DELIMITER ';', NULLSTR 'NULL')
          ")

Read/Write Parquet

Write a DuckDB table as parquet

dbExecute(con, "
          COPY (SELECT * FROM tbl_beer) 
          TO 'myfiles/out.parquet' 
          (FORMAT PARQUET)
          ")

Query a parquet file

dbGetQuery(con, "
           SELECT * FROM read_parquet(
           'myfiles/out.parquet')
          ")

Read parquet and write into DuckDB

dbExecute(con, "
          CREATE TABLE tbl_parquet AS
          SELECT * FROM read_parquet(
            'myfiles/out.parquet')
          ")
Written on October 8, 2024