duckdb

My Note

  • Reading
sql
# <- csv ->
# delim
# nullstr
# normalize_names
select * from read_csv_auto('/path/data.csv', header=True, delim='|');

# cli reading
.mode csv
.import data.csv data_tbl

# sample
# using sample 10% -> cluster sampling
select * from read_csv_auto('/path/data.csv') using sample 4;

# persist as table
create table as select * from read_csv_auto('/path/data.csv');

# <- parquet ->
# filename
select * from read_parquet('/tmp/*.parquet');
  • Writing
sql
# copy (select * from tbl) to 'result.csv' with (header, delimiter ',');
# <- parquet ->
copy (select * from tbl) to 'result.parquet' (format 'parquet');

# cli writing
.mode jsonlines
.output results.txt
select * from tbl;
.output

# write entire db -> series of files
export database 'target_dir' (format parquet);
  • Features
sql
# remove cols
select * exclude(col_1, col_2) from tbl;

# quick col fixes
select * replace(lower(city) as city) from tbl;

# smarter groupby
select col_1, col_2, count(*) from tbl group by all;

# aliases can be used where + having

# string + list slicing -> 1-indexed
select 'hello world!'[:-2];
  • Insert using python
python
import duckdb

con = duckdb.connect(database='my-db.duckdb')

# create a table
con.execute("CREATE TABLE items(item VARCHAR, value DECIMAL(10,2), count INTEGER)")
# insert two items into the table
con.execute("INSERT INTO items VALUES ('jeans', 20.0, 1), ('hammer', 42.2, 2)")
# insert several rows using prepared statements
con.executemany("INSERT INTO items VALUES (?, ?, ?)", [['chainsaw', 500, 10], ['iphone', 300, 2]] )
  • Read sql file
sql
.read select_example.sql
  • Generate date spine
sql
select * from range(date '1992-01-01', date '1992-03-01', interval '1' month);
  • List lambdas
sql
select list_transform([1, 2, null, 4], x -> x + 1);
  • Histogram
sql
select histogram(col) from tbl;