Python DuckDB: In-Process Analytical Database

DuckDB is an in-process OLAP database — think SQLite but for analytics. It runs inside your Python process with no server to configure, queries Parquet files and Pandas DataFrames directly with full SQL, and outperforms Pandas groupby by 5–20x for analytical queries. It is the fastest way to run complex SQL analytics on local files, and increasingly the foundation of the "local data stack" replacing heavyweight Spark clusters for datasets under a few hundred GB.

Setup and First Queries

pip install duckdb pandas pyarrow
import duckdb

# In-memory connection (default)
con = duckdb.connect()

# Run SQL directly
result = con.execute("SELECT 42 AS answer, 'hello' AS greeting").fetchdf()

# Create table from values
con.execute("""
    CREATE TABLE employees AS
    SELECT * FROM (VALUES
        (1, 'Alice', 'Engineering', 95000, 5),
        (2, 'Bob',   'Engineering', 88000, 3),
        (3, 'Carol', 'Sales',       72000, 7),
        (4, 'Dave',  'Sales',       67000, 2)
    ) t(id, name, dept, salary, years)
""")

# Query and return as DataFrame, list of tuples, or Arrow table
df = con.execute("SELECT * FROM employees WHERE salary > 80000").fetchdf()
rows = con.execute("SELECT name, salary FROM employees").fetchall()
arrow = con.execute("SELECT * FROM employees").arrow()

print(df)
# Results as Python dicts
for row in con.execute("SELECT name, salary FROM employees").fetchmany(2):
    print(dict(zip(["name", "salary"], row)))

Querying Parquet and CSV

DuckDB's killer feature: query files directly without loading them into memory first. Parquet files support predicate pushdown — DuckDB reads only the row groups that match your WHERE clause.

import duckdb

con = duckdb.connect()

# Query CSV directly
df = con.execute("""
    SELECT
        dept,
        COUNT(*) as headcount,
        AVG(salary) as avg_salary,
        SUM(salary) as total_salary
    FROM read_csv_auto('employees.csv')
    WHERE years >= 3
    GROUP BY dept
    ORDER BY avg_salary DESC
""").fetchdf()

# Query Parquet with predicate pushdown
result = con.execute("""
    SELECT date, product_id, SUM(revenue) as daily_revenue
    FROM read_parquet('sales/year=2026/**/*.parquet')
    WHERE date >= '2026-06-01'
    GROUP BY date, product_id
    ORDER BY date
""").fetchdf()

# Query multiple files with glob
con.execute("""
    SELECT year, month, SUM(amount) as total
    FROM read_parquet('data/transactions_*.parquet')
    GROUP BY year, month
    ORDER BY year, month
""")

# Create view over files (like a virtual table)
con.execute("""
    CREATE VIEW sales AS
    SELECT * FROM read_parquet('data/sales/*.parquet')
""")
con.execute("SELECT * FROM sales WHERE revenue > 10000").fetchdf()

DuckDB with Pandas

DuckDB can query Pandas DataFrames directly by name — zero-copy using Arrow. This is ideal for mixing SQL analytics with Pandas preprocessing.

import duckdb
import pandas as pd

# Create DataFrames
employees = pd.DataFrame({
    "id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Carol", "Dave"],
    "dept_id": [10, 10, 20, 20],
    "salary": [95000, 88000, 72000, 67000],
})
departments = pd.DataFrame({
    "id": [10, 20],
    "dept_name": ["Engineering", "Sales"],
    "budget": [500000, 300000],
})

con = duckdb.connect()

# Query DataFrames by variable name — DuckDB finds them automatically
result = con.execute("""
    SELECT
        e.name,
        d.dept_name,
        e.salary,
        e.salary / d.budget * 100 AS budget_pct
    FROM employees e
    JOIN departments d ON e.dept_id = d.id
    ORDER BY e.salary DESC
""").fetchdf()

# Register explicitly (safer for production code)
con.register("emp_view", employees)
con.register("dept_view", departments)
result = con.execute("SELECT * FROM emp_view JOIN dept_view USING (id)").fetchdf()

# Round-trip: DuckDB → Pandas → DuckDB
aggregated = con.execute("""
    SELECT dept_id, AVG(salary) as avg_sal FROM employees GROUP BY dept_id
""").fetchdf()

# Now query the result as a DataFrame
final = con.execute("""
    SELECT e.name, e.salary, a.avg_sal,
           e.salary - a.avg_sal AS salary_delta
    FROM employees e
    JOIN aggregated a ON e.dept_id = a.dept_id
""").fetchdf()

Window Functions

result = con.execute("""
    SELECT
        name,
        dept,
        salary,
        -- Running total within each dept
        SUM(salary) OVER (PARTITION BY dept ORDER BY salary) AS running_dept_total,
        -- Rank within department
        RANK() OVER (PARTITION BY dept ORDER BY salary DESC) AS salary_rank,
        -- Percentile
        PERCENT_RANK() OVER (ORDER BY salary) AS salary_percentile,
        -- Previous row value
        LAG(salary, 1) OVER (PARTITION BY dept ORDER BY years) AS prev_salary,
        -- Moving average
        AVG(salary) OVER (
            PARTITION BY dept
            ORDER BY years
            ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
        ) AS rolling_3_avg,
        -- Cumulative count
        COUNT(*) OVER (PARTITION BY dept ORDER BY years) AS running_headcount
    FROM employees
""").fetchdf()

# QUALIFY — filter on window function result (like HAVING for windows)
top_earners = con.execute("""
    SELECT name, dept, salary
    FROM employees
    QUALIFY RANK() OVER (PARTITION BY dept ORDER BY salary DESC) <= 2
""").fetchdf()

Persistent Database

import duckdb

# Persistent — saves to file
con = duckdb.connect("analytics.duckdb")

# Create tables
con.execute("""
    CREATE TABLE IF NOT EXISTS events (
        id BIGINT PRIMARY KEY,
        user_id INTEGER,
        event_type VARCHAR,
        timestamp TIMESTAMPTZ,
        properties JSON
    )
""")

# Insert data
con.execute("""
    INSERT INTO events VALUES
    (1, 101, 'page_view', NOW(), '{"page": "/home"}'),
    (2, 102, 'click',     NOW(), '{"button": "signup"}')
""")

# Transactions
with con.cursor() as cur:
    cur.execute("BEGIN")
    cur.execute("INSERT INTO events VALUES (3, 103, 'purchase', NOW(), '{}')")
    cur.execute("COMMIT")

# Export to Parquet (great for archiving)
con.execute("COPY events TO 'events_backup.parquet' (FORMAT PARQUET)")

# Import from Parquet
con.execute("COPY events FROM 'events_backup.parquet' (FORMAT PARQUET)")

con.close()

Performance Tips

import duckdb

con = duckdb.connect()

# Set threads and memory limit
con.execute("SET threads TO 8")
con.execute("SET memory_limit='4GB'")

# Use Parquet over CSV — 10x faster reads
# Write Parquet from pandas:
import pandas as pd
df = pd.read_csv("big.csv")
df.to_parquet("big.parquet", compression="snappy", index=False)

# Use EXPLAIN to understand query plans
plan = con.execute("""
    EXPLAIN ANALYZE
    SELECT dept, AVG(salary)
    FROM read_parquet('employees.parquet')
    GROUP BY dept
""").fetchdf()
print(plan)

# Partition Parquet by frequently filtered columns
con.execute("""
    COPY (SELECT * FROM read_parquet('sales.parquet'))
    TO 'partitioned/' (FORMAT PARQUET, PARTITION_BY (year, month))
""")
# Now queries filtered by year/month only scan relevant partitions

# Use prepared statements for repeated queries
stmt = con.prepare("SELECT * FROM employees WHERE dept = ? AND salary > ?")
eng = stmt.execute(["Engineering", 80000]).fetchdf()
sales = stmt.execute(["Sales", 60000]).fetchdf()

FastAPI Integration

from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager
import duckdb
import asyncio

_connection: duckdb.DuckDBPyConnection | None = None

@asynccontextmanager
async def lifespan(app: FastAPI):
    global _connection
    _connection = duckdb.connect("analytics.duckdb")
    _connection.execute("SET threads TO 4")
    yield
    _connection.close()

app = FastAPI(lifespan=lifespan)

def get_db():
    return _connection.cursor()  # thread-safe cursor

@app.get("/analytics/revenue")
async def revenue_by_dept():
    # DuckDB is synchronous — run in thread pool
    def query():
        with get_db() as cur:
            return cur.execute("""
                SELECT dept, SUM(salary) as total_cost, COUNT(*) as headcount
                FROM read_parquet('employees.parquet')
                GROUP BY dept
                ORDER BY total_cost DESC
            """).fetchdf().to_dict("records")

    return await asyncio.to_thread(query)

@app.get("/analytics/top-earners/{dept}")
async def top_earners(dept: str, limit: int = 10):
    def query():
        with get_db() as cur:
            return cur.execute("""
                SELECT name, salary, years
                FROM employees
                WHERE dept = ?
                ORDER BY salary DESC
                LIMIT ?
            """, [dept, limit]).fetchdf().to_dict("records")

    return await asyncio.to_thread(query)

Frequently Asked Questions

DuckDB vs SQLite — when to use which?
SQLite is a row-oriented OLTP database — optimized for reads and writes of individual rows. DuckDB is a column-oriented OLAP database — optimized for analytical queries aggregating millions of rows. Use SQLite for application data (user accounts, settings). Use DuckDB for analytics, reports, and querying data files.
Can DuckDB handle multi-user access?
DuckDB supports multiple readers but only one writer at a time per database file. For concurrent write workloads, use PostgreSQL. DuckDB is designed for single-user analytics sessions, not web application transactional databases. The in-memory mode supports one connection per process.
How does DuckDB compare to Spark for big data?
DuckDB is single-machine; Spark distributes across a cluster. DuckDB is 5–50x faster than Spark for data that fits on one machine (up to ~500GB with streaming), with zero setup overhead. Use Spark when you genuinely need petabyte-scale distributed processing. Many teams have moved from Spark to DuckDB + Polars for terabyte-scale workloads on a single large EC2 instance.