Python Hypothesis: Property-Based Testing Guide

Hypothesis is a Python library for property-based testing — instead of writing specific example inputs, you describe the shape of valid inputs and Hypothesis generates hundreds of random test cases automatically, including tricky edge cases like empty strings, negative numbers, Unicode surrogates, and NaN. When it finds a failure it automatically shrinks the input to the minimal reproducing example. Property-based testing is the most effective technique for finding bugs that hand-written examples miss.

Installation and First Test

pip install hypothesis pytest
# test_basics.py
from hypothesis import given
from hypothesis import strategies as st


def encode_decode(s: str) -> str:
    """Function we want to test: encode then decode should be an identity."""
    return s.encode("utf-8").decode("utf-8")


@given(st.text())
def test_encode_decode_identity(s: str):
    """Property: encoding then decoding any text returns the original string."""
    assert encode_decode(s) == s


# Run with: pytest test_basics.py -v
# Hypothesis generates 100 random strings including empty string,
# ASCII, Unicode, emoji, null bytes, surrogates etc.
How Hypothesis works: It maintains a database of previously found failures so re-runs always check the minimal failure case first. If you commit the .hypothesis/ directory, your CI picks up the same failures immediately without needing to re-discover them.

Built-in Strategies

Strategies are composable generators. Hypothesis ships with strategies for all Python built-in types, and you can combine them to generate complex nested structures matching your domain model.

from hypothesis import given, assume
from hypothesis import strategies as st


@given(st.integers())
def test_integers(n: int):
    """Generates: 0, -1, 1, min_int, max_int, random values."""
    assert isinstance(n, int)


@given(st.integers(min_value=1, max_value=100))
def test_bounded_integer(n: int):
    assert 1 <= n <= 100


@given(st.floats(allow_nan=False, allow_infinity=False))
def test_float_arithmetic(f: float):
    """Filters NaN and inf — useful for numeric functions."""
    assert f * 2 / 2 == f or abs(f * 2 / 2 - f) < 1e-10


@given(st.text(min_size=1, max_size=50))
def test_non_empty_text(s: str):
    assert len(s) >= 1


@given(st.lists(st.integers(), min_size=1))
def test_list_max(lst: list[int]):
    """Property: max of a list is always >= every element."""
    assert max(lst) >= lst[0]


@given(st.dictionaries(st.text(), st.integers()))
def test_dict_round_trip(d: dict):
    import json
    # Property: JSON round-trip preserves structure
    restored = json.loads(json.dumps(d))
    assert restored == d


@given(
    st.one_of(st.integers(), st.floats(allow_nan=False), st.text())
)
def test_one_of(value):
    """one_of draws from any of the provided strategies."""
    assert value is not None

Composite and Custom Strategies

Use @st.composite to build domain-specific strategies from multiple existing strategies. This is how you generate realistic test data matching your application's data model.

from hypothesis import given
from hypothesis import strategies as st
from dataclasses import dataclass


@dataclass
class Order:
    order_id: str
    user_id: int
    items: list[dict]
    total: float


@st.composite
def order_strategy(draw) -> Order:
    """Generate a realistic Order with consistent total."""
    order_id = draw(st.text(alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", min_size=8, max_size=8))
    user_id = draw(st.integers(min_value=1, max_value=1_000_000))
    n_items = draw(st.integers(min_value=1, max_value=10))
    items = []
    total = 0.0
    for _ in range(n_items):
        price = draw(st.floats(min_value=0.01, max_value=999.99, allow_nan=False, allow_infinity=False))
        qty = draw(st.integers(min_value=1, max_value=20))
        items.append({"price": round(price, 2), "qty": qty})
        total += price * qty
    return Order(order_id=order_id, user_id=user_id, items=items, total=round(total, 2))


@given(order_strategy())
def test_order_total_is_positive(order: Order):
    assert order.total > 0


@given(order_strategy())
def test_order_id_length(order: Order):
    assert len(order.order_id) == 8


# Combining strategies with flatmap
positive_and_larger = st.integers(min_value=1, max_value=100).flatmap(
    lambda n: st.tuples(st.just(n), st.integers(min_value=n + 1, max_value=n + 100))
)


@given(positive_and_larger)
def test_a_less_than_b(pair):
    a, b = pair
    assert a < b

Writing Good Properties

The key skill in property-based testing is identifying invariants — properties that must hold for all valid inputs. Common property patterns include round-trip, oracle, commutativity, idempotence, and metamorphic relations.

from hypothesis import given
from hypothesis import strategies as st
import json


# 1. Round-trip property
@given(st.binary())
def test_base64_round_trip(data: bytes):
    import base64
    assert base64.b64decode(base64.b64encode(data)) == data


# 2. Oracle property (compare fast impl against slow reference impl)
def sort_fast(lst: list[int]) -> list[int]:
    return sorted(lst)  # production sort

def sort_reference(lst: list[int]) -> list[int]:
    # Bubble sort — obviously correct, obviously slow
    lst = list(lst)
    for i in range(len(lst)):
        for j in range(len(lst) - 1 - i):
            if lst[j] > lst[j + 1]:
                lst[j], lst[j + 1] = lst[j + 1], lst[j]
    return lst

@given(st.lists(st.integers()))
def test_sort_matches_reference(lst: list[int]):
    assert sort_fast(lst) == sort_reference(lst)


# 3. Idempotence property
@given(st.text())
def test_strip_idempotent(s: str):
    assert s.strip() == s.strip().strip()


# 4. Metamorphic relation
@given(st.lists(st.integers(), min_size=1))
def test_max_after_adding_larger(lst: list[int]):
    original_max = max(lst)
    bigger = original_max + 1
    assert max(lst + [bigger]) == bigger


# 5. Structural property
@given(st.lists(st.integers()))
def test_sorted_list_is_ordered(lst: list[int]):
    result = sorted(lst)
    for i in range(len(result) - 1):
        assert result[i] <= result[i + 1]

Settings and Profiles

Use @settings to control how many examples Hypothesis generates, the deadline per test, and suppressed health checks. Define profiles in conftest.py to run quick tests locally and thorough tests in CI.

from hypothesis import given, settings, HealthCheck
from hypothesis import strategies as st


# Increase examples for thorough CI testing
@settings(max_examples=1000, deadline=None)
@given(st.text())
def test_thorough(s: str):
    assert isinstance(s.lower(), str)


# Suppress health check for slow data generation
@settings(suppress_health_check=[HealthCheck.too_slow], max_examples=200)
@given(st.text(min_size=100, max_size=10000))
def test_large_text(s: str):
    assert len(s) >= 100


# conftest.py — define profiles
from hypothesis import settings, Phase

settings.register_profile("dev", max_examples=20, deadline=200)
settings.register_profile("ci", max_examples=500, deadline=None, phases=[
    Phase.explicit, Phase.reuse, Phase.generate, Phase.shrink
])

# Activate with: HYPOTHESIS_PROFILE=ci pytest
import os
settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "dev"))

Stateful Testing

Stateful (model-based) testing generates sequences of operations on a stateful system and checks invariants after each step. It is ideal for testing data structures, APIs, and anything where the order of operations matters.

from hypothesis.stateful import RuleBasedStateMachine, rule, invariant
from hypothesis import strategies as st


class SetMachine(RuleBasedStateMachine):
    """Tests a custom set implementation against Python's built-in set."""

    def __init__(self):
        super().__init__()
        self.model = set()       # reference implementation
        self.impl = set()        # implementation under test

    @rule(value=st.integers())
    def add_value(self, value: int):
        self.model.add(value)
        self.impl.add(value)

    @rule(value=st.integers())
    def discard_value(self, value: int):
        self.model.discard(value)
        self.impl.discard(value)

    @invariant()
    def sets_are_equal(self):
        assert self.model == self.impl

    @invariant()
    def length_matches(self):
        assert len(self.model) == len(self.impl)


# Convert to a pytest test class
TestSet = SetMachine.TestCase

Testing Django and FastAPI

Hypothesis integrates seamlessly with pytest and Django's test client. For FastAPI, combine Hypothesis with httpx's test client to property-test entire endpoints.

# FastAPI property test
from hypothesis import given, settings
from hypothesis import strategies as st
from fastapi.testclient import TestClient
from app.main import app

client = TestClient(app)


@given(
    name=st.text(min_size=1, max_size=100),
    age=st.integers(min_value=0, max_value=150),
)
@settings(max_examples=100)
def test_create_user_any_valid_input(name: str, age: int):
    response = client.post("/users", json={"name": name, "age": age})
    # Property: valid input never returns 5xx
    assert response.status_code < 500

@given(st.text())
def test_search_never_crashes(query: str):
    response = client.get("/search", params={"q": query})
    assert response.status_code in {200, 400, 422}  # no 500

Frequently Asked Questions

How many examples does Hypothesis run by default?
100 examples per test. This is usually enough to find common bugs. Set max_examples=1000 in CI for more thorough checking. Hypothesis also caches previously found failures and always re-runs them first.
What is shrinking?
When Hypothesis finds a failing example, it automatically tries smaller and simpler inputs until it finds the minimal reproducible case. For example, if a list of 500 integers causes a failure, Hypothesis shrinks it to the smallest list (often 1-2 elements) that still triggers the same assertion error.
Hypothesis vs pytest-randomly?
pytest-randomly randomises test order and seeds random functions. Hypothesis generates structured inputs with domain knowledge (empty string, max integer, etc.), shrinks failures, and persists them. They complement each other — use both.