Python Memory Optimization: Slots, Generators and Profiling

Python applications can consume far more memory than necessary due to object overhead, holding entire datasets in lists, and dictionary-based instance storage. This guide covers systematic memory profiling with tracemalloc and memory_profiler, reducing object size with __slots__, processing data lazily with generators, using weak references to avoid memory leaks, and choosing memory-efficient data structures.

Understanding Python Object Memory

import sys

# Every Python object has overhead beyond its data
print(sys.getsizeof(0))           # int: 28 bytes
print(sys.getsizeof(3.14))        # float: 24 bytes
print(sys.getsizeof("hello"))     # str: 54 bytes (+ 1 per char)
print(sys.getsizeof([1, 2, 3]))   # list: 88 bytes + 8 per element
print(sys.getsizeof({"a": 1}))    # dict: 232 bytes (CPython overhead)
print(sys.getsizeof((1, 2, 3)))   # tuple: 72 bytes (smaller than list)

# sys.getsizeof only counts the container, not referenced objects
lst = [1, 2, 3]
print(sys.getsizeof(lst))  # 88 — does NOT count the integers inside

# Deep size: recursively count all referenced objects
from pympler import asizeof
print(asizeof.asizeof([list(range(1000))]))  # accurate total size

# Class instance overhead
class RegularPoint:
    def __init__(self, x, y):
        self.x = x
        self.y = y

import sys
p = RegularPoint(1.0, 2.0)
print(sys.getsizeof(p))          # ~48 bytes for the object
print(sys.getsizeof(p.__dict__)) # ~232 bytes for the dict!
# Total: ~280+ bytes for a simple x,y point

Memory Profiling Tools

import tracemalloc

# Built-in tracemalloc — find what's allocating memory
tracemalloc.start()

# ... your code here ...
data = [dict(x=i, y=i*2) for i in range(100_000)]

snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics("lineno")
for stat in top_stats[:5]:
    print(stat)
# main.py:5: size=12.8 MiB, count=100001, average=134 B

tracemalloc.stop()

# Compare before/after
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()

data = [{"x": i} for i in range(50_000)]

snapshot2 = tracemalloc.take_snapshot()
stats = snapshot2.compare_to(snapshot1, "lineno")
for stat in stats[:3]:
    print(stat)
# Shows memory delta per line
# memory_profiler — line-by-line memory usage
# pip install memory_profiler
from memory_profiler import profile

@profile
def load_data():
    data = []
    for i in range(100_000):
        data.append({"id": i, "value": i * 2.5, "name": f"item_{i}"})
    return data

# Run: python -m memory_profiler script.py
# Output shows MiB used per line

__slots__: Eliminating __dict__

By default, Python stores instance attributes in a __dict__ (a per-instance hash table). __slots__ replaces this with fixed C-level attributes — saving 50–80% memory per instance when you have millions of objects.

import sys

class RegularPoint:
    def __init__(self, x, y, z=0.0):
        self.x = x
        self.y = y
        self.z = z

class SlottedPoint:
    __slots__ = ("x", "y", "z")

    def __init__(self, x, y, z=0.0):
        self.x = x
        self.y = y
        self.z = z

# Memory comparison
regular = RegularPoint(1.0, 2.0, 3.0)
slotted = SlottedPoint(1.0, 2.0, 3.0)

print(f"Regular: {sys.getsizeof(regular) + sys.getsizeof(regular.__dict__)} bytes")
# ~280 bytes
print(f"Slotted: {sys.getsizeof(slotted)} bytes")
# ~72 bytes — ~4x less

# Scale to 1 million objects
import time

n = 1_000_000
t0 = time.perf_counter()
regular_list = [RegularPoint(float(i), float(i)) for i in range(n)]
t1 = time.perf_counter()
slotted_list = [SlottedPoint(float(i), float(i)) for i in range(n)]
t2 = time.perf_counter()

print(f"Regular: {(t1-t0):.2f}s")
print(f"Slotted: {(t2-t1):.2f}s")

# dataclass with __slots__ (Python 3.10+)
from dataclasses import dataclass

@dataclass(slots=True)
class Coordinate:
    x: float
    y: float
    z: float = 0.0

c = Coordinate(1.0, 2.0)
print(sys.getsizeof(c))  # ~72 bytes

Generators for Lazy Processing

import sys

# List — loads everything into memory
def get_squares_list(n):
    return [i * i for i in range(n)]

# Generator — produces one value at a time
def get_squares_gen(n):
    for i in range(n):
        yield i * i

n = 10_000_000
lst = get_squares_list(n)        # ~80 MB
gen = get_squares_gen(n)         # ~200 bytes (!)

print(sys.getsizeof(lst))        # 89095160 (80 MB)
print(sys.getsizeof(gen))        # 208 bytes

# Process a huge file without loading it
def process_log_file(path: str):
    def lines():
        with open(path) as f:
            for line in f:
                yield line.strip()

    def parse(lines):
        for line in lines:
            if line:
                parts = line.split(" ")
                yield {"ts": parts[0], "level": parts[1], "msg": " ".join(parts[2:])}

    def filter_errors(records):
        return (r for r in records if r["level"] == "ERROR")

    # Compose the pipeline — O(1) memory regardless of file size
    pipeline = filter_errors(parse(lines()))
    for record in pipeline:
        print(record)

# Generator expressions in sum(), any(), all(), max()
total = sum(i * i for i in range(1_000_000))   # no list created
has_big = any(x > 999 for x in range(1_000_000))
largest = max(abs(x) for x in [-5, 3, -8, 1])

Memory-Efficient Data Structures

import sys
from array import array
from collections import namedtuple
import numpy as np

# array.array — typed array, 8x smaller than list of ints
py_list = list(range(1_000_000))
typed_array = array("i", range(1_000_000))  # signed int

print(f"list:  {sys.getsizeof(py_list):,} bytes")    # ~8,000,056
print(f"array: {sys.getsizeof(typed_array):,} bytes") # ~4,000,064

# namedtuple — ~30% smaller than dict, still has __dict__-free option
Point = namedtuple("Point", ["x", "y", "z"])
p = Point(1.0, 2.0, 3.0)
print(sys.getsizeof(p))  # ~72 bytes (same as tuple)

# Tuple vs list
lst = [1, 2, 3, 4, 5]
tpl = (1, 2, 3, 4, 5)
print(sys.getsizeof(lst))  # 120
print(sys.getsizeof(tpl))  # 80

# NumPy arrays — massively more efficient than Python lists
py_floats = [float(i) for i in range(100_000)]  # ~3.5 MB
np_floats = np.arange(100_000, dtype=np.float64) # 800 KB
np_f32 = np.arange(100_000, dtype=np.float32)   # 400 KB

# Use int32 instead of int64 when values fit
data = np.array([1, 2, 3], dtype=np.int32)   # 4 bytes/element
data64 = np.array([1, 2, 3], dtype=np.int64) # 8 bytes/element

# Pandas: use category for low-cardinality strings
import pandas as pd
df = pd.DataFrame({"country": ["US", "IN", "UK"] * 100_000})
print(df.memory_usage(deep=True).sum())  # ~19 MB (string objects)
df["country"] = df["country"].astype("category")
print(df.memory_usage(deep=True).sum())  # ~300 KB — 60x smaller!

Weak References

Normal references keep objects alive. Weak references let you reference an object without preventing garbage collection — essential for caches that shouldn't cause memory leaks.

import weakref
import gc

class ExpensiveObject:
    def __init__(self, name):
        self.name = name
        self.data = list(range(10_000))  # some big data

    def __del__(self):
        print(f"{self.name} was garbage collected")

# Strong reference — object lives as long as 'obj' exists
obj = ExpensiveObject("regular")
strong_ref = obj      # prevents GC
del obj               # obj still alive via strong_ref

# Weak reference — object can be GC'd even while ref exists
obj2 = ExpensiveObject("weakly referenced")
weak_ref = weakref.ref(obj2)

print(weak_ref())     # returns obj2
del obj2
gc.collect()
print(weak_ref())     # returns None — object was collected

# WeakValueDictionary — cache that auto-removes evicted entries
cache = weakref.WeakValueDictionary()

def get_expensive(key: str) -> ExpensiveObject:
    if key in cache:
        return cache[key]
    obj = ExpensiveObject(key)
    cache[key] = obj  # weak reference — won't prevent GC
    return obj

a = get_expensive("report_A")
b = get_expensive("report_B")
print(len(cache))  # 2

del a, b
gc.collect()
print(len(cache))  # 0 — both were collected

NumPy Memory Tricks

import numpy as np

# Views vs copies — zero-copy slicing
a = np.arange(1_000_000)
view = a[::2]       # view shares data with 'a' — no copy
copy = a[::2].copy() # explicit copy

# In-place operations — avoid creating temporaries
a = np.ones(1_000_000)
b = np.ones(1_000_000)
# BAD: creates a temporary
result = a * 2 + b
# GOOD: in-place, no extra allocation
np.multiply(a, 2, out=a)
np.add(a, b, out=a)

# Memory-mapped files — work on arrays larger than RAM
mmap = np.memmap("large_data.npy", dtype="float32", mode="r",
                 shape=(10_000_000, 128))
# Reads only the pages you access — rest stays on disk
subset = mmap[:1000]  # loads 1000 rows into RAM

Frequently Asked Questions

How do I find what is causing memory growth in production?
Use tracemalloc with periodic snapshots: compare two snapshots taken minutes apart to see what's accumulating. For live inspection, attach memray or use pympler.tracker.SummaryTracker to detect growing object counts over time.
Can __slots__ be used with inheritance?
Yes, but each class in the hierarchy must declare __slots__. If any parent class has __dict__ (no slots), the child inherits __dict__ and slots only partially help. Define slots on every class in the chain for full benefit.
Why does Python sometimes use more memory than expected after del?
Python's memory allocator doesn't immediately return freed memory to the OS — it holds it in an internal pool for future allocations. This is why resident memory (RSS) often stays high after del. Use gc.collect() to force garbage collection, and check actual object counts with gc.get_objects().