Python Memory Optimization: Slots, Generators and Profiling
Python applications can consume far more memory than necessary due to object overhead, holding entire datasets in lists, and dictionary-based instance storage. This guide covers systematic memory profiling with tracemalloc and memory_profiler, reducing object size with __slots__, processing data lazily with generators, using weak references to avoid memory leaks, and choosing memory-efficient data structures.
Table of Contents
Understanding Python Object Memory
import sys
# Every Python object has overhead beyond its data
print(sys.getsizeof(0)) # int: 28 bytes
print(sys.getsizeof(3.14)) # float: 24 bytes
print(sys.getsizeof("hello")) # str: 54 bytes (+ 1 per char)
print(sys.getsizeof([1, 2, 3])) # list: 88 bytes + 8 per element
print(sys.getsizeof({"a": 1})) # dict: 232 bytes (CPython overhead)
print(sys.getsizeof((1, 2, 3))) # tuple: 72 bytes (smaller than list)
# sys.getsizeof only counts the container, not referenced objects
lst = [1, 2, 3]
print(sys.getsizeof(lst)) # 88 — does NOT count the integers inside
# Deep size: recursively count all referenced objects
from pympler import asizeof
print(asizeof.asizeof([list(range(1000))])) # accurate total size
# Class instance overhead
class RegularPoint:
def __init__(self, x, y):
self.x = x
self.y = y
import sys
p = RegularPoint(1.0, 2.0)
print(sys.getsizeof(p)) # ~48 bytes for the object
print(sys.getsizeof(p.__dict__)) # ~232 bytes for the dict!
# Total: ~280+ bytes for a simple x,y point
Memory Profiling Tools
import tracemalloc
# Built-in tracemalloc — find what's allocating memory
tracemalloc.start()
# ... your code here ...
data = [dict(x=i, y=i*2) for i in range(100_000)]
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics("lineno")
for stat in top_stats[:5]:
print(stat)
# main.py:5: size=12.8 MiB, count=100001, average=134 B
tracemalloc.stop()
# Compare before/after
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()
data = [{"x": i} for i in range(50_000)]
snapshot2 = tracemalloc.take_snapshot()
stats = snapshot2.compare_to(snapshot1, "lineno")
for stat in stats[:3]:
print(stat)
# Shows memory delta per line
# memory_profiler — line-by-line memory usage
# pip install memory_profiler
from memory_profiler import profile
@profile
def load_data():
data = []
for i in range(100_000):
data.append({"id": i, "value": i * 2.5, "name": f"item_{i}"})
return data
# Run: python -m memory_profiler script.py
# Output shows MiB used per line
__slots__: Eliminating __dict__
By default, Python stores instance attributes in a __dict__ (a per-instance hash table). __slots__ replaces this with fixed C-level attributes — saving 50–80% memory per instance when you have millions of objects.
import sys
class RegularPoint:
def __init__(self, x, y, z=0.0):
self.x = x
self.y = y
self.z = z
class SlottedPoint:
__slots__ = ("x", "y", "z")
def __init__(self, x, y, z=0.0):
self.x = x
self.y = y
self.z = z
# Memory comparison
regular = RegularPoint(1.0, 2.0, 3.0)
slotted = SlottedPoint(1.0, 2.0, 3.0)
print(f"Regular: {sys.getsizeof(regular) + sys.getsizeof(regular.__dict__)} bytes")
# ~280 bytes
print(f"Slotted: {sys.getsizeof(slotted)} bytes")
# ~72 bytes — ~4x less
# Scale to 1 million objects
import time
n = 1_000_000
t0 = time.perf_counter()
regular_list = [RegularPoint(float(i), float(i)) for i in range(n)]
t1 = time.perf_counter()
slotted_list = [SlottedPoint(float(i), float(i)) for i in range(n)]
t2 = time.perf_counter()
print(f"Regular: {(t1-t0):.2f}s")
print(f"Slotted: {(t2-t1):.2f}s")
# dataclass with __slots__ (Python 3.10+)
from dataclasses import dataclass
@dataclass(slots=True)
class Coordinate:
x: float
y: float
z: float = 0.0
c = Coordinate(1.0, 2.0)
print(sys.getsizeof(c)) # ~72 bytes
Generators for Lazy Processing
import sys
# List — loads everything into memory
def get_squares_list(n):
return [i * i for i in range(n)]
# Generator — produces one value at a time
def get_squares_gen(n):
for i in range(n):
yield i * i
n = 10_000_000
lst = get_squares_list(n) # ~80 MB
gen = get_squares_gen(n) # ~200 bytes (!)
print(sys.getsizeof(lst)) # 89095160 (80 MB)
print(sys.getsizeof(gen)) # 208 bytes
# Process a huge file without loading it
def process_log_file(path: str):
def lines():
with open(path) as f:
for line in f:
yield line.strip()
def parse(lines):
for line in lines:
if line:
parts = line.split(" ")
yield {"ts": parts[0], "level": parts[1], "msg": " ".join(parts[2:])}
def filter_errors(records):
return (r for r in records if r["level"] == "ERROR")
# Compose the pipeline — O(1) memory regardless of file size
pipeline = filter_errors(parse(lines()))
for record in pipeline:
print(record)
# Generator expressions in sum(), any(), all(), max()
total = sum(i * i for i in range(1_000_000)) # no list created
has_big = any(x > 999 for x in range(1_000_000))
largest = max(abs(x) for x in [-5, 3, -8, 1])
Memory-Efficient Data Structures
import sys
from array import array
from collections import namedtuple
import numpy as np
# array.array — typed array, 8x smaller than list of ints
py_list = list(range(1_000_000))
typed_array = array("i", range(1_000_000)) # signed int
print(f"list: {sys.getsizeof(py_list):,} bytes") # ~8,000,056
print(f"array: {sys.getsizeof(typed_array):,} bytes") # ~4,000,064
# namedtuple — ~30% smaller than dict, still has __dict__-free option
Point = namedtuple("Point", ["x", "y", "z"])
p = Point(1.0, 2.0, 3.0)
print(sys.getsizeof(p)) # ~72 bytes (same as tuple)
# Tuple vs list
lst = [1, 2, 3, 4, 5]
tpl = (1, 2, 3, 4, 5)
print(sys.getsizeof(lst)) # 120
print(sys.getsizeof(tpl)) # 80
# NumPy arrays — massively more efficient than Python lists
py_floats = [float(i) for i in range(100_000)] # ~3.5 MB
np_floats = np.arange(100_000, dtype=np.float64) # 800 KB
np_f32 = np.arange(100_000, dtype=np.float32) # 400 KB
# Use int32 instead of int64 when values fit
data = np.array([1, 2, 3], dtype=np.int32) # 4 bytes/element
data64 = np.array([1, 2, 3], dtype=np.int64) # 8 bytes/element
# Pandas: use category for low-cardinality strings
import pandas as pd
df = pd.DataFrame({"country": ["US", "IN", "UK"] * 100_000})
print(df.memory_usage(deep=True).sum()) # ~19 MB (string objects)
df["country"] = df["country"].astype("category")
print(df.memory_usage(deep=True).sum()) # ~300 KB — 60x smaller!
Weak References
Normal references keep objects alive. Weak references let you reference an object without preventing garbage collection — essential for caches that shouldn't cause memory leaks.
import weakref
import gc
class ExpensiveObject:
def __init__(self, name):
self.name = name
self.data = list(range(10_000)) # some big data
def __del__(self):
print(f"{self.name} was garbage collected")
# Strong reference — object lives as long as 'obj' exists
obj = ExpensiveObject("regular")
strong_ref = obj # prevents GC
del obj # obj still alive via strong_ref
# Weak reference — object can be GC'd even while ref exists
obj2 = ExpensiveObject("weakly referenced")
weak_ref = weakref.ref(obj2)
print(weak_ref()) # returns obj2
del obj2
gc.collect()
print(weak_ref()) # returns None — object was collected
# WeakValueDictionary — cache that auto-removes evicted entries
cache = weakref.WeakValueDictionary()
def get_expensive(key: str) -> ExpensiveObject:
if key in cache:
return cache[key]
obj = ExpensiveObject(key)
cache[key] = obj # weak reference — won't prevent GC
return obj
a = get_expensive("report_A")
b = get_expensive("report_B")
print(len(cache)) # 2
del a, b
gc.collect()
print(len(cache)) # 0 — both were collected
NumPy Memory Tricks
import numpy as np
# Views vs copies — zero-copy slicing
a = np.arange(1_000_000)
view = a[::2] # view shares data with 'a' — no copy
copy = a[::2].copy() # explicit copy
# In-place operations — avoid creating temporaries
a = np.ones(1_000_000)
b = np.ones(1_000_000)
# BAD: creates a temporary
result = a * 2 + b
# GOOD: in-place, no extra allocation
np.multiply(a, 2, out=a)
np.add(a, b, out=a)
# Memory-mapped files — work on arrays larger than RAM
mmap = np.memmap("large_data.npy", dtype="float32", mode="r",
shape=(10_000_000, 128))
# Reads only the pages you access — rest stays on disk
subset = mmap[:1000] # loads 1000 rows into RAM
Frequently Asked Questions
- How do I find what is causing memory growth in production?
- Use
tracemallocwith periodic snapshots: compare two snapshots taken minutes apart to see what's accumulating. For live inspection, attachmemrayor usepympler.tracker.SummaryTrackerto detect growing object counts over time. - Can __slots__ be used with inheritance?
- Yes, but each class in the hierarchy must declare
__slots__. If any parent class has__dict__(no slots), the child inherits__dict__and slots only partially help. Define slots on every class in the chain for full benefit. - Why does Python sometimes use more memory than expected after del?
- Python's memory allocator doesn't immediately return freed memory to the OS — it holds it in an internal pool for future allocations. This is why resident memory (RSS) often stays high after
del. Usegc.collect()to force garbage collection, and check actual object counts withgc.get_objects().