Python NumPy: Array Operations and Scientific Computing

NumPy is the numerical computing backbone of Python. Every major data science library — Pandas, Scikit-learn, TensorFlow, PyTorch — is built on NumPy arrays. Understanding NumPy lets you write vectorized code that operates on millions of numbers in microseconds instead of seconds. This guide covers array creation, indexing, broadcasting, linear algebra, random number generation, and performance patterns every Python data engineer needs.

Array Creation and Properties

import numpy as np

# From Python lists
a = np.array([1, 2, 3, 4, 5])
b = np.array([[1, 2, 3], [4, 5, 6]])  # 2D array

# Built-in constructors
zeros = np.zeros((3, 4))             # 3×4 matrix of zeros
ones = np.ones((2, 3), dtype=np.float32)
eye = np.eye(4)                       # 4×4 identity matrix
ramp = np.arange(0, 10, 0.5)         # [0, 0.5, 1.0, ..., 9.5]
linspace = np.linspace(0, 1, 100)    # 100 evenly spaced from 0 to 1

# Array properties
print(b.shape)    # (2, 3)
print(b.ndim)     # 2
print(b.dtype)    # dtype('int64')
print(b.size)     # 6 (total elements)
print(b.nbytes)   # 48 (bytes in memory)

# Type control — use float32 instead of float64 to halve memory for ML
x = np.array([1.5, 2.5, 3.5], dtype=np.float32)
y = x.astype(np.float64)  # convert

Indexing and Slicing

arr = np.arange(24).reshape(4, 6)
# array([[ 0,  1,  2,  3,  4,  5],
#        [ 6,  7,  8,  9, 10, 11],
#        [12, 13, 14, 15, 16, 17],
#        [18, 19, 20, 21, 22, 23]])

# Basic slicing — [rows, cols]
arr[0, :]     # first row: [0, 1, 2, 3, 4, 5]
arr[:, 2]     # third column: [2, 8, 14, 20]
arr[1:3, 2:5] # sub-matrix rows 1-2, cols 2-4
arr[::2, ::2] # every other row and column

# Boolean indexing
mask = arr > 10
arr[mask]               # flat array of values > 10
arr[arr % 2 == 0]       # even numbers only
arr[(arr > 5) & (arr < 15)]

# Fancy indexing — select specific rows/cols
rows = np.array([0, 2])
cols = np.array([1, 4])
arr[rows, cols]  # elements at (0,1) and (2,4): [1, 16]

# Modify with indexing
arr[arr < 0] = 0  # clip negatives to 0
arr[0, :] = 99    # set first row to 99

Vectorized Operations

a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])

# Element-wise arithmetic (no loops needed)
a + b         # [11, 22, 33, 44]
a * b         # [10, 40, 90, 160]
a ** 2        # [1, 4, 9, 16]
np.sqrt(a)    # [1.0, 1.41, 1.73, 2.0]

# Universal functions (ufuncs)
np.exp(a)     # e^1, e^2, e^3, e^4
np.log(a)     # natural log
np.abs(np.array([-1, -2, 3]))  # [1, 2, 3]
np.sin(np.linspace(0, np.pi, 100))  # sine wave

# Reductions
a.sum()       # 10
a.mean()      # 2.5
a.std()       # standard deviation
a.min(), a.max()
a.argmin(), a.argmax()  # indices of min/max
np.cumsum(a)  # [1, 3, 6, 10]

# Axis-based reductions on 2D
m = np.arange(12).reshape(3, 4)
m.sum(axis=0)   # sum per column: [12, 15, 18, 21]
m.sum(axis=1)   # sum per row: [6, 22, 38]
m.mean(axis=0)  # mean per column

# Conditional selection
np.where(a > 2, a * 10, 0)  # [0, 0, 30, 40]
np.clip(a, 2, 3)             # [2, 2, 3, 3]

Broadcasting

Broadcasting is NumPy's rule for operating on arrays of different shapes without copying data. NumPy virtually stretches the smaller array to match the larger one. The rule: starting from trailing dimensions, sizes must match or one must be 1.

# Add a scalar to a 2D array — scalar broadcasts to all elements
m = np.ones((3, 4))
m + 10  # all elements become 11

# Add a 1D array (shape 4) to a 2D array (shape 3×4)
# 1D array broadcasts across rows
row_offsets = np.array([0, 10, 20, 30])  # shape (4,)
m + row_offsets  # each row gets [0, 10, 20, 30] added

# Add a column vector (shape 3×1) to each column
col_offsets = np.array([[100], [200], [300]])  # shape (3, 1)
m + col_offsets  # row 0 gets +100, row 1 gets +200, etc.

# Real use case: normalize each feature (column) in a dataset
data = np.random.randn(1000, 20)  # 1000 samples, 20 features
mean = data.mean(axis=0)          # shape (20,) — mean per feature
std = data.std(axis=0)            # shape (20,)
normalized = (data - mean) / std  # broadcasting: (1000,20) - (20,) / (20,)

# Outer product via broadcasting
x = np.arange(1, 4).reshape(3, 1)  # [[1],[2],[3]]
y = np.arange(1, 5).reshape(1, 4)  # [[1, 2, 3, 4]]
outer = x * y  # 3×4 matrix — multiplication table

Linear Algebra

A = np.array([[2, 1], [5, 3]])
b = np.array([4, 7])

# Matrix multiplication
C = np.array([[1, 2], [3, 4]])
D = A @ C         # preferred: matrix multiply operator (Python 3.5+)
D = np.dot(A, C)  # equivalent

# Linear algebra operations
det = np.linalg.det(A)           # determinant: 1.0
inv = np.linalg.inv(A)           # inverse
x = np.linalg.solve(A, b)        # solve Ax = b (faster than inv)
eigenvalues, eigenvectors = np.linalg.eig(A)

# SVD (used in PCA, dimensionality reduction)
U, s, Vt = np.linalg.svd(A, full_matrices=False)
# Reconstruct: U @ np.diag(s) @ Vt

# Norms
np.linalg.norm(b)           # L2 norm (Euclidean)
np.linalg.norm(b, ord=1)    # L1 norm
np.linalg.norm(A, 'fro')    # Frobenius norm

# Least squares regression
X = np.column_stack([np.ones(100), np.random.randn(100)])  # features
y = 3 + 2 * X[:, 1] + np.random.randn(100) * 0.5          # targets
coeffs, residuals, rank, sv = np.linalg.lstsq(X, y, rcond=None)
print(f"intercept={coeffs[0]:.2f}, slope={coeffs[1]:.2f}")

Random Number Generation

# NumPy 1.17+ recommended API: np.random.default_rng()
rng = np.random.default_rng(seed=42)  # reproducible

# Distributions
uniform = rng.uniform(0, 1, size=(3, 4))
normal = rng.standard_normal(size=1000)
integers = rng.integers(0, 100, size=50)
choice = rng.choice([10, 20, 30, 40], size=5, replace=True)

# Specific distributions for ML/stats
poisson = rng.poisson(lam=5, size=100)
binomial = rng.binomial(n=10, p=0.3, size=100)
exponential = rng.exponential(scale=2.0, size=100)

# Shuffle and permutation
arr = np.arange(10)
rng.shuffle(arr)                      # in-place shuffle
perm = rng.permutation(10)            # returns shuffled array
sample = rng.permutation(arr)[:5]     # random sample without replacement

# Train/test split without sklearn
idx = rng.permutation(len(data))
split = int(0.8 * len(data))
train_idx, test_idx = idx[:split], idx[split:]

Practical ML Patterns

rng = np.random.default_rng(42)

# Softmax function
def softmax(x):
    x_shifted = x - x.max(axis=-1, keepdims=True)  # numerical stability
    e = np.exp(x_shifted)
    return e / e.sum(axis=-1, keepdims=True)

logits = np.array([[2.0, 1.0, 0.1], [0.5, 3.0, 0.2]])
probs = softmax(logits)  # each row sums to 1

# One-hot encoding
def one_hot(labels, num_classes):
    oh = np.zeros((len(labels), num_classes))
    oh[np.arange(len(labels)), labels] = 1
    return oh

labels = np.array([0, 2, 1, 3])
encoded = one_hot(labels, 4)

# Cosine similarity matrix (for embedding search)
def cosine_similarity_matrix(A):
    norms = np.linalg.norm(A, axis=1, keepdims=True)
    A_norm = A / (norms + 1e-8)
    return A_norm @ A_norm.T

embeddings = rng.standard_normal((100, 768))  # 100 sentences, 768-dim
sim = cosine_similarity_matrix(embeddings)  # 100×100 similarity matrix
top_k = np.argsort(sim[0])[-5:][::-1]       # top 5 similar to first

Performance and Memory

import numpy as np
import time

# Views vs copies — crucial for performance
a = np.arange(1_000_000)
view = a[::2]       # view — shares memory, zero-copy
copy = a[::2].copy() # explicit copy

# Contiguous arrays — much faster for many operations
a_c = np.ascontiguousarray(a.T)  # make C-contiguous after transpose

# memmap — work with arrays larger than RAM
large = np.memmap("data.bin", dtype="float32", mode="w+", shape=(100_000, 768))
large[0] = rng.standard_normal(768)

# numexpr — faster than NumPy for complex expressions
import numexpr as ne
a, b, c = np.random.randn(3, 1_000_000)
result = ne.evaluate("2*a + b/c - sqrt(abs(a))")  # ~3x faster than NumPy

# Avoid temporary arrays with out= parameter
out = np.empty_like(a)
np.multiply(a, 2, out=out)  # no temporary array created

Frequently Asked Questions

When should I use NumPy vs PyTorch/TensorFlow?
NumPy runs on CPU only and doesn't support autograd (automatic differentiation). Use NumPy for data preprocessing, feature engineering, and classical ML. Use PyTorch or TensorFlow when you need GPU acceleration, gradient computation, or neural network training.
Why is broadcasting faster than explicit loops?
Broadcasting operations are implemented in C inside NumPy and run in a tight loop that benefits from CPU SIMD (vectorized) instructions. Python loops have interpreter overhead per iteration; NumPy eliminates it by operating on entire arrays at once.
What is the difference between np.copy() and np.array()?
np.array(x) always returns a new array (a copy if the input is already an array). np.copy(x) is an explicit copy. x.view() creates a view that shares the same data buffer — modifying the view modifies the original.