Python Web Scraping: BeautifulSoup and Playwright
Web scraping in Python comes down to two scenarios: static HTML pages that return all content in the initial response, and JavaScript-rendered pages where content loads dynamically. For static pages, httpx + BeautifulSoup is fast and lightweight. For SPAs and dynamic content, Playwright provides a full browser engine with async Python support. This guide covers both approaches, plus async batch scraping, data extraction patterns, and handling anti-bot measures.
Table of Contents
BeautifulSoup: Static HTML Scraping
pip install httpx beautifulsoup4 lxml
import httpx
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
def scrape_page(url: str) -> BeautifulSoup:
resp = httpx.get(url, headers=HEADERS, follow_redirects=True, timeout=15)
resp.raise_for_status()
return BeautifulSoup(resp.text, "lxml")
# CSS selector (modern, recommended)
def extract_articles(url: str) -> list[dict]:
soup = scrape_page(url)
articles = []
for card in soup.select("article.news-card"):
title_el = card.select_one("h2.card-title a")
date_el = card.select_one("time[datetime]")
summary_el = card.select_one("p.summary")
if title_el:
articles.append({
"title": title_el.get_text(strip=True),
"url": title_el["href"],
"date": date_el["datetime"] if date_el else None,
"summary": summary_el.get_text(strip=True) if summary_el else "",
})
return articles
# find() and find_all() — classic approach
def extract_table(url: str) -> list[dict]:
soup = scrape_page(url)
table = soup.find("table", {"class": "data-table"})
if not table:
return []
headers = [th.get_text(strip=True) for th in table.select("thead th")]
rows = []
for tr in table.select("tbody tr"):
cells = [td.get_text(strip=True) for td in tr.find_all("td")]
if cells:
rows.append(dict(zip(headers, cells)))
return rows
Async Batch Scraping with httpx
For scraping many pages, async HTTP requests fetch dozens of pages concurrently without the overhead of threads.
import asyncio
import httpx
from bs4 import BeautifulSoup
async def scrape_one(client: httpx.AsyncClient, url: str) -> dict:
try:
resp = await client.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
return {
"url": url,
"title": soup.select_one("h1").get_text(strip=True) if soup.select_one("h1") else "",
"status": "ok",
}
except Exception as e:
return {"url": url, "status": "error", "error": str(e)}
async def scrape_all(urls: list[str], concurrency: int = 10) -> list[dict]:
semaphore = asyncio.Semaphore(concurrency)
async def bounded_scrape(client, url):
async with semaphore:
await asyncio.sleep(0.5) # polite delay between requests
return await scrape_one(client, url)
async with httpx.AsyncClient(
headers=HEADERS,
follow_redirects=True,
limits=httpx.Limits(max_connections=20),
) as client:
tasks = [bounded_scrape(client, url) for url in urls]
return await asyncio.gather(*tasks)
# Scrape with pagination
async def scrape_paginated(base_url: str, max_pages: int = 10) -> list[dict]:
all_items = []
async with httpx.AsyncClient(headers=HEADERS) as client:
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
resp = await client.get(url, timeout=10)
soup = BeautifulSoup(resp.text, "lxml")
items = soup.select(".item")
if not items:
break # no more pages
all_items.extend([{"text": i.get_text(strip=True)} for i in items])
await asyncio.sleep(1) # rate limiting
return all_items
Playwright: JavaScript-Rendered Pages
pip install playwright
playwright install chromium
import asyncio
from playwright.async_api import async_playwright, Page
async def scrape_spa(url: str) -> list[dict]:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
viewport={"width": 1280, "height": 720},
)
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
# Wait for dynamic content to load
await page.wait_for_selector(".product-card", timeout=10_000)
# Extract data via JavaScript evaluation
items = await page.evaluate("""() => {
return Array.from(document.querySelectorAll('.product-card')).map(card => ({
name: card.querySelector('h2')?.textContent?.trim(),
price: card.querySelector('.price')?.textContent?.trim(),
rating: card.querySelector('.rating')?.getAttribute('data-value'),
}));
}""")
await browser.close()
return items
async def scrape_with_login(url: str, username: str, password: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Login
await page.goto("https://example.com/login")
await page.fill("#username", username)
await page.fill("#password", password)
await page.click("#login-btn")
await page.wait_for_url("**/dashboard")
# Navigate to target
await page.goto(url)
await page.wait_for_load_state("networkidle")
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(2)
html = await page.content()
await browser.close()
return html
async def intercept_api_calls(url: str) -> list[dict]:
"""Capture the API responses the page makes instead of scraping HTML."""
results = []
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
async def handle_response(response):
if "/api/products" in response.url:
try:
data = await response.json()
results.extend(data.get("items", []))
except Exception:
pass
page.on("response", handle_response)
await page.goto(url, wait_until="networkidle")
await browser.close()
return results
Data Extraction Patterns
import re
from bs4 import BeautifulSoup
def safe_text(el) -> str:
"""Get text or empty string if element is None."""
return el.get_text(strip=True) if el else ""
def safe_attr(el, attr: str, default: str = "") -> str:
"""Get attribute or default if element/attribute is None."""
return el.get(attr, default) if el else default
def extract_price(text: str) -> float | None:
"""Extract numeric price from text like '$1,299.99'."""
match = re.search(r"[\d,]+\.?\d*", text.replace(",", ""))
return float(match.group()) if match else None
def extract_structured(soup: BeautifulSoup) -> dict:
"""Extract JSON-LD structured data from a page."""
import json
for script in soup.find_all("script", type="application/ld+json"):
try:
return json.loads(script.string)
except Exception:
continue
return {}
def extract_meta(soup: BeautifulSoup) -> dict:
"""Extract Open Graph and Twitter Card meta tags."""
meta = {}
for tag in soup.find_all("meta"):
prop = tag.get("property") or tag.get("name", "")
content = tag.get("content", "")
if prop.startswith(("og:", "twitter:")) and content:
meta[prop] = content
return meta
Handling Anti-Bot Measures
import random
import time
# Randomized delays to appear human
async def polite_delay(min_s=1.0, max_s=3.0):
await asyncio.sleep(random.uniform(min_s, max_s))
# Rotate user agents
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]
def random_headers() -> dict:
return {**HEADERS, "User-Agent": random.choice(USER_AGENTS)}
# Handle 429 Rate Limit with exponential backoff
async def scrape_with_retry(client, url: str, max_retries: int = 5) -> httpx.Response:
for attempt in range(max_retries):
resp = await client.get(url, headers=random_headers())
if resp.status_code == 429:
wait = (2 ** attempt) + random.random()
print(f"Rate limited. Waiting {wait:.1f}s before retry {attempt+1}")
await asyncio.sleep(wait)
elif resp.status_code == 200:
return resp
else:
resp.raise_for_status()
raise Exception(f"Failed after {max_retries} retries")
Storing Scraped Data
import json
import csv
import sqlite3
from pathlib import Path
def save_json(data: list[dict], path: str):
Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False))
def save_csv(data: list[dict], path: str):
if not data:
return
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
def save_sqlite(data: list[dict], db_path: str, table: str = "scraped"):
if not data:
return
conn = sqlite3.connect(db_path)
cols = ", ".join(f'"{k}" TEXT' for k in data[0].keys())
conn.execute(f"CREATE TABLE IF NOT EXISTS {table} ({cols})")
placeholders = ", ".join("?" * len(data[0]))
conn.executemany(
f"INSERT OR REPLACE INTO {table} VALUES ({placeholders})",
[tuple(row.values()) for row in data],
)
conn.commit()
conn.close()
Ethical Scraping Guidelines
- Check robots.txt before scraping:
httpx.get("https://example.com/robots.txt") - Respect rate limits — add delays between requests, use exponential backoff on 429
- Identify yourself — include a contact email in your User-Agent string
- Don't scrape personal data without explicit permission or legal basis
- Use official APIs when available — they're more reliable and explicitly permitted
- Cache aggressively — don't re-scrape unchanged pages unnecessarily
Frequently Asked Questions
- BeautifulSoup vs lxml direct — which parser is faster?
lxmlis the fastest parser for BeautifulSoup (5-10x faster than Python'shtml.parser). UseBeautifulSoup(html, "lxml"). For maximum performance, uselxmldirectly vialxml.html.fromstring()and XPath, but BeautifulSoup's API is easier to use for most cases.- When should I use Playwright vs httpx?
- Use httpx + BeautifulSoup for static HTML pages where all content is in the initial HTTP response. Use Playwright when the page uses React, Angular, Vue, or other frameworks that render content client-side via JavaScript, or when you need to interact with forms, infinite scroll, or dynamic loading.
- Is web scraping legal?
- It depends on jurisdiction, the site's Terms of Service, and what you do with the data. Scraping publicly available, non-personal data for research or personal use is generally legal in most countries. Scraping behind authentication, exporting personal data (GDPR), or violating ToS can create legal risk. When in doubt, use the official API or contact the site owner.