Python Web Scraping: BeautifulSoup and Playwright

Web scraping in Python comes down to two scenarios: static HTML pages that return all content in the initial response, and JavaScript-rendered pages where content loads dynamically. For static pages, httpx + BeautifulSoup is fast and lightweight. For SPAs and dynamic content, Playwright provides a full browser engine with async Python support. This guide covers both approaches, plus async batch scraping, data extraction patterns, and handling anti-bot measures.

BeautifulSoup: Static HTML Scraping

pip install httpx beautifulsoup4 lxml
import httpx
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

def scrape_page(url: str) -> BeautifulSoup:
    resp = httpx.get(url, headers=HEADERS, follow_redirects=True, timeout=15)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml")

# CSS selector (modern, recommended)
def extract_articles(url: str) -> list[dict]:
    soup = scrape_page(url)
    articles = []
    for card in soup.select("article.news-card"):
        title_el = card.select_one("h2.card-title a")
        date_el = card.select_one("time[datetime]")
        summary_el = card.select_one("p.summary")
        if title_el:
            articles.append({
                "title": title_el.get_text(strip=True),
                "url": title_el["href"],
                "date": date_el["datetime"] if date_el else None,
                "summary": summary_el.get_text(strip=True) if summary_el else "",
            })
    return articles

# find() and find_all() — classic approach
def extract_table(url: str) -> list[dict]:
    soup = scrape_page(url)
    table = soup.find("table", {"class": "data-table"})
    if not table:
        return []
    headers = [th.get_text(strip=True) for th in table.select("thead th")]
    rows = []
    for tr in table.select("tbody tr"):
        cells = [td.get_text(strip=True) for td in tr.find_all("td")]
        if cells:
            rows.append(dict(zip(headers, cells)))
    return rows

Async Batch Scraping with httpx

For scraping many pages, async HTTP requests fetch dozens of pages concurrently without the overhead of threads.

import asyncio
import httpx
from bs4 import BeautifulSoup

async def scrape_one(client: httpx.AsyncClient, url: str) -> dict:
    try:
        resp = await client.get(url, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "lxml")
        return {
            "url": url,
            "title": soup.select_one("h1").get_text(strip=True) if soup.select_one("h1") else "",
            "status": "ok",
        }
    except Exception as e:
        return {"url": url, "status": "error", "error": str(e)}

async def scrape_all(urls: list[str], concurrency: int = 10) -> list[dict]:
    semaphore = asyncio.Semaphore(concurrency)

    async def bounded_scrape(client, url):
        async with semaphore:
            await asyncio.sleep(0.5)  # polite delay between requests
            return await scrape_one(client, url)

    async with httpx.AsyncClient(
        headers=HEADERS,
        follow_redirects=True,
        limits=httpx.Limits(max_connections=20),
    ) as client:
        tasks = [bounded_scrape(client, url) for url in urls]
        return await asyncio.gather(*tasks)

# Scrape with pagination
async def scrape_paginated(base_url: str, max_pages: int = 10) -> list[dict]:
    all_items = []
    async with httpx.AsyncClient(headers=HEADERS) as client:
        for page in range(1, max_pages + 1):
            url = f"{base_url}?page={page}"
            resp = await client.get(url, timeout=10)
            soup = BeautifulSoup(resp.text, "lxml")
            items = soup.select(".item")
            if not items:
                break  # no more pages
            all_items.extend([{"text": i.get_text(strip=True)} for i in items])
            await asyncio.sleep(1)  # rate limiting
    return all_items

Playwright: JavaScript-Rendered Pages

pip install playwright
playwright install chromium
import asyncio
from playwright.async_api import async_playwright, Page

async def scrape_spa(url: str) -> list[dict]:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
            viewport={"width": 1280, "height": 720},
        )
        page = await context.new_page()

        await page.goto(url, wait_until="networkidle")

        # Wait for dynamic content to load
        await page.wait_for_selector(".product-card", timeout=10_000)

        # Extract data via JavaScript evaluation
        items = await page.evaluate("""() => {
            return Array.from(document.querySelectorAll('.product-card')).map(card => ({
                name: card.querySelector('h2')?.textContent?.trim(),
                price: card.querySelector('.price')?.textContent?.trim(),
                rating: card.querySelector('.rating')?.getAttribute('data-value'),
            }));
        }""")

        await browser.close()
        return items

async def scrape_with_login(url: str, username: str, password: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Login
        await page.goto("https://example.com/login")
        await page.fill("#username", username)
        await page.fill("#password", password)
        await page.click("#login-btn")
        await page.wait_for_url("**/dashboard")

        # Navigate to target
        await page.goto(url)
        await page.wait_for_load_state("networkidle")

        # Scroll to load lazy content
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await asyncio.sleep(2)

        html = await page.content()
        await browser.close()
        return html

async def intercept_api_calls(url: str) -> list[dict]:
    """Capture the API responses the page makes instead of scraping HTML."""
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        async def handle_response(response):
            if "/api/products" in response.url:
                try:
                    data = await response.json()
                    results.extend(data.get("items", []))
                except Exception:
                    pass

        page.on("response", handle_response)
        await page.goto(url, wait_until="networkidle")
        await browser.close()

    return results

Data Extraction Patterns

import re
from bs4 import BeautifulSoup

def safe_text(el) -> str:
    """Get text or empty string if element is None."""
    return el.get_text(strip=True) if el else ""

def safe_attr(el, attr: str, default: str = "") -> str:
    """Get attribute or default if element/attribute is None."""
    return el.get(attr, default) if el else default

def extract_price(text: str) -> float | None:
    """Extract numeric price from text like '$1,299.99'."""
    match = re.search(r"[\d,]+\.?\d*", text.replace(",", ""))
    return float(match.group()) if match else None

def extract_structured(soup: BeautifulSoup) -> dict:
    """Extract JSON-LD structured data from a page."""
    import json
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            return json.loads(script.string)
        except Exception:
            continue
    return {}

def extract_meta(soup: BeautifulSoup) -> dict:
    """Extract Open Graph and Twitter Card meta tags."""
    meta = {}
    for tag in soup.find_all("meta"):
        prop = tag.get("property") or tag.get("name", "")
        content = tag.get("content", "")
        if prop.startswith(("og:", "twitter:")) and content:
            meta[prop] = content
    return meta

Handling Anti-Bot Measures

import random
import time

# Randomized delays to appear human
async def polite_delay(min_s=1.0, max_s=3.0):
    await asyncio.sleep(random.uniform(min_s, max_s))

# Rotate user agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]

def random_headers() -> dict:
    return {**HEADERS, "User-Agent": random.choice(USER_AGENTS)}

# Handle 429 Rate Limit with exponential backoff
async def scrape_with_retry(client, url: str, max_retries: int = 5) -> httpx.Response:
    for attempt in range(max_retries):
        resp = await client.get(url, headers=random_headers())
        if resp.status_code == 429:
            wait = (2 ** attempt) + random.random()
            print(f"Rate limited. Waiting {wait:.1f}s before retry {attempt+1}")
            await asyncio.sleep(wait)
        elif resp.status_code == 200:
            return resp
        else:
            resp.raise_for_status()
    raise Exception(f"Failed after {max_retries} retries")

Storing Scraped Data

import json
import csv
import sqlite3
from pathlib import Path

def save_json(data: list[dict], path: str):
    Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False))

def save_csv(data: list[dict], path: str):
    if not data:
        return
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

def save_sqlite(data: list[dict], db_path: str, table: str = "scraped"):
    if not data:
        return
    conn = sqlite3.connect(db_path)
    cols = ", ".join(f'"{k}" TEXT' for k in data[0].keys())
    conn.execute(f"CREATE TABLE IF NOT EXISTS {table} ({cols})")
    placeholders = ", ".join("?" * len(data[0]))
    conn.executemany(
        f"INSERT OR REPLACE INTO {table} VALUES ({placeholders})",
        [tuple(row.values()) for row in data],
    )
    conn.commit()
    conn.close()

Ethical Scraping Guidelines

  • Check robots.txt before scraping: httpx.get("https://example.com/robots.txt")
  • Respect rate limits — add delays between requests, use exponential backoff on 429
  • Identify yourself — include a contact email in your User-Agent string
  • Don't scrape personal data without explicit permission or legal basis
  • Use official APIs when available — they're more reliable and explicitly permitted
  • Cache aggressively — don't re-scrape unchanged pages unnecessarily

Frequently Asked Questions

BeautifulSoup vs lxml direct — which parser is faster?
lxml is the fastest parser for BeautifulSoup (5-10x faster than Python's html.parser). Use BeautifulSoup(html, "lxml"). For maximum performance, use lxml directly via lxml.html.fromstring() and XPath, but BeautifulSoup's API is easier to use for most cases.
When should I use Playwright vs httpx?
Use httpx + BeautifulSoup for static HTML pages where all content is in the initial HTTP response. Use Playwright when the page uses React, Angular, Vue, or other frameworks that render content client-side via JavaScript, or when you need to interact with forms, infinite scroll, or dynamic loading.
Is web scraping legal?
It depends on jurisdiction, the site's Terms of Service, and what you do with the data. Scraping publicly available, non-personal data for research or personal use is generally legal in most countries. Scraping behind authentication, exporting personal data (GDPR), or violating ToS can create legal risk. When in doubt, use the official API or contact the site owner.