Chapter 9

Web Scraping — requests + BeautifulSoup + Playwright

Chapter 9: Web Scraping — httpx + BeautifulSoup + Playwright

The web is full of data, but copying it by hand is the worst possible approach. This chapter covers tool selection, making efficient HTTP requests with httpx, parsing HTML with BeautifulSoup, handling JavaScript-rendered pages with Playwright, anti-scraping countermeasures, and persistent storage — all culminating in a competitor price monitoring system.

Tool Selection

HTTP Clients: requests vs httpx

Aspect	requests	httpx
Async support	No (sync only)	Full async/await
HTTP/2	No	Native support
API style	Very simple, beginner-friendly	Nearly identical to requests
Best for	Single pages, small batches	High-concurrency, large-scale

Recommendation: Use httpx for all new projects. The API is nearly identical to requests with no migration cost, but the ceiling is far higher.

HTML Parsing

Library	Strengths	Weaknesses	Best for
BeautifulSoup4	Most docs, easiest to learn	Slowest	Learning, small projects
lxml	Fast, XPath support	C dependency install	High-volume parsing
parsel	CSS + XPath, same as Scrapy	Smaller community	Flexible needs

Dynamic Pages: Playwright vs Selenium

Playwright outperforms Selenium on every dimension: faster execution, built-in smart waits, better async support, cleaner API. For any new project, choose Playwright.

Decision Tree

Static HTML, small batch - Static HTML, large concurrent batch - Requires JavaScript rendering / login / click interactions - Large-scale production scraper

httpx + BeautifulSoup Basics

pip install httpx beautifulsoup4 lxml

Requests with Headers and Session Reuse

import httpx

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

with httpx.Client(headers=HEADERS, timeout=15, follow_redirects=True) as client:
    resp = client.get("https://example.com/news")
    resp.raise_for_status()
    html = resp.text

CSS Selectors vs XPath

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "lxml")

# CSS selectors (recommended — more readable)
for tag in soup.select("div.article-list h2.title a"):
    print(tag.get_text(strip=True), tag["href"])

# XPath via lxml (better for complex paths)
from lxml import etree
tree = etree.HTML(html)
links = tree.xpath('//div[@class="article-list"]//a/@href')

Async Scraping: asyncio + httpx.AsyncClient

import asyncio
import httpx
from bs4 import BeautifulSoup

async def fetch_page(client: httpx.AsyncClient, url: str, sem: asyncio.Semaphore) -> dict:
    async with sem:
        try:
            resp = await client.get(url, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "lxml")
            title = soup.select_one("h1")
            return {"url": url, "title": title.get_text(strip=True) if title else ""}
        except httpx.HTTPError as e:
            return {"url": url, "title": "", "error": str(e)}

async def scrape_many(urls: list[str], concurrency: int = 10) -> list[dict]:
    sem = asyncio.Semaphore(concurrency)
    headers = {"User-Agent": "Mozilla/5.0 (compatible; AsyncBot/1.0)"}
    async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
        tasks = [fetch_page(client, url, sem) for url in urls]
        return list(await asyncio.gather(*tasks))

if __name__ == "__main__":
    urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
    results = asyncio.run(scrape_many(urls, concurrency=10))
    print(f"Success: {sum(1 for r in results if r.get('title'))}/{len(urls)}")

Playwright for Dynamic Pages

pip install playwright
playwright install chromium

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto("https://example.com/spa-page")

    # Wait for a specific element (most reliable strategy)
    page.wait_for_selector("div.data-table", timeout=10000)
    # Alternative: wait for network to go idle (good for SPAs)
    # page.wait_for_load_state("networkidle")

    html = page.content()
    browser.close()

from playwright.sync_api import sync_playwright

def login_and_scrape(username: str, password: str, target_url: str) -> str:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context()
        page = context.new_page()

        page.goto("https://example.com/login")
        page.fill("#username", username)
        page.fill("#password", password)
        page.click("button[type='submit']")
        page.wait_for_url("**/dashboard**")

        # Persist login state for reuse
        context.storage_state(path="auth_state.json")

        page.goto(target_url)
        page.wait_for_selector("div.data-container")
        html = page.content()
        browser.close()
    return html

Anti-Scraping Countermeasures

import random, time, httpx

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
]

def fetch_with_retry(url: str, max_retries: int = 3) -> str:
    for attempt in range(max_retries):
        try:
            time.sleep(random.uniform(1.0, 3.0))
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            with httpx.Client(headers=headers, timeout=15) as client:
                resp = client.get(url)
                resp.raise_for_status()
                return resp.text
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                wait = (2 ** attempt) * 5
                print(f"Rate limited. Waiting {wait}s...")
                time.sleep(wait)
            else:
                raise
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

Data Storage

import sqlite3
from datetime import datetime

def init_db(db_path: str) -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT, price REAL,
            url TEXT UNIQUE,
            scraped_at TEXT
        )
    """)
    conn.commit()
    return conn

def upsert_product(conn, name: str, price: float, url: str):
    conn.execute(
        """INSERT INTO products (name, price, url, scraped_at) VALUES (?,?,?,?)
           ON CONFLICT(url) DO UPDATE SET price=excluded.price, scraped_at=excluded.scraped_at""",
        (name, price, url, datetime.now().isoformat())
    )
    conn.commit()

Project: Competitor Price Monitor

import httpx, sqlite3, time, random
from bs4 import BeautifulSoup
from datetime import datetime

TARGETS = [
    {"name": "Competitor A Headphones", "url": "https://competitor-a.com/product/123"},
    {"name": "Competitor B Headphones", "url": "https://competitor-b.com/product/456"},
]
DB_PATH = "prices.db"
ALERT_THRESHOLD = 0.05  # Alert on 5%+ price change

def init_db():
    conn = sqlite3.connect(DB_PATH)
    conn.execute("""CREATE TABLE IF NOT EXISTS price_history (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT, url TEXT, price REAL, scraped_at TEXT
    )""")
    conn.commit()
    return conn

def get_last_price(conn, url):
    row = conn.execute(
        "SELECT price FROM price_history WHERE url=? ORDER BY scraped_at DESC LIMIT 1", (url,)
    ).fetchone()
    return row[0] if row else None

def scrape_price(url: str) -> float | None:
    try:
        time.sleep(random.uniform(1, 3))
        with httpx.Client(headers={"User-Agent": "Mozilla/5.0"}, timeout=15) as client:
            resp = client.get(url)
            resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "lxml")
        tag = soup.select_one("span.price, [data-price]")
        if tag:
            return float(tag.get_text(strip=True).replace("$", "").replace(",", ""))
    except Exception as e:
        print(f"Scrape failed {url}: {e}")
    return None

def main():
    conn = init_db()
    for t in TARGETS:
        price = scrape_price(t["url"])
        if price is None:
            continue
        last = get_last_price(conn, t["url"])
        conn.execute(
            "INSERT INTO price_history (name,url,price,scraped_at) VALUES (?,?,?,?)",
            (t["name"], t["url"], price, datetime.now().isoformat())
        )
        conn.commit()
        if last and abs(price - last) / last >= ALERT_THRESHOLD:
            pct = (price - last) / last * 100
            print(f"[ALERT] {t['name']}: ${last:.2f} -> ${price:.2f} ({pct:+.1f}%)")
        else:
            print(f"[OK] {t['name']}: ${price:.2f}")
    conn.close()

if __name__ == "__main__":
    main()

Previous

Next
Chapter 10: Email Automation

Rate this chapter

4.7 / 5 (34 ratings)