Chapter 9

Web Scraping — requests + BeautifulSoup + Playwright

Chapter 9: Web Scraping — httpx + BeautifulSoup + Playwright

The web is full of data, but copying it by hand is the worst possible approach. This chapter covers tool selection, making efficient HTTP requests with httpx, parsing HTML with BeautifulSoup, handling JavaScript-rendered pages with Playwright, anti-scraping countermeasures, and persistent storage — all culminating in a competitor price monitoring system.

Tool Selection

HTTP Clients: requests vs httpx

Aspect requests httpx
Async support No (sync only) Full async/await
HTTP/2 No Native support
API style Very simple, beginner-friendly Nearly identical to requests
Best for Single pages, small batches High-concurrency, large-scale

Recommendation: Use httpx for all new projects. The API is nearly identical to requests with no migration cost, but the ceiling is far higher.

HTML Parsing

Library Strengths Weaknesses Best for
BeautifulSoup4 Most docs, easiest to learn Slowest Learning, small projects
lxml Fast, XPath support C dependency install High-volume parsing
parsel CSS + XPath, same as Scrapy Smaller community Flexible needs

Dynamic Pages: Playwright vs Selenium

Playwright outperforms Selenium on every dimension: faster execution, built-in smart waits, better async support, cleaner API. For any new project, choose Playwright.

Decision Tree

httpx + BeautifulSoup Basics

pip install httpx beautifulsoup4 lxml

Requests with Headers and Session Reuse

import httpx

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

with httpx.Client(headers=HEADERS, timeout=15, follow_redirects=True) as client:
    resp = client.get("https://example.com/news")
    resp.raise_for_status()
    html = resp.text

CSS Selectors vs XPath

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "lxml")

# CSS selectors (recommended — more readable)
for tag in soup.select("div.article-list h2.title a"):
    print(tag.get_text(strip=True), tag["href"])

# XPath via lxml (better for complex paths)
from lxml import etree
tree = etree.HTML(html)
links = tree.xpath('//div[@class="article-list"]//a/@href')

Async Scraping: asyncio + httpx.AsyncClient

import asyncio
import httpx
from bs4 import BeautifulSoup

async def fetch_page(client: httpx.AsyncClient, url: str, sem: asyncio.Semaphore) -> dict:
    async with sem:
        try:
            resp = await client.get(url, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "lxml")
            title = soup.select_one("h1")
            return {"url": url, "title": title.get_text(strip=True) if title else ""}
        except httpx.HTTPError as e:
            return {"url": url, "title": "", "error": str(e)}

async def scrape_many(urls: list[str], concurrency: int = 10) -> list[dict]:
    sem = asyncio.Semaphore(concurrency)
    headers = {"User-Agent": "Mozilla/5.0 (compatible; AsyncBot/1.0)"}
    async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
        tasks = [fetch_page(client, url, sem) for url in urls]
        return list(await asyncio.gather(*tasks))

if __name__ == "__main__":
    urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
    results = asyncio.run(scrape_many(urls, concurrency=10))
    print(f"Success: {sum(1 for r in results if r.get('title'))}/{len(urls)}")

Playwright for Dynamic Pages

pip install playwright
playwright install chromium
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto("https://example.com/spa-page")

    # Wait for a specific element (most reliable strategy)
    page.wait_for_selector("div.data-table", timeout=10000)
    # Alternative: wait for network to go idle (good for SPAs)
    # page.wait_for_load_state("networkidle")

    html = page.content()
    browser.close()

Login and Session Persistence

from playwright.sync_api import sync_playwright

def login_and_scrape(username: str, password: str, target_url: str) -> str:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context()
        page = context.new_page()

        page.goto("https://example.com/login")
        page.fill("#username", username)
        page.fill("#password", password)
        page.click("button[type='submit']")
        page.wait_for_url("**/dashboard**")

        # Persist login state for reuse
        context.storage_state(path="auth_state.json")

        page.goto(target_url)
        page.wait_for_selector("div.data-container")
        html = page.content()
        browser.close()
    return html

Anti-Scraping Countermeasures

import random, time, httpx

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
]

def fetch_with_retry(url: str, max_retries: int = 3) -> str:
    for attempt in range(max_retries):
        try:
            time.sleep(random.uniform(1.0, 3.0))
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            with httpx.Client(headers=headers, timeout=15) as client:
                resp = client.get(url)
                resp.raise_for_status()
                return resp.text
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                wait = (2 ** attempt) * 5
                print(f"Rate limited. Waiting {wait}s...")
                time.sleep(wait)
            else:
                raise
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

Data Storage

import sqlite3
from datetime import datetime

def init_db(db_path: str) -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT, price REAL,
            url TEXT UNIQUE,
            scraped_at TEXT
        )
    """)
    conn.commit()
    return conn

def upsert_product(conn, name: str, price: float, url: str):
    conn.execute(
        """INSERT INTO products (name, price, url, scraped_at) VALUES (?,?,?,?)
           ON CONFLICT(url) DO UPDATE SET price=excluded.price, scraped_at=excluded.scraped_at""",
        (name, price, url, datetime.now().isoformat())
    )
    conn.commit()

Project: Competitor Price Monitor

import httpx, sqlite3, time, random
from bs4 import BeautifulSoup
from datetime import datetime

TARGETS = [
    {"name": "Competitor A Headphones", "url": "https://competitor-a.com/product/123"},
    {"name": "Competitor B Headphones", "url": "https://competitor-b.com/product/456"},
]
DB_PATH = "prices.db"
ALERT_THRESHOLD = 0.05  # Alert on 5%+ price change

def init_db():
    conn = sqlite3.connect(DB_PATH)
    conn.execute("""CREATE TABLE IF NOT EXISTS price_history (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT, url TEXT, price REAL, scraped_at TEXT
    )""")
    conn.commit()
    return conn

def get_last_price(conn, url):
    row = conn.execute(
        "SELECT price FROM price_history WHERE url=? ORDER BY scraped_at DESC LIMIT 1", (url,)
    ).fetchone()
    return row[0] if row else None

def scrape_price(url: str) -> float | None:
    try:
        time.sleep(random.uniform(1, 3))
        with httpx.Client(headers={"User-Agent": "Mozilla/5.0"}, timeout=15) as client:
            resp = client.get(url)
            resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "lxml")
        tag = soup.select_one("span.price, [data-price]")
        if tag:
            return float(tag.get_text(strip=True).replace("$", "").replace(",", ""))
    except Exception as e:
        print(f"Scrape failed {url}: {e}")
    return None

def main():
    conn = init_db()
    for t in TARGETS:
        price = scrape_price(t["url"])
        if price is None:
            continue
        last = get_last_price(conn, t["url"])
        conn.execute(
            "INSERT INTO price_history (name,url,price,scraped_at) VALUES (?,?,?,?)",
            (t["name"], t["url"], price, datetime.now().isoformat())
        )
        conn.commit()
        if last and abs(price - last) / last >= ALERT_THRESHOLD:
            pct = (price - last) / last * 100
            print(f"[ALERT] {t['name']}: ${last:.2f} -> ${price:.2f} ({pct:+.1f}%)")
        else:
            print(f"[OK] {t['name']}: ${price:.2f}")
    conn.close()

if __name__ == "__main__":
    main()
Previous

Next
Chapter 10: Email Automation
Rate this chapter
4.7  / 5  (34 ratings)

💬 Comments