Chapter 9
Web Scraping — requests + BeautifulSoup + Playwright
Chapter 9: Web Scraping — httpx + BeautifulSoup + Playwright
The web is full of data, but copying it by hand is the worst possible approach. This chapter covers tool selection, making efficient HTTP requests with httpx, parsing HTML with BeautifulSoup, handling JavaScript-rendered pages with Playwright, anti-scraping countermeasures, and persistent storage — all culminating in a competitor price monitoring system.
Tool Selection
HTTP Clients: requests vs httpx
| Aspect | requests | httpx |
|---|---|---|
| Async support | No (sync only) | Full async/await |
| HTTP/2 | No | Native support |
| API style | Very simple, beginner-friendly | Nearly identical to requests |
| Best for | Single pages, small batches | High-concurrency, large-scale |
Recommendation: Use httpx for all new projects. The API is nearly identical to requests with no migration cost, but the ceiling is far higher.
HTML Parsing
| Library | Strengths | Weaknesses | Best for |
|---|---|---|---|
| BeautifulSoup4 | Most docs, easiest to learn | Slowest | Learning, small projects |
| lxml | Fast, XPath support | C dependency install | High-volume parsing |
| parsel | CSS + XPath, same as Scrapy | Smaller community | Flexible needs |
Dynamic Pages: Playwright vs Selenium
Playwright outperforms Selenium on every dimension: faster execution, built-in smart waits, better async support, cleaner API. For any new project, choose Playwright.
Decision Tree
- Static HTML, small batch - Static HTML, large concurrent batch - Requires JavaScript rendering / login / click interactions - Large-scale production scraper
httpx + BeautifulSoup Basics
pip install httpx beautifulsoup4 lxml
Requests with Headers and Session Reuse
import httpx
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
with httpx.Client(headers=HEADERS, timeout=15, follow_redirects=True) as client:
resp = client.get("https://example.com/news")
resp.raise_for_status()
html = resp.text
CSS Selectors vs XPath
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
# CSS selectors (recommended — more readable)
for tag in soup.select("div.article-list h2.title a"):
print(tag.get_text(strip=True), tag["href"])
# XPath via lxml (better for complex paths)
from lxml import etree
tree = etree.HTML(html)
links = tree.xpath('//div[@class="article-list"]//a/@href')
Async Scraping: asyncio + httpx.AsyncClient
import asyncio
import httpx
from bs4 import BeautifulSoup
async def fetch_page(client: httpx.AsyncClient, url: str, sem: asyncio.Semaphore) -> dict:
async with sem:
try:
resp = await client.get(url, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
title = soup.select_one("h1")
return {"url": url, "title": title.get_text(strip=True) if title else ""}
except httpx.HTTPError as e:
return {"url": url, "title": "", "error": str(e)}
async def scrape_many(urls: list[str], concurrency: int = 10) -> list[dict]:
sem = asyncio.Semaphore(concurrency)
headers = {"User-Agent": "Mozilla/5.0 (compatible; AsyncBot/1.0)"}
async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
tasks = [fetch_page(client, url, sem) for url in urls]
return list(await asyncio.gather(*tasks))
if __name__ == "__main__":
urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
results = asyncio.run(scrape_many(urls, concurrency=10))
print(f"Success: {sum(1 for r in results if r.get('title'))}/{len(urls)}")
Playwright for Dynamic Pages
pip install playwright
playwright install chromium
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto("https://example.com/spa-page")
# Wait for a specific element (most reliable strategy)
page.wait_for_selector("div.data-table", timeout=10000)
# Alternative: wait for network to go idle (good for SPAs)
# page.wait_for_load_state("networkidle")
html = page.content()
browser.close()
Login and Session Persistence
from playwright.sync_api import sync_playwright
def login_and_scrape(username: str, password: str, target_url: str) -> str:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto("https://example.com/login")
page.fill("#username", username)
page.fill("#password", password)
page.click("button[type='submit']")
page.wait_for_url("**/dashboard**")
# Persist login state for reuse
context.storage_state(path="auth_state.json")
page.goto(target_url)
page.wait_for_selector("div.data-container")
html = page.content()
browser.close()
return html
Anti-Scraping Countermeasures
import random, time, httpx
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
]
def fetch_with_retry(url: str, max_retries: int = 3) -> str:
for attempt in range(max_retries):
try:
time.sleep(random.uniform(1.0, 3.0))
headers = {"User-Agent": random.choice(USER_AGENTS)}
with httpx.Client(headers=headers, timeout=15) as client:
resp = client.get(url)
resp.raise_for_status()
return resp.text
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
wait = (2 ** attempt) * 5
print(f"Rate limited. Waiting {wait}s...")
time.sleep(wait)
else:
raise
raise RuntimeError(f"Failed after {max_retries} retries: {url}")
Data Storage
import sqlite3
from datetime import datetime
def init_db(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT, price REAL,
url TEXT UNIQUE,
scraped_at TEXT
)
""")
conn.commit()
return conn
def upsert_product(conn, name: str, price: float, url: str):
conn.execute(
"""INSERT INTO products (name, price, url, scraped_at) VALUES (?,?,?,?)
ON CONFLICT(url) DO UPDATE SET price=excluded.price, scraped_at=excluded.scraped_at""",
(name, price, url, datetime.now().isoformat())
)
conn.commit()
Project: Competitor Price Monitor
import httpx, sqlite3, time, random
from bs4 import BeautifulSoup
from datetime import datetime
TARGETS = [
{"name": "Competitor A Headphones", "url": "https://competitor-a.com/product/123"},
{"name": "Competitor B Headphones", "url": "https://competitor-b.com/product/456"},
]
DB_PATH = "prices.db"
ALERT_THRESHOLD = 0.05 # Alert on 5%+ price change
def init_db():
conn = sqlite3.connect(DB_PATH)
conn.execute("""CREATE TABLE IF NOT EXISTS price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT, url TEXT, price REAL, scraped_at TEXT
)""")
conn.commit()
return conn
def get_last_price(conn, url):
row = conn.execute(
"SELECT price FROM price_history WHERE url=? ORDER BY scraped_at DESC LIMIT 1", (url,)
).fetchone()
return row[0] if row else None
def scrape_price(url: str) -> float | None:
try:
time.sleep(random.uniform(1, 3))
with httpx.Client(headers={"User-Agent": "Mozilla/5.0"}, timeout=15) as client:
resp = client.get(url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
tag = soup.select_one("span.price, [data-price]")
if tag:
return float(tag.get_text(strip=True).replace("$", "").replace(",", ""))
except Exception as e:
print(f"Scrape failed {url}: {e}")
return None
def main():
conn = init_db()
for t in TARGETS:
price = scrape_price(t["url"])
if price is None:
continue
last = get_last_price(conn, t["url"])
conn.execute(
"INSERT INTO price_history (name,url,price,scraped_at) VALUES (?,?,?,?)",
(t["name"], t["url"], price, datetime.now().isoformat())
)
conn.commit()
if last and abs(price - last) / last >= ALERT_THRESHOLD:
pct = (price - last) / last * 100
print(f"[ALERT] {t['name']}: ${last:.2f} -> ${price:.2f} ({pct:+.1f}%)")
else:
print(f"[OK] {t['name']}: ${price:.2f}")
conn.close()
if __name__ == "__main__":
main()
Previous
Next
Chapter 10: Email Automation