Chapter 47

Single-Machine Production Deployment: Docker Containerization

Chapter 47: Single-Machine Production Deployment — Docker Containerization

Introduction

"Works on my machine" is the oldest joke in software. Docker solves it. Containerizing Hermes Agent delivers environment consistency, clear resource isolation, clean secret management, and repeatable deployments. This chapter provides a complete Dockerfile, docker-compose.yml, health checks, data persistence configuration, and resource limits — everything you need for a production-grade single-machine deployment.

47.1 Complete Dockerfile

Multi-Stage Build (Hermes Agent)

# Dockerfile — Multi-stage build for minimal final image size

# ─── Stage 1: Build dependencies ────────────────────────────────────────────
FROM python:3.11-slim AS builder

ARG DEBIAN_FRONTEND=noninteractive
ARG PIP_NO_CACHE_DIR=1

WORKDIR /app

RUN apt-get update && apt-get install -y \
    build-essential curl git \
    && rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip install --prefix=/install --no-deps -r requirements.txt

# ─── Stage 2: Runtime image ─────────────────────────────────────────────────
FROM python:3.11-slim AS runtime

LABEL maintainer="[email protected]"
LABEL description="Hermes Agent MCP Server"
LABEL version="1.0.0"

RUN apt-get update && apt-get install -y \
    curl ca-certificates tini \
    && rm -rf /var/lib/apt/lists/*

# Non-root user (security best practice)
RUN groupadd -r hermes && useradd -r -g hermes -m -d /home/hermes hermes

WORKDIR /app

# Copy installed packages from builder
COPY --from=builder /install /usr/local

# Copy application code
COPY --chown=hermes:hermes . /app/

# Create required directories
RUN mkdir -p /app/logs /app/data /app/cache \
    && chown -R hermes:hermes /app/logs /app/data /app/cache

USER hermes

EXPOSE 8765

HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8765/health || exit 1

# tini as init process: correct signal handling + zombie process reaping
ENTRYPOINT ["/usr/bin/tini", "--"]
CMD ["python", "-m", "hermes_agent.server"]

requirements.txt

mcp==1.0.0
httpx==0.27.0
pydantic==2.5.0
python-dotenv==1.0.0
prometheus-client==0.20.0
tenacity==8.2.3
structlog==24.1.0
fastapi==0.110.0
uvicorn==0.27.0
psutil==5.9.8

Ollama Initialization Script

# scripts/ollama_entrypoint.sh
#!/bin/bash
set -e

ollama serve &
OLLAMA_PID=$!

echo "Waiting for Ollama to start..."
until curl -sf http://localhost:11434/api/version > /dev/null 2>&1; do
    sleep 1
done
echo "Ollama is ready"

MODEL="${OLLAMA_MODEL:-nous-hermes3:70b}"
if ! ollama list | grep -q "$MODEL"; then
    echo "Pulling model: $MODEL"
    ollama pull "$MODEL"
else
    echo "Model already exists: $MODEL"
fi

wait $OLLAMA_PID

47.2 Complete docker-compose.yml

# docker-compose.yml — Production Hermes Agent + Ollama deployment

version: "3.9"

services:
  # ─── Ollama Inference Engine ───────────────────────────────────────────────
  ollama:
    image: ollama/ollama:latest
    container_name: hermes-ollama
    restart: unless-stopped

    environment:
      - OLLAMA_HOST=0.0.0.0:11434
      - OLLAMA_NUM_PARALLEL=4
      - OLLAMA_MAX_LOADED_MODELS=1
      - OLLAMA_KEEP_ALIVE=10m
      - OLLAMA_MODELS=/models

    volumes:
      - ollama_models:/models
      - /tmp/ollama:/tmp

    ports:
      - "127.0.0.1:11434:11434"    # Local only — not exposed externally

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
        limits:
          memory: 90G

    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s

    networks:
      - hermes-internal

    logging:
      driver: "json-file"
      options:
        max-size: "100m"
        max-file: "5"

  # ─── Model Initialization (one-shot task) ──────────────────────────────────
  ollama-init:
    image: ollama/ollama:latest
    container_name: hermes-ollama-init
    restart: "no"

    depends_on:
      ollama:
        condition: service_healthy

    environment:
      - OLLAMA_HOST=http://ollama:11434

    command: >
      sh -c "
        ollama pull ${OLLAMA_MODEL:-nous-hermes3:70b} &&
        echo 'Model initialization complete'
      "

    volumes:
      - ollama_models:/models

    networks:
      - hermes-internal

  # ─── Hermes Agent MCP Server ───────────────────────────────────────────────
  hermes-agent:
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - BUILD_ENV=production

    container_name: hermes-agent
    restart: unless-stopped

    depends_on:
      ollama:
        condition: service_healthy
      ollama-init:
        condition: service_completed_successfully

    environment:
      - HERMES_BASE_URL=http://ollama:11434
      - HERMES_MODEL=${HERMES_MODEL:-nous-hermes3:70b}
      - MCP_PORT=8765
      - MAX_TOKENS=4096
      - TEMPERATURE=0.1
      - CONTEXT_WINDOW=65536
      - REQUEST_TIMEOUT=120
      - PROMETHEUS_PORT=9090
      - LOG_LEVEL=INFO
      - LOG_FORMAT=json
      - ENABLE_RATE_LIMITING=true
      - MAX_REQUESTS_PER_MINUTE=60

    env_file:
      - .env.production

    secrets:
      - api_key
      - db_password

    volumes:
      - hermes_logs:/app/logs
      - hermes_data:/app/data
      - hermes_cache:/app/cache

    ports:
      - "127.0.0.1:8765:8765"
      - "127.0.0.1:9090:9090"

    deploy:
      resources:
        limits:
          cpus: "4.0"
          memory: 4G
        reservations:
          cpus: "1.0"
          memory: 1G

    ulimits:
      nofile:
        soft: 65536
        hard: 65536

    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8765/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

    networks:
      - hermes-internal
      - hermes-external

    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "10"

  # ─── Nginx Reverse Proxy ───────────────────────────────────────────────────
  nginx:
    image: nginx:alpine
    container_name: hermes-nginx
    restart: unless-stopped
    depends_on:
      - hermes-agent
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx_logs:/var/log/nginx
    ports:
      - "80:80"
      - "443:443"
    networks:
      - hermes-external

  # ─── Prometheus Monitoring ─────────────────────────────────────────────────
  prometheus:
    image: prom/prometheus:latest
    container_name: hermes-prometheus
    restart: unless-stopped
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    ports:
      - "127.0.0.1:9091:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=30d'
    networks:
      - hermes-internal

# ─── Networks ─────────────────────────────────────────────────────────────────
networks:
  hermes-internal:
    driver: bridge
    internal: true        # No external internet access
  hermes-external:
    driver: bridge

# ─── Volumes ──────────────────────────────────────────────────────────────────
volumes:
  ollama_models:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/ollama/models     # Bind to large host storage

  hermes_logs:
    driver: local

  hermes_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/hermes/data

  hermes_cache:
    driver: tmpfs                     # In-memory cache (cleared on restart)
    driver_opts:
      device: tmpfs
      o: size=2g

  nginx_logs:
    driver: local

  prometheus_data:
    driver: local

# ─── Docker Secrets ───────────────────────────────────────────────────────────
secrets:
  api_key:
    file: ./secrets/api_key.txt
  db_password:
    file: ./secrets/db_password.txt

47.3 Environment Variable Management (Secrets)

Configuration Priority

Security hierarchy (highest → lowest):
1. Docker Secrets (/run/secrets/)     ← Safest (in-memory mount)
2. Environment variables (-e KEY=VAL)  ← Convenient but visible in docker inspect
3. .env files                          ← Dev only; avoid in production
4. Hardcoded defaults                  ← Last resort only

Secret Setup

mkdir -p secrets
echo -n "your-super-secret-api-key" > secrets/api_key.txt
echo -n "db-password-here" > secrets/db_password.txt
chmod 600 secrets/*.txt

echo "secrets/" >> .gitignore
echo ".env.production" >> .gitignore

Reading Secrets in Python

from pathlib import Path
import os

def read_secret(secret_name: str, env_fallback: str = None) -> str:
    """
    Read from Docker Secret file first,
    fall back to environment variable.
    """
    secret_file = Path(f"/run/secrets/{secret_name}")
    if secret_file.exists():
        return secret_file.read_text().strip()

    if env_fallback:
        value = os.getenv(env_fallback)
        if value:
            return value

    value = os.getenv(secret_name.upper())
    if value:
        return value

    raise ValueError(f"Secret '{secret_name}' not found")

API_KEY = read_secret("api_key", env_fallback="API_KEY")
DB_PASSWORD = read_secret("db_password", env_fallback="DB_PASSWORD")

Environment File Templates

# .env.example (commit to Git — no real values)
HERMES_MODEL=nous-hermes3:70b
CONTEXT_WINDOW=65536
MAX_TOKENS=4096
TEMPERATURE=0.1
REQUEST_TIMEOUT=120
ENABLE_RATE_LIMITING=true
MAX_REQUESTS_PER_MINUTE=60
LOG_LEVEL=INFO
PROMETHEUS_ENABLED=true
SENTRY_DSN=

# .env.production (never commit — gitignored)
HERMES_MODEL=nous-hermes3:70b
CONTEXT_WINDOW=65536
MAX_TOKENS=4096
TEMPERATURE=0.1
REQUEST_TIMEOUT=120
ENABLE_RATE_LIMITING=true
MAX_REQUESTS_PER_MINUTE=60
LOG_LEVEL=WARNING
PROMETHEUS_ENABLED=true
SENTRY_DSN=https://your-sentry-dsn

47.4 Data Persistence

Volume Strategy by Data Type

Data	Volume Type	Persistence	Notes
Model files (GGUF)	bind mount	Forever	Huge (30–140 GB); bind to large storage
Application logs	named volume	Long-term	Add logrotate
Conversation history	bind mount	Long-term	Back up regularly
Inference cache	tmpfs	Session only	Fastest; cleared on restart

Backup Script

#!/bin/bash
# backup_hermes.sh

BACKUP_DIR="/backup/hermes/$(date +%Y%m%d)"
mkdir -p "$BACKUP_DIR"

# Back up application data
docker run --rm \
    -v hermes_data:/data \
    -v "$BACKUP_DIR":/backup \
    alpine tar czf /backup/hermes_data.tar.gz -C /data .

# Back up recent logs (last 7 days)
docker run --rm \
    -v hermes_logs:/logs \
    -v "$BACKUP_DIR":/backup \
    alpine find /logs -mtime -7 -exec tar czf /backup/recent_logs.tar.gz {} +

# Prune backups older than 30 days
find /backup/hermes/ -maxdepth 1 -type d -mtime +30 -exec rm -rf {} +

echo "Backup complete: $BACKUP_DIR"

47.5 Health Check Configuration

Three-Tier Health Check API

# health_check.py
from fastapi import FastAPI
import httpx, time, shutil, psutil

app = FastAPI()

@app.get("/health")
async def health():
    """Liveness — is the process alive?"""
    return {"status": "ok", "timestamp": time.time()}

@app.get("/health/ready")
async def readiness():
    """Readiness — are all dependencies available?"""
    checks = {}
    ready = True

    # Check Ollama
    try:
        async with httpx.AsyncClient(timeout=5) as client:
            r = await client.get("http://ollama:11434/api/version")
            checks["ollama"] = {
                "status": "ok" if r.status_code == 200 else "error",
                "latency_ms": r.elapsed.total_seconds() * 1000
            }
    except Exception as e:
        checks["ollama"] = {"status": "error", "error": str(e)}
        ready = False

    # Check disk space
    disk = shutil.disk_usage("/app/data")
    free_gb = disk.free / 1024 ** 3
    checks["disk"] = {"status": "ok" if free_gb > 1 else "warning", "free_gb": round(free_gb, 2)}
    if free_gb < 0.5:
        ready = False

    # Check memory
    mem = psutil.virtual_memory()
    checks["memory"] = {"status": "ok" if mem.percent < 90 else "warning", "used_pct": mem.percent}

    status_code = 200 if ready else 503
    return {"status": "ready" if ready else "not_ready", "checks": checks}, status_code

@app.get("/health/live")
async def liveness():
    """Deep liveness — can the model actually infer?"""
    try:
        async with httpx.AsyncClient(timeout=30) as client:
            r = await client.post(
                "http://ollama:11434/api/generate",
                json={"model": "nous-hermes3:70b", "prompt": "Hi",
                      "stream": False, "options": {"num_predict": 5}}
            )
            if r.status_code == 200:
                return {"status": "ok", "inference": "working"}
    except Exception as e:
        return {"status": "error", "inference": str(e)}, 503
    return {"status": "error"}, 503

47.6 Container Resource Limits

Resource Configuration

services:
  hermes-agent:
    deploy:
      resources:
        limits:
          cpus: "4.0"       # Hard CPU cap
          memory: 4G        # OOM kill threshold
        reservations:
          cpus: "0.5"       # Guaranteed minimum
          memory: 512M
    ulimits:
      nofile:
        soft: 65536
        hard: 65536

  ollama:
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1      # Specific GPU count
              capabilities: [gpu]
        limits:
          memory: 90G

Monitoring Resource Usage

# Real-time resource usage for all containers
docker stats --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}"

# GPU utilization
nvidia-smi --query-gpu=name,memory.used,memory.total,utilization.gpu \
    --format=csv --loop=5

47.7 Production Operations Runbook

First-Time Deployment

# 1. Prepare directory structure
mkdir -p /data/ollama/models /data/hermes/{data,logs} secrets

# 2. Fill in secrets
echo -n "your-api-key" > secrets/api_key.txt
cp .env.example .env.production && vim .env.production

# 3. Build images
docker compose build --no-cache

# 4. Start in background
docker compose up -d

# 5. Watch initialization
docker compose logs -f ollama-init

# 6. Verify health
curl http://localhost:8765/health
curl http://localhost:8765/health/ready

# 7. Test MCP tool call
python test_mcp.py

Zero-Downtime Update

# Update Hermes Agent only (Ollama keeps running)
docker compose build hermes-agent
docker compose up -d --no-deps hermes-agent

Rollback

docker compose stop hermes-agent
docker compose rm -f hermes-agent
docker tag hermes-agent:previous hermes-agent:latest
docker compose up -d hermes-agent

Chapter Summary

Key elements of containerizing Hermes Agent for production:

Multi-stage build: separate builder and runtime stages; minimize image size
Non-root execution: dedicated user reduces attack surface
Secrets hierarchy: Docker Secrets > env vars > .env files (in security order)
Data persistence: bind mount for models; named volumes for app data; tmpfs for cache
Three-tier health checks: /health (alive) + /health/ready (dependencies) + /health/live (inference working)
Resource limits: always set memory limits to prevent OOM from affecting the entire host

Review Questions

The ollama-init service has restart: "no". If the model download fails midway, the service won't retry automatically. How would you modify the compose configuration to allow manually re-triggering initialization without redeploying the entire stack?
When Hermes Agent's memory usage hits the memory: 4G limit, Docker sends SIGKILL immediately. How would you modify the server code to handle SIGTERM gracefully — completing in-flight requests before shutting down?
This chapter uses tini as the container init process. Why does a Python process (especially one that spawns child processes) need an init process? What specific problems arise without one?

Rate this chapter

4.8 / 5 (3 ratings)