Skip to main content
ship before you scale

Infrastructure Health Check Script

5 min read Chapter 41 of 42

Infrastructure Health Check Script

The Feature

A Python script runs daily (via cron or GitHub Actions) and checks every infrastructure component against the thresholds defined in Chapter 14. It produces a plain-text summary: green (healthy), yellow (investigate), or red (act now) for each component. If any component is red, the developer receives a notification.

The Decision

A script, not a service. Not a monitoring tool with a database and a web UI. A script that runs, checks, prints results, and exits. It queries the same metrics that Grafana Cloud collects, but instead of watching a dashboard, the developer receives a daily summary. If everything is green, the summary takes two seconds to read and the developer moves on.

The Implementation

Health Check Script

#!/usr/bin/env python3
"""Daily infrastructure health check for Marketflow."""
import asyncio
import json
import os
import sys

import httpx
import asyncpg


# Thresholds (from CH14)
THRESHOLDS = {
    "cpu_percent": {"yellow": 70, "red": 90},
    "memory_percent": {"yellow": 80, "red": 90},
    "disk_percent": {"yellow": 70, "red": 85},
    "db_size_mb": {"yellow": 400, "red": 480},  # 500 MB free tier
    "db_connections_percent": {"yellow": 80, "red": 95},
    "response_time_p95_ms": {"yellow": 500, "red": 2000},
    "error_rate_percent": {"yellow": 2, "red": 5},
    "redis_memory_percent": {"yellow": 75, "red": 90},
}


def status_for(metric: str, value: float) -> str:
    t = THRESHOLDS[metric]
    if value >= t["red"]:
        return "RED"
    if value >= t["yellow"]:
        return "YELLOW"
    return "GREEN"


def status_icon(status: str) -> str:
    return {"GREEN": "[OK]", "YELLOW": "[!!]", "RED": "[XX]"}[status]


async def check_system_metrics(api_url: str) -> list[dict]:
    """Query Prometheus metrics from the API."""
    results = []

    async with httpx.AsyncClient(timeout=10) as client:
        try:
            resp = await client.get(f"{api_url}/metrics")
            metrics_text = resp.text

            # Parse the health endpoint for system info
            health_resp = await client.get(f"{api_url}/health")
            health = health_resp.json()
            results.append({
                "component": "API Health",
                "status": "GREEN" if health["status"] == "healthy" else "RED",
                "value": health["status"],
            })
        except Exception as e:
            results.append({
                "component": "API Health",
                "status": "RED",
                "value": f"Unreachable: {e}",
            })

    return results


async def check_database(database_url: str) -> list[dict]:
    """Check database size and connection count."""
    results = []

    try:
        conn = await asyncpg.connect(database_url)

        # Database size
        size_bytes = await conn.fetchval(
            "SELECT pg_database_size(current_database())"
        )
        size_mb = size_bytes / (1024 * 1024)
        s = status_for("db_size_mb", size_mb)
        results.append({
            "component": "Database Size",
            "status": s,
            "value": f"{size_mb:.0f} MB / 500 MB",
        })

        # Connection count
        active = await conn.fetchval(
            "SELECT count(*) FROM pg_stat_activity"
        )
        max_conn = await conn.fetchval(
            "SELECT setting::int FROM pg_settings WHERE name = 'max_connections'"
        )
        conn_pct = (active / max_conn) * 100
        s = status_for("db_connections_percent", conn_pct)
        results.append({
            "component": "DB Connections",
            "status": s,
            "value": f"{active}/{max_conn} ({conn_pct:.0f}%)",
        })

        await conn.close()
    except Exception as e:
        results.append({
            "component": "Database",
            "status": "RED",
            "value": f"Connection failed: {e}",
        })

    return results


async def check_redis(redis_url: str) -> list[dict]:
    """Check Redis memory usage."""
    results = []

    try:
        import redis.asyncio as aioredis

        r = aioredis.from_url(redis_url)
        info = await r.info("memory")

        used_mb = info["used_memory"] / (1024 * 1024)
        max_mb = info.get("maxmemory", 0) / (1024 * 1024)

        if max_mb > 0:
            pct = (used_mb / max_mb) * 100
            s = status_for("redis_memory_percent", pct)
            results.append({
                "component": "Redis Memory",
                "status": s,
                "value": f"{used_mb:.0f} MB / {max_mb:.0f} MB ({pct:.0f}%)",
            })
        else:
            results.append({
                "component": "Redis Memory",
                "status": "GREEN",
                "value": f"{used_mb:.0f} MB (no limit set)",
            })

        await r.aclose()
    except Exception as e:
        results.append({
            "component": "Redis",
            "status": "RED",
            "value": f"Connection failed: {e}",
        })

    return results


def print_report(results: list[dict]) -> bool:
    """Print the health check report. Returns True if any RED."""
    print("=" * 60)
    print("  MARKETFLOW INFRASTRUCTURE HEALTH CHECK")
    print("=" * 60)

    has_red = False
    for r in results:
        icon = status_icon(r["status"])
        print(f"  {icon} {r['component']:.<30} {r['value']}")
        if r["status"] == "RED":
            has_red = True

    print("=" * 60)
    if has_red:
        print("  STATUS: ACTION REQUIRED")
    elif any(r["status"] == "YELLOW" for r in results):
        print("  STATUS: INVESTIGATE")
    else:
        print("  STATUS: ALL HEALTHY")
    print("=" * 60)

    return has_red


async def main():
    api_url = os.environ.get("API_URL", "http://localhost:8000")
    database_url = os.environ.get("DATABASE_URL", "")
    redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379")

    results = []
    results.extend(await check_system_metrics(api_url))

    if database_url:
        results.extend(await check_database(database_url))

    results.extend(await check_redis(redis_url))

    has_red = print_report(results)
    sys.exit(1 if has_red else 0)


if __name__ == "__main__":
    asyncio.run(main())

Scheduled Execution with GitHub Actions

# .github/workflows/health-check.yml
name: Daily Infrastructure Health Check

on:
  schedule:
    - cron: "0 8 * * *" # 8 AM UTC daily
  workflow_dispatch: # Allow manual trigger

jobs:
  health-check:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install dependencies
        run: pip install httpx asyncpg redis

      - name: Run health check
        env:
          API_URL: ${{ secrets.PRODUCTION_API_URL }}
          DATABASE_URL: ${{ secrets.PRODUCTION_DATABASE_URL }}
          REDIS_URL: ${{ secrets.PRODUCTION_REDIS_URL }}
        run: python scripts/health_check.py

      # If the health check exits with code 1 (RED status),
      # the GitHub Actions step fails and sends a notification
      # via GitHub's built-in email notifications for failed workflows.

Alternative: Cron on the VPS

# On the Hetzner VPS, add to crontab:
# crontab -e
0 8 * * * cd /opt/marketflow && python3 scripts/health_check.py >> /var/log/marketflow/health.log 2>&1

The Trap

# TRAP: The health check script that checks everything and alerts on nothing
# It logs 50 metrics to a file that nobody reads
# The file grows to 10 MB and fills the disk
# The very thing the script should detect (disk usage) is caused by the script

# SAFE: Check only what matters, alert only on actionable thresholds
# 6 components, 2 thresholds each
# Non-zero exit code triggers existing notification (GitHub Actions, cron mail)
# No persistent log storage needed

A health check script is only useful if someone acts on its output. Non-zero exit codes integrate with existing notification systems. GitHub Actions sends an email when a workflow fails. Cron sends an email when a command exits with an error. No additional notification infrastructure needed.

The Cost

ItemCost
Health check script$0
GitHub Actions (scheduled)Free for public repos, 2,000 min/month for private
Cron on VPS$0

The script runs in under 10 seconds. GitHub Actions free tier provides 2,000 minutes per month for private repositories. A daily 10-second health check uses approximately 5 minutes per month.