Files
internet-id/docker-compose.monitoring.yml
Copilot b6c4dc984a Configure production monitoring and alerting infrastructure (#105)
* Initial plan

* Add production monitoring and alerting infrastructure

- Create Prometheus alert rules for all critical thresholds
- Add Alertmanager configuration with PagerDuty, Slack, and email routing
- Create docker-compose.monitoring.yml with full monitoring stack
- Add Sentry error tracking service integration
- Create comprehensive alerting runbook documentation
- Add monitoring setup guide with detailed instructions
- Configure blackbox exporter for uptime monitoring
- Update .env.example with monitoring and alerting configuration

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Add blockchain transaction and health check metrics, integrate Sentry error tracking

- Enhanced metrics service with blockchain transaction tracking
- Added health check status metrics to Prometheus
- Added queue depth gauge for future queue implementation
- Integrated Sentry error tracking in Express app
- Updated health check endpoint to export metrics
- Fixed linting issues in new code
- Created monitoring directory README

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Add comprehensive monitoring implementation summary documentation

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Address code review feedback: improve query string filtering, add error params

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Fix Sentry error handling: remove duplicate capture, fix fallback handler

- Remove redundant sentryService.captureException call in global error handler
  (Sentry's error handler already captures all errors)
- Fix fallback error handler to pass error to next handler with next(_err)
  instead of swallowing it with next()

Addresses review feedback from @copilot-pull-request-reviewer

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>
2025-10-31 18:32:10 -05:00

225 lines
6.8 KiB
YAML

version: "3.9"
# Docker Compose configuration for Monitoring Stack
# This file adds monitoring services to the Internet-ID infrastructure
# Usage: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d
services:
# Prometheus - Metrics collection and alerting
prometheus:
image: prom/prometheus:v2.48.0
container_name: prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
volumes:
- ./ops/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./ops/monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
networks:
- monitoring
- default
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# Alertmanager - Alert routing and management
alertmanager:
image: prom/alertmanager:v0.26.0
container_name: alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
volumes:
- ./ops/monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
environment:
# PagerDuty configuration
- PAGERDUTY_SERVICE_KEY=${PAGERDUTY_SERVICE_KEY}
- PAGERDUTY_ROUTING_KEY=${PAGERDUTY_ROUTING_KEY}
- PAGERDUTY_DATABASE_KEY=${PAGERDUTY_DATABASE_KEY}
- PAGERDUTY_DBA_ROUTING_KEY=${PAGERDUTY_DBA_ROUTING_KEY}
# Slack configuration
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SLACK_CRITICAL_CHANNEL=${SLACK_CRITICAL_CHANNEL:-#alerts-critical}
- SLACK_WARNINGS_CHANNEL=${SLACK_WARNINGS_CHANNEL:-#alerts-warnings}
# Email configuration
- ALERT_EMAIL=${ALERT_EMAIL:-ops@example.com}
- INFO_EMAIL=${INFO_EMAIL:-team@example.com}
- ALERT_FROM_EMAIL=${ALERT_FROM_EMAIL:-alerts@internet-id.com}
- SMTP_HOST=${SMTP_HOST:-smtp.gmail.com}
- SMTP_PORT=${SMTP_PORT:-587}
- SMTP_USERNAME=${SMTP_USERNAME}
- SMTP_PASSWORD=${SMTP_PASSWORD}
ports:
- "9093:9093"
networks:
- monitoring
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# Grafana - Metrics visualization and dashboards
grafana:
image: grafana/grafana:10.2.2
container_name: grafana
volumes:
- grafana_data:/var/lib/grafana
- ./ops/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./ops/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Enable alerting
- GF_ALERTING_ENABLED=true
- GF_UNIFIED_ALERTING_ENABLED=true
# Anonymous access for public dashboards (optional)
- GF_AUTH_ANONYMOUS_ENABLED=${GRAFANA_ANONYMOUS_ENABLED:-false}
ports:
- "3001:3000"
networks:
- monitoring
- default
restart: unless-stopped
depends_on:
- prometheus
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
interval: 30s
timeout: 10s
retries: 3
# PostgreSQL Exporter - Database metrics
postgres-exporter:
image: prometheuscommunity/postgres-exporter:v0.15.0
container_name: postgres-exporter
environment:
- DATA_SOURCE_NAME=postgresql://${POSTGRES_USER:-internetid}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-internetid}?sslmode=disable
ports:
- "9187:9187"
networks:
- monitoring
- default
restart: unless-stopped
depends_on:
- db
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9187/"]
interval: 30s
timeout: 10s
retries: 3
# Redis Exporter - Cache metrics
redis-exporter:
image: oliver006/redis_exporter:v1.55.0
container_name: redis-exporter
environment:
- REDIS_ADDR=redis://redis:6379
ports:
- "9121:9121"
networks:
- monitoring
- default
restart: unless-stopped
depends_on:
- redis
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9121/"]
interval: 30s
timeout: 10s
retries: 3
# Node Exporter - System metrics (CPU, memory, disk, network)
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9100/"]
interval: 30s
timeout: 10s
retries: 3
# cAdvisor - Container metrics
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /cgroup:/cgroup:ro
ports:
- "8080:8080"
networks:
- monitoring
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
# Blackbox Exporter - External endpoint monitoring
blackbox-exporter:
image: prom/blackbox-exporter:v0.24.0
container_name: blackbox-exporter
command:
- '--config.file=/etc/blackbox/blackbox.yml'
volumes:
- ./ops/monitoring/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
ports:
- "9115:9115"
networks:
- monitoring
- default
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9115/"]
interval: 30s
timeout: 10s
retries: 3
networks:
monitoring:
driver: bridge
default:
external: true
name: internet-id_default
volumes:
prometheus_data:
alertmanager_data:
grafana_data: