* Initial plan * Add production monitoring and alerting infrastructure - Create Prometheus alert rules for all critical thresholds - Add Alertmanager configuration with PagerDuty, Slack, and email routing - Create docker-compose.monitoring.yml with full monitoring stack - Add Sentry error tracking service integration - Create comprehensive alerting runbook documentation - Add monitoring setup guide with detailed instructions - Configure blackbox exporter for uptime monitoring - Update .env.example with monitoring and alerting configuration Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add blockchain transaction and health check metrics, integrate Sentry error tracking - Enhanced metrics service with blockchain transaction tracking - Added health check status metrics to Prometheus - Added queue depth gauge for future queue implementation - Integrated Sentry error tracking in Express app - Updated health check endpoint to export metrics - Fixed linting issues in new code - Created monitoring directory README Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add comprehensive monitoring implementation summary documentation Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Address code review feedback: improve query string filtering, add error params Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Fix Sentry error handling: remove duplicate capture, fix fallback handler - Remove redundant sentryService.captureException call in global error handler (Sentry's error handler already captures all errors) - Fix fallback error handler to pass error to next handler with next(_err) instead of swallowing it with next() Addresses review feedback from @copilot-pull-request-reviewer Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>
225 lines
6.8 KiB
YAML
225 lines
6.8 KiB
YAML
version: "3.9"
|
|
|
|
# Docker Compose configuration for Monitoring Stack
|
|
# This file adds monitoring services to the Internet-ID infrastructure
|
|
# Usage: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d
|
|
|
|
services:
|
|
# Prometheus - Metrics collection and alerting
|
|
prometheus:
|
|
image: prom/prometheus:v2.48.0
|
|
container_name: prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
|
- '--web.console.templates=/usr/share/prometheus/consoles'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
volumes:
|
|
- ./ops/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- ./ops/monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
|
- prometheus_data:/prometheus
|
|
ports:
|
|
- "9090:9090"
|
|
networks:
|
|
- monitoring
|
|
- default
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Alertmanager - Alert routing and management
|
|
alertmanager:
|
|
image: prom/alertmanager:v0.26.0
|
|
container_name: alertmanager
|
|
command:
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|
- '--storage.path=/alertmanager'
|
|
volumes:
|
|
- ./ops/monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|
- alertmanager_data:/alertmanager
|
|
environment:
|
|
# PagerDuty configuration
|
|
- PAGERDUTY_SERVICE_KEY=${PAGERDUTY_SERVICE_KEY}
|
|
- PAGERDUTY_ROUTING_KEY=${PAGERDUTY_ROUTING_KEY}
|
|
- PAGERDUTY_DATABASE_KEY=${PAGERDUTY_DATABASE_KEY}
|
|
- PAGERDUTY_DBA_ROUTING_KEY=${PAGERDUTY_DBA_ROUTING_KEY}
|
|
# Slack configuration
|
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
|
- SLACK_CRITICAL_CHANNEL=${SLACK_CRITICAL_CHANNEL:-#alerts-critical}
|
|
- SLACK_WARNINGS_CHANNEL=${SLACK_WARNINGS_CHANNEL:-#alerts-warnings}
|
|
# Email configuration
|
|
- ALERT_EMAIL=${ALERT_EMAIL:-ops@example.com}
|
|
- INFO_EMAIL=${INFO_EMAIL:-team@example.com}
|
|
- ALERT_FROM_EMAIL=${ALERT_FROM_EMAIL:-alerts@internet-id.com}
|
|
- SMTP_HOST=${SMTP_HOST:-smtp.gmail.com}
|
|
- SMTP_PORT=${SMTP_PORT:-587}
|
|
- SMTP_USERNAME=${SMTP_USERNAME}
|
|
- SMTP_PASSWORD=${SMTP_PASSWORD}
|
|
ports:
|
|
- "9093:9093"
|
|
networks:
|
|
- monitoring
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Grafana - Metrics visualization and dashboards
|
|
grafana:
|
|
image: grafana/grafana:10.2.2
|
|
container_name: grafana
|
|
volumes:
|
|
- grafana_data:/var/lib/grafana
|
|
- ./ops/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ./ops/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
environment:
|
|
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
|
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
|
|
- GF_INSTALL_PLUGINS=grafana-piechart-panel
|
|
# Enable alerting
|
|
- GF_ALERTING_ENABLED=true
|
|
- GF_UNIFIED_ALERTING_ENABLED=true
|
|
# Anonymous access for public dashboards (optional)
|
|
- GF_AUTH_ANONYMOUS_ENABLED=${GRAFANA_ANONYMOUS_ENABLED:-false}
|
|
ports:
|
|
- "3001:3000"
|
|
networks:
|
|
- monitoring
|
|
- default
|
|
restart: unless-stopped
|
|
depends_on:
|
|
- prometheus
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# PostgreSQL Exporter - Database metrics
|
|
postgres-exporter:
|
|
image: prometheuscommunity/postgres-exporter:v0.15.0
|
|
container_name: postgres-exporter
|
|
environment:
|
|
- DATA_SOURCE_NAME=postgresql://${POSTGRES_USER:-internetid}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-internetid}?sslmode=disable
|
|
ports:
|
|
- "9187:9187"
|
|
networks:
|
|
- monitoring
|
|
- default
|
|
restart: unless-stopped
|
|
depends_on:
|
|
- db
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9187/"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Redis Exporter - Cache metrics
|
|
redis-exporter:
|
|
image: oliver006/redis_exporter:v1.55.0
|
|
container_name: redis-exporter
|
|
environment:
|
|
- REDIS_ADDR=redis://redis:6379
|
|
ports:
|
|
- "9121:9121"
|
|
networks:
|
|
- monitoring
|
|
- default
|
|
restart: unless-stopped
|
|
depends_on:
|
|
- redis
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9121/"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Node Exporter - System metrics (CPU, memory, disk, network)
|
|
node-exporter:
|
|
image: prom/node-exporter:v1.7.0
|
|
container_name: node-exporter
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
- '--path.rootfs=/rootfs'
|
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
ports:
|
|
- "9100:9100"
|
|
networks:
|
|
- monitoring
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9100/"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# cAdvisor - Container metrics
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
|
container_name: cadvisor
|
|
privileged: true
|
|
devices:
|
|
- /dev/kmsg:/dev/kmsg
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker/:/var/lib/docker:ro
|
|
- /cgroup:/cgroup:ro
|
|
ports:
|
|
- "8080:8080"
|
|
networks:
|
|
- monitoring
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8080/healthz"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Blackbox Exporter - External endpoint monitoring
|
|
blackbox-exporter:
|
|
image: prom/blackbox-exporter:v0.24.0
|
|
container_name: blackbox-exporter
|
|
command:
|
|
- '--config.file=/etc/blackbox/blackbox.yml'
|
|
volumes:
|
|
- ./ops/monitoring/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
|
|
ports:
|
|
- "9115:9115"
|
|
networks:
|
|
- monitoring
|
|
- default
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9115/"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
networks:
|
|
monitoring:
|
|
driver: bridge
|
|
default:
|
|
external: true
|
|
name: internet-id_default
|
|
|
|
volumes:
|
|
prometheus_data:
|
|
alertmanager_data:
|
|
grafana_data:
|