Add production deployment infrastructure with Kubernetes, Terraform, and multi-strategy CI/CD (#145)

* Initial plan

* Add Kubernetes manifests and Terraform infrastructure modules

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Add Helm charts, deployment scripts, CI/CD workflows, and documentation

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Add infrastructure documentation and update README

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Fix code review issues and security vulnerabilities

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

* Address PR review comments: improve security, fix API versions, and enhance deployment reliability

Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>
This commit was merged in pull request #145.
This commit is contained in:
Copilot
2025-11-02 17:27:49 -06:00
committed by GitHub
parent ca1f33b734
commit d3111dfbdf
56 changed files with 5663 additions and 0 deletions

291
.github/workflows/deploy-production.yml vendored Normal file
View File

@@ -0,0 +1,291 @@
name: Deploy to Production
on:
push:
branches: [main]
workflow_dispatch:
inputs:
deployment_strategy:
description: 'Deployment strategy'
required: true
default: 'rolling'
type: choice
options:
- rolling
- blue-green
- canary
env:
AWS_REGION: us-east-1
EKS_CLUSTER_NAME: spywatcher-production
REGISTRY: ghcr.io
IMAGE_NAME_BACKEND: ${{ github.repository_owner }}/spywatcher-backend
IMAGE_NAME_FRONTEND: ${{ github.repository_owner }}/spywatcher-frontend
jobs:
build-and-push:
name: Build and Push Docker Images
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
outputs:
backend-tag: ${{ steps.meta-backend.outputs.tags }}
frontend-tag: ${{ steps.meta-frontend.outputs.tags }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata for backend
id: meta-backend
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}
tags: |
type=sha,prefix={{branch}}-
type=ref,event=branch
type=semver,pattern={{version}}
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push backend image
uses: docker/build-push-action@v5
with:
context: ./backend
file: ./backend/Dockerfile
push: true
tags: ${{ steps.meta-backend.outputs.tags }}
labels: ${{ steps.meta-backend.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Extract metadata for frontend
id: meta-frontend
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}
tags: |
type=sha,prefix={{branch}}-
type=ref,event=branch
type=semver,pattern={{version}}
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push frontend image
uses: docker/build-push-action@v5
with:
context: ./frontend
file: ./frontend/Dockerfile
push: true
tags: ${{ steps.meta-frontend.outputs.tags }}
labels: ${{ steps.meta-frontend.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
name: Deploy to Kubernetes
runs-on: ubuntu-latest
needs: build-and-push
environment: production
permissions:
contents: read
id-token: write # Required for AWS OIDC authentication
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Update kubeconfig
run: |
aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
- name: Install kubectl
uses: azure/setup-kubectl@v4
with:
version: 'v1.28.0'
- name: Verify cluster access
run: |
kubectl cluster-info
kubectl get nodes
- name: Run database migrations
run: |
# Create unique migration job name
JOB_NAME="db-migration-$(date +%s)"
# Update the migration job manifest with unique name and latest image
kubectl get job spywatcher-db-migration -n spywatcher -o yaml 2>/dev/null | \
sed "s/name: spywatcher-db-migration/name: $JOB_NAME/" | \
sed "s|image: ghcr.io/subculture-collective/spywatcher-backend:.*|image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest|" | \
kubectl apply -f - || \
kubectl create job $JOB_NAME --image=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
-n spywatcher -- sh -c "npx prisma migrate deploy"
# Set DATABASE_URL secret for the job if created via kubectl create
kubectl set env job/$JOB_NAME -n spywatcher --from=secret/spywatcher-secrets DATABASE_URL=database-url || true
# Wait for migration to complete
kubectl wait --for=condition=complete --timeout=300s job/$JOB_NAME -n spywatcher
# Show migration logs
kubectl logs job/$JOB_NAME -n spywatcher
- name: Deploy with Rolling Update
if: github.event.inputs.deployment_strategy == 'rolling' || github.event.inputs.deployment_strategy == ''
run: |
# Update backend deployment
kubectl set image deployment/spywatcher-backend \
backend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
-n spywatcher
# Update frontend deployment
kubectl set image deployment/spywatcher-frontend \
frontend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}:latest \
-n spywatcher
# Wait for rollout to complete
kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=10m
kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=10m
- name: Deploy with Blue-Green
if: github.event.inputs.deployment_strategy == 'blue-green'
run: |
chmod +x ./scripts/deployment/blue-green-deploy.sh
IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
- name: Deploy with Canary
if: github.event.inputs.deployment_strategy == 'canary'
run: |
chmod +x ./scripts/deployment/canary-deploy.sh
IMAGE_TAG=latest ./scripts/deployment/canary-deploy.sh
- name: Run smoke tests
run: |
# Test via ingress if available, otherwise use port-forward
INGRESS_HOST=$(kubectl get ingress spywatcher-ingress -n spywatcher -o jsonpath='{.spec.rules[1].host}' 2>/dev/null || echo "")
if [ -n "$INGRESS_HOST" ]; then
echo "Testing via ingress: $INGRESS_HOST"
BACKEND_URL="https://${INGRESS_HOST}"
# Test health endpoints
echo "Testing liveness endpoint..."
curl -f "${BACKEND_URL}/health/live" || exit 1
echo "Testing readiness endpoint..."
curl -f "${BACKEND_URL}/health/ready" || exit 1
else
echo "No ingress found, testing via port-forward"
# Port-forward backend service to localhost:8080
kubectl port-forward svc/spywatcher-backend 8080:80 -n spywatcher &
PORT_FORWARD_PID=$!
# Wait for port-forward to be ready
sleep 5
# Test health endpoints
echo "Testing liveness endpoint..."
curl -f "http://localhost:8080/health/live" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
echo "Testing readiness endpoint..."
curl -f "http://localhost:8080/health/ready" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
# Kill port-forward process
kill $PORT_FORWARD_PID 2>/dev/null
fi
echo "Smoke tests passed!"
- name: Verify deployment
run: |
echo "=== Deployment Status ==="
kubectl get deployments -n spywatcher
kubectl get pods -n spywatcher
kubectl get services -n spywatcher
echo "=== Recent Events ==="
kubectl get events -n spywatcher --sort-by='.lastTimestamp' | tail -20
- name: Rollback on failure
if: failure()
run: |
echo "Deployment failed, rolling back..."
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
kubectl rollout undo deployment/spywatcher-frontend -n spywatcher
kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=5m
kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=5m
- name: Notify on success
if: success()
run: |
echo "✅ Production deployment successful!"
echo "Deployed commit: ${{ github.sha }}"
echo "Deployment strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}"
- name: Notify on failure
if: failure()
uses: 8398a7/action-slack@v3
with:
status: failure
text: |
Production deployment failed!
Commit: ${{ github.sha }}
Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}
webhook_url: ${{ secrets.SLACK_WEBHOOK }}
continue-on-error: true
post-deployment:
name: Post-Deployment Tasks
runs-on: ubuntu-latest
needs: deploy
if: success()
permissions:
contents: read
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Update kubeconfig
run: |
aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
- name: Clean up old resources
run: |
# Clean up completed migration jobs older than 1 hour
kubectl delete jobs -n spywatcher --field-selector status.successful=1 \
--ignore-not-found=true || true
# Clean up old replica sets
kubectl delete replicaset -n spywatcher --field-selector status.replicas=0 \
--ignore-not-found=true || true
- name: Update deployment documentation
run: |
echo "Deployment completed at $(date)" >> deployment-log.txt
echo "Commit: ${{ github.sha }}" >> deployment-log.txt
echo "Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}" >> deployment-log.txt
echo "---" >> deployment-log.txt

413
DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,413 @@
# Deployment Guide
This document describes the production deployment strategy for Spywatcher, including infrastructure setup, deployment procedures, and rollback strategies.
## Table of Contents
- [Overview](#overview)
- [Infrastructure Setup](#infrastructure-setup)
- [Deployment Strategies](#deployment-strategies)
- [Kubernetes Deployment](#kubernetes-deployment)
- [Terraform Infrastructure](#terraform-infrastructure)
- [Helm Charts](#helm-charts)
- [CI/CD Pipeline](#cicd-pipeline)
- [Rollback Procedures](#rollback-procedures)
- [Monitoring and Alerts](#monitoring-and-alerts)
- [Troubleshooting](#troubleshooting)
## Overview
Spywatcher uses a multi-strategy deployment approach with:
- **Infrastructure as Code**: Terraform for AWS infrastructure
- **Container Orchestration**: Kubernetes (EKS) for application deployment
- **Package Management**: Helm charts for simplified deployments
- **Deployment Strategies**: Rolling, Blue-Green, and Canary deployments
- **CI/CD**: GitHub Actions for automated deployments
## Infrastructure Setup
### Prerequisites
1. AWS Account with appropriate permissions
2. AWS CLI configured
3. kubectl installed
4. Terraform installed (>= 1.5.0)
5. Helm installed (>= 3.0)
### Terraform Infrastructure
The infrastructure is defined in Terraform modules:
```bash
cd terraform
# Initialize Terraform
terraform init
# Review the plan
terraform plan -var-file="environments/production/terraform.tfvars"
# Apply infrastructure
terraform apply -var-file="environments/production/terraform.tfvars"
```
#### Infrastructure Components
- **VPC**: Isolated network with public, private, and database subnets across 3 AZs
- **EKS Cluster**: Kubernetes cluster with managed node groups
- **RDS PostgreSQL**: Managed database with encryption and automated backups
- **ElastiCache Redis**: In-memory cache with cluster mode
- **Application Load Balancer**: With WAF for security
- **Security Groups**: Least-privilege network access
- **IAM Roles**: Service accounts and node permissions
### Configure kubectl
After infrastructure deployment:
```bash
aws eks update-kubeconfig --name spywatcher-production --region us-east-1
kubectl cluster-info
```
## Deployment Strategies
### Rolling Deployment (Default)
Updates pods gradually, maintaining service availability.
```bash
# Triggered automatically on push to main branch
# Or manually via GitHub Actions UI
```
**Advantages:**
- Simple and predictable
- Zero downtime
- Automatic rollback on failure
**Disadvantages:**
- Gradual rollout may take time
- Both versions run simultaneously during update
### Blue-Green Deployment
Maintains two identical environments, switching traffic instantly.
```bash
# Via GitHub Actions
# Select "blue-green" as deployment strategy
# Or manually
IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
# Rollback if needed
./scripts/deployment/blue-green-deploy.sh --rollback
```
**Advantages:**
- Instant traffic switch
- Easy rollback
- Full environment testing before switch
**Disadvantages:**
- Requires double resources temporarily
- Database migrations must be compatible with both versions
### Canary Deployment
Gradually shifts traffic to new version while monitoring metrics.
```bash
# Via GitHub Actions
# Select "canary" as deployment strategy
# Or manually
IMAGE_TAG=latest CANARY_STEPS="5 25 50 100" ./scripts/deployment/canary-deploy.sh
```
**Advantages:**
- Risk mitigation through gradual rollout
- Real-world testing with subset of users
- Automated rollback on errors
**Disadvantages:**
- Longer deployment time
- Requires robust monitoring
## Kubernetes Deployment
### Using Kustomize
Deploy to different environments:
```bash
# Production
kubectl apply -k k8s/overlays/production
# Staging
kubectl apply -k k8s/overlays/staging
# Development (base)
kubectl apply -k k8s/base
```
### Manual Deployment
```bash
# Create namespace
kubectl apply -f k8s/base/namespace.yaml
# Apply configurations
kubectl apply -f k8s/base/configmap.yaml
kubectl apply -f k8s/base/secrets.yaml
# Deploy databases
kubectl apply -f k8s/base/postgres-statefulset.yaml
kubectl apply -f k8s/base/redis-statefulset.yaml
# Deploy applications
kubectl apply -f k8s/base/backend-deployment.yaml
kubectl apply -f k8s/base/frontend-deployment.yaml
# Create services
kubectl apply -f k8s/base/backend-service.yaml
kubectl apply -f k8s/base/frontend-service.yaml
# Configure ingress
kubectl apply -f k8s/base/ingress.yaml
```
### Scaling
```bash
# Manual scaling
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
# Auto-scaling is configured via HPA
kubectl get hpa -n spywatcher
```
## Helm Charts
### Installation
```bash
# Install with default values
helm install spywatcher ./helm/spywatcher -n spywatcher --create-namespace
# Install with custom values
helm install spywatcher ./helm/spywatcher \
-n spywatcher \
--create-namespace \
-f helm/spywatcher/values-production.yaml
```
### Upgrade
```bash
helm upgrade spywatcher ./helm/spywatcher -n spywatcher
```
### Rollback
```bash
# List releases
helm history spywatcher -n spywatcher
# Rollback to previous version
helm rollback spywatcher -n spywatcher
# Rollback to specific revision
helm rollback spywatcher 2 -n spywatcher
```
## CI/CD Pipeline
### GitHub Actions Workflow
The deployment pipeline is triggered by:
1. Push to `main` branch (automatic)
2. Manual workflow dispatch
#### Pipeline Steps
1. **Build and Push**
- Build Docker images for backend and frontend
- Push to GitHub Container Registry
- Tag with commit SHA and latest
2. **Database Migration**
- Run Prisma migrations
- Verify migration success
3. **Deploy**
- Apply selected deployment strategy
- Update Kubernetes deployments
- Monitor rollout status
4. **Smoke Tests**
- Health check endpoints
- Basic functionality tests
5. **Rollback on Failure**
- Automatic rollback if deployment fails
- Notification to team
### Required Secrets
Configure in GitHub repository settings:
```
AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY
DATABASE_URL
REDIS_URL
JWT_SECRET
JWT_REFRESH_SECRET
DISCORD_BOT_TOKEN
DISCORD_CLIENT_ID
DISCORD_CLIENT_SECRET
SLACK_WEBHOOK (optional)
```
## Rollback Procedures
### Kubernetes Rollback
```bash
# View rollout history
kubectl rollout history deployment/spywatcher-backend -n spywatcher
# Rollback to previous version
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
# Rollback to specific revision
kubectl rollout undo deployment/spywatcher-backend --to-revision=2 -n spywatcher
# Check rollback status
kubectl rollout status deployment/spywatcher-backend -n spywatcher
```
### Blue-Green Rollback
```bash
./scripts/deployment/blue-green-deploy.sh --rollback
```
### Database Rollback
```bash
# If migration needs to be rolled back
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- npx prisma migrate resolve --rolled-back <migration_name>
```
## Monitoring and Alerts
### Health Checks
```bash
# Liveness probe
curl https://api.spywatcher.example.com/health/live
# Readiness probe
curl https://api.spywatcher.example.com/health/ready
```
### Kubernetes Monitoring
```bash
# Check pod status
kubectl get pods -n spywatcher
# View pod logs
kubectl logs -f deployment/spywatcher-backend -n spywatcher
# Check events
kubectl get events -n spywatcher --sort-by='.lastTimestamp'
# Resource usage
kubectl top pods -n spywatcher
kubectl top nodes
```
### CloudWatch Metrics
Monitor via AWS CloudWatch:
- EKS cluster metrics
- RDS performance metrics
- ElastiCache metrics
- ALB request metrics
## Troubleshooting
### Pod Not Starting
```bash
# Describe pod to see events
kubectl describe pod <pod-name> -n spywatcher
# Check logs
kubectl logs <pod-name> -n spywatcher
# Check resource constraints
kubectl describe node <node-name>
```
### Database Connection Issues
```bash
# Verify database secret
kubectl get secret spywatcher-secrets -n spywatcher -o yaml
# Test database connection
kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -n spywatcher -- \
psql -h <rds-endpoint> -U spywatcher -d spywatcher
```
### Traffic Not Routing
```bash
# Check service endpoints
kubectl get endpoints -n spywatcher
# Check ingress
kubectl describe ingress spywatcher-ingress -n spywatcher
# Check ALB target groups
aws elbv2 describe-target-health --target-group-arn <arn>
```
### High Resource Usage
```bash
# Check HPA status
kubectl get hpa -n spywatcher
# Scale manually if needed
kubectl scale deployment spywatcher-backend --replicas=10 -n spywatcher
# Check resource limits
kubectl describe deployment spywatcher-backend -n spywatcher
```
## Best Practices
1. **Always test in staging first**
2. **Run database migrations before deploying code**
3. **Use feature flags for risky changes**
4. **Monitor error rates during deployment**
5. **Keep rollback scripts ready**
6. **Document all configuration changes**
7. **Regular backup testing**
8. **Security patches applied promptly**
## Support
For deployment issues:
- Check GitHub Actions logs
- Review CloudWatch logs
- Contact DevOps team
- Create incident in issue tracker

351
INFRASTRUCTURE.md Normal file
View File

@@ -0,0 +1,351 @@
# Infrastructure Overview
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────┐
│ AWS Cloud │
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ VPC (10.0.0.0/16) │ │
│ │ │ │
│ │ ┌────────────────────────────────────────────────────┐ │ │
│ │ │ Application Load Balancer (ALB) │ │ │
│ │ │ with WAF Protection │ │ │
│ │ └──────────────────┬─────────────────────────────────┘ │ │
│ │ │ │ │
│ │ ┌──────────────────┴────────────────────────┐ │ │
│ │ │ EKS Cluster (Kubernetes) │ │ │
│ │ │ │ │ │
│ │ │ ┌────────────────┐ ┌─────────────────┐ │ │ │
│ │ │ │ Backend │ │ Frontend │ │ │ │
│ │ │ │ Pods (3) │ │ Pods (2) │ │ │ │
│ │ │ │ │ │ │ │ │ │
│ │ │ │ - Auto-scaling │ │ - Auto-scaling │ │ │ │
│ │ │ │ - Health checks│ │ - Health checks │ │ │ │
│ │ │ └────────┬───────┘ └────────┬────────┘ │ │ │
│ │ │ │ │ │ │ │
│ │ │ └───────┬───────────┘ │ │ │
│ │ │ │ │ │ │
│ │ └───────────────────┼──────────────────────┘ │ │
│ │ │ │ │
│ │ ┌───────────────────┼──────────────────────────────┐ │ │
│ │ │ Database Subnets │ │ │ │
│ │ │ │ │ │ │
│ │ │ ┌────────────────▼────────┐ ┌───────────────┐ │ │ │
│ │ │ │ RDS PostgreSQL 15 │ │ ElastiCache │ │ │ │
│ │ │ │ │ │ Redis │ │ │ │
│ │ │ │ - Multi-AZ │ │ │ │ │ │
│ │ │ │ - Encrypted │ │ - Encrypted │ │ │ │
│ │ │ │ - Automated Backups │ │ - Failover │ │ │ │
│ │ │ └─────────────────────────┘ └───────────────┘ │ │ │
│ │ └──────────────────────────────────────────────────┘ │ │
│ │ │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────┐ ┌──────────────────┐ │
│ │ CloudWatch │ │ Secrets Manager │ │
│ │ Monitoring │ │ Credentials │ │
│ └──────────────────┘ └──────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ GitHub Actions │
│ │
│ Build → Test → Deploy → Smoke Tests → Monitor │
│ │
└─────────────────────────────────────────────────────────────────┘
```
## Components
### Compute
- **EKS Cluster**: Managed Kubernetes cluster (v1.28)
- **Node Groups**: Auto-scaling EC2 instances (t3.large)
- **Pods**: Containerized applications with health checks
### Networking
- **VPC**: Isolated network (10.0.0.0/16)
- **Subnets**: Public, Private, and Database across 3 AZs
- **NAT Gateways**: Internet access for private subnets
- **ALB**: HTTPS termination and routing
### Data Storage
- **RDS PostgreSQL**: Managed database (15.3)
- Multi-AZ for high availability
- Automated backups (7 days retention)
- Encryption at rest (KMS)
- **ElastiCache Redis**: In-memory cache (7.0)
- Authentication token
- Encryption in transit
- Automatic failover
### Security
- **WAF**: Web Application Firewall with rate limiting
- **Security Groups**: Network-level access control
- **IAM Roles**: Fine-grained permissions
- **Secrets Manager**: Secure credential storage
- **TLS/SSL**: End-to-end encryption
### Monitoring
- **CloudWatch**: Metrics, logs, and alarms
- **Health Checks**: Liveness and readiness probes
- **Resource Metrics**: CPU, memory, network usage
## Resource Sizing
### Production Environment
| Component | Type | Specs | Replicas | Scaling |
|-----------|------|-------|----------|---------|
| Backend | Pod | 512Mi RAM, 500m CPU | 3 | 2-10 |
| Frontend | Pod | 128Mi RAM, 100m CPU | 2 | 2-5 |
| PostgreSQL | RDS | db.t3.large | 1 (Multi-AZ) | Manual |
| Redis | ElastiCache | cache.t3.medium | 2 | Manual |
| EKS Nodes | EC2 | t3.large | 3 | 2-10 |
### Staging Environment
| Component | Type | Specs | Replicas | Scaling |
|-----------|------|-------|----------|---------|
| Backend | Pod | 256Mi RAM, 250m CPU | 1 | 1-3 |
| Frontend | Pod | 128Mi RAM, 100m CPU | 1 | 1-2 |
| PostgreSQL | RDS | db.t3.medium | 1 | N/A |
| Redis | ElastiCache | cache.t3.small | 1 | N/A |
| EKS Nodes | EC2 | t3.medium | 2 | 1-4 |
## Cost Estimation
### Monthly Costs (US East 1)
#### Production
- EKS Cluster: $73
- EC2 Nodes (3x t3.large): ~$150
- RDS PostgreSQL (db.t3.large, Multi-AZ): ~$290
- ElastiCache Redis (cache.t3.medium x2): ~$100
- ALB: ~$25
- Data Transfer: ~$50
- Backups & Monitoring: ~$30
**Total: ~$718/month**
#### Staging
- EKS Cluster: $73
- EC2 Nodes (2x t3.medium): ~$60
- RDS PostgreSQL (db.t3.medium): ~$70
- ElastiCache Redis (cache.t3.small): ~$25
- ALB: ~$25
- Data Transfer: ~$20
**Total: ~$273/month**
*Note: Costs are estimates and may vary based on usage*
## Deployment Strategies
### 1. Rolling Update (Default)
- **Use Case**: Standard deployments
- **Downtime**: Zero
- **Risk**: Low
- **Duration**: 5-10 minutes
### 2. Blue-Green
- **Use Case**: Major releases, critical changes
- **Downtime**: Zero
- **Risk**: Very Low (instant rollback)
- **Duration**: 10-15 minutes
### 3. Canary
- **Use Case**: High-risk changes, gradual rollout
- **Downtime**: Zero
- **Risk**: Minimal (gradual exposure)
- **Duration**: 30-60 minutes
## High Availability
### Application Layer
- Multiple replicas across availability zones
- Pod anti-affinity rules
- Pod disruption budgets (min 1 available)
- Health checks with automatic restart
### Database Layer
- Multi-AZ deployment for RDS
- Automated failover (< 60 seconds)
- Read replicas for scaling (optional)
- Point-in-time recovery
### Network Layer
- Multi-AZ load balancing
- Health checks on targets
- Automatic target deregistration
- DDoS protection (AWS Shield)
## Disaster Recovery
### RTO (Recovery Time Objective)
- Application: < 5 minutes
- Database: < 1 minute (automated failover)
- Full Infrastructure: < 30 minutes (Terraform redeploy)
### RPO (Recovery Point Objective)
- Database: < 5 minutes (automated backups)
- Application: 0 (stateless, recreatable)
### Backup Strategy
- **Database**: Daily automated backups (7 days retention)
- **Configuration**: Git repository (versioned)
- **Infrastructure**: Terraform state (versioned in S3)
## Security Measures
### Network Security
- Private subnets for application and database
- Security groups with least-privilege rules
- Network ACLs
- VPC Flow Logs
### Application Security
- Containers run as non-root
- Read-only root filesystems where possible
- No privilege escalation
- Security scanning in CI/CD
### Data Security
- Encryption at rest (KMS)
- Encryption in transit (TLS 1.2+)
- Secrets stored in AWS Secrets Manager
- Database credentials auto-rotated
### Access Control
- IAM roles with least privilege
- RBAC in Kubernetes
- MFA for admin access
- Audit logging enabled
## Scaling Strategy
### Horizontal Scaling
- **Triggers**:
- CPU > 70%
- Memory > 80%
- Custom metrics (request rate)
- **Limits**:
- Backend: 2-10 pods
- Frontend: 2-5 pods
- Nodes: 2-10 instances
### Vertical Scaling
- Database: Manual scaling with downtime
- Redis: Manual scaling with failover
- Pods: Update resource limits and restart
## Monitoring Strategy
### Application Metrics
- Request rate and latency
- Error rate
- Active connections
- Cache hit rate
### Infrastructure Metrics
- CPU utilization
- Memory utilization
- Network throughput
- Disk I/O
### Business Metrics
- Active users
- API usage per tier
- Feature usage
- User sessions
### Alerting
- Critical: Page immediately
- Service down
- Database unavailable
- High error rate
- Warning: Notify during business hours
- High CPU/memory
- Low disk space
- Elevated response time
## Maintenance Windows
### Planned Maintenance
- **Schedule**: Sundays 02:00-04:00 UTC
- **Notification**: 7 days advance notice
- **Activities**:
- OS patches
- Database maintenance
- Kubernetes upgrades
- SSL certificate renewal
### Emergency Maintenance
- Immediate security patches
- Critical bug fixes
- Infrastructure failures
## Compliance & Governance
### Tagging Strategy
All resources tagged with:
- `Environment`: production/staging
- `Project`: spywatcher
- `ManagedBy`: terraform
- `CostCenter`: engineering
### Resource Naming
- Pattern: `{project}-{environment}-{resource}`
- Example: `spywatcher-production-backend`
### Access Audit
- CloudTrail enabled
- Quarterly access review
- Regular security audits
## Quick Reference
### Useful Commands
```bash
# Check cluster status
kubectl cluster-info
kubectl get nodes
# View application status
kubectl get all -n spywatcher
# View logs
kubectl logs -f deployment/spywatcher-backend -n spywatcher
# Scale application
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
# Rollback deployment
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
# Database backup
aws rds create-db-snapshot --db-instance-identifier spywatcher-production
# View CloudWatch alarms
aws cloudwatch describe-alarms --state-value ALARM
```
### Important URLs
- Production: https://spywatcher.example.com
- API: https://api.spywatcher.example.com
- Staging: https://staging.spywatcher.example.com
- Grafana: https://grafana.spywatcher.example.com
- AWS Console: https://console.aws.amazon.com
### Support Contacts
- On-Call: oncall@spywatcher.example.com
- DevOps: devops@spywatcher.example.com
- Security: security@spywatcher.example.com

View File

@@ -453,6 +453,69 @@ Git hooks are automatically installed when you run `npm install` in the root dir
See [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines.
## 🚀 Production Deployment
Spywatcher includes comprehensive production deployment infrastructure with Kubernetes, Terraform, and CI/CD automation.
### Deployment Strategies
- **Rolling Updates**: Zero-downtime gradual deployment (default)
- **Blue-Green**: Instant traffic switching with quick rollback
- **Canary**: Gradual rollout with automated error detection
### Infrastructure as Code
- **Terraform**: Complete AWS infrastructure modules
- VPC with multi-AZ setup
- EKS Kubernetes cluster
- RDS PostgreSQL (Multi-AZ, encrypted)
- ElastiCache Redis (encrypted, failover)
- Application Load Balancer with WAF
- **Kubernetes**: Production-ready manifests
- Auto-scaling with HorizontalPodAutoscaler
- Health checks and pod disruption budgets
- Security contexts and network policies
- **Helm Charts**: Simplified deployment and configuration
### Quick Deployment
```bash
# Deploy with Terraform
cd terraform
terraform init
terraform apply -var-file="environments/production/terraform.tfvars"
# Deploy with Kubernetes
kubectl apply -k k8s/overlays/production
# Deploy with Helm
helm install spywatcher ./helm/spywatcher -n spywatcher
# Blue-green deployment
./scripts/deployment/blue-green-deploy.sh
# Canary deployment
./scripts/deployment/canary-deploy.sh
```
### Documentation
- **[DEPLOYMENT.md](./DEPLOYMENT.md)** - Complete deployment guide
- **[INFRASTRUCTURE.md](./INFRASTRUCTURE.md)** - Architecture overview
- **[terraform/README.md](./terraform/README.md)** - Infrastructure as Code guide
- **[k8s/README.md](./k8s/README.md)** - Kubernetes manifests guide
### CI/CD Pipeline
GitHub Actions workflows for automated deployment:
- Docker image building and pushing to GHCR
- Database migrations
- Multiple deployment strategy support
- Automated smoke tests and health checks
- Rollback on failure
See [.github/workflows/deploy-production.yml](./.github/workflows/deploy-production.yml) for the complete pipeline.
## 👥 Contributions
See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines on contributing to this project.

View File

@@ -0,0 +1,15 @@
apiVersion: v2
name: spywatcher
description: A Helm chart for Spywatcher Discord surveillance and analytics application
type: application
version: 1.0.0
appVersion: "1.0.0"
keywords:
- discord
- monitoring
- analytics
maintainers:
- name: Spywatcher Team
home: https://github.com/subculture-collective/discord-spywatcher
sources:
- https://github.com/subculture-collective/discord-spywatcher

View File

@@ -0,0 +1,65 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "spywatcher.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
*/}}
{{- define "spywatcher.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "spywatcher.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "spywatcher.labels" -}}
helm.sh/chart: {{ include "spywatcher.chart" . }}
{{ include "spywatcher.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "spywatcher.selectorLabels" -}}
app.kubernetes.io/name: {{ include "spywatcher.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Backend labels
*/}}
{{- define "spywatcher.backend.labels" -}}
{{ include "spywatcher.labels" . }}
app.kubernetes.io/component: backend
{{- end }}
{{/*
Frontend labels
*/}}
{{- define "spywatcher.frontend.labels" -}}
{{ include "spywatcher.labels" . }}
app.kubernetes.io/component: frontend
{{- end }}

View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "spywatcher.fullname" . }}-config
namespace: {{ .Values.namespace }}
labels:
{{- include "spywatcher.labels" . | nindent 4 }}
data:
{{- range $key, $value := .Values.configMap.data }}
{{ $key }}: {{ $value | quote }}
{{- end }}

View File

@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: {{ .Values.namespace }}
labels:
{{- include "spywatcher.labels" . | nindent 4 }}

View File

@@ -0,0 +1,20 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "spywatcher.fullname" . }}-secrets
namespace: {{ .Values.namespace }}
labels:
{{- include "spywatcher.labels" . | nindent 4 }}
type: Opaque
stringData:
database-url: {{ .Values.secrets.databaseUrl | quote }}
redis-url: {{ .Values.secrets.redisUrl | quote }}
jwt-secret: {{ .Values.secrets.jwtSecret | quote }}
jwt-refresh-secret: {{ .Values.secrets.jwtRefreshSecret | quote }}
discord-bot-token: {{ .Values.secrets.discordBotToken | quote }}
discord-client-id: {{ .Values.secrets.discordClientId | quote }}
discord-client-secret: {{ .Values.secrets.discordClientSecret | quote }}
discord-guild-id: {{ .Values.secrets.discordGuildId | quote }}
discord-redirect-uri: {{ .Values.secrets.discordRedirectUri | quote }}
admin-discord-ids: {{ .Values.secrets.adminDiscordIds | quote }}
bot-guild-ids: {{ .Values.secrets.botGuildIds | quote }}

View File

@@ -0,0 +1,115 @@
# Production Environment Values
# Override default values for production deployment
global:
environment: production
namespace: spywatcher
image:
backend:
repository: ghcr.io/subculture-collective/spywatcher-backend
tag: latest
pullPolicy: Always
frontend:
repository: ghcr.io/subculture-collective/spywatcher-frontend
tag: latest
pullPolicy: Always
backend:
enabled: true
replicaCount: 3
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
env:
NODE_ENV: production
PORT: "3001"
LOG_LEVEL: info
frontend:
enabled: true
replicaCount: 2
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
env:
VITE_API_URL: "https://api.spywatcher.example.com"
# Use managed services instead of in-cluster databases
postgresql:
enabled: false
redis:
enabled: false
ingress:
enabled: true
className: nginx
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
nginx.ingress.kubernetes.io/rate-limit: "100"
hosts:
- host: spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-frontend
port: 80
- host: api.spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-backend
port: 80
tls:
- secretName: spywatcher-tls-cert
hosts:
- spywatcher.example.com
- api.spywatcher.example.com
podDisruptionBudget:
enabled: true
minAvailable: 2
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
# Production-specific node affinity
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- t3.large
- t3a.large

View File

@@ -0,0 +1,104 @@
# Staging Environment Values
# Override default values for staging deployment
global:
environment: staging
namespace: spywatcher-staging
image:
backend:
repository: ghcr.io/subculture-collective/spywatcher-backend
tag: staging
pullPolicy: Always
frontend:
repository: ghcr.io/subculture-collective/spywatcher-frontend
tag: staging
pullPolicy: Always
backend:
enabled: true
replicaCount: 1
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 3
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
env:
NODE_ENV: staging
PORT: "3001"
LOG_LEVEL: debug
frontend:
enabled: true
replicaCount: 1
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
env:
VITE_API_URL: "https://api-staging.spywatcher.example.com"
# Use in-cluster databases for staging
postgresql:
enabled: true
primary:
persistence:
size: 10Gi
redis:
enabled: true
master:
persistence:
size: 5Gi
ingress:
enabled: true
className: nginx
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-staging"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- host: staging.spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-frontend
port: 80
- host: api-staging.spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-backend
port: 80
tls:
- secretName: spywatcher-staging-tls-cert
hosts:
- staging.spywatcher.example.com
- api-staging.spywatcher.example.com
podDisruptionBudget:
enabled: false
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001

212
helm/spywatcher/values.yaml Normal file
View File

@@ -0,0 +1,212 @@
# Default values for spywatcher
# This is a YAML-formatted file.
# Global settings
global:
environment: production
# Namespace
namespace: spywatcher
# Image settings
image:
backend:
repository: ghcr.io/subculture-collective/spywatcher-backend
tag: latest
pullPolicy: Always
frontend:
repository: ghcr.io/subculture-collective/spywatcher-frontend
tag: latest
pullPolicy: Always
imagePullSecrets: []
# Backend configuration
backend:
enabled: true
replicaCount: 3
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
service:
type: ClusterIP
port: 80
targetPort: 3001
env:
NODE_ENV: production
PORT: "3001"
LOG_LEVEL: info
# Health check configuration
livenessProbe:
httpGet:
path: /health/live
port: 3001
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health/ready
port: 3001
initialDelaySeconds: 10
periodSeconds: 5
# Frontend configuration
frontend:
enabled: true
replicaCount: 2
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
service:
type: ClusterIP
port: 80
targetPort: 80
env:
VITE_API_URL: "https://api.spywatcher.example.com"
# PostgreSQL configuration
postgresql:
enabled: true
image: postgres:15-alpine
auth:
username: spywatcher
database: spywatcher
# Password should be set via --set or separate values file
existingSecret: postgres-secret
secretKeys:
adminPasswordKey: password
primary:
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
persistence:
enabled: true
size: 20Gi
storageClass: ""
# Redis configuration
redis:
enabled: true
image: redis:7-alpine
auth:
enabled: true
existingSecret: redis-secret
existingSecretPasswordKey: password
master:
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
persistence:
enabled: true
size: 10Gi
storageClass: ""
# Ingress configuration
ingress:
enabled: true
className: nginx
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
hosts:
- host: spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-frontend
port: 80
- host: api.spywatcher.example.com
paths:
- path: /
pathType: Prefix
service: spywatcher-backend
port: 80
tls:
- secretName: spywatcher-tls-cert
hosts:
- spywatcher.example.com
- api.spywatcher.example.com
# ConfigMap data
configMap:
data:
NODE_ENV: "production"
PORT: "3001"
LOG_LEVEL: "info"
RATE_LIMIT_WINDOW_MS: "900000"
RATE_LIMIT_MAX_REQUESTS: "100"
# Secrets (should be provided externally)
secrets:
# Database
databaseUrl: ""
# Redis
redisUrl: ""
# JWT
jwtSecret: ""
jwtRefreshSecret: ""
# Discord
discordBotToken: ""
discordClientId: ""
discordClientSecret: ""
discordGuildId: ""
discordRedirectUri: ""
adminDiscordIds: ""
botGuildIds: ""
# Pod Disruption Budget
podDisruptionBudget:
enabled: true
minAvailable: 1
# Security context
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
# Node affinity and tolerations
affinity: {}
tolerations: []
nodeSelector: {}

11
k8s/.gitignore vendored Normal file
View File

@@ -0,0 +1,11 @@
# Ignore secret files that contain sensitive data
secrets/
*.secret.yaml
*-secrets.yaml
# Ignore generated manifests
generated/
# Ignore local development files
*.local.yaml
local/

377
k8s/README.md Normal file
View File

@@ -0,0 +1,377 @@
# Kubernetes Manifests
This directory contains Kubernetes manifests for deploying Spywatcher.
## Directory Structure
```
k8s/
├── base/ # Base manifests
│ ├── namespace.yaml # Namespace and resource quotas
│ ├── configmap.yaml # Application configuration
│ ├── secrets.yaml # Secrets template (DO NOT commit actual secrets)
│ ├── migration-job.yaml # Database migration job
│ ├── backend-deployment.yaml
│ ├── backend-service.yaml
│ ├── backend-hpa.yaml # Horizontal Pod Autoscaler
│ ├── frontend-deployment.yaml
│ ├── frontend-service.yaml
│ ├── postgres-statefulset.yaml
│ ├── redis-statefulset.yaml
│ ├── ingress.yaml
│ ├── pdb.yaml # Pod Disruption Budget
│ └── kustomization.yaml
├── overlays/ # Environment-specific overlays
│ ├── production/
│ └── staging/
└── secrets/ # Actual secrets (gitignored)
```
## Quick Start
### Prerequisites
- kubectl configured with cluster access
- kustomize (built into kubectl >= 1.14)
### Deploy to Production
```bash
# Review what will be deployed
kubectl kustomize k8s/overlays/production
# Apply manifests
kubectl apply -k k8s/overlays/production
# Check deployment status
kubectl get all -n spywatcher
```
### Deploy to Staging
```bash
kubectl apply -k k8s/overlays/staging
kubectl get all -n spywatcher-staging
```
## Configuration Management
### Secrets
**IMPORTANT**: Never commit actual secrets to git!
1. Copy the secrets template:
```bash
cp k8s/base/secrets.yaml k8s/secrets/secrets.yaml
```
2. Edit with actual values:
```bash
vim k8s/secrets/secrets.yaml
```
3. Apply separately:
```bash
kubectl apply -f k8s/secrets/secrets.yaml
```
### ConfigMap
Application configuration is in `k8s/base/configmap.yaml`. Environment-specific values can be patched in overlays.
## Deployment Strategies
### Rolling Update (Default)
```bash
# Update image
kubectl set image deployment/spywatcher-backend \
backend=ghcr.io/subculture-collective/spywatcher-backend:v2.0.0 \
-n spywatcher
# Watch rollout
kubectl rollout status deployment/spywatcher-backend -n spywatcher
```
### Blue-Green Deployment
Use the provided script:
```bash
./scripts/deployment/blue-green-deploy.sh
```
### Canary Deployment
Use the provided script:
```bash
./scripts/deployment/canary-deploy.sh
```
## Scaling
### Manual Scaling
```bash
# Scale backend
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
# Scale frontend
kubectl scale deployment spywatcher-frontend --replicas=3 -n spywatcher
```
### Auto-scaling
HorizontalPodAutoscaler is configured to scale based on:
- CPU utilization (target: 70%)
- Memory utilization (target: 80%)
```bash
# Check HPA status
kubectl get hpa -n spywatcher
# Describe HPA
kubectl describe hpa spywatcher-backend-hpa -n spywatcher
```
## Monitoring
### Check Pod Status
```bash
# List all pods
kubectl get pods -n spywatcher
# Describe pod
kubectl describe pod <pod-name> -n spywatcher
# View logs
kubectl logs -f <pod-name> -n spywatcher
# View logs from all replicas
kubectl logs -f deployment/spywatcher-backend -n spywatcher
```
### Health Checks
```bash
# Test liveness probe
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
wget -qO- http://localhost:3001/health/live
# Test readiness probe
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
wget -qO- http://localhost:3001/health/ready
```
### Resource Usage
```bash
# Pod resource usage
kubectl top pods -n spywatcher
# Node resource usage
kubectl top nodes
```
## Troubleshooting
### Pod Not Starting
```bash
# Check events
kubectl get events -n spywatcher --sort-by='.lastTimestamp'
# Describe pod
kubectl describe pod <pod-name> -n spywatcher
# Check logs
kubectl logs <pod-name> -n spywatcher --previous # Previous container
```
### Network Issues
```bash
# Check services
kubectl get services -n spywatcher
# Check endpoints
kubectl get endpoints -n spywatcher
# Test service from within cluster
kubectl run -it --rm debug --image=busybox --restart=Never -n spywatcher -- \
wget -qO- http://spywatcher-backend/health/live
```
### Database Connection
```bash
# Check database pod
kubectl get pods -n spywatcher | grep postgres
# Test database connection
kubectl exec -it postgres-0 -n spywatcher -- \
psql -U spywatcher -d spywatcher -c "SELECT version();"
# Check database logs
kubectl logs postgres-0 -n spywatcher
```
### Redis Connection
```bash
# Check Redis pod
kubectl get pods -n spywatcher | grep redis
# Test Redis connection
kubectl exec -it redis-0 -n spywatcher -- redis-cli ping
# Check Redis logs
kubectl logs redis-0 -n spywatcher
```
## Maintenance
### Update Configuration
```bash
# Edit configmap
kubectl edit configmap spywatcher-config -n spywatcher
# Restart pods to pick up changes
kubectl rollout restart deployment/spywatcher-backend -n spywatcher
```
### Database Migrations
Database migrations are run as a separate Kubernetes Job to avoid race conditions.
Migrations should be run before deploying new application versions.
```bash
# Create a unique migration job
JOB_NAME="db-migration-$(date +%s)"
kubectl create job $JOB_NAME --from=job/spywatcher-db-migration -n spywatcher
# Or apply the migration job directly (it will run once)
kubectl apply -f k8s/base/migration-job.yaml
# Check migration status
kubectl get jobs -n spywatcher
# View migration logs
kubectl logs job/$JOB_NAME -n spywatcher
# Delete completed migration jobs (optional, they auto-delete after 1 hour)
kubectl delete job $JOB_NAME -n spywatcher
```
**Important:** The migration job uses `completions: 1` and `parallelism: 1` to ensure
only one migration runs at a time, preventing race conditions and deadlocks.
### Backup
```bash
# Backup PostgreSQL
kubectl exec postgres-0 -n spywatcher -- \
pg_dump -U spywatcher spywatcher > backup.sql
# Backup Redis
kubectl exec redis-0 -n spywatcher -- \
redis-cli BGSAVE
```
## Security
### Network Policies
Network policies restrict traffic between pods:
- Backend can connect to: PostgreSQL, Redis
- Frontend can connect to: Backend
- External traffic: Ingress only
### RBAC
Service accounts with minimal permissions:
- `spywatcher-backend`: Access to secrets, configmaps
- `spywatcher-frontend`: Read-only access
### Secrets
- Use Sealed Secrets or External Secrets Operator for production
- Never commit unencrypted secrets
- Rotate secrets regularly
## Ingress
### NGINX Ingress Controller
Install if not already present:
```bash
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
helm install nginx-ingress ingress-nginx/ingress-nginx
```
### Cert-Manager
Install for automatic SSL certificates:
```bash
helm repo add jetstack https://charts.jetstack.io
helm install cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--set installCRDs=true
```
Create ClusterIssuer:
```yaml
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
server: https://acme-v02.api.letsencrypt.org/directory
email: your-email@example.com
privateKeySecretRef:
name: letsencrypt-prod
solvers:
- http01:
ingress:
class: nginx
```
## Clean Up
### Delete Resources
```bash
# Delete all resources in namespace
kubectl delete namespace spywatcher
# Or use kustomize
kubectl delete -k k8s/overlays/production
```
### Persistent Data
⚠️ **WARNING**: Deleting PVCs will delete all data!
```bash
# List PVCs
kubectl get pvc -n spywatcher
# Delete specific PVC
kubectl delete pvc postgres-data-postgres-0 -n spywatcher
```
## Best Practices
1. **Use namespaces**: Separate environments with namespaces
2. **Resource limits**: Always set requests and limits
3. **Health checks**: Configure liveness and readiness probes
4. **Security context**: Run containers as non-root
5. **Pod disruption budgets**: Ensure high availability
6. **Horizontal scaling**: Use HPA for dynamic scaling
7. **Rolling updates**: Use for zero-downtime deployments
8. **Monitoring**: Integrate with Prometheus/Grafana
9. **Logging**: Centralize logs with ELK or Loki
10. **Backups**: Regular backups of persistent data

View File

@@ -0,0 +1,193 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-backend
namespace: spywatcher
labels:
app: spywatcher
tier: backend
version: v1
spec:
replicas: 3
revisionHistoryLimit: 10
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: spywatcher
tier: backend
template:
metadata:
labels:
app: spywatcher
tier: backend
version: v1
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3001"
prometheus.io/path: "/metrics"
spec:
# Security context for pod
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
# Anti-affinity to spread pods across nodes
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- spywatcher
- key: tier
operator: In
values:
- backend
topologyKey: kubernetes.io/hostname
containers:
- name: backend
image: ghcr.io/subculture-collective/spywatcher-backend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 3001
protocol: TCP
# Environment variables from ConfigMap
envFrom:
- configMapRef:
name: spywatcher-config
# Environment variables from Secrets
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: database-url
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: redis-url
- name: JWT_SECRET
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: jwt-secret
- name: JWT_REFRESH_SECRET
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: jwt-refresh-secret
- name: DISCORD_BOT_TOKEN
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-bot-token
- name: DISCORD_CLIENT_ID
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-client-id
- name: DISCORD_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-client-secret
- name: DISCORD_GUILD_ID
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-guild-id
- name: DISCORD_REDIRECT_URI
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-redirect-uri
- name: ADMIN_DISCORD_IDS
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: admin-discord-ids
- name: BOT_GUILD_IDS
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: bot-guild-ids
# Resource limits and requests
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
# Liveness probe - checks if container is alive
livenessProbe:
httpGet:
path: /health/live
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
# Readiness probe - checks if container is ready to serve traffic
readinessProbe:
httpGet:
path: /health/ready
port: http
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
successThreshold: 1
failureThreshold: 3
# Startup probe - allows slow starting containers more time
startupProbe:
httpGet:
path: /health/live
port: http
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 3
successThreshold: 1
failureThreshold: 30
# Security context for container
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1001
capabilities:
drop:
- ALL
# Volume mounts
volumeMounts:
- name: logs
mountPath: /app/logs
# Volumes
volumes:
- name: logs
emptyDir: {}
# Image pull secrets if using private registry
# imagePullSecrets:
# - name: ghcr-secret

49
k8s/base/backend-hpa.yaml Normal file
View File

@@ -0,0 +1,49 @@
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: spywatcher-backend-hpa
namespace: spywatcher
labels:
app: spywatcher
tier: backend
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: spywatcher-backend
minReplicas: 2
maxReplicas: 10
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
- type: Pods
value: 1
periodSeconds: 60
selectPolicy: Min
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 2
periodSeconds: 30
selectPolicy: Max
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80

View File

@@ -0,0 +1,24 @@
apiVersion: v1
kind: Service
metadata:
name: spywatcher-backend
namespace: spywatcher
labels:
app: spywatcher
tier: backend
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
spec:
type: ClusterIP
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: 10800
selector:
app: spywatcher
tier: backend
ports:
- name: http
port: 80
targetPort: http
protocol: TCP

29
k8s/base/configmap.yaml Normal file
View File

@@ -0,0 +1,29 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: spywatcher-config
namespace: spywatcher
labels:
app: spywatcher
data:
# Application settings
NODE_ENV: "production"
PORT: "3001"
# Logging settings
LOG_LEVEL: "info"
# Rate limiting settings
RATE_LIMIT_WINDOW_MS: "900000"
RATE_LIMIT_MAX_REQUESTS: "100"
# Health check settings
HEALTH_CHECK_INTERVAL: "30"
# Database pool settings
DB_POOL_MIN: "2"
DB_POOL_MAX: "10"
# Redis settings
REDIS_MAX_RETRIES: "3"
REDIS_RETRY_DELAY: "1000"

View File

@@ -0,0 +1,124 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-frontend
namespace: spywatcher
labels:
app: spywatcher
tier: frontend
version: v1
spec:
replicas: 2
revisionHistoryLimit: 10
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: spywatcher
tier: frontend
template:
metadata:
labels:
app: spywatcher
tier: frontend
version: v1
spec:
# Security context for pod
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
# Anti-affinity to spread pods across nodes
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- spywatcher
- key: tier
operator: In
values:
- frontend
topologyKey: kubernetes.io/hostname
containers:
- name: frontend
image: ghcr.io/subculture-collective/spywatcher-frontend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 80
protocol: TCP
env:
- name: VITE_API_URL
value: "https://api.spywatcher.example.com"
- name: VITE_DISCORD_CLIENT_ID
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: discord-client-id
# Resource limits and requests
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
# Liveness probe
livenessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
# Readiness probe
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
successThreshold: 1
failureThreshold: 3
# Security context for container
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1001
capabilities:
drop:
- ALL
# Volume mounts for nginx cache and temp files
volumeMounts:
- name: cache
mountPath: /var/cache/nginx
- name: run
mountPath: /var/run
# Volumes
volumes:
- name: cache
emptyDir: {}
- name: run
emptyDir: {}

View File

@@ -0,0 +1,18 @@
apiVersion: v1
kind: Service
metadata:
name: spywatcher-frontend
namespace: spywatcher
labels:
app: spywatcher
tier: frontend
spec:
type: ClusterIP
selector:
app: spywatcher
tier: frontend
ports:
- name: http
port: 80
targetPort: http
protocol: TCP

76
k8s/base/ingress.yaml Normal file
View File

@@ -0,0 +1,76 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: spywatcher-ingress
namespace: spywatcher
labels:
app: spywatcher
annotations:
# SSL/TLS configuration
cert-manager.io/cluster-issuer: "letsencrypt-prod"
# AWS ALB annotations (if using AWS)
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=60
alb.ingress.kubernetes.io/healthcheck-path: /health/live
alb.ingress.kubernetes.io/healthcheck-interval-seconds: "30"
alb.ingress.kubernetes.io/healthcheck-timeout-seconds: "5"
alb.ingress.kubernetes.io/healthy-threshold-count: "2"
alb.ingress.kubernetes.io/unhealthy-threshold-count: "3"
# NGINX Ingress annotations (if using NGINX)
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
# WebSocket support
nginx.ingress.kubernetes.io/websocket-services: spywatcher-backend
nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
nginx.ingress.kubernetes.io/configuration-snippet: |
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# Security headers
nginx.ingress.kubernetes.io/server-snippet: |
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
# Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "100"
spec:
ingressClassName: nginx
tls:
- hosts:
- spywatcher.example.com
- api.spywatcher.example.com
secretName: spywatcher-tls-cert
rules:
# Frontend
- host: spywatcher.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: spywatcher-frontend
port:
number: 80
# Backend API
- host: api.spywatcher.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: spywatcher-backend
port:
number: 80

View File

@@ -0,0 +1,23 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: spywatcher
resources:
- namespace.yaml
- configmap.yaml
- secrets.yaml
- postgres-statefulset.yaml
- redis-statefulset.yaml
- migration-job.yaml
- backend-deployment.yaml
- backend-service.yaml
- backend-hpa.yaml
- frontend-deployment.yaml
- frontend-service.yaml
- ingress.yaml
- pdb.yaml
commonLabels:
app.kubernetes.io/name: spywatcher
app.kubernetes.io/managed-by: kustomize

View File

@@ -0,0 +1,65 @@
apiVersion: batch/v1
kind: Job
metadata:
name: spywatcher-db-migration
namespace: spywatcher
labels:
app: spywatcher
component: migration
spec:
# Only keep successful jobs for 1 hour
ttlSecondsAfterFinished: 3600
# Prevent concurrent migrations
completions: 1
parallelism: 1
backoffLimit: 3
template:
metadata:
labels:
app: spywatcher
component: migration
spec:
restartPolicy: Never
# Security context for pod
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
containers:
- name: migrate
image: ghcr.io/subculture-collective/spywatcher-backend:latest
imagePullPolicy: Always
command:
- sh
- -c
- |
echo "Starting database migration..."
npx prisma migrate deploy
echo "Migration completed successfully"
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: spywatcher-secrets
key: database-url
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
# Security context for container
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1001
capabilities:
drop:
- ALL

35
k8s/base/namespace.yaml Normal file
View File

@@ -0,0 +1,35 @@
apiVersion: v1
kind: Namespace
metadata:
name: spywatcher
labels:
name: spywatcher
environment: production
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: spywatcher-quota
namespace: spywatcher
spec:
hard:
requests.cpu: "20"
requests.memory: 40Gi
limits.cpu: "40"
limits.memory: 80Gi
persistentvolumeclaims: "10"
---
apiVersion: v1
kind: LimitRange
metadata:
name: spywatcher-limit-range
namespace: spywatcher
spec:
limits:
- default:
cpu: "1"
memory: 1Gi
defaultRequest:
cpu: "500m"
memory: 512Mi
type: Container

29
k8s/base/pdb.yaml Normal file
View File

@@ -0,0 +1,29 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: spywatcher-backend-pdb
namespace: spywatcher
labels:
app: spywatcher
tier: backend
spec:
minAvailable: 1
selector:
matchLabels:
app: spywatcher
tier: backend
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: spywatcher-frontend-pdb
namespace: spywatcher
labels:
app: spywatcher
tier: frontend
spec:
minAvailable: 1
selector:
matchLabels:
app: spywatcher
tier: frontend

View File

@@ -0,0 +1,113 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: spywatcher
labels:
app: spywatcher
tier: database
spec:
serviceName: postgres
replicas: 1
selector:
matchLabels:
app: spywatcher
tier: database
template:
metadata:
labels:
app: spywatcher
tier: database
spec:
containers:
- name: postgres
image: postgres:15-alpine
ports:
- name: postgres
containerPort: 5432
protocol: TCP
env:
- name: POSTGRES_DB
value: spywatcher
- name: POSTGRES_USER
value: spywatcher
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secret
key: password
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command:
- /bin/sh
- -c
- pg_isready -U spywatcher
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- pg_isready -U spywatcher
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
volumeClaimTemplates:
- metadata:
name: postgres-data
labels:
app: spywatcher
tier: database
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 20Gi
---
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: spywatcher
labels:
app: spywatcher
tier: database
spec:
type: ClusterIP
clusterIP: None
selector:
app: spywatcher
tier: database
ports:
- name: postgres
port: 5432
targetPort: postgres
protocol: TCP
---
apiVersion: v1
kind: Secret
metadata:
name: postgres-secret
namespace: spywatcher
labels:
app: spywatcher
tier: database
type: Opaque
stringData:
password: "CHANGE_ME_IN_PRODUCTION"

View File

@@ -0,0 +1,117 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis
namespace: spywatcher
labels:
app: spywatcher
tier: cache
spec:
serviceName: redis
replicas: 1
selector:
matchLabels:
app: spywatcher
tier: cache
template:
metadata:
labels:
app: spywatcher
tier: cache
spec:
containers:
- name: redis
image: redis:7-alpine
command:
- redis-server
- --appendonly
- "yes"
- --requirepass
- $(REDIS_PASSWORD)
ports:
- name: redis
containerPort: 6379
protocol: TCP
env:
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: redis-secret
key: password
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
volumeMounts:
- name: redis-data
mountPath: /data
livenessProbe:
exec:
command:
- redis-cli
- --no-auth-warning
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- redis-cli
- --no-auth-warning
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
volumeClaimTemplates:
- metadata:
name: redis-data
labels:
app: spywatcher
tier: cache
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
name: redis
namespace: spywatcher
labels:
app: spywatcher
tier: cache
spec:
type: ClusterIP
clusterIP: None
selector:
app: spywatcher
tier: cache
ports:
- name: redis
port: 6379
targetPort: redis
protocol: TCP
---
apiVersion: v1
kind: Secret
metadata:
name: redis-secret
namespace: spywatcher
labels:
app: spywatcher
tier: cache
type: Opaque
stringData:
password: "CHANGE_ME_IN_PRODUCTION"

32
k8s/base/secrets.yaml Normal file
View File

@@ -0,0 +1,32 @@
# This is a template file for secrets
# In production, use sealed-secrets, external-secrets, or your cloud provider's secret management
# DO NOT commit actual secrets to git
apiVersion: v1
kind: Secret
metadata:
name: spywatcher-secrets
namespace: spywatcher
labels:
app: spywatcher
type: Opaque
stringData:
# Database connection
database-url: "postgresql://user:password@postgres-service:5432/spywatcher"
# Redis connection
redis-url: "redis://redis-service:6379"
# JWT secrets
jwt-secret: "CHANGE_ME_IN_PRODUCTION"
jwt-refresh-secret: "CHANGE_ME_IN_PRODUCTION"
# Discord credentials
discord-bot-token: "CHANGE_ME_IN_PRODUCTION"
discord-client-id: "CHANGE_ME_IN_PRODUCTION"
discord-client-secret: "CHANGE_ME_IN_PRODUCTION"
discord-guild-id: "CHANGE_ME_IN_PRODUCTION"
discord-redirect-uri: "https://spywatcher.example.com/auth/callback"
# Admin settings
admin-discord-ids: ""
bot-guild-ids: ""

View File

@@ -0,0 +1,29 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: spywatcher
bases:
- ../../base
namePrefix: prod-
commonLabels:
environment: production
patchesStrategicMerge:
- replicas-patch.yaml
- resources-patch.yaml
configMapGenerator:
- name: spywatcher-config
behavior: merge
literals:
- NODE_ENV=production
- LOG_LEVEL=info
images:
- name: ghcr.io/subculture-collective/spywatcher-backend
newTag: latest
- name: ghcr.io/subculture-collective/spywatcher-frontend
newTag: latest

View File

@@ -0,0 +1,15 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-backend
namespace: spywatcher
spec:
replicas: 3
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-frontend
namespace: spywatcher
spec:
replicas: 2

View File

@@ -0,0 +1,35 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-backend
namespace: spywatcher
spec:
template:
spec:
containers:
- name: backend
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-frontend
namespace: spywatcher
spec:
template:
spec:
containers:
- name: frontend
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"

View File

@@ -0,0 +1,28 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: spywatcher-staging
bases:
- ../../base
namePrefix: staging-
commonLabels:
environment: staging
patchesStrategicMerge:
- replicas-patch.yaml
configMapGenerator:
- name: spywatcher-config
behavior: merge
literals:
- NODE_ENV=staging
- LOG_LEVEL=debug
images:
- name: ghcr.io/subculture-collective/spywatcher-backend
newTag: staging
- name: ghcr.io/subculture-collective/spywatcher-frontend
newTag: staging

View File

@@ -0,0 +1,15 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-backend
namespace: spywatcher-staging
spec:
replicas: 1
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: spywatcher-frontend
namespace: spywatcher-staging
spec:
replicas: 1

View File

@@ -0,0 +1,198 @@
#!/bin/bash
set -e
# Blue-Green Deployment Script for Spywatcher
# This script performs zero-downtime deployments by maintaining two identical environments
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
NAMESPACE="${NAMESPACE:-spywatcher}"
APP_NAME="${APP_NAME:-spywatcher-backend}"
IMAGE_TAG="${IMAGE_TAG:-latest}"
HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
HEALTH_CHECK_RETRIES="${HEALTH_CHECK_RETRIES:-10}"
HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-10}"
# Function to print colored output
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check if a deployment exists
deployment_exists() {
kubectl get deployment "$1" -n "$NAMESPACE" &> /dev/null
}
# Function to get current active environment
get_active_environment() {
local service_selector=$(kubectl get service "$APP_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.selector.version}')
echo "$service_selector"
}
# Function to perform health check
health_check() {
local deployment=$1
local retries=$HEALTH_CHECK_RETRIES
print_info "Performing health check on $deployment..."
while [ $retries -gt 0 ]; do
# Get pod name
local pod=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=$deployment -o jsonpath='{.items[0].metadata.name}')
if [ -z "$pod" ]; then
print_warning "No pod found for $deployment, retrying..."
sleep $HEALTH_CHECK_INTERVAL
retries=$((retries - 1))
continue
fi
# Check if pod is running
local pod_status=$(kubectl get pod "$pod" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
if [ "$pod_status" != "Running" ]; then
print_warning "Pod $pod is not running (status: $pod_status), retrying..."
sleep $HEALTH_CHECK_INTERVAL
retries=$((retries - 1))
continue
fi
# Perform HTTP health check
if kubectl exec "$pod" -n "$NAMESPACE" -- wget -q -O- "http://localhost:3001$HEALTH_CHECK_PATH" &> /dev/null; then
print_info "Health check passed for $deployment"
return 0
else
print_warning "Health check failed for $deployment, retrying..."
sleep $HEALTH_CHECK_INTERVAL
retries=$((retries - 1))
fi
done
print_error "Health check failed after $HEALTH_CHECK_RETRIES retries"
return 1
}
# Main deployment logic
main() {
print_info "Starting Blue-Green deployment for $APP_NAME"
print_info "Namespace: $NAMESPACE"
print_info "Image Tag: $IMAGE_TAG"
# Determine current active environment
local current_env=$(get_active_environment)
if [ -z "$current_env" ]; then
# No active environment, default to blue
current_env="blue"
new_env="green"
print_info "No active environment found, will deploy to green"
elif [ "$current_env" = "blue" ]; then
new_env="green"
else
new_env="blue"
fi
print_info "Current active environment: $current_env"
print_info "Deploying to: $new_env"
# Create or update new environment deployment
local new_deployment="$APP_NAME-$new_env"
# Apply deployment
kubectl set image "deployment/$new_deployment" \
backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
-n "$NAMESPACE" 2>/dev/null || \
kubectl create deployment "$new_deployment" \
--image="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
-n "$NAMESPACE"
# Label the deployment
kubectl label deployment "$new_deployment" app=spywatcher version=$new_env -n "$NAMESPACE" --overwrite
# Wait for deployment to be ready
print_info "Waiting for deployment $new_deployment to be ready..."
kubectl rollout status "deployment/$new_deployment" -n "$NAMESPACE" --timeout=5m
# Perform health checks
if ! health_check "$new_env"; then
print_error "Health check failed for $new_env environment"
print_error "Keeping traffic on $current_env environment"
exit 1
fi
# Update service selector to point to new environment
print_info "Switching traffic to $new_env environment..."
kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
-p "{\"spec\":{\"selector\":{\"version\":\"$new_env\"}}}"
print_info "Traffic successfully switched to $new_env"
# Wait a bit before considering old environment for removal
print_info "Waiting 60 seconds before cleaning up old environment..."
sleep 60
# Optional: Scale down old environment instead of deleting
if deployment_exists "$APP_NAME-$current_env"; then
print_info "Scaling down old environment: $current_env"
kubectl scale deployment "$APP_NAME-$current_env" --replicas=0 -n "$NAMESPACE"
print_info "Old environment scaled to 0 replicas (can be used for quick rollback)"
fi
print_info "Blue-Green deployment completed successfully!"
print_info "Active environment: $new_env"
}
# Rollback function
rollback() {
print_warning "Rolling back deployment..."
local current_env=$(get_active_environment)
local previous_env
if [ "$current_env" = "blue" ]; then
previous_env="green"
else
previous_env="blue"
fi
# Check if previous environment exists
if ! deployment_exists "$APP_NAME-$previous_env"; then
print_error "Previous environment $previous_env does not exist, cannot rollback"
exit 1
fi
# Scale up previous environment if it's scaled down
local replicas=$(kubectl get deployment "$APP_NAME-$previous_env" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
if [ "$replicas" -eq 0 ]; then
print_info "Scaling up previous environment: $previous_env"
kubectl scale deployment "$APP_NAME-$previous_env" --replicas=3 -n "$NAMESPACE"
kubectl rollout status "deployment/$APP_NAME-$previous_env" -n "$NAMESPACE" --timeout=5m
fi
# Switch traffic back
print_info "Switching traffic back to $previous_env"
kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
-p "{\"spec\":{\"selector\":{\"version\":\"$previous_env\"}}}"
print_info "Rollback completed successfully!"
print_info "Active environment: $previous_env"
}
# Check if rollback flag is set
if [ "$1" = "--rollback" ]; then
rollback
else
main
fi

View File

@@ -0,0 +1,218 @@
#!/bin/bash
set -e
# Canary Deployment Script for Spywatcher
# This script gradually shifts traffic to a new version while monitoring for errors
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
NAMESPACE="${NAMESPACE:-spywatcher}"
APP_NAME="${APP_NAME:-spywatcher-backend}"
IMAGE_TAG="${IMAGE_TAG:-latest}"
HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
# Canary rollout percentages
CANARY_STEPS="${CANARY_STEPS:-5 25 50 75 100}"
CANARY_WAIT="${CANARY_WAIT:-60}" # Wait time between steps in seconds
# Error thresholds
ERROR_THRESHOLD="${ERROR_THRESHOLD:-5}" # Max error percentage before rollback
# Function to print colored output
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check deployment health
check_health() {
local deployment=$1
local replicas=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}')
local desired=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
if [ "$replicas" = "$desired" ] && [ "$replicas" -gt 0 ]; then
return 0
else
return 1
fi
}
# Function to check error rate (simplified - you should integrate with your monitoring system)
check_error_rate() {
local deployment=$1
# Get pod logs and check for errors
local pods=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=canary -o jsonpath='{.items[*].metadata.name}')
if [ -z "$pods" ]; then
print_warning "No canary pods found"
return 0
fi
# Simple error check - count ERROR log entries
local error_count=0
for pod in $pods; do
local pod_errors=$(kubectl logs "$pod" -n "$NAMESPACE" --tail=100 | grep -c "ERROR" || true)
error_count=$((error_count + pod_errors))
done
print_info "Detected $error_count errors in canary pods"
if [ "$error_count" -gt "$ERROR_THRESHOLD" ]; then
return 1
else
return 0
fi
}
# Function to update traffic weights
# NOTE: This implementation uses replica counts to approximate traffic splitting,
# which is not precise. For accurate percentage-based traffic splitting,
# consider using a service mesh (Istio, Linkerd) or an ingress controller
# that supports weighted traffic splitting (like NGINX Ingress with canary annotations).
update_traffic_weight() {
local canary_weight=$1
local stable_weight=$((100 - canary_weight))
print_info "Adjusting traffic: Canary $canary_weight%, Stable $stable_weight%"
print_warning "Note: Replica-based traffic splitting is approximate. Actual traffic may not match percentages exactly."
# Calculate replica counts based on percentages
local total_replicas=3
local canary_replicas=$(( (total_replicas * canary_weight + 50) / 100 ))
local stable_replicas=$((total_replicas - canary_replicas))
# Ensure at least 1 replica
[ "$canary_replicas" -eq 0 ] && canary_replicas=1
[ "$stable_replicas" -eq 0 ] && stable_replicas=1
# Scale deployments
kubectl scale deployment "$APP_NAME-canary" --replicas=$canary_replicas -n "$NAMESPACE"
kubectl scale deployment "$APP_NAME-stable" --replicas=$stable_replicas -n "$NAMESPACE"
# Wait for scaling to complete
kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=2m
kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=2m
}
# Function to promote canary to stable
promote_canary() {
print_info "Promoting canary to stable..."
# Update stable deployment with canary image
local canary_image=$(kubectl get deployment "$APP_NAME-canary" -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].image}')
kubectl set image "deployment/$APP_NAME-stable" backend="$canary_image" -n "$NAMESPACE"
kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=5m
# Scale stable back to full capacity
kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
# Remove canary deployment
kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
print_info "Canary promoted to stable successfully!"
}
# Function to rollback canary
rollback_canary() {
print_error "Rolling back canary deployment..."
# Delete canary deployment
kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
# Ensure stable is at full capacity
kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
print_info "Canary deployment rolled back"
}
# Main deployment logic
main() {
print_info "Starting Canary deployment for $APP_NAME"
print_info "Namespace: $NAMESPACE"
print_info "Image Tag: $IMAGE_TAG"
print_info "Canary steps: $CANARY_STEPS"
# Ensure stable deployment exists
if ! kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" &> /dev/null; then
# If stable doesn't exist, copy from existing deployment
if kubectl get deployment "$APP_NAME" -n "$NAMESPACE" &> /dev/null; then
print_info "Creating stable deployment from existing deployment"
kubectl get deployment "$APP_NAME" -n "$NAMESPACE" -o yaml | \
sed "s/name: $APP_NAME$/name: $APP_NAME-stable/" | \
kubectl apply -f -
# Update the original deployment name if needed
kubectl label deployment "$APP_NAME-stable" version=stable -n "$NAMESPACE" --overwrite
else
print_error "No existing deployment found"
exit 1
fi
fi
# Create canary deployment
print_info "Creating canary deployment..."
kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" -o yaml | \
sed "s/$APP_NAME-stable/$APP_NAME-canary/g" | \
sed "s/version: stable/version: canary/g" | \
kubectl apply -f -
# Update canary image
kubectl set image "deployment/$APP_NAME-canary" \
backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
-n "$NAMESPACE"
kubectl label deployment "$APP_NAME-canary" version=canary -n "$NAMESPACE" --overwrite
# Start with minimal canary traffic
kubectl scale deployment "$APP_NAME-canary" --replicas=1 -n "$NAMESPACE"
kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=5m
# Gradually shift traffic
for step in $CANARY_STEPS; do
print_info "Canary rollout: ${step}%"
# Update traffic weights
update_traffic_weight "$step"
# Wait for the step duration
print_info "Waiting ${CANARY_WAIT}s before next step..."
sleep "$CANARY_WAIT"
# Check health
if ! check_health "$APP_NAME-canary"; then
print_error "Canary health check failed"
rollback_canary
exit 1
fi
# Check error rate
if ! check_error_rate "$APP_NAME-canary"; then
print_error "Canary error rate exceeded threshold"
rollback_canary
exit 1
fi
print_info "Step ${step}% completed successfully"
done
# Promote canary to stable
promote_canary
print_info "Canary deployment completed successfully!"
}
# Run main function
main

336
terraform/README.md Normal file
View File

@@ -0,0 +1,336 @@
# Spywatcher Infrastructure as Code
This directory contains Terraform configurations for deploying Spywatcher infrastructure on AWS.
## Prerequisites
- AWS CLI configured with appropriate credentials
- Terraform >= 1.5.0
- kubectl
- Helm (optional)
## Infrastructure Components
### Modules
- **VPC**: Virtual Private Cloud with public, private, and database subnets
- **EKS**: Elastic Kubernetes Service cluster
- **RDS**: PostgreSQL database
- **Redis**: ElastiCache Redis cluster
- **ALB**: Application Load Balancer with WAF
### Directory Structure
```
terraform/
├── main.tf # Root module configuration
├── variables.tf # Root module variables
├── outputs.tf # Root module outputs
├── modules/ # Reusable modules
│ ├── vpc/
│ ├── eks/
│ ├── rds/
│ ├── redis/
│ └── alb/
└── environments/ # Environment-specific configurations
├── production/
│ └── terraform.tfvars
└── staging/
└── terraform.tfvars
```
## Quick Start
### 1. Configure Backend
First, create an S3 bucket and DynamoDB table for state management:
```bash
# Create S3 bucket for state
aws s3api create-bucket \
--bucket spywatcher-terraform-state \
--region us-east-1
# Enable versioning
aws s3api put-bucket-versioning \
--bucket spywatcher-terraform-state \
--versioning-configuration Status=Enabled
# Create DynamoDB table for state locking
aws dynamodb create-table \
--table-name terraform-state-lock \
--attribute-definitions AttributeName=LockID,AttributeType=S \
--key-schema AttributeName=LockID,KeyType=HASH \
--billing-mode PAY_PER_REQUEST \
--region us-east-1
```
### 2. Initialize Terraform
```bash
cd terraform
terraform init
```
### 3. Review and Customize
Edit the appropriate `terraform.tfvars` file:
```bash
# For production
vim environments/production/terraform.tfvars
# For staging
vim environments/staging/terraform.tfvars
```
Key configurations to update:
- `certificate_arn`: SSL certificate ARN from AWS Certificate Manager
- VPC CIDR blocks (if needed)
- Instance types and sizes
- Database credentials (use environment variables or AWS Secrets Manager)
### 4. Plan Infrastructure
```bash
# Production
terraform plan -var-file="environments/production/terraform.tfvars"
# Staging
terraform plan -var-file="environments/staging/terraform.tfvars"
```
### 5. Apply Infrastructure
```bash
# Production
terraform apply -var-file="environments/production/terraform.tfvars"
# Staging
terraform apply -var-file="environments/staging/terraform.tfvars"
```
This will create:
- VPC with NAT gateways
- EKS cluster with node groups
- RDS PostgreSQL instance
- ElastiCache Redis cluster
- Application Load Balancer
- Security groups and IAM roles
### 6. Configure kubectl
After infrastructure is created:
```bash
# Get the cluster name from outputs
terraform output eks_cluster_name
# Configure kubectl
aws eks update-kubeconfig \
--name $(terraform output -raw eks_cluster_name) \
--region us-east-1
# Verify connection
kubectl cluster-info
kubectl get nodes
```
## Outputs
After applying, Terraform will output important values:
```bash
# View all outputs
terraform output
# View specific output
terraform output rds_endpoint
terraform output eks_cluster_endpoint
```
## Secrets Management
### Database Password
The RDS password is auto-generated and stored in AWS Secrets Manager:
```bash
# Retrieve database password
aws secretsmanager get-secret-value \
--secret-id spywatcher-production-db-password \
--query SecretString \
--output text
```
### Redis Auth Token
Redis authentication token is also in Secrets Manager:
```bash
# Retrieve Redis auth token
aws secretsmanager get-secret-value \
--secret-id spywatcher-production-auth-token \
--query SecretString \
--output text
```
## Updating Infrastructure
```bash
# Make changes to .tf files or terraform.tfvars
# Plan changes
terraform plan -var-file="environments/production/terraform.tfvars"
# Apply changes
terraform apply -var-file="environments/production/terraform.tfvars"
```
## Destroying Infrastructure
⚠️ **WARNING**: This will destroy all resources. Make sure you have backups!
```bash
# Destroy infrastructure
terraform destroy -var-file="environments/production/terraform.tfvars"
```
## Module Documentation
### VPC Module
Creates a VPC with:
- 3 availability zones
- Public, private, and database subnets
- NAT gateways for private subnet internet access
- VPC Flow Logs
### EKS Module
Creates an EKS cluster with:
- Managed node groups
- OIDC provider for IRSA
- Essential add-ons (VPC CNI, CoreDNS, kube-proxy)
- Security groups
### RDS Module
Creates a PostgreSQL database with:
- Encryption at rest
- Automated backups
- Multi-AZ deployment (production)
- Performance Insights
- CloudWatch alarms
### Redis Module
Creates an ElastiCache Redis cluster with:
- Encryption in transit and at rest
- Authentication token
- Automatic failover (if multi-node)
- CloudWatch alarms
### ALB Module
Creates an Application Load Balancer with:
- HTTPS termination
- HTTP to HTTPS redirect
- WAF with rate limiting
- AWS Managed Rules
## Cost Optimization
### Development/Testing
For cost savings in non-production:
- Use smaller instance types
- Single-AZ deployments
- Spot instances for EKS nodes
- Reduce backup retention periods
### Production
- Use Reserved Instances for steady-state workload
- Enable auto-scaling
- Right-size instances based on metrics
- Use S3 lifecycle policies for backups
## Monitoring
### CloudWatch Alarms
The modules create CloudWatch alarms for:
- RDS CPU utilization
- RDS storage space
- Redis CPU utilization
- Redis memory usage
Configure SNS topics for notifications:
```bash
# Create SNS topic
aws sns create-topic --name spywatcher-alerts
# Subscribe to topic
aws sns subscribe \
--topic-arn arn:aws:sns:us-east-1:123456789012:spywatcher-alerts \
--protocol email \
--notification-endpoint your-email@example.com
```
## Troubleshooting
### State Lock Issues
If you encounter state lock errors:
```bash
# Force unlock (use carefully)
terraform force-unlock <LOCK_ID>
```
### EKS Access Issues
If you can't access the cluster:
```bash
# Ensure your AWS credentials are correct
aws sts get-caller-identity
# Update kubeconfig
aws eks update-kubeconfig --name <cluster-name> --region us-east-1
# Check IAM authentication
kubectl auth can-i get pods --all-namespaces
```
### RDS Connection Issues
```bash
# Check security group rules
aws ec2 describe-security-groups --group-ids <sg-id>
# Test connection from EKS node
kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -- \
psql -h <rds-endpoint> -U spywatcher -d spywatcher
```
## Security Best Practices
1. **Never commit secrets**: Use AWS Secrets Manager or environment variables
2. **Enable MFA**: For AWS account access
3. **Use IAM roles**: Instead of access keys where possible
4. **Regular updates**: Keep Terraform and providers up to date
5. **Review changes**: Always review `terraform plan` output
6. **Backup state**: S3 versioning is enabled for state files
7. **Least privilege**: IAM policies follow least privilege principle
## Support
For infrastructure issues:
- Check Terraform state: `terraform show`
- Review CloudWatch logs
- Check AWS CloudTrail for API calls
- Consult AWS documentation
- Create issue in repository

View File

@@ -0,0 +1,54 @@
# Production Environment Configuration
environment = "production"
aws_region = "us-east-1"
project_name = "spywatcher"
# VPC Configuration
vpc_cidr = "10.0.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
private_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
public_subnet_cidrs = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
database_subnet_cidrs = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
# EKS Configuration
eks_cluster_version = "1.28"
eks_node_groups = {
general = {
desired_size = 3
min_size = 2
max_size = 10
instance_types = ["t3.large"]
capacity_type = "ON_DEMAND"
}
spot = {
desired_size = 2
min_size = 0
max_size = 5
instance_types = ["t3.large", "t3a.large"]
capacity_type = "SPOT"
}
}
# RDS Configuration
rds_engine_version = "15.3"
rds_instance_class = "db.t3.large"
rds_allocated_storage = 100
database_name = "spywatcher"
database_username = "spywatcher"
# Redis Configuration
redis_node_type = "cache.t3.medium"
redis_num_cache_nodes = 2
# SSL Certificate
# Replace with actual certificate ARN after creating in ACM
certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# Additional Tags
tags = {
Terraform = "true"
Environment = "production"
Project = "spywatcher"
CostCenter = "engineering"
}

View File

@@ -0,0 +1,45 @@
# Staging Environment Configuration
environment = "staging"
aws_region = "us-east-1"
project_name = "spywatcher"
# VPC Configuration
vpc_cidr = "10.1.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b"]
private_subnet_cidrs = ["10.1.1.0/24", "10.1.2.0/24"]
public_subnet_cidrs = ["10.1.101.0/24", "10.1.102.0/24"]
database_subnet_cidrs = ["10.1.201.0/24", "10.1.202.0/24"]
# EKS Configuration
eks_cluster_version = "1.28"
eks_node_groups = {
general = {
desired_size = 2
min_size = 1
max_size = 4
instance_types = ["t3.medium"]
capacity_type = "ON_DEMAND"
}
}
# RDS Configuration
rds_engine_version = "15.3"
rds_instance_class = "db.t3.medium"
rds_allocated_storage = 50
database_name = "spywatcher"
database_username = "spywatcher"
# Redis Configuration
redis_node_type = "cache.t3.small"
redis_num_cache_nodes = 1
# SSL Certificate
certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# Additional Tags
tags = {
Terraform = "true"
Environment = "staging"
Project = "spywatcher"
}

141
terraform/main.tf Normal file
View File

@@ -0,0 +1,141 @@
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.11"
}
}
# Backend configuration for state storage
# Note: The key should be set dynamically using -backend-config flag:
# terraform init -backend-config="key=<environment>/terraform.tfstate"
backend "s3" {
bucket = "spywatcher-terraform-state"
key = "terraform.tfstate" # Override with -backend-config flag
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Project = "spywatcher"
Environment = var.environment
ManagedBy = "terraform"
}
}
}
# VPC Module
module "vpc" {
source = "./modules/vpc"
environment = var.environment
vpc_cidr = var.vpc_cidr
availability_zones = var.availability_zones
private_subnet_cidrs = var.private_subnet_cidrs
public_subnet_cidrs = var.public_subnet_cidrs
database_subnet_cidrs = var.database_subnet_cidrs
}
# EKS Module
module "eks" {
source = "./modules/eks"
environment = var.environment
cluster_name = "${var.project_name}-${var.environment}"
cluster_version = var.eks_cluster_version
vpc_id = module.vpc.vpc_id
private_subnet_ids = module.vpc.private_subnet_ids
node_groups = var.eks_node_groups
}
# RDS PostgreSQL Module
module "rds" {
source = "./modules/rds"
environment = var.environment
identifier = "${var.project_name}-${var.environment}"
engine_version = var.rds_engine_version
instance_class = var.rds_instance_class
allocated_storage = var.rds_allocated_storage
database_name = var.database_name
master_username = var.database_username
vpc_id = module.vpc.vpc_id
database_subnet_ids = module.vpc.database_subnet_ids
allowed_security_group_ids = [module.eks.cluster_security_group_id]
}
# ElastiCache Redis Module
module "redis" {
source = "./modules/redis"
environment = var.environment
cluster_id = "${var.project_name}-${var.environment}"
node_type = var.redis_node_type
num_cache_nodes = var.redis_num_cache_nodes
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
allowed_security_group_ids = [module.eks.cluster_security_group_id]
}
# Application Load Balancer Module
module "alb" {
source = "./modules/alb"
environment = var.environment
vpc_id = module.vpc.vpc_id
public_subnet_ids = module.vpc.public_subnet_ids
certificate_arn = var.certificate_arn
}
# Configure Kubernetes provider
provider "kubernetes" {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
exec {
api_version = "client.authentication.k8s.io/v1"
command = "aws"
args = [
"eks",
"get-token",
"--cluster-name",
module.eks.cluster_name
]
}
}
# Configure Helm provider
provider "helm" {
kubernetes {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
exec {
api_version = "client.authentication.k8s.io/v1"
command = "aws"
args = [
"eks",
"get-token",
"--cluster-name",
module.eks.cluster_name
]
}
}
}

View File

@@ -0,0 +1,262 @@
resource "aws_security_group" "alb" {
name = "${var.environment}-alb-sg"
description = "Security group for Application Load Balancer"
vpc_id = var.vpc_id
ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "HTTP access"
}
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "HTTPS access"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.environment}-alb-sg"
Environment = var.environment
}
}
resource "aws_lb" "main" {
name = "${var.environment}-alb"
internal = false
load_balancer_type = "application"
security_groups = [aws_security_group.alb.id]
subnets = var.public_subnet_ids
enable_deletion_protection = var.enable_deletion_protection
enable_http2 = true
enable_cross_zone_load_balancing = true
drop_invalid_header_fields = true
tags = {
Name = "${var.environment}-alb"
Environment = var.environment
}
}
resource "aws_lb_target_group" "backend" {
name = "${var.environment}-backend-tg"
port = 80
protocol = "HTTP"
vpc_id = var.vpc_id
health_check {
enabled = true
healthy_threshold = 2
unhealthy_threshold = 3
timeout = 5
interval = 30
path = "/health/live"
matcher = "200"
}
deregistration_delay = 30
stickiness {
type = "lb_cookie"
cookie_duration = 86400
enabled = true
}
tags = {
Name = "${var.environment}-backend-tg"
Environment = var.environment
}
}
resource "aws_lb_target_group" "frontend" {
name = "${var.environment}-frontend-tg"
port = 80
protocol = "HTTP"
vpc_id = var.vpc_id
health_check {
enabled = true
healthy_threshold = 2
unhealthy_threshold = 3
timeout = 5
interval = 30
path = "/"
matcher = "200"
}
deregistration_delay = 30
tags = {
Name = "${var.environment}-frontend-tg"
Environment = var.environment
}
}
# HTTP Listener - Redirect to HTTPS
resource "aws_lb_listener" "http" {
load_balancer_arn = aws_lb.main.arn
port = "80"
protocol = "HTTP"
default_action {
type = "redirect"
redirect {
port = "443"
protocol = "HTTPS"
status_code = "HTTP_301"
}
}
}
# HTTPS Listener
resource "aws_lb_listener" "https" {
load_balancer_arn = aws_lb.main.arn
port = "443"
protocol = "HTTPS"
ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01"
certificate_arn = var.certificate_arn
default_action {
type = "forward"
forward {
target_group {
arn = aws_lb_target_group.frontend.arn
weight = 100
}
}
}
}
# Listener Rules for API routing
resource "aws_lb_listener_rule" "api" {
listener_arn = aws_lb_listener.https.arn
priority = 100
action {
type = "forward"
forward {
target_group {
arn = aws_lb_target_group.backend.arn
weight = 100
}
}
}
condition {
path_pattern {
values = ["/api/*", "/health/*"]
}
}
}
# WAF Web ACL (optional but recommended)
resource "aws_wafv2_web_acl" "main" {
name = "${var.environment}-waf"
scope = "REGIONAL"
default_action {
allow {}
}
# Rate limiting rule
rule {
name = "RateLimitRule"
priority = 1
action {
block {}
}
statement {
rate_based_statement {
limit = 2000
aggregate_key_type = "IP"
}
}
visibility_config {
cloudwatch_metrics_enabled = true
metric_name = "RateLimitRule"
sampled_requests_enabled = true
}
}
# AWS Managed Rules - Core Rule Set
rule {
name = "AWSManagedRulesCommonRuleSet"
priority = 2
override_action {
none {}
}
statement {
managed_rule_group_statement {
name = "AWSManagedRulesCommonRuleSet"
vendor_name = "AWS"
}
}
visibility_config {
cloudwatch_metrics_enabled = true
metric_name = "AWSManagedRulesCommonRuleSetMetric"
sampled_requests_enabled = true
}
}
# Known Bad Inputs Rule Set
rule {
name = "AWSManagedRulesKnownBadInputsRuleSet"
priority = 3
override_action {
none {}
}
statement {
managed_rule_group_statement {
name = "AWSManagedRulesKnownBadInputsRuleSet"
vendor_name = "AWS"
}
}
visibility_config {
cloudwatch_metrics_enabled = true
metric_name = "AWSManagedRulesKnownBadInputsRuleSetMetric"
sampled_requests_enabled = true
}
}
visibility_config {
cloudwatch_metrics_enabled = true
metric_name = "${var.environment}-waf"
sampled_requests_enabled = true
}
tags = {
Name = "${var.environment}-waf"
Environment = var.environment
}
}
# Associate WAF with ALB
resource "aws_wafv2_web_acl_association" "main" {
resource_arn = aws_lb.main.arn
web_acl_arn = aws_wafv2_web_acl.main.arn
}

View File

@@ -0,0 +1,29 @@
output "alb_arn" {
description = "ALB ARN"
value = aws_lb.main.arn
}
output "alb_dns_name" {
description = "ALB DNS name"
value = aws_lb.main.dns_name
}
output "alb_zone_id" {
description = "ALB zone ID"
value = aws_lb.main.zone_id
}
output "backend_target_group_arn" {
description = "Backend target group ARN"
value = aws_lb_target_group.backend.arn
}
output "frontend_target_group_arn" {
description = "Frontend target group ARN"
value = aws_lb_target_group.frontend.arn
}
output "alb_security_group_id" {
description = "ALB security group ID"
value = aws_security_group.alb.id
}

View File

@@ -0,0 +1,25 @@
variable "environment" {
description = "Environment name"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "public_subnet_ids" {
description = "Public subnet IDs for ALB"
type = list(string)
}
variable "certificate_arn" {
description = "ARN of SSL certificate"
type = string
}
variable "enable_deletion_protection" {
description = "Enable deletion protection for ALB"
type = bool
default = true
}

View File

@@ -0,0 +1,178 @@
resource "aws_eks_cluster" "main" {
name = var.cluster_name
role_arn = aws_iam_role.cluster.arn
version = var.cluster_version
vpc_config {
subnet_ids = var.private_subnet_ids
endpoint_private_access = true
endpoint_public_access = true
security_group_ids = [aws_security_group.cluster.id]
}
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
depends_on = [
aws_iam_role_policy_attachment.cluster_policy,
aws_iam_role_policy_attachment.vpc_resource_controller
]
tags = {
Name = var.cluster_name
Environment = var.environment
}
}
# EKS Cluster IAM Role
resource "aws_iam_role" "cluster" {
name = "${var.cluster_name}-cluster-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "eks.amazonaws.com"
}
}]
})
tags = {
Name = "${var.cluster_name}-cluster-role"
Environment = var.environment
}
}
resource "aws_iam_role_policy_attachment" "cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.cluster.name
}
resource "aws_iam_role_policy_attachment" "vpc_resource_controller" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
role = aws_iam_role.cluster.name
}
# Cluster Security Group
resource "aws_security_group" "cluster" {
name = "${var.cluster_name}-cluster-sg"
description = "Security group for EKS cluster"
vpc_id = var.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.cluster_name}-cluster-sg"
Environment = var.environment
}
}
# EKS Node Groups
resource "aws_eks_node_group" "main" {
for_each = var.node_groups
cluster_name = aws_eks_cluster.main.name
node_group_name = "${var.cluster_name}-${each.key}"
node_role_arn = aws_iam_role.node.arn
subnet_ids = var.private_subnet_ids
capacity_type = each.value.capacity_type
instance_types = each.value.instance_types
scaling_config {
desired_size = each.value.desired_size
max_size = each.value.max_size
min_size = each.value.min_size
}
update_config {
max_unavailable = 1
}
depends_on = [
aws_iam_role_policy_attachment.node_policy,
aws_iam_role_policy_attachment.cni_policy,
aws_iam_role_policy_attachment.container_registry_policy,
]
tags = {
Name = "${var.cluster_name}-${each.key}"
Environment = var.environment
}
}
# Node IAM Role
resource "aws_iam_role" "node" {
name = "${var.cluster_name}-node-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ec2.amazonaws.com"
}
}]
})
tags = {
Name = "${var.cluster_name}-node-role"
Environment = var.environment
}
}
resource "aws_iam_role_policy_attachment" "node_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "cni_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "container_registry_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node.name
}
# OIDC Provider for IRSA (IAM Roles for Service Accounts)
data "tls_certificate" "cluster" {
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
}
resource "aws_iam_openid_connect_provider" "cluster" {
client_id_list = ["sts.amazonaws.com"]
thumbprint_list = [data.tls_certificate.cluster.certificates[0].sha1_fingerprint]
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
tags = {
Name = "${var.cluster_name}-oidc"
Environment = var.environment
}
}
# EKS Add-ons
resource "aws_eks_addon" "vpc_cni" {
cluster_name = aws_eks_cluster.main.name
addon_name = "vpc-cni"
}
resource "aws_eks_addon" "coredns" {
cluster_name = aws_eks_cluster.main.name
addon_name = "coredns"
depends_on = [aws_eks_node_group.main]
}
resource "aws_eks_addon" "kube_proxy" {
cluster_name = aws_eks_cluster.main.name
addon_name = "kube-proxy"
}

View File

@@ -0,0 +1,29 @@
output "cluster_name" {
description = "EKS cluster name"
value = aws_eks_cluster.main.name
}
output "cluster_endpoint" {
description = "EKS cluster endpoint"
value = aws_eks_cluster.main.endpoint
}
output "cluster_ca_certificate" {
description = "EKS cluster CA certificate"
value = aws_eks_cluster.main.certificate_authority[0].data
}
output "cluster_security_group_id" {
description = "Security group ID attached to the EKS cluster"
value = aws_security_group.cluster.id
}
output "cluster_oidc_issuer_url" {
description = "OIDC issuer URL"
value = aws_eks_cluster.main.identity[0].oidc[0].issuer
}
output "node_role_arn" {
description = "IAM role ARN for EKS nodes"
value = aws_iam_role.node.arn
}

View File

@@ -0,0 +1,35 @@
variable "environment" {
description = "Environment name"
type = string
}
variable "cluster_name" {
description = "EKS cluster name"
type = string
}
variable "cluster_version" {
description = "Kubernetes version"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "private_subnet_ids" {
description = "Private subnet IDs for EKS"
type = list(string)
}
variable "node_groups" {
description = "Node groups configuration"
type = map(object({
desired_size = number
min_size = number
max_size = number
instance_types = list(string)
capacity_type = string
}))
}

View File

@@ -0,0 +1,164 @@
resource "aws_db_subnet_group" "main" {
name = "${var.identifier}-subnet-group"
subnet_ids = var.database_subnet_ids
tags = {
Name = "${var.identifier}-subnet-group"
Environment = var.environment
}
}
resource "aws_security_group" "rds" {
name = "${var.identifier}-rds-sg"
description = "Security group for RDS PostgreSQL"
vpc_id = var.vpc_id
ingress {
from_port = 5432
to_port = 5432
protocol = "tcp"
security_groups = var.allowed_security_group_ids
description = "PostgreSQL access from EKS"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.identifier}-rds-sg"
Environment = var.environment
}
}
resource "random_password" "master" {
length = 32
special = true
}
resource "aws_db_instance" "postgres" {
identifier = var.identifier
engine = "postgres"
engine_version = var.engine_version
instance_class = var.instance_class
allocated_storage = var.allocated_storage
max_allocated_storage = var.max_allocated_storage
storage_type = "gp3"
storage_encrypted = true
kms_key_id = aws_kms_key.rds.arn
db_name = var.database_name
username = var.master_username
password = random_password.master.result
db_subnet_group_name = aws_db_subnet_group.main.name
vpc_security_group_ids = [aws_security_group.rds.id]
publicly_accessible = false
multi_az = var.multi_az
backup_retention_period = var.backup_retention_period
backup_window = var.backup_window
maintenance_window = var.maintenance_window
enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
performance_insights_enabled = true
performance_insights_retention_period = 7
deletion_protection = var.deletion_protection
skip_final_snapshot = var.skip_final_snapshot
final_snapshot_identifier = var.skip_final_snapshot ? null : "${var.identifier}-final-snapshot"
auto_minor_version_upgrade = true
apply_immediately = false
tags = {
Name = var.identifier
Environment = var.environment
}
}
resource "aws_kms_key" "rds" {
description = "KMS key for RDS encryption"
deletion_window_in_days = 10
enable_key_rotation = true
tags = {
Name = "${var.identifier}-kms"
Environment = var.environment
}
}
resource "aws_kms_alias" "rds" {
name = "alias/${var.identifier}-rds"
target_key_id = aws_kms_key.rds.key_id
}
# Store password in Secrets Manager
resource "aws_secretsmanager_secret" "db_password" {
name = "${var.identifier}-db-password"
description = "Database master password"
tags = {
Name = "${var.identifier}-db-password"
Environment = var.environment
}
}
resource "aws_secretsmanager_secret_version" "db_password" {
secret_id = aws_secretsmanager_secret.db_password.id
secret_string = jsonencode({
username = var.master_username
password = random_password.master.result
engine = "postgres"
host = aws_db_instance.postgres.address
port = aws_db_instance.postgres.port
dbname = var.database_name
})
}
# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "cpu" {
alarm_name = "${var.identifier}-cpu-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/RDS"
period = "300"
statistic = "Average"
threshold = "80"
alarm_description = "This metric monitors RDS CPU utilization"
dimensions = {
DBInstanceIdentifier = aws_db_instance.postgres.id
}
tags = {
Name = "${var.identifier}-cpu-alarm"
Environment = var.environment
}
}
resource "aws_cloudwatch_metric_alarm" "storage" {
alarm_name = "${var.identifier}-free-storage-space"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "FreeStorageSpace"
namespace = "AWS/RDS"
period = "300"
statistic = "Average"
threshold = "10000000000" # 10GB in bytes
alarm_description = "This metric monitors RDS free storage space"
dimensions = {
DBInstanceIdentifier = aws_db_instance.postgres.id
}
tags = {
Name = "${var.identifier}-storage-alarm"
Environment = var.environment
}
}

View File

@@ -0,0 +1,35 @@
output "db_endpoint" {
description = "RDS instance endpoint"
value = aws_db_instance.postgres.endpoint
}
output "db_address" {
description = "RDS instance address"
value = aws_db_instance.postgres.address
}
output "db_port" {
description = "RDS instance port"
value = aws_db_instance.postgres.port
}
output "db_name" {
description = "Database name"
value = aws_db_instance.postgres.db_name
}
output "db_username" {
description = "Master username"
value = aws_db_instance.postgres.username
sensitive = true
}
output "security_group_id" {
description = "RDS security group ID"
value = aws_security_group.rds.id
}
output "secret_arn" {
description = "ARN of the secret containing database credentials"
value = aws_secretsmanager_secret.db_password.arn
}

View File

@@ -0,0 +1,94 @@
variable "environment" {
description = "Environment name"
type = string
}
variable "identifier" {
description = "Identifier for RDS instance"
type = string
}
variable "engine_version" {
description = "PostgreSQL engine version"
type = string
default = "15.3"
}
variable "instance_class" {
description = "RDS instance class"
type = string
default = "db.t3.medium"
}
variable "allocated_storage" {
description = "Allocated storage in GB"
type = number
default = 100
}
variable "max_allocated_storage" {
description = "Maximum allocated storage for autoscaling in GB"
type = number
default = 500
}
variable "database_name" {
description = "Name of the database"
type = string
}
variable "master_username" {
description = "Master username"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "database_subnet_ids" {
description = "Database subnet IDs"
type = list(string)
}
variable "allowed_security_group_ids" {
description = "Security group IDs allowed to access RDS"
type = list(string)
}
variable "multi_az" {
description = "Enable Multi-AZ deployment"
type = bool
default = true
}
variable "backup_retention_period" {
description = "Backup retention period in days"
type = number
default = 7
}
variable "backup_window" {
description = "Preferred backup window"
type = string
default = "03:00-04:00"
}
variable "maintenance_window" {
description = "Preferred maintenance window"
type = string
default = "mon:04:00-mon:05:00"
}
variable "deletion_protection" {
description = "Enable deletion protection"
type = bool
default = true
}
variable "skip_final_snapshot" {
description = "Skip final snapshot when destroying"
type = bool
default = false
}

View File

@@ -0,0 +1,174 @@
resource "aws_elasticache_subnet_group" "main" {
name = "${var.cluster_id}-subnet-group"
subnet_ids = var.subnet_ids
tags = {
Name = "${var.cluster_id}-subnet-group"
Environment = var.environment
}
}
resource "aws_security_group" "redis" {
name = "${var.cluster_id}-redis-sg"
description = "Security group for ElastiCache Redis"
vpc_id = var.vpc_id
ingress {
from_port = 6379
to_port = 6379
protocol = "tcp"
security_groups = var.allowed_security_group_ids
description = "Redis access from EKS"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.cluster_id}-redis-sg"
Environment = var.environment
}
}
resource "random_password" "auth_token" {
length = 32
special = false
}
resource "aws_elasticache_replication_group" "redis" {
replication_group_id = var.cluster_id
replication_group_description = "Redis cluster for ${var.environment}"
engine = "redis"
engine_version = var.engine_version
node_type = var.node_type
num_cache_clusters = var.num_cache_nodes
port = 6379
subnet_group_name = aws_elasticache_subnet_group.main.name
security_group_ids = [aws_security_group.redis.id]
parameter_group_name = aws_elasticache_parameter_group.main.name
at_rest_encryption_enabled = true
transit_encryption_enabled = true
auth_token = random_password.auth_token.result
automatic_failover_enabled = var.num_cache_nodes > 1 ? true : false
multi_az_enabled = var.num_cache_nodes > 1 ? true : false
snapshot_retention_limit = var.snapshot_retention_limit
snapshot_window = var.snapshot_window
maintenance_window = var.maintenance_window
auto_minor_version_upgrade = true
apply_immediately = false
log_delivery_configuration {
destination = aws_cloudwatch_log_group.redis.name
destination_type = "cloudwatch-logs"
log_format = "json"
log_type = "slow-log"
}
tags = {
Name = var.cluster_id
Environment = var.environment
}
}
resource "aws_elasticache_parameter_group" "main" {
name = "${var.cluster_id}-params"
family = "redis7"
parameter {
name = "maxmemory-policy"
value = "allkeys-lru"
}
parameter {
name = "timeout"
value = "300"
}
tags = {
Name = "${var.cluster_id}-params"
Environment = var.environment
}
}
resource "aws_cloudwatch_log_group" "redis" {
name = "/aws/elasticache/${var.cluster_id}"
retention_in_days = 7
tags = {
Name = "${var.cluster_id}-logs"
Environment = var.environment
}
}
# Store auth token in Secrets Manager
resource "aws_secretsmanager_secret" "redis_auth" {
name = "${var.cluster_id}-auth-token"
description = "Redis authentication token"
tags = {
Name = "${var.cluster_id}-auth-token"
Environment = var.environment
}
}
resource "aws_secretsmanager_secret_version" "redis_auth" {
secret_id = aws_secretsmanager_secret.redis_auth.id
secret_string = jsonencode({
auth_token = random_password.auth_token.result
endpoint = aws_elasticache_replication_group.redis.primary_endpoint_address
port = 6379
})
}
# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "cpu" {
alarm_name = "${var.cluster_id}-cpu-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
threshold = "75"
alarm_description = "This metric monitors Redis CPU utilization"
dimensions = {
ReplicationGroupId = var.cluster_id
}
tags = {
Name = "${var.cluster_id}-cpu-alarm"
Environment = var.environment
}
}
resource "aws_cloudwatch_metric_alarm" "memory" {
alarm_name = "${var.cluster_id}-database-memory-usage"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
threshold = "90"
alarm_description = "This metric monitors Redis memory usage"
dimensions = {
ReplicationGroupId = var.cluster_id
}
tags = {
Name = "${var.cluster_id}-memory-alarm"
Environment = var.environment
}
}

View File

@@ -0,0 +1,19 @@
output "redis_endpoint" {
description = "Redis primary endpoint address"
value = aws_elasticache_replication_group.redis.primary_endpoint_address
}
output "redis_port" {
description = "Redis port"
value = 6379
}
output "security_group_id" {
description = "Redis security group ID"
value = aws_security_group.redis.id
}
output "secret_arn" {
description = "ARN of the secret containing Redis auth token"
value = aws_secretsmanager_secret.redis_auth.arn
}

View File

@@ -0,0 +1,60 @@
variable "environment" {
description = "Environment name"
type = string
}
variable "cluster_id" {
description = "ElastiCache cluster ID"
type = string
}
variable "engine_version" {
description = "Redis engine version"
type = string
default = "7.0"
}
variable "node_type" {
description = "ElastiCache node type"
type = string
default = "cache.t3.medium"
}
variable "num_cache_nodes" {
description = "Number of cache nodes"
type = number
default = 1
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Subnet IDs for ElastiCache"
type = list(string)
}
variable "allowed_security_group_ids" {
description = "Security group IDs allowed to access Redis"
type = list(string)
}
variable "snapshot_retention_limit" {
description = "Number of days to retain automatic snapshots"
type = number
default = 5
}
variable "snapshot_window" {
description = "Daily time range for snapshots"
type = string
default = "03:00-05:00"
}
variable "maintenance_window" {
description = "Weekly time range for maintenance"
type = string
default = "sun:05:00-sun:07:00"
}

View File

@@ -0,0 +1,213 @@
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "${var.environment}-vpc"
Environment = var.environment
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
}
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = {
Name = "${var.environment}-igw"
Environment = var.environment
}
}
# Public Subnets
resource "aws_subnet" "public" {
count = length(var.public_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.public_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
map_public_ip_on_launch = true
tags = {
Name = "${var.environment}-public-subnet-${count.index + 1}"
Environment = var.environment
Type = "public"
"kubernetes.io/role/elb" = "1"
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
}
}
# Private Subnets
resource "aws_subnet" "private" {
count = length(var.private_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.private_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
tags = {
Name = "${var.environment}-private-subnet-${count.index + 1}"
Environment = var.environment
Type = "private"
"kubernetes.io/role/internal-elb" = "1"
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
}
}
# Database Subnets
resource "aws_subnet" "database" {
count = length(var.database_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.database_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
tags = {
Name = "${var.environment}-database-subnet-${count.index + 1}"
Environment = var.environment
Type = "database"
}
}
# Elastic IPs for NAT Gateways
resource "aws_eip" "nat" {
count = length(var.availability_zones)
domain = "vpc"
tags = {
Name = "${var.environment}-nat-eip-${count.index + 1}"
Environment = var.environment
}
}
# NAT Gateways
resource "aws_nat_gateway" "main" {
count = length(var.availability_zones)
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = {
Name = "${var.environment}-nat-gateway-${count.index + 1}"
Environment = var.environment
}
depends_on = [aws_internet_gateway.main]
}
# Public Route Table
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = {
Name = "${var.environment}-public-rt"
Environment = var.environment
}
}
# Private Route Tables
resource "aws_route_table" "private" {
count = length(var.availability_zones)
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
tags = {
Name = "${var.environment}-private-rt-${count.index + 1}"
Environment = var.environment
}
}
# Route Table Associations - Public
resource "aws_route_table_association" "public" {
count = length(var.public_subnet_cidrs)
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
# Route Table Associations - Private
resource "aws_route_table_association" "private" {
count = length(var.private_subnet_cidrs)
subnet_id = aws_subnet.private[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
# Route Table Associations - Database
resource "aws_route_table_association" "database" {
count = length(var.database_subnet_cidrs)
subnet_id = aws_subnet.database[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
# VPC Flow Logs
resource "aws_flow_log" "main" {
iam_role_arn = aws_iam_role.flow_log.arn
log_destination = aws_cloudwatch_log_group.flow_log.arn
traffic_type = "ALL"
vpc_id = aws_vpc.main.id
tags = {
Name = "${var.environment}-flow-log"
Environment = var.environment
}
}
resource "aws_cloudwatch_log_group" "flow_log" {
name = "/aws/vpc/${var.environment}-flow-log"
retention_in_days = 30
tags = {
Name = "${var.environment}-flow-log"
Environment = var.environment
}
}
resource "aws_iam_role" "flow_log" {
name = "${var.environment}-vpc-flow-log-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "vpc-flow-logs.amazonaws.com"
}
}
]
})
tags = {
Name = "${var.environment}-flow-log-role"
Environment = var.environment
}
}
resource "aws_iam_role_policy" "flow_log" {
name = "${var.environment}-vpc-flow-log-policy"
role = aws_iam_role.flow_log.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:DescribeLogGroups",
"logs:DescribeLogStreams"
]
Effect = "Allow"
Resource = "${aws_cloudwatch_log_group.flow_log.arn}:*"
}
]
})
}

View File

@@ -0,0 +1,29 @@
output "vpc_id" {
description = "VPC ID"
value = aws_vpc.main.id
}
output "vpc_cidr" {
description = "VPC CIDR block"
value = aws_vpc.main.cidr_block
}
output "public_subnet_ids" {
description = "Public subnet IDs"
value = aws_subnet.public[*].id
}
output "private_subnet_ids" {
description = "Private subnet IDs"
value = aws_subnet.private[*].id
}
output "database_subnet_ids" {
description = "Database subnet IDs"
value = aws_subnet.database[*].id
}
output "nat_gateway_ids" {
description = "NAT Gateway IDs"
value = aws_nat_gateway.main[*].id
}

View File

@@ -0,0 +1,29 @@
variable "environment" {
description = "Environment name"
type = string
}
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
}
variable "availability_zones" {
description = "List of availability zones"
type = list(string)
}
variable "private_subnet_cidrs" {
description = "CIDR blocks for private subnets"
type = list(string)
}
variable "public_subnet_cidrs" {
description = "CIDR blocks for public subnets"
type = list(string)
}
variable "database_subnet_cidrs" {
description = "CIDR blocks for database subnets"
type = list(string)
}

61
terraform/outputs.tf Normal file
View File

@@ -0,0 +1,61 @@
output "vpc_id" {
description = "VPC ID"
value = module.vpc.vpc_id
}
output "private_subnet_ids" {
description = "Private subnet IDs"
value = module.vpc.private_subnet_ids
}
output "public_subnet_ids" {
description = "Public subnet IDs"
value = module.vpc.public_subnet_ids
}
output "eks_cluster_name" {
description = "EKS cluster name"
value = module.eks.cluster_name
}
output "eks_cluster_endpoint" {
description = "EKS cluster endpoint"
value = module.eks.cluster_endpoint
}
output "eks_cluster_security_group_id" {
description = "EKS cluster security group ID"
value = module.eks.cluster_security_group_id
}
output "rds_endpoint" {
description = "RDS database endpoint"
value = module.rds.db_endpoint
sensitive = true
}
output "rds_database_name" {
description = "RDS database name"
value = module.rds.db_name
}
output "redis_endpoint" {
description = "Redis cluster endpoint"
value = module.redis.redis_endpoint
sensitive = true
}
output "alb_dns_name" {
description = "ALB DNS name"
value = module.alb.alb_dns_name
}
output "alb_zone_id" {
description = "ALB zone ID"
value = module.alb.alb_zone_id
}
output "configure_kubectl" {
description = "Command to configure kubectl"
value = "aws eks update-kubeconfig --name ${module.eks.cluster_name} --region ${var.aws_region}"
}

132
terraform/variables.tf Normal file
View File

@@ -0,0 +1,132 @@
variable "aws_region" {
description = "AWS region to deploy resources"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Environment name (staging, production)"
type = string
}
variable "project_name" {
description = "Project name"
type = string
default = "spywatcher"
}
# VPC Variables
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "List of availability zones"
type = list(string)
default = ["us-east-1a", "us-east-1b", "us-east-1c"]
}
variable "private_subnet_cidrs" {
description = "CIDR blocks for private subnets"
type = list(string)
default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
}
variable "public_subnet_cidrs" {
description = "CIDR blocks for public subnets"
type = list(string)
default = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
}
variable "database_subnet_cidrs" {
description = "CIDR blocks for database subnets"
type = list(string)
default = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
}
# EKS Variables
variable "eks_cluster_version" {
description = "Kubernetes version for EKS cluster"
type = string
default = "1.28"
}
variable "eks_node_groups" {
description = "EKS node groups configuration"
type = map(object({
desired_size = number
min_size = number
max_size = number
instance_types = list(string)
capacity_type = string
}))
default = {
general = {
desired_size = 3
min_size = 2
max_size = 10
instance_types = ["t3.medium"]
capacity_type = "ON_DEMAND"
}
}
}
# RDS Variables
variable "rds_engine_version" {
description = "PostgreSQL engine version"
type = string
default = "15.3"
}
variable "rds_instance_class" {
description = "RDS instance class"
type = string
default = "db.t3.medium"
}
variable "rds_allocated_storage" {
description = "Allocated storage in GB"
type = number
default = 100
}
variable "database_name" {
description = "Name of the database"
type = string
default = "spywatcher"
}
variable "database_username" {
description = "Database master username"
type = string
default = "spywatcher"
}
# Redis Variables
variable "redis_node_type" {
description = "ElastiCache node type"
type = string
default = "cache.t3.medium"
}
variable "redis_num_cache_nodes" {
description = "Number of cache nodes"
type = number
default = 1
}
# SSL Certificate
variable "certificate_arn" {
description = "ARN of SSL certificate for ALB"
type = string
default = ""
}
# Tags
variable "tags" {
description = "Additional tags for resources"
type = map(string)
default = {}
}