Add production deployment infrastructure with Kubernetes, Terraform, and multi-strategy CI/CD (#145)

* Initial plan * Add Kubernetes manifests and Terraform infrastructure modules Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add Helm charts, deployment scripts, CI/CD workflows, and documentation Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add infrastructure documentation and update README Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Fix code review issues and security vulnerabilities Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Address PR review comments: improve security, fix API versions, and enhance deployment reliability Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>
2025-11-02 17:27:49 -06:00
parent ca1f33b734
commit d3111dfbdf
56 changed files with 5663 additions and 0 deletions
--- a/.github/workflows/deploy-production.yml
+++ b/.github/workflows/deploy-production.yml
@@ -0,0 +1,291 @@
+name: Deploy to Production
+
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      deployment_strategy:
+        description: 'Deployment strategy'
+        required: true
+        default: 'rolling'
+        type: choice
+        options:
+          - rolling
+          - blue-green
+          - canary
+
+env:
+  AWS_REGION: us-east-1
+  EKS_CLUSTER_NAME: spywatcher-production
+  REGISTRY: ghcr.io
+  IMAGE_NAME_BACKEND: ${{ github.repository_owner }}/spywatcher-backend
+  IMAGE_NAME_FRONTEND: ${{ github.repository_owner }}/spywatcher-frontend
+
+jobs:
+  build-and-push:
+    name: Build and Push Docker Images
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    
+    outputs:
+      backend-tag: ${{ steps.meta-backend.outputs.tags }}
+      frontend-tag: ${{ steps.meta-frontend.outputs.tags }}
+    
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Extract metadata for backend
+        id: meta-backend
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}
+          tags: |
+            type=sha,prefix={{branch}}-
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+            type=raw,value=latest,enable={{is_default_branch}}
+      
+      - name: Build and push backend image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          push: true
+          tags: ${{ steps.meta-backend.outputs.tags }}
+          labels: ${{ steps.meta-backend.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+      
+      - name: Extract metadata for frontend
+        id: meta-frontend
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}
+          tags: |
+            type=sha,prefix={{branch}}-
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+            type=raw,value=latest,enable={{is_default_branch}}
+      
+      - name: Build and push frontend image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./frontend
+          file: ./frontend/Dockerfile
+          push: true
+          tags: ${{ steps.meta-frontend.outputs.tags }}
+          labels: ${{ steps.meta-frontend.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  deploy:
+    name: Deploy to Kubernetes
+    runs-on: ubuntu-latest
+    needs: build-and-push
+    environment: production
+    permissions:
+      contents: read
+      id-token: write  # Required for AWS OIDC authentication
+    
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      
+      - name: Update kubeconfig
+        run: |
+          aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
+      
+      - name: Install kubectl
+        uses: azure/setup-kubectl@v4
+        with:
+          version: 'v1.28.0'
+      
+      - name: Verify cluster access
+        run: |
+          kubectl cluster-info
+          kubectl get nodes
+      
+      - name: Run database migrations
+        run: |
+          # Create unique migration job name
+          JOB_NAME="db-migration-$(date +%s)"
+          
+          # Update the migration job manifest with unique name and latest image
+          kubectl get job spywatcher-db-migration -n spywatcher -o yaml 2>/dev/null | \
+            sed "s/name: spywatcher-db-migration/name: $JOB_NAME/" | \
+            sed "s|image: ghcr.io/subculture-collective/spywatcher-backend:.*|image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest|" | \
+            kubectl apply -f - || \
+          kubectl create job $JOB_NAME --image=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
+            -n spywatcher -- sh -c "npx prisma migrate deploy"
+          
+          # Set DATABASE_URL secret for the job if created via kubectl create
+          kubectl set env job/$JOB_NAME -n spywatcher --from=secret/spywatcher-secrets DATABASE_URL=database-url || true
+          
+          # Wait for migration to complete
+          kubectl wait --for=condition=complete --timeout=300s job/$JOB_NAME -n spywatcher
+          
+          # Show migration logs
+          kubectl logs job/$JOB_NAME -n spywatcher
+      
+      - name: Deploy with Rolling Update
+        if: github.event.inputs.deployment_strategy == 'rolling' || github.event.inputs.deployment_strategy == ''
+        run: |
+          # Update backend deployment
+          kubectl set image deployment/spywatcher-backend \
+            backend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
+            -n spywatcher
+          
+          # Update frontend deployment
+          kubectl set image deployment/spywatcher-frontend \
+            frontend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}:latest \
+            -n spywatcher
+          
+          # Wait for rollout to complete
+          kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=10m
+          kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=10m
+      
+      - name: Deploy with Blue-Green
+        if: github.event.inputs.deployment_strategy == 'blue-green'
+        run: |
+          chmod +x ./scripts/deployment/blue-green-deploy.sh
+          IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
+      
+      - name: Deploy with Canary
+        if: github.event.inputs.deployment_strategy == 'canary'
+        run: |
+          chmod +x ./scripts/deployment/canary-deploy.sh
+          IMAGE_TAG=latest ./scripts/deployment/canary-deploy.sh
+      
+      - name: Run smoke tests
+        run: |
+          # Test via ingress if available, otherwise use port-forward
+          INGRESS_HOST=$(kubectl get ingress spywatcher-ingress -n spywatcher -o jsonpath='{.spec.rules[1].host}' 2>/dev/null || echo "")
+          
+          if [ -n "$INGRESS_HOST" ]; then
+            echo "Testing via ingress: $INGRESS_HOST"
+            BACKEND_URL="https://${INGRESS_HOST}"
+            
+            # Test health endpoints
+            echo "Testing liveness endpoint..."
+            curl -f "${BACKEND_URL}/health/live" || exit 1
+            
+            echo "Testing readiness endpoint..."
+            curl -f "${BACKEND_URL}/health/ready" || exit 1
+          else
+            echo "No ingress found, testing via port-forward"
+            # Port-forward backend service to localhost:8080
+            kubectl port-forward svc/spywatcher-backend 8080:80 -n spywatcher &
+            PORT_FORWARD_PID=$!
+            
+            # Wait for port-forward to be ready
+            sleep 5
+            
+            # Test health endpoints
+            echo "Testing liveness endpoint..."
+            curl -f "http://localhost:8080/health/live" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
+            
+            echo "Testing readiness endpoint..."
+            curl -f "http://localhost:8080/health/ready" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
+            
+            # Kill port-forward process
+            kill $PORT_FORWARD_PID 2>/dev/null
+          fi
+          
+          echo "Smoke tests passed!"
+      
+      - name: Verify deployment
+        run: |
+          echo "=== Deployment Status ==="
+          kubectl get deployments -n spywatcher
+          kubectl get pods -n spywatcher
+          kubectl get services -n spywatcher
+          
+          echo "=== Recent Events ==="
+          kubectl get events -n spywatcher --sort-by='.lastTimestamp' | tail -20
+      
+      - name: Rollback on failure
+        if: failure()
+        run: |
+          echo "Deployment failed, rolling back..."
+          kubectl rollout undo deployment/spywatcher-backend -n spywatcher
+          kubectl rollout undo deployment/spywatcher-frontend -n spywatcher
+          
+          kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=5m
+          kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=5m
+      
+      - name: Notify on success
+        if: success()
+        run: |
+          echo "✅ Production deployment successful!"
+          echo "Deployed commit: ${{ github.sha }}"
+          echo "Deployment strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}"
+      
+      - name: Notify on failure
+        if: failure()
+        uses: 8398a7/action-slack@v3
+        with:
+          status: failure
+          text: |
+            Production deployment failed!
+            Commit: ${{ github.sha }}
+            Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}
+          webhook_url: ${{ secrets.SLACK_WEBHOOK }}
+        continue-on-error: true
+
+  post-deployment:
+    name: Post-Deployment Tasks
+    runs-on: ubuntu-latest
+    needs: deploy
+    if: success()
+    permissions:
+      contents: read
+    
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      
+      - name: Update kubeconfig
+        run: |
+          aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
+      
+      - name: Clean up old resources
+        run: |
+          # Clean up completed migration jobs older than 1 hour
+          kubectl delete jobs -n spywatcher --field-selector status.successful=1 \
+            --ignore-not-found=true || true
+          
+          # Clean up old replica sets
+          kubectl delete replicaset -n spywatcher --field-selector status.replicas=0 \
+            --ignore-not-found=true || true
+      
+      - name: Update deployment documentation
+        run: |
+          echo "Deployment completed at $(date)" >> deployment-log.txt
+          echo "Commit: ${{ github.sha }}" >> deployment-log.txt
+          echo "Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}" >> deployment-log.txt
+          echo "---" >> deployment-log.txt
--- a/DEPLOYMENT.md
+++ b/DEPLOYMENT.md
@@ -0,0 +1,413 @@
+# Deployment Guide
+
+This document describes the production deployment strategy for Spywatcher, including infrastructure setup, deployment procedures, and rollback strategies.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Infrastructure Setup](#infrastructure-setup)
+- [Deployment Strategies](#deployment-strategies)
+- [Kubernetes Deployment](#kubernetes-deployment)
+- [Terraform Infrastructure](#terraform-infrastructure)
+- [Helm Charts](#helm-charts)
+- [CI/CD Pipeline](#cicd-pipeline)
+- [Rollback Procedures](#rollback-procedures)
+- [Monitoring and Alerts](#monitoring-and-alerts)
+- [Troubleshooting](#troubleshooting)
+
+## Overview
+
+Spywatcher uses a multi-strategy deployment approach with:
+
+- **Infrastructure as Code**: Terraform for AWS infrastructure
+- **Container Orchestration**: Kubernetes (EKS) for application deployment
+- **Package Management**: Helm charts for simplified deployments
+- **Deployment Strategies**: Rolling, Blue-Green, and Canary deployments
+- **CI/CD**: GitHub Actions for automated deployments
+
+## Infrastructure Setup
+
+### Prerequisites
+
+1. AWS Account with appropriate permissions
+2. AWS CLI configured
+3. kubectl installed
+4. Terraform installed (>= 1.5.0)
+5. Helm installed (>= 3.0)
+
+### Terraform Infrastructure
+
+The infrastructure is defined in Terraform modules:
+
+```bash
+cd terraform
+
+# Initialize Terraform
+terraform init
+
+# Review the plan
+terraform plan -var-file="environments/production/terraform.tfvars"
+
+# Apply infrastructure
+terraform apply -var-file="environments/production/terraform.tfvars"
+```
+
+#### Infrastructure Components
+
+- **VPC**: Isolated network with public, private, and database subnets across 3 AZs
+- **EKS Cluster**: Kubernetes cluster with managed node groups
+- **RDS PostgreSQL**: Managed database with encryption and automated backups
+- **ElastiCache Redis**: In-memory cache with cluster mode
+- **Application Load Balancer**: With WAF for security
+- **Security Groups**: Least-privilege network access
+- **IAM Roles**: Service accounts and node permissions
+
+### Configure kubectl
+
+After infrastructure deployment:
+
+```bash
+aws eks update-kubeconfig --name spywatcher-production --region us-east-1
+kubectl cluster-info
+```
+
+## Deployment Strategies
+
+### Rolling Deployment (Default)
+
+Updates pods gradually, maintaining service availability.
+
+```bash
+# Triggered automatically on push to main branch
+# Or manually via GitHub Actions UI
+```
+
+**Advantages:**
+- Simple and predictable
+- Zero downtime
+- Automatic rollback on failure
+
+**Disadvantages:**
+- Gradual rollout may take time
+- Both versions run simultaneously during update
+
+### Blue-Green Deployment
+
+Maintains two identical environments, switching traffic instantly.
+
+```bash
+# Via GitHub Actions
+# Select "blue-green" as deployment strategy
+
+# Or manually
+IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
+
+# Rollback if needed
+./scripts/deployment/blue-green-deploy.sh --rollback
+```
+
+**Advantages:**
+- Instant traffic switch
+- Easy rollback
+- Full environment testing before switch
+
+**Disadvantages:**
+- Requires double resources temporarily
+- Database migrations must be compatible with both versions
+
+### Canary Deployment
+
+Gradually shifts traffic to new version while monitoring metrics.
+
+```bash
+# Via GitHub Actions
+# Select "canary" as deployment strategy
+
+# Or manually
+IMAGE_TAG=latest CANARY_STEPS="5 25 50 100" ./scripts/deployment/canary-deploy.sh
+```
+
+**Advantages:**
+- Risk mitigation through gradual rollout
+- Real-world testing with subset of users
+- Automated rollback on errors
+
+**Disadvantages:**
+- Longer deployment time
+- Requires robust monitoring
+
+## Kubernetes Deployment
+
+### Using Kustomize
+
+Deploy to different environments:
+
+```bash
+# Production
+kubectl apply -k k8s/overlays/production
+
+# Staging
+kubectl apply -k k8s/overlays/staging
+
+# Development (base)
+kubectl apply -k k8s/base
+```
+
+### Manual Deployment
+
+```bash
+# Create namespace
+kubectl apply -f k8s/base/namespace.yaml
+
+# Apply configurations
+kubectl apply -f k8s/base/configmap.yaml
+kubectl apply -f k8s/base/secrets.yaml
+
+# Deploy databases
+kubectl apply -f k8s/base/postgres-statefulset.yaml
+kubectl apply -f k8s/base/redis-statefulset.yaml
+
+# Deploy applications
+kubectl apply -f k8s/base/backend-deployment.yaml
+kubectl apply -f k8s/base/frontend-deployment.yaml
+
+# Create services
+kubectl apply -f k8s/base/backend-service.yaml
+kubectl apply -f k8s/base/frontend-service.yaml
+
+# Configure ingress
+kubectl apply -f k8s/base/ingress.yaml
+```
+
+### Scaling
+
+```bash
+# Manual scaling
+kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
+
+# Auto-scaling is configured via HPA
+kubectl get hpa -n spywatcher
+```
+
+## Helm Charts
+
+### Installation
+
+```bash
+# Install with default values
+helm install spywatcher ./helm/spywatcher -n spywatcher --create-namespace
+
+# Install with custom values
+helm install spywatcher ./helm/spywatcher \
+  -n spywatcher \
+  --create-namespace \
+  -f helm/spywatcher/values-production.yaml
+```
+
+### Upgrade
+
+```bash
+helm upgrade spywatcher ./helm/spywatcher -n spywatcher
+```
+
+### Rollback
+
+```bash
+# List releases
+helm history spywatcher -n spywatcher
+
+# Rollback to previous version
+helm rollback spywatcher -n spywatcher
+
+# Rollback to specific revision
+helm rollback spywatcher 2 -n spywatcher
+```
+
+## CI/CD Pipeline
+
+### GitHub Actions Workflow
+
+The deployment pipeline is triggered by:
+
+1. Push to `main` branch (automatic)
+2. Manual workflow dispatch
+
+#### Pipeline Steps
+
+1. **Build and Push**
+   - Build Docker images for backend and frontend
+   - Push to GitHub Container Registry
+   - Tag with commit SHA and latest
+
+2. **Database Migration**
+   - Run Prisma migrations
+   - Verify migration success
+
+3. **Deploy**
+   - Apply selected deployment strategy
+   - Update Kubernetes deployments
+   - Monitor rollout status
+
+4. **Smoke Tests**
+   - Health check endpoints
+   - Basic functionality tests
+
+5. **Rollback on Failure**
+   - Automatic rollback if deployment fails
+   - Notification to team
+
+### Required Secrets
+
+Configure in GitHub repository settings:
+
+```
+AWS_ACCESS_KEY_ID
+AWS_SECRET_ACCESS_KEY
+DATABASE_URL
+REDIS_URL
+JWT_SECRET
+JWT_REFRESH_SECRET
+DISCORD_BOT_TOKEN
+DISCORD_CLIENT_ID
+DISCORD_CLIENT_SECRET
+SLACK_WEBHOOK (optional)
+```
+
+## Rollback Procedures
+
+### Kubernetes Rollback
+
+```bash
+# View rollout history
+kubectl rollout history deployment/spywatcher-backend -n spywatcher
+
+# Rollback to previous version
+kubectl rollout undo deployment/spywatcher-backend -n spywatcher
+
+# Rollback to specific revision
+kubectl rollout undo deployment/spywatcher-backend --to-revision=2 -n spywatcher
+
+# Check rollback status
+kubectl rollout status deployment/spywatcher-backend -n spywatcher
+```
+
+### Blue-Green Rollback
+
+```bash
+./scripts/deployment/blue-green-deploy.sh --rollback
+```
+
+### Database Rollback
+
+```bash
+# If migration needs to be rolled back
+kubectl exec -it deployment/spywatcher-backend -n spywatcher -- npx prisma migrate resolve --rolled-back <migration_name>
+```
+
+## Monitoring and Alerts
+
+### Health Checks
+
+```bash
+# Liveness probe
+curl https://api.spywatcher.example.com/health/live
+
+# Readiness probe
+curl https://api.spywatcher.example.com/health/ready
+```
+
+### Kubernetes Monitoring
+
+```bash
+# Check pod status
+kubectl get pods -n spywatcher
+
+# View pod logs
+kubectl logs -f deployment/spywatcher-backend -n spywatcher
+
+# Check events
+kubectl get events -n spywatcher --sort-by='.lastTimestamp'
+
+# Resource usage
+kubectl top pods -n spywatcher
+kubectl top nodes
+```
+
+### CloudWatch Metrics
+
+Monitor via AWS CloudWatch:
+- EKS cluster metrics
+- RDS performance metrics
+- ElastiCache metrics
+- ALB request metrics
+
+## Troubleshooting
+
+### Pod Not Starting
+
+```bash
+# Describe pod to see events
+kubectl describe pod <pod-name> -n spywatcher
+
+# Check logs
+kubectl logs <pod-name> -n spywatcher
+
+# Check resource constraints
+kubectl describe node <node-name>
+```
+
+### Database Connection Issues
+
+```bash
+# Verify database secret
+kubectl get secret spywatcher-secrets -n spywatcher -o yaml
+
+# Test database connection
+kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -n spywatcher -- \
+  psql -h <rds-endpoint> -U spywatcher -d spywatcher
+```
+
+### Traffic Not Routing
+
+```bash
+# Check service endpoints
+kubectl get endpoints -n spywatcher
+
+# Check ingress
+kubectl describe ingress spywatcher-ingress -n spywatcher
+
+# Check ALB target groups
+aws elbv2 describe-target-health --target-group-arn <arn>
+```
+
+### High Resource Usage
+
+```bash
+# Check HPA status
+kubectl get hpa -n spywatcher
+
+# Scale manually if needed
+kubectl scale deployment spywatcher-backend --replicas=10 -n spywatcher
+
+# Check resource limits
+kubectl describe deployment spywatcher-backend -n spywatcher
+```
+
+## Best Practices
+
+1. **Always test in staging first**
+2. **Run database migrations before deploying code**
+3. **Use feature flags for risky changes**
+4. **Monitor error rates during deployment**
+5. **Keep rollback scripts ready**
+6. **Document all configuration changes**
+7. **Regular backup testing**
+8. **Security patches applied promptly**
+
+## Support
+
+For deployment issues:
+- Check GitHub Actions logs
+- Review CloudWatch logs
+- Contact DevOps team
+- Create incident in issue tracker
--- a/INFRASTRUCTURE.md
+++ b/INFRASTRUCTURE.md
@@ -0,0 +1,351 @@
+# Infrastructure Overview
+
+## Architecture Diagram
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                          AWS Cloud                               │
+│                                                                   │
+│  ┌──────────────────────────────────────────────────────────┐   │
+│  │                    VPC (10.0.0.0/16)                     │   │
+│  │                                                           │   │
+│  │  ┌────────────────────────────────────────────────────┐  │   │
+│  │  │           Application Load Balancer (ALB)          │  │   │
+│  │  │              with WAF Protection                    │  │   │
+│  │  └──────────────────┬─────────────────────────────────┘  │   │
+│  │                     │                                     │   │
+│  │  ┌──────────────────┴────────────────────────┐           │   │
+│  │  │          EKS Cluster (Kubernetes)         │           │   │
+│  │  │                                            │           │   │
+│  │  │  ┌────────────────┐  ┌─────────────────┐ │           │   │
+│  │  │  │   Backend      │  │   Frontend      │ │           │   │
+│  │  │  │   Pods (3)     │  │   Pods (2)      │ │           │   │
+│  │  │  │                │  │                 │ │           │   │
+│  │  │  │ - Auto-scaling │  │ - Auto-scaling  │ │           │   │
+│  │  │  │ - Health checks│  │ - Health checks │ │           │   │
+│  │  │  └────────┬───────┘  └────────┬────────┘ │           │   │
+│  │  │           │                   │          │           │   │
+│  │  │           └───────┬───────────┘          │           │   │
+│  │  │                   │                      │           │   │
+│  │  └───────────────────┼──────────────────────┘           │   │
+│  │                      │                                  │   │
+│  │  ┌───────────────────┼──────────────────────────────┐  │   │
+│  │  │  Database Subnets │                              │  │   │
+│  │  │                   │                              │  │   │
+│  │  │  ┌────────────────▼────────┐  ┌───────────────┐ │  │   │
+│  │  │  │  RDS PostgreSQL 15      │  │ ElastiCache   │ │  │   │
+│  │  │  │                         │  │ Redis         │ │  │   │
+│  │  │  │ - Multi-AZ              │  │               │ │  │   │
+│  │  │  │ - Encrypted             │  │ - Encrypted   │ │  │   │
+│  │  │  │ - Automated Backups     │  │ - Failover    │ │  │   │
+│  │  │  └─────────────────────────┘  └───────────────┘ │  │   │
+│  │  └──────────────────────────────────────────────────┘  │   │
+│  │                                                         │   │
+│  └─────────────────────────────────────────────────────────┘   │
+│                                                                 │
+│  ┌──────────────────┐  ┌──────────────────┐                   │
+│  │  CloudWatch      │  │  Secrets Manager │                   │
+│  │  Monitoring      │  │  Credentials     │                   │
+│  └──────────────────┘  └──────────────────┘                   │
+│                                                                 │
+└─────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────┐
+│                        GitHub Actions                            │
+│                                                                   │
+│  Build → Test → Deploy → Smoke Tests → Monitor                  │
+│                                                                   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Components
+
+### Compute
+- **EKS Cluster**: Managed Kubernetes cluster (v1.28)
+- **Node Groups**: Auto-scaling EC2 instances (t3.large)
+- **Pods**: Containerized applications with health checks
+
+### Networking
+- **VPC**: Isolated network (10.0.0.0/16)
+- **Subnets**: Public, Private, and Database across 3 AZs
+- **NAT Gateways**: Internet access for private subnets
+- **ALB**: HTTPS termination and routing
+
+### Data Storage
+- **RDS PostgreSQL**: Managed database (15.3)
+  - Multi-AZ for high availability
+  - Automated backups (7 days retention)
+  - Encryption at rest (KMS)
+  
+- **ElastiCache Redis**: In-memory cache (7.0)
+  - Authentication token
+  - Encryption in transit
+  - Automatic failover
+
+### Security
+- **WAF**: Web Application Firewall with rate limiting
+- **Security Groups**: Network-level access control
+- **IAM Roles**: Fine-grained permissions
+- **Secrets Manager**: Secure credential storage
+- **TLS/SSL**: End-to-end encryption
+
+### Monitoring
+- **CloudWatch**: Metrics, logs, and alarms
+- **Health Checks**: Liveness and readiness probes
+- **Resource Metrics**: CPU, memory, network usage
+
+## Resource Sizing
+
+### Production Environment
+
+| Component | Type | Specs | Replicas | Scaling |
+|-----------|------|-------|----------|---------|
+| Backend | Pod | 512Mi RAM, 500m CPU | 3 | 2-10 |
+| Frontend | Pod | 128Mi RAM, 100m CPU | 2 | 2-5 |
+| PostgreSQL | RDS | db.t3.large | 1 (Multi-AZ) | Manual |
+| Redis | ElastiCache | cache.t3.medium | 2 | Manual |
+| EKS Nodes | EC2 | t3.large | 3 | 2-10 |
+
+### Staging Environment
+
+| Component | Type | Specs | Replicas | Scaling |
+|-----------|------|-------|----------|---------|
+| Backend | Pod | 256Mi RAM, 250m CPU | 1 | 1-3 |
+| Frontend | Pod | 128Mi RAM, 100m CPU | 1 | 1-2 |
+| PostgreSQL | RDS | db.t3.medium | 1 | N/A |
+| Redis | ElastiCache | cache.t3.small | 1 | N/A |
+| EKS Nodes | EC2 | t3.medium | 2 | 1-4 |
+
+## Cost Estimation
+
+### Monthly Costs (US East 1)
+
+#### Production
+- EKS Cluster: $73
+- EC2 Nodes (3x t3.large): ~$150
+- RDS PostgreSQL (db.t3.large, Multi-AZ): ~$290
+- ElastiCache Redis (cache.t3.medium x2): ~$100
+- ALB: ~$25
+- Data Transfer: ~$50
+- Backups & Monitoring: ~$30
+
+**Total: ~$718/month**
+
+#### Staging
+- EKS Cluster: $73
+- EC2 Nodes (2x t3.medium): ~$60
+- RDS PostgreSQL (db.t3.medium): ~$70
+- ElastiCache Redis (cache.t3.small): ~$25
+- ALB: ~$25
+- Data Transfer: ~$20
+
+**Total: ~$273/month**
+
+*Note: Costs are estimates and may vary based on usage*
+
+## Deployment Strategies
+
+### 1. Rolling Update (Default)
+- **Use Case**: Standard deployments
+- **Downtime**: Zero
+- **Risk**: Low
+- **Duration**: 5-10 minutes
+
+### 2. Blue-Green
+- **Use Case**: Major releases, critical changes
+- **Downtime**: Zero
+- **Risk**: Very Low (instant rollback)
+- **Duration**: 10-15 minutes
+
+### 3. Canary
+- **Use Case**: High-risk changes, gradual rollout
+- **Downtime**: Zero
+- **Risk**: Minimal (gradual exposure)
+- **Duration**: 30-60 minutes
+
+## High Availability
+
+### Application Layer
+- Multiple replicas across availability zones
+- Pod anti-affinity rules
+- Pod disruption budgets (min 1 available)
+- Health checks with automatic restart
+
+### Database Layer
+- Multi-AZ deployment for RDS
+- Automated failover (< 60 seconds)
+- Read replicas for scaling (optional)
+- Point-in-time recovery
+
+### Network Layer
+- Multi-AZ load balancing
+- Health checks on targets
+- Automatic target deregistration
+- DDoS protection (AWS Shield)
+
+## Disaster Recovery
+
+### RTO (Recovery Time Objective)
+- Application: < 5 minutes
+- Database: < 1 minute (automated failover)
+- Full Infrastructure: < 30 minutes (Terraform redeploy)
+
+### RPO (Recovery Point Objective)
+- Database: < 5 minutes (automated backups)
+- Application: 0 (stateless, recreatable)
+
+### Backup Strategy
+- **Database**: Daily automated backups (7 days retention)
+- **Configuration**: Git repository (versioned)
+- **Infrastructure**: Terraform state (versioned in S3)
+
+## Security Measures
+
+### Network Security
+- Private subnets for application and database
+- Security groups with least-privilege rules
+- Network ACLs
+- VPC Flow Logs
+
+### Application Security
+- Containers run as non-root
+- Read-only root filesystems where possible
+- No privilege escalation
+- Security scanning in CI/CD
+
+### Data Security
+- Encryption at rest (KMS)
+- Encryption in transit (TLS 1.2+)
+- Secrets stored in AWS Secrets Manager
+- Database credentials auto-rotated
+
+### Access Control
+- IAM roles with least privilege
+- RBAC in Kubernetes
+- MFA for admin access
+- Audit logging enabled
+
+## Scaling Strategy
+
+### Horizontal Scaling
+- **Triggers**:
+  - CPU > 70%
+  - Memory > 80%
+  - Custom metrics (request rate)
+  
+- **Limits**:
+  - Backend: 2-10 pods
+  - Frontend: 2-5 pods
+  - Nodes: 2-10 instances
+
+### Vertical Scaling
+- Database: Manual scaling with downtime
+- Redis: Manual scaling with failover
+- Pods: Update resource limits and restart
+
+## Monitoring Strategy
+
+### Application Metrics
+- Request rate and latency
+- Error rate
+- Active connections
+- Cache hit rate
+
+### Infrastructure Metrics
+- CPU utilization
+- Memory utilization
+- Network throughput
+- Disk I/O
+
+### Business Metrics
+- Active users
+- API usage per tier
+- Feature usage
+- User sessions
+
+### Alerting
+- Critical: Page immediately
+  - Service down
+  - Database unavailable
+  - High error rate
+  
+- Warning: Notify during business hours
+  - High CPU/memory
+  - Low disk space
+  - Elevated response time
+
+## Maintenance Windows
+
+### Planned Maintenance
+- **Schedule**: Sundays 02:00-04:00 UTC
+- **Notification**: 7 days advance notice
+- **Activities**:
+  - OS patches
+  - Database maintenance
+  - Kubernetes upgrades
+  - SSL certificate renewal
+
+### Emergency Maintenance
+- Immediate security patches
+- Critical bug fixes
+- Infrastructure failures
+
+## Compliance & Governance
+
+### Tagging Strategy
+All resources tagged with:
+- `Environment`: production/staging
+- `Project`: spywatcher
+- `ManagedBy`: terraform
+- `CostCenter`: engineering
+
+### Resource Naming
+- Pattern: `{project}-{environment}-{resource}`
+- Example: `spywatcher-production-backend`
+
+### Access Audit
+- CloudTrail enabled
+- Quarterly access review
+- Regular security audits
+
+## Quick Reference
+
+### Useful Commands
+
+```bash
+# Check cluster status
+kubectl cluster-info
+kubectl get nodes
+
+# View application status
+kubectl get all -n spywatcher
+
+# View logs
+kubectl logs -f deployment/spywatcher-backend -n spywatcher
+
+# Scale application
+kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
+
+# Rollback deployment
+kubectl rollout undo deployment/spywatcher-backend -n spywatcher
+
+# Database backup
+aws rds create-db-snapshot --db-instance-identifier spywatcher-production
+
+# View CloudWatch alarms
+aws cloudwatch describe-alarms --state-value ALARM
+```
+
+### Important URLs
+
+- Production: https://spywatcher.example.com
+- API: https://api.spywatcher.example.com
+- Staging: https://staging.spywatcher.example.com
+- Grafana: https://grafana.spywatcher.example.com
+- AWS Console: https://console.aws.amazon.com
+
+### Support Contacts
+
+- On-Call: oncall@spywatcher.example.com
+- DevOps: devops@spywatcher.example.com
+- Security: security@spywatcher.example.com
--- a/README.md
+++ b/README.md
@@ -453,6 +453,69 @@ Git hooks are automatically installed when you run `npm install` in the root dir

 See [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines.

+## 🚀 Production Deployment
+
+Spywatcher includes comprehensive production deployment infrastructure with Kubernetes, Terraform, and CI/CD automation.
+
+### Deployment Strategies
+
+- **Rolling Updates**: Zero-downtime gradual deployment (default)
+- **Blue-Green**: Instant traffic switching with quick rollback
+- **Canary**: Gradual rollout with automated error detection
+
+### Infrastructure as Code
+
+- **Terraform**: Complete AWS infrastructure modules
+  - VPC with multi-AZ setup
+  - EKS Kubernetes cluster
+  - RDS PostgreSQL (Multi-AZ, encrypted)
+  - ElastiCache Redis (encrypted, failover)
+  - Application Load Balancer with WAF
+- **Kubernetes**: Production-ready manifests
+  - Auto-scaling with HorizontalPodAutoscaler
+  - Health checks and pod disruption budgets
+  - Security contexts and network policies
+- **Helm Charts**: Simplified deployment and configuration
+
+### Quick Deployment
+
+```bash
+# Deploy with Terraform
+cd terraform
+terraform init
+terraform apply -var-file="environments/production/terraform.tfvars"
+
+# Deploy with Kubernetes
+kubectl apply -k k8s/overlays/production
+
+# Deploy with Helm
+helm install spywatcher ./helm/spywatcher -n spywatcher
+
+# Blue-green deployment
+./scripts/deployment/blue-green-deploy.sh
+
+# Canary deployment
+./scripts/deployment/canary-deploy.sh
+```
+
+### Documentation
+
+- **[DEPLOYMENT.md](./DEPLOYMENT.md)** - Complete deployment guide
+- **[INFRASTRUCTURE.md](./INFRASTRUCTURE.md)** - Architecture overview
+- **[terraform/README.md](./terraform/README.md)** - Infrastructure as Code guide
+- **[k8s/README.md](./k8s/README.md)** - Kubernetes manifests guide
+
+### CI/CD Pipeline
+
+GitHub Actions workflows for automated deployment:
+- Docker image building and pushing to GHCR
+- Database migrations
+- Multiple deployment strategy support
+- Automated smoke tests and health checks
+- Rollback on failure
+
+See [.github/workflows/deploy-production.yml](./.github/workflows/deploy-production.yml) for the complete pipeline.
+
 ## 👥 Contributions

 See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines on contributing to this project.
--- a/helm/spywatcher/Chart.yaml
+++ b/helm/spywatcher/Chart.yaml
@@ -0,0 +1,15 @@
+apiVersion: v2
+name: spywatcher
+description: A Helm chart for Spywatcher Discord surveillance and analytics application
+type: application
+version: 1.0.0
+appVersion: "1.0.0"
+keywords:
+  - discord
+  - monitoring
+  - analytics
+maintainers:
+  - name: Spywatcher Team
+home: https://github.com/subculture-collective/discord-spywatcher
+sources:
+  - https://github.com/subculture-collective/discord-spywatcher
--- a/helm/spywatcher/templates/_helpers.tpl
+++ b/helm/spywatcher/templates/_helpers.tpl
@@ -0,0 +1,65 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "spywatcher.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+*/}}
+{{- define "spywatcher.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "spywatcher.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "spywatcher.labels" -}}
+helm.sh/chart: {{ include "spywatcher.chart" . }}
+{{ include "spywatcher.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "spywatcher.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "spywatcher.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Backend labels
+*/}}
+{{- define "spywatcher.backend.labels" -}}
+{{ include "spywatcher.labels" . }}
+app.kubernetes.io/component: backend
+{{- end }}
+
+{{/*
+Frontend labels
+*/}}
+{{- define "spywatcher.frontend.labels" -}}
+{{ include "spywatcher.labels" . }}
+app.kubernetes.io/component: frontend
+{{- end }}
--- a/helm/spywatcher/templates/configmap.yaml
+++ b/helm/spywatcher/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "spywatcher.fullname" . }}-config
+  namespace: {{ .Values.namespace }}
+  labels:
+    {{- include "spywatcher.labels" . | nindent 4 }}
+data:
+  {{- range $key, $value := .Values.configMap.data }}
+  {{ $key }}: {{ $value | quote }}
+  {{- end }}
--- a/helm/spywatcher/templates/namespace.yaml
+++ b/helm/spywatcher/templates/namespace.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {{ .Values.namespace }}
+  labels:
+    {{- include "spywatcher.labels" . | nindent 4 }}
--- a/helm/spywatcher/templates/secrets.yaml
+++ b/helm/spywatcher/templates/secrets.yaml
@@ -0,0 +1,20 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "spywatcher.fullname" . }}-secrets
+  namespace: {{ .Values.namespace }}
+  labels:
+    {{- include "spywatcher.labels" . | nindent 4 }}
+type: Opaque
+stringData:
+  database-url: {{ .Values.secrets.databaseUrl | quote }}
+  redis-url: {{ .Values.secrets.redisUrl | quote }}
+  jwt-secret: {{ .Values.secrets.jwtSecret | quote }}
+  jwt-refresh-secret: {{ .Values.secrets.jwtRefreshSecret | quote }}
+  discord-bot-token: {{ .Values.secrets.discordBotToken | quote }}
+  discord-client-id: {{ .Values.secrets.discordClientId | quote }}
+  discord-client-secret: {{ .Values.secrets.discordClientSecret | quote }}
+  discord-guild-id: {{ .Values.secrets.discordGuildId | quote }}
+  discord-redirect-uri: {{ .Values.secrets.discordRedirectUri | quote }}
+  admin-discord-ids: {{ .Values.secrets.adminDiscordIds | quote }}
+  bot-guild-ids: {{ .Values.secrets.botGuildIds | quote }}
--- a/helm/spywatcher/values-production.yaml
+++ b/helm/spywatcher/values-production.yaml
@@ -0,0 +1,115 @@
+# Production Environment Values
+# Override default values for production deployment
+
+global:
+  environment: production
+
+namespace: spywatcher
+
+image:
+  backend:
+    repository: ghcr.io/subculture-collective/spywatcher-backend
+    tag: latest
+    pullPolicy: Always
+  frontend:
+    repository: ghcr.io/subculture-collective/spywatcher-frontend
+    tag: latest
+    pullPolicy: Always
+
+backend:
+  enabled: true
+  replicaCount: 3
+  
+  resources:
+    requests:
+      memory: "512Mi"
+      cpu: "500m"
+    limits:
+      memory: "1Gi"
+      cpu: "1000m"
+  
+  autoscaling:
+    enabled: true
+    minReplicas: 3
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 70
+    targetMemoryUtilizationPercentage: 80
+  
+  env:
+    NODE_ENV: production
+    PORT: "3001"
+    LOG_LEVEL: info
+
+frontend:
+  enabled: true
+  replicaCount: 2
+  
+  resources:
+    requests:
+      memory: "128Mi"
+      cpu: "100m"
+    limits:
+      memory: "256Mi"
+      cpu: "500m"
+  
+  env:
+    VITE_API_URL: "https://api.spywatcher.example.com"
+
+# Use managed services instead of in-cluster databases
+postgresql:
+  enabled: false
+
+redis:
+  enabled: false
+
+ingress:
+  enabled: true
+  className: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+    nginx.ingress.kubernetes.io/rate-limit: "100"
+  
+  hosts:
+    - host: spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-frontend
+          port: 80
+    - host: api.spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-backend
+          port: 80
+  
+  tls:
+    - secretName: spywatcher-tls-cert
+      hosts:
+        - spywatcher.example.com
+        - api.spywatcher.example.com
+
+podDisruptionBudget:
+  enabled: true
+  minAvailable: 2
+
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 1001
+  fsGroup: 1001
+
+# Production-specific node affinity
+affinity:
+  nodeAffinity:
+    preferredDuringSchedulingIgnoredDuringExecution:
+    - weight: 100
+      preference:
+        matchExpressions:
+        - key: node.kubernetes.io/instance-type
+          operator: In
+          values:
+          - t3.large
+          - t3a.large
--- a/helm/spywatcher/values-staging.yaml
+++ b/helm/spywatcher/values-staging.yaml
@@ -0,0 +1,104 @@
+# Staging Environment Values
+# Override default values for staging deployment
+
+global:
+  environment: staging
+
+namespace: spywatcher-staging
+
+image:
+  backend:
+    repository: ghcr.io/subculture-collective/spywatcher-backend
+    tag: staging
+    pullPolicy: Always
+  frontend:
+    repository: ghcr.io/subculture-collective/spywatcher-frontend
+    tag: staging
+    pullPolicy: Always
+
+backend:
+  enabled: true
+  replicaCount: 1
+  
+  resources:
+    requests:
+      memory: "256Mi"
+      cpu: "250m"
+    limits:
+      memory: "512Mi"
+      cpu: "500m"
+  
+  autoscaling:
+    enabled: true
+    minReplicas: 1
+    maxReplicas: 3
+    targetCPUUtilizationPercentage: 70
+    targetMemoryUtilizationPercentage: 80
+  
+  env:
+    NODE_ENV: staging
+    PORT: "3001"
+    LOG_LEVEL: debug
+
+frontend:
+  enabled: true
+  replicaCount: 1
+  
+  resources:
+    requests:
+      memory: "128Mi"
+      cpu: "100m"
+    limits:
+      memory: "256Mi"
+      cpu: "500m"
+  
+  env:
+    VITE_API_URL: "https://api-staging.spywatcher.example.com"
+
+# Use in-cluster databases for staging
+postgresql:
+  enabled: true
+  primary:
+    persistence:
+      size: 10Gi
+
+redis:
+  enabled: true
+  master:
+    persistence:
+      size: 5Gi
+
+ingress:
+  enabled: true
+  className: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-staging"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+  
+  hosts:
+    - host: staging.spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-frontend
+          port: 80
+    - host: api-staging.spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-backend
+          port: 80
+  
+  tls:
+    - secretName: spywatcher-staging-tls-cert
+      hosts:
+        - staging.spywatcher.example.com
+        - api-staging.spywatcher.example.com
+
+podDisruptionBudget:
+  enabled: false
+
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 1001
+  fsGroup: 1001
--- a/helm/spywatcher/values.yaml
+++ b/helm/spywatcher/values.yaml
@@ -0,0 +1,212 @@
+# Default values for spywatcher
+# This is a YAML-formatted file.
+
+# Global settings
+global:
+  environment: production
+
+# Namespace
+namespace: spywatcher
+
+# Image settings
+image:
+  backend:
+    repository: ghcr.io/subculture-collective/spywatcher-backend
+    tag: latest
+    pullPolicy: Always
+  frontend:
+    repository: ghcr.io/subculture-collective/spywatcher-frontend
+    tag: latest
+    pullPolicy: Always
+
+imagePullSecrets: []
+
+# Backend configuration
+backend:
+  enabled: true
+  replicaCount: 3
+  
+  resources:
+    requests:
+      memory: "512Mi"
+      cpu: "500m"
+    limits:
+      memory: "1Gi"
+      cpu: "1000m"
+  
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 70
+    targetMemoryUtilizationPercentage: 80
+  
+  service:
+    type: ClusterIP
+    port: 80
+    targetPort: 3001
+  
+  env:
+    NODE_ENV: production
+    PORT: "3001"
+    LOG_LEVEL: info
+  
+  # Health check configuration
+  livenessProbe:
+    httpGet:
+      path: /health/live
+      port: 3001
+    initialDelaySeconds: 30
+    periodSeconds: 10
+  
+  readinessProbe:
+    httpGet:
+      path: /health/ready
+      port: 3001
+    initialDelaySeconds: 10
+    periodSeconds: 5
+
+# Frontend configuration
+frontend:
+  enabled: true
+  replicaCount: 2
+  
+  resources:
+    requests:
+      memory: "128Mi"
+      cpu: "100m"
+    limits:
+      memory: "256Mi"
+      cpu: "500m"
+  
+  service:
+    type: ClusterIP
+    port: 80
+    targetPort: 80
+  
+  env:
+    VITE_API_URL: "https://api.spywatcher.example.com"
+
+# PostgreSQL configuration
+postgresql:
+  enabled: true
+  image: postgres:15-alpine
+  
+  auth:
+    username: spywatcher
+    database: spywatcher
+    # Password should be set via --set or separate values file
+    existingSecret: postgres-secret
+    secretKeys:
+      adminPasswordKey: password
+  
+  primary:
+    resources:
+      requests:
+        memory: "512Mi"
+        cpu: "500m"
+      limits:
+        memory: "1Gi"
+        cpu: "1000m"
+    
+    persistence:
+      enabled: true
+      size: 20Gi
+      storageClass: ""
+
+# Redis configuration
+redis:
+  enabled: true
+  image: redis:7-alpine
+  
+  auth:
+    enabled: true
+    existingSecret: redis-secret
+    existingSecretPasswordKey: password
+  
+  master:
+    resources:
+      requests:
+        memory: "256Mi"
+        cpu: "250m"
+      limits:
+        memory: "512Mi"
+        cpu: "500m"
+    
+    persistence:
+      enabled: true
+      size: 10Gi
+      storageClass: ""
+
+# Ingress configuration
+ingress:
+  enabled: true
+  className: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+  
+  hosts:
+    - host: spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-frontend
+          port: 80
+    - host: api.spywatcher.example.com
+      paths:
+        - path: /
+          pathType: Prefix
+          service: spywatcher-backend
+          port: 80
+  
+  tls:
+    - secretName: spywatcher-tls-cert
+      hosts:
+        - spywatcher.example.com
+        - api.spywatcher.example.com
+
+# ConfigMap data
+configMap:
+  data:
+    NODE_ENV: "production"
+    PORT: "3001"
+    LOG_LEVEL: "info"
+    RATE_LIMIT_WINDOW_MS: "900000"
+    RATE_LIMIT_MAX_REQUESTS: "100"
+
+# Secrets (should be provided externally)
+secrets:
+  # Database
+  databaseUrl: ""
+  # Redis
+  redisUrl: ""
+  # JWT
+  jwtSecret: ""
+  jwtRefreshSecret: ""
+  # Discord
+  discordBotToken: ""
+  discordClientId: ""
+  discordClientSecret: ""
+  discordGuildId: ""
+  discordRedirectUri: ""
+  adminDiscordIds: ""
+  botGuildIds: ""
+
+# Pod Disruption Budget
+podDisruptionBudget:
+  enabled: true
+  minAvailable: 1
+
+# Security context
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 1001
+  fsGroup: 1001
+
+# Node affinity and tolerations
+affinity: {}
+tolerations: []
+nodeSelector: {}
--- a/k8s/.gitignore
+++ b/k8s/.gitignore
@@ -0,0 +1,11 @@
+# Ignore secret files that contain sensitive data
+secrets/
+*.secret.yaml
+*-secrets.yaml
+
+# Ignore generated manifests
+generated/
+
+# Ignore local development files
+*.local.yaml
+local/
--- a/k8s/README.md
+++ b/k8s/README.md
@@ -0,0 +1,377 @@
+# Kubernetes Manifests
+
+This directory contains Kubernetes manifests for deploying Spywatcher.
+
+## Directory Structure
+
+```
+k8s/
+├── base/                    # Base manifests
+│   ├── namespace.yaml       # Namespace and resource quotas
+│   ├── configmap.yaml       # Application configuration
+│   ├── secrets.yaml         # Secrets template (DO NOT commit actual secrets)
+│   ├── migration-job.yaml   # Database migration job
+│   ├── backend-deployment.yaml
+│   ├── backend-service.yaml
+│   ├── backend-hpa.yaml     # Horizontal Pod Autoscaler
+│   ├── frontend-deployment.yaml
+│   ├── frontend-service.yaml
+│   ├── postgres-statefulset.yaml
+│   ├── redis-statefulset.yaml
+│   ├── ingress.yaml
+│   ├── pdb.yaml             # Pod Disruption Budget
+│   └── kustomization.yaml
+├── overlays/                # Environment-specific overlays
+│   ├── production/
+│   └── staging/
+└── secrets/                 # Actual secrets (gitignored)
+```
+
+## Quick Start
+
+### Prerequisites
+
+- kubectl configured with cluster access
+- kustomize (built into kubectl >= 1.14)
+
+### Deploy to Production
+
+```bash
+# Review what will be deployed
+kubectl kustomize k8s/overlays/production
+
+# Apply manifests
+kubectl apply -k k8s/overlays/production
+
+# Check deployment status
+kubectl get all -n spywatcher
+```
+
+### Deploy to Staging
+
+```bash
+kubectl apply -k k8s/overlays/staging
+kubectl get all -n spywatcher-staging
+```
+
+## Configuration Management
+
+### Secrets
+
+**IMPORTANT**: Never commit actual secrets to git!
+
+1. Copy the secrets template:
+```bash
+cp k8s/base/secrets.yaml k8s/secrets/secrets.yaml
+```
+
+2. Edit with actual values:
+```bash
+vim k8s/secrets/secrets.yaml
+```
+
+3. Apply separately:
+```bash
+kubectl apply -f k8s/secrets/secrets.yaml
+```
+
+### ConfigMap
+
+Application configuration is in `k8s/base/configmap.yaml`. Environment-specific values can be patched in overlays.
+
+## Deployment Strategies
+
+### Rolling Update (Default)
+
+```bash
+# Update image
+kubectl set image deployment/spywatcher-backend \
+  backend=ghcr.io/subculture-collective/spywatcher-backend:v2.0.0 \
+  -n spywatcher
+
+# Watch rollout
+kubectl rollout status deployment/spywatcher-backend -n spywatcher
+```
+
+### Blue-Green Deployment
+
+Use the provided script:
+```bash
+./scripts/deployment/blue-green-deploy.sh
+```
+
+### Canary Deployment
+
+Use the provided script:
+```bash
+./scripts/deployment/canary-deploy.sh
+```
+
+## Scaling
+
+### Manual Scaling
+
+```bash
+# Scale backend
+kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
+
+# Scale frontend
+kubectl scale deployment spywatcher-frontend --replicas=3 -n spywatcher
+```
+
+### Auto-scaling
+
+HorizontalPodAutoscaler is configured to scale based on:
+- CPU utilization (target: 70%)
+- Memory utilization (target: 80%)
+
+```bash
+# Check HPA status
+kubectl get hpa -n spywatcher
+
+# Describe HPA
+kubectl describe hpa spywatcher-backend-hpa -n spywatcher
+```
+
+## Monitoring
+
+### Check Pod Status
+
+```bash
+# List all pods
+kubectl get pods -n spywatcher
+
+# Describe pod
+kubectl describe pod <pod-name> -n spywatcher
+
+# View logs
+kubectl logs -f <pod-name> -n spywatcher
+
+# View logs from all replicas
+kubectl logs -f deployment/spywatcher-backend -n spywatcher
+```
+
+### Health Checks
+
+```bash
+# Test liveness probe
+kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
+  wget -qO- http://localhost:3001/health/live
+
+# Test readiness probe
+kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
+  wget -qO- http://localhost:3001/health/ready
+```
+
+### Resource Usage
+
+```bash
+# Pod resource usage
+kubectl top pods -n spywatcher
+
+# Node resource usage
+kubectl top nodes
+```
+
+## Troubleshooting
+
+### Pod Not Starting
+
+```bash
+# Check events
+kubectl get events -n spywatcher --sort-by='.lastTimestamp'
+
+# Describe pod
+kubectl describe pod <pod-name> -n spywatcher
+
+# Check logs
+kubectl logs <pod-name> -n spywatcher --previous  # Previous container
+```
+
+### Network Issues
+
+```bash
+# Check services
+kubectl get services -n spywatcher
+
+# Check endpoints
+kubectl get endpoints -n spywatcher
+
+# Test service from within cluster
+kubectl run -it --rm debug --image=busybox --restart=Never -n spywatcher -- \
+  wget -qO- http://spywatcher-backend/health/live
+```
+
+### Database Connection
+
+```bash
+# Check database pod
+kubectl get pods -n spywatcher | grep postgres
+
+# Test database connection
+kubectl exec -it postgres-0 -n spywatcher -- \
+  psql -U spywatcher -d spywatcher -c "SELECT version();"
+
+# Check database logs
+kubectl logs postgres-0 -n spywatcher
+```
+
+### Redis Connection
+
+```bash
+# Check Redis pod
+kubectl get pods -n spywatcher | grep redis
+
+# Test Redis connection
+kubectl exec -it redis-0 -n spywatcher -- redis-cli ping
+
+# Check Redis logs
+kubectl logs redis-0 -n spywatcher
+```
+
+## Maintenance
+
+### Update Configuration
+
+```bash
+# Edit configmap
+kubectl edit configmap spywatcher-config -n spywatcher
+
+# Restart pods to pick up changes
+kubectl rollout restart deployment/spywatcher-backend -n spywatcher
+```
+
+### Database Migrations
+
+Database migrations are run as a separate Kubernetes Job to avoid race conditions.
+Migrations should be run before deploying new application versions.
+
+```bash
+# Create a unique migration job
+JOB_NAME="db-migration-$(date +%s)"
+kubectl create job $JOB_NAME --from=job/spywatcher-db-migration -n spywatcher
+
+# Or apply the migration job directly (it will run once)
+kubectl apply -f k8s/base/migration-job.yaml
+
+# Check migration status
+kubectl get jobs -n spywatcher
+
+# View migration logs
+kubectl logs job/$JOB_NAME -n spywatcher
+
+# Delete completed migration jobs (optional, they auto-delete after 1 hour)
+kubectl delete job $JOB_NAME -n spywatcher
+```
+
+**Important:** The migration job uses `completions: 1` and `parallelism: 1` to ensure 
+only one migration runs at a time, preventing race conditions and deadlocks.
+
+### Backup
+
+```bash
+# Backup PostgreSQL
+kubectl exec postgres-0 -n spywatcher -- \
+  pg_dump -U spywatcher spywatcher > backup.sql
+
+# Backup Redis
+kubectl exec redis-0 -n spywatcher -- \
+  redis-cli BGSAVE
+```
+
+## Security
+
+### Network Policies
+
+Network policies restrict traffic between pods:
+- Backend can connect to: PostgreSQL, Redis
+- Frontend can connect to: Backend
+- External traffic: Ingress only
+
+### RBAC
+
+Service accounts with minimal permissions:
+- `spywatcher-backend`: Access to secrets, configmaps
+- `spywatcher-frontend`: Read-only access
+
+### Secrets
+
+- Use Sealed Secrets or External Secrets Operator for production
+- Never commit unencrypted secrets
+- Rotate secrets regularly
+
+## Ingress
+
+### NGINX Ingress Controller
+
+Install if not already present:
+```bash
+helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
+helm install nginx-ingress ingress-nginx/ingress-nginx
+```
+
+### Cert-Manager
+
+Install for automatic SSL certificates:
+```bash
+helm repo add jetstack https://charts.jetstack.io
+helm install cert-manager jetstack/cert-manager \
+  --namespace cert-manager \
+  --create-namespace \
+  --set installCRDs=true
+```
+
+Create ClusterIssuer:
+```yaml
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-prod
+spec:
+  acme:
+    server: https://acme-v02.api.letsencrypt.org/directory
+    email: your-email@example.com
+    privateKeySecretRef:
+      name: letsencrypt-prod
+    solvers:
+    - http01:
+        ingress:
+          class: nginx
+```
+
+## Clean Up
+
+### Delete Resources
+
+```bash
+# Delete all resources in namespace
+kubectl delete namespace spywatcher
+
+# Or use kustomize
+kubectl delete -k k8s/overlays/production
+```
+
+### Persistent Data
+
+⚠️ **WARNING**: Deleting PVCs will delete all data!
+
+```bash
+# List PVCs
+kubectl get pvc -n spywatcher
+
+# Delete specific PVC
+kubectl delete pvc postgres-data-postgres-0 -n spywatcher
+```
+
+## Best Practices
+
+1. **Use namespaces**: Separate environments with namespaces
+2. **Resource limits**: Always set requests and limits
+3. **Health checks**: Configure liveness and readiness probes
+4. **Security context**: Run containers as non-root
+5. **Pod disruption budgets**: Ensure high availability
+6. **Horizontal scaling**: Use HPA for dynamic scaling
+7. **Rolling updates**: Use for zero-downtime deployments
+8. **Monitoring**: Integrate with Prometheus/Grafana
+9. **Logging**: Centralize logs with ELK or Loki
+10. **Backups**: Regular backups of persistent data
--- a/k8s/base/backend-deployment.yaml
+++ b/k8s/base/backend-deployment.yaml
@@ -0,0 +1,193 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-backend
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: backend
+    version: v1
+spec:
+  replicas: 3
+  revisionHistoryLimit: 10
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: backend
+  template:
+    metadata:
+      labels:
+        app: spywatcher
+        tier: backend
+        version: v1
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "3001"
+        prometheus.io/path: "/metrics"
+    spec:
+      # Security context for pod
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1001
+        fsGroup: 1001
+      
+      # Anti-affinity to spread pods across nodes
+      affinity:
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                  - spywatcher
+                - key: tier
+                  operator: In
+                  values:
+                  - backend
+              topologyKey: kubernetes.io/hostname
+      
+      containers:
+      - name: backend
+        image: ghcr.io/subculture-collective/spywatcher-backend:latest
+        imagePullPolicy: Always
+        ports:
+        - name: http
+          containerPort: 3001
+          protocol: TCP
+        
+        # Environment variables from ConfigMap
+        envFrom:
+        - configMapRef:
+            name: spywatcher-config
+        
+        # Environment variables from Secrets
+        env:
+        - name: DATABASE_URL
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: database-url
+        - name: REDIS_URL
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: redis-url
+        - name: JWT_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: jwt-secret
+        - name: JWT_REFRESH_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: jwt-refresh-secret
+        - name: DISCORD_BOT_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-bot-token
+        - name: DISCORD_CLIENT_ID
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-client-id
+        - name: DISCORD_CLIENT_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-client-secret
+        - name: DISCORD_GUILD_ID
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-guild-id
+        - name: DISCORD_REDIRECT_URI
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-redirect-uri
+        - name: ADMIN_DISCORD_IDS
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: admin-discord-ids
+        - name: BOT_GUILD_IDS
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: bot-guild-ids
+        
+        # Resource limits and requests
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "500m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+        
+        # Liveness probe - checks if container is alive
+        livenessProbe:
+          httpGet:
+            path: /health/live
+            port: http
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 3
+        
+        # Readiness probe - checks if container is ready to serve traffic
+        readinessProbe:
+          httpGet:
+            path: /health/ready
+            port: http
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 3
+        
+        # Startup probe - allows slow starting containers more time
+        startupProbe:
+          httpGet:
+            path: /health/live
+            port: http
+          initialDelaySeconds: 0
+          periodSeconds: 10
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 30
+        
+        # Security context for container
+        securityContext:
+          allowPrivilegeEscalation: false
+          readOnlyRootFilesystem: false
+          runAsNonRoot: true
+          runAsUser: 1001
+          capabilities:
+            drop:
+            - ALL
+        
+        # Volume mounts
+        volumeMounts:
+        - name: logs
+          mountPath: /app/logs
+      
+      # Volumes
+      volumes:
+      - name: logs
+        emptyDir: {}
+      
+      # Image pull secrets if using private registry
+      # imagePullSecrets:
+      # - name: ghcr-secret
--- a/k8s/base/backend-hpa.yaml
+++ b/k8s/base/backend-hpa.yaml
@@ -0,0 +1,49 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: spywatcher-backend-hpa
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: backend
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: spywatcher-backend
+  minReplicas: 2
+  maxReplicas: 10
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 300
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 60
+      - type: Pods
+        value: 1
+        periodSeconds: 60
+      selectPolicy: Min
+    scaleUp:
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 100
+        periodSeconds: 30
+      - type: Pods
+        value: 2
+        periodSeconds: 30
+      selectPolicy: Max
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+  - type: Resource
+    resource:
+      name: memory
+      target:
+        type: Utilization
+        averageUtilization: 80
--- a/k8s/base/backend-service.yaml
+++ b/k8s/base/backend-service.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: spywatcher-backend
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: backend
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+spec:
+  type: ClusterIP
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 10800
+  selector:
+    app: spywatcher
+    tier: backend
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
--- a/k8s/base/configmap.yaml
+++ b/k8s/base/configmap.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: spywatcher-config
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+data:
+  # Application settings
+  NODE_ENV: "production"
+  PORT: "3001"
+  
+  # Logging settings
+  LOG_LEVEL: "info"
+  
+  # Rate limiting settings
+  RATE_LIMIT_WINDOW_MS: "900000"
+  RATE_LIMIT_MAX_REQUESTS: "100"
+  
+  # Health check settings
+  HEALTH_CHECK_INTERVAL: "30"
+  
+  # Database pool settings
+  DB_POOL_MIN: "2"
+  DB_POOL_MAX: "10"
+  
+  # Redis settings
+  REDIS_MAX_RETRIES: "3"
+  REDIS_RETRY_DELAY: "1000"
--- a/k8s/base/frontend-deployment.yaml
+++ b/k8s/base/frontend-deployment.yaml
@@ -0,0 +1,124 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-frontend
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: frontend
+    version: v1
+spec:
+  replicas: 2
+  revisionHistoryLimit: 10
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: frontend
+  template:
+    metadata:
+      labels:
+        app: spywatcher
+        tier: frontend
+        version: v1
+    spec:
+      # Security context for pod
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1001
+        fsGroup: 1001
+      
+      # Anti-affinity to spread pods across nodes
+      affinity:
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                  - spywatcher
+                - key: tier
+                  operator: In
+                  values:
+                  - frontend
+              topologyKey: kubernetes.io/hostname
+      
+      containers:
+      - name: frontend
+        image: ghcr.io/subculture-collective/spywatcher-frontend:latest
+        imagePullPolicy: Always
+        ports:
+        - name: http
+          containerPort: 80
+          protocol: TCP
+        
+        env:
+        - name: VITE_API_URL
+          value: "https://api.spywatcher.example.com"
+        - name: VITE_DISCORD_CLIENT_ID
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: discord-client-id
+        
+        # Resource limits and requests
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "100m"
+          limits:
+            memory: "256Mi"
+            cpu: "500m"
+        
+        # Liveness probe
+        livenessProbe:
+          httpGet:
+            path: /
+            port: http
+          initialDelaySeconds: 10
+          periodSeconds: 10
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 3
+        
+        # Readiness probe
+        readinessProbe:
+          httpGet:
+            path: /
+            port: http
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 3
+        
+        # Security context for container
+        securityContext:
+          allowPrivilegeEscalation: false
+          readOnlyRootFilesystem: true
+          runAsNonRoot: true
+          runAsUser: 1001
+          capabilities:
+            drop:
+            - ALL
+        
+        # Volume mounts for nginx cache and temp files
+        volumeMounts:
+        - name: cache
+          mountPath: /var/cache/nginx
+        - name: run
+          mountPath: /var/run
+      
+      # Volumes
+      volumes:
+      - name: cache
+        emptyDir: {}
+      - name: run
+        emptyDir: {}
--- a/k8s/base/frontend-service.yaml
+++ b/k8s/base/frontend-service.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: spywatcher-frontend
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: frontend
+spec:
+  type: ClusterIP
+  selector:
+    app: spywatcher
+    tier: frontend
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
--- a/k8s/base/ingress.yaml
+++ b/k8s/base/ingress.yaml
@@ -0,0 +1,76 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: spywatcher-ingress
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+  annotations:
+    # SSL/TLS configuration
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    
+    # AWS ALB annotations (if using AWS)
+    alb.ingress.kubernetes.io/scheme: internet-facing
+    alb.ingress.kubernetes.io/target-type: ip
+    alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=60
+    alb.ingress.kubernetes.io/healthcheck-path: /health/live
+    alb.ingress.kubernetes.io/healthcheck-interval-seconds: "30"
+    alb.ingress.kubernetes.io/healthcheck-timeout-seconds: "5"
+    alb.ingress.kubernetes.io/healthy-threshold-count: "2"
+    alb.ingress.kubernetes.io/unhealthy-threshold-count: "3"
+    
+    # NGINX Ingress annotations (if using NGINX)
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+    nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
+    
+    # WebSocket support
+    nginx.ingress.kubernetes.io/websocket-services: spywatcher-backend
+    nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
+    nginx.ingress.kubernetes.io/configuration-snippet: |
+      proxy_set_header Upgrade $http_upgrade;
+      proxy_set_header Connection "upgrade";
+    
+    # Security headers
+    nginx.ingress.kubernetes.io/server-snippet: |
+      add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
+      add_header X-Frame-Options "SAMEORIGIN" always;
+      add_header X-Content-Type-Options "nosniff" always;
+      add_header X-XSS-Protection "1; mode=block" always;
+    
+    # Rate limiting
+    nginx.ingress.kubernetes.io/limit-rps: "100"
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - spywatcher.example.com
+    - api.spywatcher.example.com
+    secretName: spywatcher-tls-cert
+  rules:
+  # Frontend
+  - host: spywatcher.example.com
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: spywatcher-frontend
+            port:
+              number: 80
+  
+  # Backend API
+  - host: api.spywatcher.example.com
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: spywatcher-backend
+            port:
+              number: 80
--- a/k8s/base/kustomization.yaml
+++ b/k8s/base/kustomization.yaml
@@ -0,0 +1,23 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: spywatcher
+
+resources:
+  - namespace.yaml
+  - configmap.yaml
+  - secrets.yaml
+  - postgres-statefulset.yaml
+  - redis-statefulset.yaml
+  - migration-job.yaml
+  - backend-deployment.yaml
+  - backend-service.yaml
+  - backend-hpa.yaml
+  - frontend-deployment.yaml
+  - frontend-service.yaml
+  - ingress.yaml
+  - pdb.yaml
+
+commonLabels:
+  app.kubernetes.io/name: spywatcher
+  app.kubernetes.io/managed-by: kustomize
--- a/k8s/base/migration-job.yaml
+++ b/k8s/base/migration-job.yaml
@@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: spywatcher-db-migration
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    component: migration
+spec:
+  # Only keep successful jobs for 1 hour
+  ttlSecondsAfterFinished: 3600
+  # Prevent concurrent migrations
+  completions: 1
+  parallelism: 1
+  backoffLimit: 3
+  template:
+    metadata:
+      labels:
+        app: spywatcher
+        component: migration
+    spec:
+      restartPolicy: Never
+      
+      # Security context for pod
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1001
+        fsGroup: 1001
+      
+      containers:
+      - name: migrate
+        image: ghcr.io/subculture-collective/spywatcher-backend:latest
+        imagePullPolicy: Always
+        command:
+        - sh
+        - -c
+        - |
+          echo "Starting database migration..."
+          npx prisma migrate deploy
+          echo "Migration completed successfully"
+        
+        env:
+        - name: DATABASE_URL
+          valueFrom:
+            secretKeyRef:
+              name: spywatcher-secrets
+              key: database-url
+        
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "250m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+        
+        # Security context for container
+        securityContext:
+          allowPrivilegeEscalation: false
+          readOnlyRootFilesystem: false
+          runAsNonRoot: true
+          runAsUser: 1001
+          capabilities:
+            drop:
+            - ALL
--- a/k8s/base/namespace.yaml
+++ b/k8s/base/namespace.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: spywatcher
+  labels:
+    name: spywatcher
+    environment: production
+---
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: spywatcher-quota
+  namespace: spywatcher
+spec:
+  hard:
+    requests.cpu: "20"
+    requests.memory: 40Gi
+    limits.cpu: "40"
+    limits.memory: 80Gi
+    persistentvolumeclaims: "10"
+---
+apiVersion: v1
+kind: LimitRange
+metadata:
+  name: spywatcher-limit-range
+  namespace: spywatcher
+spec:
+  limits:
+  - default:
+      cpu: "1"
+      memory: 1Gi
+    defaultRequest:
+      cpu: "500m"
+      memory: 512Mi
+    type: Container
--- a/k8s/base/pdb.yaml
+++ b/k8s/base/pdb.yaml
@@ -0,0 +1,29 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: spywatcher-backend-pdb
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: backend
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: backend
+---
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: spywatcher-frontend-pdb
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: frontend
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: frontend
--- a/k8s/base/postgres-statefulset.yaml
+++ b/k8s/base/postgres-statefulset.yaml
@@ -0,0 +1,113 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: database
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: database
+  template:
+    metadata:
+      labels:
+        app: spywatcher
+        tier: database
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15-alpine
+        ports:
+        - name: postgres
+          containerPort: 5432
+          protocol: TCP
+        env:
+        - name: POSTGRES_DB
+          value: spywatcher
+        - name: POSTGRES_USER
+          value: spywatcher
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: postgres-secret
+              key: password
+        - name: PGDATA
+          value: /var/lib/postgresql/data/pgdata
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "500m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: postgres-data
+          mountPath: /var/lib/postgresql/data
+        livenessProbe:
+          exec:
+            command:
+            - /bin/sh
+            - -c
+            - pg_isready -U spywatcher
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          exec:
+            command:
+            - /bin/sh
+            - -c
+            - pg_isready -U spywatcher
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+          failureThreshold: 3
+  volumeClaimTemplates:
+  - metadata:
+      name: postgres-data
+      labels:
+        app: spywatcher
+        tier: database
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: database
+spec:
+  type: ClusterIP
+  clusterIP: None
+  selector:
+    app: spywatcher
+    tier: database
+  ports:
+  - name: postgres
+    port: 5432
+    targetPort: postgres
+    protocol: TCP
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: postgres-secret
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: database
+type: Opaque
+stringData:
+  password: "CHANGE_ME_IN_PRODUCTION"
--- a/k8s/base/redis-statefulset.yaml
+++ b/k8s/base/redis-statefulset.yaml
@@ -0,0 +1,117 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: redis
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: cache
+spec:
+  serviceName: redis
+  replicas: 1
+  selector:
+    matchLabels:
+      app: spywatcher
+      tier: cache
+  template:
+    metadata:
+      labels:
+        app: spywatcher
+        tier: cache
+    spec:
+      containers:
+      - name: redis
+        image: redis:7-alpine
+        command:
+        - redis-server
+        - --appendonly
+        - "yes"
+        - --requirepass
+        - $(REDIS_PASSWORD)
+        ports:
+        - name: redis
+          containerPort: 6379
+          protocol: TCP
+        env:
+        - name: REDIS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: redis-secret
+              key: password
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "250m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+        volumeMounts:
+        - name: redis-data
+          mountPath: /data
+        livenessProbe:
+          exec:
+            command:
+            - redis-cli
+            - --no-auth-warning
+            - -a
+            - $(REDIS_PASSWORD)
+            - ping
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          exec:
+            command:
+            - redis-cli
+            - --no-auth-warning
+            - -a
+            - $(REDIS_PASSWORD)
+            - ping
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+          failureThreshold: 3
+  volumeClaimTemplates:
+  - metadata:
+      name: redis-data
+      labels:
+        app: spywatcher
+        tier: cache
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 10Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: redis
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: cache
+spec:
+  type: ClusterIP
+  clusterIP: None
+  selector:
+    app: spywatcher
+    tier: cache
+  ports:
+  - name: redis
+    port: 6379
+    targetPort: redis
+    protocol: TCP
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: redis-secret
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+    tier: cache
+type: Opaque
+stringData:
+  password: "CHANGE_ME_IN_PRODUCTION"
--- a/k8s/base/secrets.yaml
+++ b/k8s/base/secrets.yaml
@@ -0,0 +1,32 @@
+# This is a template file for secrets
+# In production, use sealed-secrets, external-secrets, or your cloud provider's secret management
+# DO NOT commit actual secrets to git
+apiVersion: v1
+kind: Secret
+metadata:
+  name: spywatcher-secrets
+  namespace: spywatcher
+  labels:
+    app: spywatcher
+type: Opaque
+stringData:
+  # Database connection
+  database-url: "postgresql://user:password@postgres-service:5432/spywatcher"
+  
+  # Redis connection
+  redis-url: "redis://redis-service:6379"
+  
+  # JWT secrets
+  jwt-secret: "CHANGE_ME_IN_PRODUCTION"
+  jwt-refresh-secret: "CHANGE_ME_IN_PRODUCTION"
+  
+  # Discord credentials
+  discord-bot-token: "CHANGE_ME_IN_PRODUCTION"
+  discord-client-id: "CHANGE_ME_IN_PRODUCTION"
+  discord-client-secret: "CHANGE_ME_IN_PRODUCTION"
+  discord-guild-id: "CHANGE_ME_IN_PRODUCTION"
+  discord-redirect-uri: "https://spywatcher.example.com/auth/callback"
+  
+  # Admin settings
+  admin-discord-ids: ""
+  bot-guild-ids: ""
--- a/k8s/overlays/production/kustomization.yaml
+++ b/k8s/overlays/production/kustomization.yaml
@@ -0,0 +1,29 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: spywatcher
+
+bases:
+  - ../../base
+
+namePrefix: prod-
+
+commonLabels:
+  environment: production
+
+patchesStrategicMerge:
+  - replicas-patch.yaml
+  - resources-patch.yaml
+
+configMapGenerator:
+  - name: spywatcher-config
+    behavior: merge
+    literals:
+      - NODE_ENV=production
+      - LOG_LEVEL=info
+
+images:
+  - name: ghcr.io/subculture-collective/spywatcher-backend
+    newTag: latest
+  - name: ghcr.io/subculture-collective/spywatcher-frontend
+    newTag: latest
--- a/k8s/overlays/production/replicas-patch.yaml
+++ b/k8s/overlays/production/replicas-patch.yaml
@@ -0,0 +1,15 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-backend
+  namespace: spywatcher
+spec:
+  replicas: 3
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-frontend
+  namespace: spywatcher
+spec:
+  replicas: 2
--- a/k8s/overlays/production/resources-patch.yaml
+++ b/k8s/overlays/production/resources-patch.yaml
@@ -0,0 +1,35 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-backend
+  namespace: spywatcher
+spec:
+  template:
+    spec:
+      containers:
+      - name: backend
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "500m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-frontend
+  namespace: spywatcher
+spec:
+  template:
+    spec:
+      containers:
+      - name: frontend
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "100m"
+          limits:
+            memory: "256Mi"
+            cpu: "500m"
--- a/k8s/overlays/staging/kustomization.yaml
+++ b/k8s/overlays/staging/kustomization.yaml
@@ -0,0 +1,28 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: spywatcher-staging
+
+bases:
+  - ../../base
+
+namePrefix: staging-
+
+commonLabels:
+  environment: staging
+
+patchesStrategicMerge:
+  - replicas-patch.yaml
+
+configMapGenerator:
+  - name: spywatcher-config
+    behavior: merge
+    literals:
+      - NODE_ENV=staging
+      - LOG_LEVEL=debug
+
+images:
+  - name: ghcr.io/subculture-collective/spywatcher-backend
+    newTag: staging
+  - name: ghcr.io/subculture-collective/spywatcher-frontend
+    newTag: staging
--- a/k8s/overlays/staging/replicas-patch.yaml
+++ b/k8s/overlays/staging/replicas-patch.yaml
@@ -0,0 +1,15 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-backend
+  namespace: spywatcher-staging
+spec:
+  replicas: 1
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spywatcher-frontend
+  namespace: spywatcher-staging
+spec:
+  replicas: 1
--- a/scripts/deployment/blue-green-deploy.sh
+++ b/scripts/deployment/blue-green-deploy.sh
@@ -0,0 +1,198 @@
+#!/bin/bash
+set -e
+
+# Blue-Green Deployment Script for Spywatcher
+# This script performs zero-downtime deployments by maintaining two identical environments
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+NAMESPACE="${NAMESPACE:-spywatcher}"
+APP_NAME="${APP_NAME:-spywatcher-backend}"
+IMAGE_TAG="${IMAGE_TAG:-latest}"
+HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
+HEALTH_CHECK_RETRIES="${HEALTH_CHECK_RETRIES:-10}"
+HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-10}"
+
+# Function to print colored output
+print_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to check if a deployment exists
+deployment_exists() {
+    kubectl get deployment "$1" -n "$NAMESPACE" &> /dev/null
+}
+
+# Function to get current active environment
+get_active_environment() {
+    local service_selector=$(kubectl get service "$APP_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.selector.version}')
+    echo "$service_selector"
+}
+
+# Function to perform health check
+health_check() {
+    local deployment=$1
+    local retries=$HEALTH_CHECK_RETRIES
+    
+    print_info "Performing health check on $deployment..."
+    
+    while [ $retries -gt 0 ]; do
+        # Get pod name
+        local pod=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=$deployment -o jsonpath='{.items[0].metadata.name}')
+        
+        if [ -z "$pod" ]; then
+            print_warning "No pod found for $deployment, retrying..."
+            sleep $HEALTH_CHECK_INTERVAL
+            retries=$((retries - 1))
+            continue
+        fi
+        
+        # Check if pod is running
+        local pod_status=$(kubectl get pod "$pod" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
+        if [ "$pod_status" != "Running" ]; then
+            print_warning "Pod $pod is not running (status: $pod_status), retrying..."
+            sleep $HEALTH_CHECK_INTERVAL
+            retries=$((retries - 1))
+            continue
+        fi
+        
+        # Perform HTTP health check
+        if kubectl exec "$pod" -n "$NAMESPACE" -- wget -q -O- "http://localhost:3001$HEALTH_CHECK_PATH" &> /dev/null; then
+            print_info "Health check passed for $deployment"
+            return 0
+        else
+            print_warning "Health check failed for $deployment, retrying..."
+            sleep $HEALTH_CHECK_INTERVAL
+            retries=$((retries - 1))
+        fi
+    done
+    
+    print_error "Health check failed after $HEALTH_CHECK_RETRIES retries"
+    return 1
+}
+
+# Main deployment logic
+main() {
+    print_info "Starting Blue-Green deployment for $APP_NAME"
+    print_info "Namespace: $NAMESPACE"
+    print_info "Image Tag: $IMAGE_TAG"
+    
+    # Determine current active environment
+    local current_env=$(get_active_environment)
+    
+    if [ -z "$current_env" ]; then
+        # No active environment, default to blue
+        current_env="blue"
+        new_env="green"
+        print_info "No active environment found, will deploy to green"
+    elif [ "$current_env" = "blue" ]; then
+        new_env="green"
+    else
+        new_env="blue"
+    fi
+    
+    print_info "Current active environment: $current_env"
+    print_info "Deploying to: $new_env"
+    
+    # Create or update new environment deployment
+    local new_deployment="$APP_NAME-$new_env"
+    
+    # Apply deployment
+    kubectl set image "deployment/$new_deployment" \
+        backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
+        -n "$NAMESPACE" 2>/dev/null || \
+    kubectl create deployment "$new_deployment" \
+        --image="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
+        -n "$NAMESPACE"
+    
+    # Label the deployment
+    kubectl label deployment "$new_deployment" app=spywatcher version=$new_env -n "$NAMESPACE" --overwrite
+    
+    # Wait for deployment to be ready
+    print_info "Waiting for deployment $new_deployment to be ready..."
+    kubectl rollout status "deployment/$new_deployment" -n "$NAMESPACE" --timeout=5m
+    
+    # Perform health checks
+    if ! health_check "$new_env"; then
+        print_error "Health check failed for $new_env environment"
+        print_error "Keeping traffic on $current_env environment"
+        exit 1
+    fi
+    
+    # Update service selector to point to new environment
+    print_info "Switching traffic to $new_env environment..."
+    kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
+        -p "{\"spec\":{\"selector\":{\"version\":\"$new_env\"}}}"
+    
+    print_info "Traffic successfully switched to $new_env"
+    
+    # Wait a bit before considering old environment for removal
+    print_info "Waiting 60 seconds before cleaning up old environment..."
+    sleep 60
+    
+    # Optional: Scale down old environment instead of deleting
+    if deployment_exists "$APP_NAME-$current_env"; then
+        print_info "Scaling down old environment: $current_env"
+        kubectl scale deployment "$APP_NAME-$current_env" --replicas=0 -n "$NAMESPACE"
+        print_info "Old environment scaled to 0 replicas (can be used for quick rollback)"
+    fi
+    
+    print_info "Blue-Green deployment completed successfully!"
+    print_info "Active environment: $new_env"
+}
+
+# Rollback function
+rollback() {
+    print_warning "Rolling back deployment..."
+    
+    local current_env=$(get_active_environment)
+    local previous_env
+    
+    if [ "$current_env" = "blue" ]; then
+        previous_env="green"
+    else
+        previous_env="blue"
+    fi
+    
+    # Check if previous environment exists
+    if ! deployment_exists "$APP_NAME-$previous_env"; then
+        print_error "Previous environment $previous_env does not exist, cannot rollback"
+        exit 1
+    fi
+    
+    # Scale up previous environment if it's scaled down
+    local replicas=$(kubectl get deployment "$APP_NAME-$previous_env" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
+    if [ "$replicas" -eq 0 ]; then
+        print_info "Scaling up previous environment: $previous_env"
+        kubectl scale deployment "$APP_NAME-$previous_env" --replicas=3 -n "$NAMESPACE"
+        kubectl rollout status "deployment/$APP_NAME-$previous_env" -n "$NAMESPACE" --timeout=5m
+    fi
+    
+    # Switch traffic back
+    print_info "Switching traffic back to $previous_env"
+    kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
+        -p "{\"spec\":{\"selector\":{\"version\":\"$previous_env\"}}}"
+    
+    print_info "Rollback completed successfully!"
+    print_info "Active environment: $previous_env"
+}
+
+# Check if rollback flag is set
+if [ "$1" = "--rollback" ]; then
+    rollback
+else
+    main
+fi
--- a/scripts/deployment/canary-deploy.sh
+++ b/scripts/deployment/canary-deploy.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+set -e
+
+# Canary Deployment Script for Spywatcher
+# This script gradually shifts traffic to a new version while monitoring for errors
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+NAMESPACE="${NAMESPACE:-spywatcher}"
+APP_NAME="${APP_NAME:-spywatcher-backend}"
+IMAGE_TAG="${IMAGE_TAG:-latest}"
+HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
+
+# Canary rollout percentages
+CANARY_STEPS="${CANARY_STEPS:-5 25 50 75 100}"
+CANARY_WAIT="${CANARY_WAIT:-60}"  # Wait time between steps in seconds
+
+# Error thresholds
+ERROR_THRESHOLD="${ERROR_THRESHOLD:-5}"  # Max error percentage before rollback
+
+# Function to print colored output
+print_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to check deployment health
+check_health() {
+    local deployment=$1
+    local replicas=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}')
+    local desired=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
+    
+    if [ "$replicas" = "$desired" ] && [ "$replicas" -gt 0 ]; then
+        return 0
+    else
+        return 1
+    fi
+}
+
+# Function to check error rate (simplified - you should integrate with your monitoring system)
+check_error_rate() {
+    local deployment=$1
+    
+    # Get pod logs and check for errors
+    local pods=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=canary -o jsonpath='{.items[*].metadata.name}')
+    
+    if [ -z "$pods" ]; then
+        print_warning "No canary pods found"
+        return 0
+    fi
+    
+    # Simple error check - count ERROR log entries
+    local error_count=0
+    for pod in $pods; do
+        local pod_errors=$(kubectl logs "$pod" -n "$NAMESPACE" --tail=100 | grep -c "ERROR" || true)
+        error_count=$((error_count + pod_errors))
+    done
+    
+    print_info "Detected $error_count errors in canary pods"
+    
+    if [ "$error_count" -gt "$ERROR_THRESHOLD" ]; then
+        return 1
+    else
+        return 0
+    fi
+}
+
+# Function to update traffic weights
+# NOTE: This implementation uses replica counts to approximate traffic splitting,
+# which is not precise. For accurate percentage-based traffic splitting,
+# consider using a service mesh (Istio, Linkerd) or an ingress controller
+# that supports weighted traffic splitting (like NGINX Ingress with canary annotations).
+update_traffic_weight() {
+    local canary_weight=$1
+    local stable_weight=$((100 - canary_weight))
+    
+    print_info "Adjusting traffic: Canary $canary_weight%, Stable $stable_weight%"
+    print_warning "Note: Replica-based traffic splitting is approximate. Actual traffic may not match percentages exactly."
+    
+    # Calculate replica counts based on percentages
+    local total_replicas=3
+    local canary_replicas=$(( (total_replicas * canary_weight + 50) / 100 ))
+    local stable_replicas=$((total_replicas - canary_replicas))
+    
+    # Ensure at least 1 replica
+    [ "$canary_replicas" -eq 0 ] && canary_replicas=1
+    [ "$stable_replicas" -eq 0 ] && stable_replicas=1
+    
+    # Scale deployments
+    kubectl scale deployment "$APP_NAME-canary" --replicas=$canary_replicas -n "$NAMESPACE"
+    kubectl scale deployment "$APP_NAME-stable" --replicas=$stable_replicas -n "$NAMESPACE"
+    
+    # Wait for scaling to complete
+    kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=2m
+    kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=2m
+}
+
+# Function to promote canary to stable
+promote_canary() {
+    print_info "Promoting canary to stable..."
+    
+    # Update stable deployment with canary image
+    local canary_image=$(kubectl get deployment "$APP_NAME-canary" -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].image}')
+    
+    kubectl set image "deployment/$APP_NAME-stable" backend="$canary_image" -n "$NAMESPACE"
+    kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=5m
+    
+    # Scale stable back to full capacity
+    kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
+    
+    # Remove canary deployment
+    kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
+    
+    print_info "Canary promoted to stable successfully!"
+}
+
+# Function to rollback canary
+rollback_canary() {
+    print_error "Rolling back canary deployment..."
+    
+    # Delete canary deployment
+    kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
+    
+    # Ensure stable is at full capacity
+    kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
+    
+    print_info "Canary deployment rolled back"
+}
+
+# Main deployment logic
+main() {
+    print_info "Starting Canary deployment for $APP_NAME"
+    print_info "Namespace: $NAMESPACE"
+    print_info "Image Tag: $IMAGE_TAG"
+    print_info "Canary steps: $CANARY_STEPS"
+    
+    # Ensure stable deployment exists
+    if ! kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" &> /dev/null; then
+        # If stable doesn't exist, copy from existing deployment
+        if kubectl get deployment "$APP_NAME" -n "$NAMESPACE" &> /dev/null; then
+            print_info "Creating stable deployment from existing deployment"
+            kubectl get deployment "$APP_NAME" -n "$NAMESPACE" -o yaml | \
+                sed "s/name: $APP_NAME$/name: $APP_NAME-stable/" | \
+                kubectl apply -f -
+            # Update the original deployment name if needed
+            kubectl label deployment "$APP_NAME-stable" version=stable -n "$NAMESPACE" --overwrite
+        else
+            print_error "No existing deployment found"
+            exit 1
+        fi
+    fi
+    
+    # Create canary deployment
+    print_info "Creating canary deployment..."
+    kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" -o yaml | \
+        sed "s/$APP_NAME-stable/$APP_NAME-canary/g" | \
+        sed "s/version: stable/version: canary/g" | \
+        kubectl apply -f -
+    
+    # Update canary image
+    kubectl set image "deployment/$APP_NAME-canary" \
+        backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
+        -n "$NAMESPACE"
+    
+    kubectl label deployment "$APP_NAME-canary" version=canary -n "$NAMESPACE" --overwrite
+    
+    # Start with minimal canary traffic
+    kubectl scale deployment "$APP_NAME-canary" --replicas=1 -n "$NAMESPACE"
+    kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=5m
+    
+    # Gradually shift traffic
+    for step in $CANARY_STEPS; do
+        print_info "Canary rollout: ${step}%"
+        
+        # Update traffic weights
+        update_traffic_weight "$step"
+        
+        # Wait for the step duration
+        print_info "Waiting ${CANARY_WAIT}s before next step..."
+        sleep "$CANARY_WAIT"
+        
+        # Check health
+        if ! check_health "$APP_NAME-canary"; then
+            print_error "Canary health check failed"
+            rollback_canary
+            exit 1
+        fi
+        
+        # Check error rate
+        if ! check_error_rate "$APP_NAME-canary"; then
+            print_error "Canary error rate exceeded threshold"
+            rollback_canary
+            exit 1
+        fi
+        
+        print_info "Step ${step}% completed successfully"
+    done
+    
+    # Promote canary to stable
+    promote_canary
+    
+    print_info "Canary deployment completed successfully!"
+}
+
+# Run main function
+main
--- a/terraform/README.md
+++ b/terraform/README.md
@@ -0,0 +1,336 @@
+# Spywatcher Infrastructure as Code
+
+This directory contains Terraform configurations for deploying Spywatcher infrastructure on AWS.
+
+## Prerequisites
+
+- AWS CLI configured with appropriate credentials
+- Terraform >= 1.5.0
+- kubectl
+- Helm (optional)
+
+## Infrastructure Components
+
+### Modules
+
+- **VPC**: Virtual Private Cloud with public, private, and database subnets
+- **EKS**: Elastic Kubernetes Service cluster
+- **RDS**: PostgreSQL database
+- **Redis**: ElastiCache Redis cluster
+- **ALB**: Application Load Balancer with WAF
+
+### Directory Structure
+
+```
+terraform/
+├── main.tf                 # Root module configuration
+├── variables.tf            # Root module variables
+├── outputs.tf              # Root module outputs
+├── modules/                # Reusable modules
+│   ├── vpc/
+│   ├── eks/
+│   ├── rds/
+│   ├── redis/
+│   └── alb/
+└── environments/           # Environment-specific configurations
+    ├── production/
+    │   └── terraform.tfvars
+    └── staging/
+        └── terraform.tfvars
+```
+
+## Quick Start
+
+### 1. Configure Backend
+
+First, create an S3 bucket and DynamoDB table for state management:
+
+```bash
+# Create S3 bucket for state
+aws s3api create-bucket \
+  --bucket spywatcher-terraform-state \
+  --region us-east-1
+
+# Enable versioning
+aws s3api put-bucket-versioning \
+  --bucket spywatcher-terraform-state \
+  --versioning-configuration Status=Enabled
+
+# Create DynamoDB table for state locking
+aws dynamodb create-table \
+  --table-name terraform-state-lock \
+  --attribute-definitions AttributeName=LockID,AttributeType=S \
+  --key-schema AttributeName=LockID,KeyType=HASH \
+  --billing-mode PAY_PER_REQUEST \
+  --region us-east-1
+```
+
+### 2. Initialize Terraform
+
+```bash
+cd terraform
+terraform init
+```
+
+### 3. Review and Customize
+
+Edit the appropriate `terraform.tfvars` file:
+
+```bash
+# For production
+vim environments/production/terraform.tfvars
+
+# For staging
+vim environments/staging/terraform.tfvars
+```
+
+Key configurations to update:
+- `certificate_arn`: SSL certificate ARN from AWS Certificate Manager
+- VPC CIDR blocks (if needed)
+- Instance types and sizes
+- Database credentials (use environment variables or AWS Secrets Manager)
+
+### 4. Plan Infrastructure
+
+```bash
+# Production
+terraform plan -var-file="environments/production/terraform.tfvars"
+
+# Staging
+terraform plan -var-file="environments/staging/terraform.tfvars"
+```
+
+### 5. Apply Infrastructure
+
+```bash
+# Production
+terraform apply -var-file="environments/production/terraform.tfvars"
+
+# Staging
+terraform apply -var-file="environments/staging/terraform.tfvars"
+```
+
+This will create:
+- VPC with NAT gateways
+- EKS cluster with node groups
+- RDS PostgreSQL instance
+- ElastiCache Redis cluster
+- Application Load Balancer
+- Security groups and IAM roles
+
+### 6. Configure kubectl
+
+After infrastructure is created:
+
+```bash
+# Get the cluster name from outputs
+terraform output eks_cluster_name
+
+# Configure kubectl
+aws eks update-kubeconfig \
+  --name $(terraform output -raw eks_cluster_name) \
+  --region us-east-1
+
+# Verify connection
+kubectl cluster-info
+kubectl get nodes
+```
+
+## Outputs
+
+After applying, Terraform will output important values:
+
+```bash
+# View all outputs
+terraform output
+
+# View specific output
+terraform output rds_endpoint
+terraform output eks_cluster_endpoint
+```
+
+## Secrets Management
+
+### Database Password
+
+The RDS password is auto-generated and stored in AWS Secrets Manager:
+
+```bash
+# Retrieve database password
+aws secretsmanager get-secret-value \
+  --secret-id spywatcher-production-db-password \
+  --query SecretString \
+  --output text
+```
+
+### Redis Auth Token
+
+Redis authentication token is also in Secrets Manager:
+
+```bash
+# Retrieve Redis auth token
+aws secretsmanager get-secret-value \
+  --secret-id spywatcher-production-auth-token \
+  --query SecretString \
+  --output text
+```
+
+## Updating Infrastructure
+
+```bash
+# Make changes to .tf files or terraform.tfvars
+
+# Plan changes
+terraform plan -var-file="environments/production/terraform.tfvars"
+
+# Apply changes
+terraform apply -var-file="environments/production/terraform.tfvars"
+```
+
+## Destroying Infrastructure
+
+⚠️ **WARNING**: This will destroy all resources. Make sure you have backups!
+
+```bash
+# Destroy infrastructure
+terraform destroy -var-file="environments/production/terraform.tfvars"
+```
+
+## Module Documentation
+
+### VPC Module
+
+Creates a VPC with:
+- 3 availability zones
+- Public, private, and database subnets
+- NAT gateways for private subnet internet access
+- VPC Flow Logs
+
+### EKS Module
+
+Creates an EKS cluster with:
+- Managed node groups
+- OIDC provider for IRSA
+- Essential add-ons (VPC CNI, CoreDNS, kube-proxy)
+- Security groups
+
+### RDS Module
+
+Creates a PostgreSQL database with:
+- Encryption at rest
+- Automated backups
+- Multi-AZ deployment (production)
+- Performance Insights
+- CloudWatch alarms
+
+### Redis Module
+
+Creates an ElastiCache Redis cluster with:
+- Encryption in transit and at rest
+- Authentication token
+- Automatic failover (if multi-node)
+- CloudWatch alarms
+
+### ALB Module
+
+Creates an Application Load Balancer with:
+- HTTPS termination
+- HTTP to HTTPS redirect
+- WAF with rate limiting
+- AWS Managed Rules
+
+## Cost Optimization
+
+### Development/Testing
+
+For cost savings in non-production:
+- Use smaller instance types
+- Single-AZ deployments
+- Spot instances for EKS nodes
+- Reduce backup retention periods
+
+### Production
+
+- Use Reserved Instances for steady-state workload
+- Enable auto-scaling
+- Right-size instances based on metrics
+- Use S3 lifecycle policies for backups
+
+## Monitoring
+
+### CloudWatch Alarms
+
+The modules create CloudWatch alarms for:
+- RDS CPU utilization
+- RDS storage space
+- Redis CPU utilization
+- Redis memory usage
+
+Configure SNS topics for notifications:
+
+```bash
+# Create SNS topic
+aws sns create-topic --name spywatcher-alerts
+
+# Subscribe to topic
+aws sns subscribe \
+  --topic-arn arn:aws:sns:us-east-1:123456789012:spywatcher-alerts \
+  --protocol email \
+  --notification-endpoint your-email@example.com
+```
+
+## Troubleshooting
+
+### State Lock Issues
+
+If you encounter state lock errors:
+
+```bash
+# Force unlock (use carefully)
+terraform force-unlock <LOCK_ID>
+```
+
+### EKS Access Issues
+
+If you can't access the cluster:
+
+```bash
+# Ensure your AWS credentials are correct
+aws sts get-caller-identity
+
+# Update kubeconfig
+aws eks update-kubeconfig --name <cluster-name> --region us-east-1
+
+# Check IAM authentication
+kubectl auth can-i get pods --all-namespaces
+```
+
+### RDS Connection Issues
+
+```bash
+# Check security group rules
+aws ec2 describe-security-groups --group-ids <sg-id>
+
+# Test connection from EKS node
+kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -- \
+  psql -h <rds-endpoint> -U spywatcher -d spywatcher
+```
+
+## Security Best Practices
+
+1. **Never commit secrets**: Use AWS Secrets Manager or environment variables
+2. **Enable MFA**: For AWS account access
+3. **Use IAM roles**: Instead of access keys where possible
+4. **Regular updates**: Keep Terraform and providers up to date
+5. **Review changes**: Always review `terraform plan` output
+6. **Backup state**: S3 versioning is enabled for state files
+7. **Least privilege**: IAM policies follow least privilege principle
+
+## Support
+
+For infrastructure issues:
+- Check Terraform state: `terraform show`
+- Review CloudWatch logs
+- Check AWS CloudTrail for API calls
+- Consult AWS documentation
+- Create issue in repository
--- a/terraform/environments/production/terraform.tfvars
+++ b/terraform/environments/production/terraform.tfvars
@@ -0,0 +1,54 @@
+# Production Environment Configuration
+
+environment = "production"
+aws_region  = "us-east-1"
+project_name = "spywatcher"
+
+# VPC Configuration
+vpc_cidr             = "10.0.0.0/16"
+availability_zones   = ["us-east-1a", "us-east-1b", "us-east-1c"]
+private_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
+public_subnet_cidrs  = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
+database_subnet_cidrs = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
+
+# EKS Configuration
+eks_cluster_version = "1.28"
+eks_node_groups = {
+  general = {
+    desired_size   = 3
+    min_size       = 2
+    max_size       = 10
+    instance_types = ["t3.large"]
+    capacity_type  = "ON_DEMAND"
+  }
+  spot = {
+    desired_size   = 2
+    min_size       = 0
+    max_size       = 5
+    instance_types = ["t3.large", "t3a.large"]
+    capacity_type  = "SPOT"
+  }
+}
+
+# RDS Configuration
+rds_engine_version    = "15.3"
+rds_instance_class    = "db.t3.large"
+rds_allocated_storage = 100
+database_name         = "spywatcher"
+database_username     = "spywatcher"
+
+# Redis Configuration
+redis_node_type      = "cache.t3.medium"
+redis_num_cache_nodes = 2
+
+# SSL Certificate
+# Replace with actual certificate ARN after creating in ACM
+certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+
+# Additional Tags
+tags = {
+  Terraform   = "true"
+  Environment = "production"
+  Project     = "spywatcher"
+  CostCenter  = "engineering"
+}
--- a/terraform/environments/staging/terraform.tfvars
+++ b/terraform/environments/staging/terraform.tfvars
@@ -0,0 +1,45 @@
+# Staging Environment Configuration
+
+environment = "staging"
+aws_region  = "us-east-1"
+project_name = "spywatcher"
+
+# VPC Configuration
+vpc_cidr             = "10.1.0.0/16"
+availability_zones   = ["us-east-1a", "us-east-1b"]
+private_subnet_cidrs = ["10.1.1.0/24", "10.1.2.0/24"]
+public_subnet_cidrs  = ["10.1.101.0/24", "10.1.102.0/24"]
+database_subnet_cidrs = ["10.1.201.0/24", "10.1.202.0/24"]
+
+# EKS Configuration
+eks_cluster_version = "1.28"
+eks_node_groups = {
+  general = {
+    desired_size   = 2
+    min_size       = 1
+    max_size       = 4
+    instance_types = ["t3.medium"]
+    capacity_type  = "ON_DEMAND"
+  }
+}
+
+# RDS Configuration
+rds_engine_version    = "15.3"
+rds_instance_class    = "db.t3.medium"
+rds_allocated_storage = 50
+database_name         = "spywatcher"
+database_username     = "spywatcher"
+
+# Redis Configuration
+redis_node_type      = "cache.t3.small"
+redis_num_cache_nodes = 1
+
+# SSL Certificate
+certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+
+# Additional Tags
+tags = {
+  Terraform   = "true"
+  Environment = "staging"
+  Project     = "spywatcher"
+}
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -0,0 +1,141 @@
+terraform {
+  required_version = ">= 1.5.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = "~> 2.23"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = "~> 2.11"
+    }
+  }
+
+  # Backend configuration for state storage
+  # Note: The key should be set dynamically using -backend-config flag:
+  # terraform init -backend-config="key=<environment>/terraform.tfstate"
+  backend "s3" {
+    bucket         = "spywatcher-terraform-state"
+    key            = "terraform.tfstate"  # Override with -backend-config flag
+    region         = "us-east-1"
+    encrypt        = true
+    dynamodb_table = "terraform-state-lock"
+  }
+}
+
+provider "aws" {
+  region = var.aws_region
+
+  default_tags {
+    tags = {
+      Project     = "spywatcher"
+      Environment = var.environment
+      ManagedBy   = "terraform"
+    }
+  }
+}
+
+# VPC Module
+module "vpc" {
+  source = "./modules/vpc"
+
+  environment         = var.environment
+  vpc_cidr            = var.vpc_cidr
+  availability_zones  = var.availability_zones
+  private_subnet_cidrs = var.private_subnet_cidrs
+  public_subnet_cidrs  = var.public_subnet_cidrs
+  database_subnet_cidrs = var.database_subnet_cidrs
+}
+
+# EKS Module
+module "eks" {
+  source = "./modules/eks"
+
+  environment        = var.environment
+  cluster_name       = "${var.project_name}-${var.environment}"
+  cluster_version    = var.eks_cluster_version
+  vpc_id             = module.vpc.vpc_id
+  private_subnet_ids = module.vpc.private_subnet_ids
+  
+  node_groups = var.eks_node_groups
+}
+
+# RDS PostgreSQL Module
+module "rds" {
+  source = "./modules/rds"
+
+  environment          = var.environment
+  identifier           = "${var.project_name}-${var.environment}"
+  engine_version       = var.rds_engine_version
+  instance_class       = var.rds_instance_class
+  allocated_storage    = var.rds_allocated_storage
+  database_name        = var.database_name
+  master_username      = var.database_username
+  vpc_id               = module.vpc.vpc_id
+  database_subnet_ids  = module.vpc.database_subnet_ids
+  allowed_security_group_ids = [module.eks.cluster_security_group_id]
+}
+
+# ElastiCache Redis Module
+module "redis" {
+  source = "./modules/redis"
+
+  environment        = var.environment
+  cluster_id         = "${var.project_name}-${var.environment}"
+  node_type          = var.redis_node_type
+  num_cache_nodes    = var.redis_num_cache_nodes
+  vpc_id             = module.vpc.vpc_id
+  subnet_ids         = module.vpc.private_subnet_ids
+  allowed_security_group_ids = [module.eks.cluster_security_group_id]
+}
+
+# Application Load Balancer Module
+module "alb" {
+  source = "./modules/alb"
+
+  environment     = var.environment
+  vpc_id          = module.vpc.vpc_id
+  public_subnet_ids = module.vpc.public_subnet_ids
+  certificate_arn = var.certificate_arn
+}
+
+# Configure Kubernetes provider
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
+  
+  exec {
+    api_version = "client.authentication.k8s.io/v1"
+    command     = "aws"
+    args = [
+      "eks",
+      "get-token",
+      "--cluster-name",
+      module.eks.cluster_name
+    ]
+  }
+}
+
+# Configure Helm provider
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
+    
+    exec {
+      api_version = "client.authentication.k8s.io/v1"
+      command     = "aws"
+      args = [
+        "eks",
+        "get-token",
+        "--cluster-name",
+        module.eks.cluster_name
+      ]
+    }
+  }
+}
--- a/terraform/modules/alb/main.tf
+++ b/terraform/modules/alb/main.tf
@@ -0,0 +1,262 @@
+resource "aws_security_group" "alb" {
+  name        = "${var.environment}-alb-sg"
+  description = "Security group for Application Load Balancer"
+  vpc_id      = var.vpc_id
+
+  ingress {
+    from_port   = 80
+    to_port     = 80
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "HTTP access"
+  }
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "HTTPS access"
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name        = "${var.environment}-alb-sg"
+    Environment = var.environment
+  }
+}
+
+resource "aws_lb" "main" {
+  name               = "${var.environment}-alb"
+  internal           = false
+  load_balancer_type = "application"
+  security_groups    = [aws_security_group.alb.id]
+  subnets            = var.public_subnet_ids
+
+  enable_deletion_protection = var.enable_deletion_protection
+  enable_http2              = true
+  enable_cross_zone_load_balancing = true
+
+  drop_invalid_header_fields = true
+
+  tags = {
+    Name        = "${var.environment}-alb"
+    Environment = var.environment
+  }
+}
+
+resource "aws_lb_target_group" "backend" {
+  name     = "${var.environment}-backend-tg"
+  port     = 80
+  protocol = "HTTP"
+  vpc_id   = var.vpc_id
+
+  health_check {
+    enabled             = true
+    healthy_threshold   = 2
+    unhealthy_threshold = 3
+    timeout             = 5
+    interval            = 30
+    path                = "/health/live"
+    matcher             = "200"
+  }
+
+  deregistration_delay = 30
+
+  stickiness {
+    type            = "lb_cookie"
+    cookie_duration = 86400
+    enabled         = true
+  }
+
+  tags = {
+    Name        = "${var.environment}-backend-tg"
+    Environment = var.environment
+  }
+}
+
+resource "aws_lb_target_group" "frontend" {
+  name     = "${var.environment}-frontend-tg"
+  port     = 80
+  protocol = "HTTP"
+  vpc_id   = var.vpc_id
+
+  health_check {
+    enabled             = true
+    healthy_threshold   = 2
+    unhealthy_threshold = 3
+    timeout             = 5
+    interval            = 30
+    path                = "/"
+    matcher             = "200"
+  }
+
+  deregistration_delay = 30
+
+  tags = {
+    Name        = "${var.environment}-frontend-tg"
+    Environment = var.environment
+  }
+}
+
+# HTTP Listener - Redirect to HTTPS
+resource "aws_lb_listener" "http" {
+  load_balancer_arn = aws_lb.main.arn
+  port              = "80"
+  protocol          = "HTTP"
+
+  default_action {
+    type = "redirect"
+
+    redirect {
+      port        = "443"
+      protocol    = "HTTPS"
+      status_code = "HTTP_301"
+    }
+  }
+}
+
+# HTTPS Listener
+resource "aws_lb_listener" "https" {
+  load_balancer_arn = aws_lb.main.arn
+  port              = "443"
+  protocol          = "HTTPS"
+  ssl_policy        = "ELBSecurityPolicy-TLS-1-2-2017-01"
+  certificate_arn   = var.certificate_arn
+
+  default_action {
+    type = "forward"
+    
+    forward {
+      target_group {
+        arn    = aws_lb_target_group.frontend.arn
+        weight = 100
+      }
+    }
+  }
+}
+
+# Listener Rules for API routing
+resource "aws_lb_listener_rule" "api" {
+  listener_arn = aws_lb_listener.https.arn
+  priority     = 100
+
+  action {
+    type = "forward"
+    
+    forward {
+      target_group {
+        arn    = aws_lb_target_group.backend.arn
+        weight = 100
+      }
+    }
+  }
+
+  condition {
+    path_pattern {
+      values = ["/api/*", "/health/*"]
+    }
+  }
+}
+
+# WAF Web ACL (optional but recommended)
+resource "aws_wafv2_web_acl" "main" {
+  name  = "${var.environment}-waf"
+  scope = "REGIONAL"
+
+  default_action {
+    allow {}
+  }
+
+  # Rate limiting rule
+  rule {
+    name     = "RateLimitRule"
+    priority = 1
+
+    action {
+      block {}
+    }
+
+    statement {
+      rate_based_statement {
+        limit              = 2000
+        aggregate_key_type = "IP"
+      }
+    }
+
+    visibility_config {
+      cloudwatch_metrics_enabled = true
+      metric_name                = "RateLimitRule"
+      sampled_requests_enabled   = true
+    }
+  }
+
+  # AWS Managed Rules - Core Rule Set
+  rule {
+    name     = "AWSManagedRulesCommonRuleSet"
+    priority = 2
+
+    override_action {
+      none {}
+    }
+
+    statement {
+      managed_rule_group_statement {
+        name        = "AWSManagedRulesCommonRuleSet"
+        vendor_name = "AWS"
+      }
+    }
+
+    visibility_config {
+      cloudwatch_metrics_enabled = true
+      metric_name                = "AWSManagedRulesCommonRuleSetMetric"
+      sampled_requests_enabled   = true
+    }
+  }
+
+  # Known Bad Inputs Rule Set
+  rule {
+    name     = "AWSManagedRulesKnownBadInputsRuleSet"
+    priority = 3
+
+    override_action {
+      none {}
+    }
+
+    statement {
+      managed_rule_group_statement {
+        name        = "AWSManagedRulesKnownBadInputsRuleSet"
+        vendor_name = "AWS"
+      }
+    }
+
+    visibility_config {
+      cloudwatch_metrics_enabled = true
+      metric_name                = "AWSManagedRulesKnownBadInputsRuleSetMetric"
+      sampled_requests_enabled   = true
+    }
+  }
+
+  visibility_config {
+    cloudwatch_metrics_enabled = true
+    metric_name                = "${var.environment}-waf"
+    sampled_requests_enabled   = true
+  }
+
+  tags = {
+    Name        = "${var.environment}-waf"
+    Environment = var.environment
+  }
+}
+
+# Associate WAF with ALB
+resource "aws_wafv2_web_acl_association" "main" {
+  resource_arn = aws_lb.main.arn
+  web_acl_arn  = aws_wafv2_web_acl.main.arn
+}
--- a/terraform/modules/alb/outputs.tf
+++ b/terraform/modules/alb/outputs.tf
@@ -0,0 +1,29 @@
+output "alb_arn" {
+  description = "ALB ARN"
+  value       = aws_lb.main.arn
+}
+
+output "alb_dns_name" {
+  description = "ALB DNS name"
+  value       = aws_lb.main.dns_name
+}
+
+output "alb_zone_id" {
+  description = "ALB zone ID"
+  value       = aws_lb.main.zone_id
+}
+
+output "backend_target_group_arn" {
+  description = "Backend target group ARN"
+  value       = aws_lb_target_group.backend.arn
+}
+
+output "frontend_target_group_arn" {
+  description = "Frontend target group ARN"
+  value       = aws_lb_target_group.frontend.arn
+}
+
+output "alb_security_group_id" {
+  description = "ALB security group ID"
+  value       = aws_security_group.alb.id
+}
--- a/terraform/modules/alb/variables.tf
+++ b/terraform/modules/alb/variables.tf
@@ -0,0 +1,25 @@
+variable "environment" {
+  description = "Environment name"
+  type        = string
+}
+
+variable "vpc_id" {
+  description = "VPC ID"
+  type        = string
+}
+
+variable "public_subnet_ids" {
+  description = "Public subnet IDs for ALB"
+  type        = list(string)
+}
+
+variable "certificate_arn" {
+  description = "ARN of SSL certificate"
+  type        = string
+}
+
+variable "enable_deletion_protection" {
+  description = "Enable deletion protection for ALB"
+  type        = bool
+  default     = true
+}
--- a/terraform/modules/eks/main.tf
+++ b/terraform/modules/eks/main.tf
@@ -0,0 +1,178 @@
+resource "aws_eks_cluster" "main" {
+  name     = var.cluster_name
+  role_arn = aws_iam_role.cluster.arn
+  version  = var.cluster_version
+
+  vpc_config {
+    subnet_ids              = var.private_subnet_ids
+    endpoint_private_access = true
+    endpoint_public_access  = true
+    security_group_ids      = [aws_security_group.cluster.id]
+  }
+
+  enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
+
+  depends_on = [
+    aws_iam_role_policy_attachment.cluster_policy,
+    aws_iam_role_policy_attachment.vpc_resource_controller
+  ]
+
+  tags = {
+    Name        = var.cluster_name
+    Environment = var.environment
+  }
+}
+
+# EKS Cluster IAM Role
+resource "aws_iam_role" "cluster" {
+  name = "${var.cluster_name}-cluster-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [{
+      Action = "sts:AssumeRole"
+      Effect = "Allow"
+      Principal = {
+        Service = "eks.amazonaws.com"
+      }
+    }]
+  })
+
+  tags = {
+    Name        = "${var.cluster_name}-cluster-role"
+    Environment = var.environment
+  }
+}
+
+resource "aws_iam_role_policy_attachment" "cluster_policy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
+  role       = aws_iam_role.cluster.name
+}
+
+resource "aws_iam_role_policy_attachment" "vpc_resource_controller" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
+  role       = aws_iam_role.cluster.name
+}
+
+# Cluster Security Group
+resource "aws_security_group" "cluster" {
+  name        = "${var.cluster_name}-cluster-sg"
+  description = "Security group for EKS cluster"
+  vpc_id      = var.vpc_id
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name        = "${var.cluster_name}-cluster-sg"
+    Environment = var.environment
+  }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "main" {
+  for_each = var.node_groups
+
+  cluster_name    = aws_eks_cluster.main.name
+  node_group_name = "${var.cluster_name}-${each.key}"
+  node_role_arn   = aws_iam_role.node.arn
+  subnet_ids      = var.private_subnet_ids
+
+  capacity_type  = each.value.capacity_type
+  instance_types = each.value.instance_types
+
+  scaling_config {
+    desired_size = each.value.desired_size
+    max_size     = each.value.max_size
+    min_size     = each.value.min_size
+  }
+
+  update_config {
+    max_unavailable = 1
+  }
+
+  depends_on = [
+    aws_iam_role_policy_attachment.node_policy,
+    aws_iam_role_policy_attachment.cni_policy,
+    aws_iam_role_policy_attachment.container_registry_policy,
+  ]
+
+  tags = {
+    Name        = "${var.cluster_name}-${each.key}"
+    Environment = var.environment
+  }
+}
+
+# Node IAM Role
+resource "aws_iam_role" "node" {
+  name = "${var.cluster_name}-node-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [{
+      Action = "sts:AssumeRole"
+      Effect = "Allow"
+      Principal = {
+        Service = "ec2.amazonaws.com"
+      }
+    }]
+  })
+
+  tags = {
+    Name        = "${var.cluster_name}-node-role"
+    Environment = var.environment
+  }
+}
+
+resource "aws_iam_role_policy_attachment" "node_policy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+  role       = aws_iam_role.node.name
+}
+
+resource "aws_iam_role_policy_attachment" "cni_policy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+  role       = aws_iam_role.node.name
+}
+
+resource "aws_iam_role_policy_attachment" "container_registry_policy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+  role       = aws_iam_role.node.name
+}
+
+# OIDC Provider for IRSA (IAM Roles for Service Accounts)
+data "tls_certificate" "cluster" {
+  url = aws_eks_cluster.main.identity[0].oidc[0].issuer
+}
+
+resource "aws_iam_openid_connect_provider" "cluster" {
+  client_id_list  = ["sts.amazonaws.com"]
+  thumbprint_list = [data.tls_certificate.cluster.certificates[0].sha1_fingerprint]
+  url             = aws_eks_cluster.main.identity[0].oidc[0].issuer
+
+  tags = {
+    Name        = "${var.cluster_name}-oidc"
+    Environment = var.environment
+  }
+}
+
+# EKS Add-ons
+resource "aws_eks_addon" "vpc_cni" {
+  cluster_name = aws_eks_cluster.main.name
+  addon_name   = "vpc-cni"
+}
+
+resource "aws_eks_addon" "coredns" {
+  cluster_name = aws_eks_cluster.main.name
+  addon_name   = "coredns"
+  
+  depends_on = [aws_eks_node_group.main]
+}
+
+resource "aws_eks_addon" "kube_proxy" {
+  cluster_name = aws_eks_cluster.main.name
+  addon_name   = "kube-proxy"
+}
--- a/terraform/modules/eks/outputs.tf
+++ b/terraform/modules/eks/outputs.tf
@@ -0,0 +1,29 @@
+output "cluster_name" {
+  description = "EKS cluster name"
+  value       = aws_eks_cluster.main.name
+}
+
+output "cluster_endpoint" {
+  description = "EKS cluster endpoint"
+  value       = aws_eks_cluster.main.endpoint
+}
+
+output "cluster_ca_certificate" {
+  description = "EKS cluster CA certificate"
+  value       = aws_eks_cluster.main.certificate_authority[0].data
+}
+
+output "cluster_security_group_id" {
+  description = "Security group ID attached to the EKS cluster"
+  value       = aws_security_group.cluster.id
+}
+
+output "cluster_oidc_issuer_url" {
+  description = "OIDC issuer URL"
+  value       = aws_eks_cluster.main.identity[0].oidc[0].issuer
+}
+
+output "node_role_arn" {
+  description = "IAM role ARN for EKS nodes"
+  value       = aws_iam_role.node.arn
+}
--- a/terraform/modules/eks/variables.tf
+++ b/terraform/modules/eks/variables.tf
@@ -0,0 +1,35 @@
+variable "environment" {
+  description = "Environment name"
+  type        = string
+}
+
+variable "cluster_name" {
+  description = "EKS cluster name"
+  type        = string
+}
+
+variable "cluster_version" {
+  description = "Kubernetes version"
+  type        = string
+}
+
+variable "vpc_id" {
+  description = "VPC ID"
+  type        = string
+}
+
+variable "private_subnet_ids" {
+  description = "Private subnet IDs for EKS"
+  type        = list(string)
+}
+
+variable "node_groups" {
+  description = "Node groups configuration"
+  type = map(object({
+    desired_size   = number
+    min_size       = number
+    max_size       = number
+    instance_types = list(string)
+    capacity_type  = string
+  }))
+}
--- a/terraform/modules/rds/main.tf
+++ b/terraform/modules/rds/main.tf
@@ -0,0 +1,164 @@
+resource "aws_db_subnet_group" "main" {
+  name       = "${var.identifier}-subnet-group"
+  subnet_ids = var.database_subnet_ids
+
+  tags = {
+    Name        = "${var.identifier}-subnet-group"
+    Environment = var.environment
+  }
+}
+
+resource "aws_security_group" "rds" {
+  name        = "${var.identifier}-rds-sg"
+  description = "Security group for RDS PostgreSQL"
+  vpc_id      = var.vpc_id
+
+  ingress {
+    from_port       = 5432
+    to_port         = 5432
+    protocol        = "tcp"
+    security_groups = var.allowed_security_group_ids
+    description     = "PostgreSQL access from EKS"
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name        = "${var.identifier}-rds-sg"
+    Environment = var.environment
+  }
+}
+
+resource "random_password" "master" {
+  length  = 32
+  special = true
+}
+
+resource "aws_db_instance" "postgres" {
+  identifier     = var.identifier
+  engine         = "postgres"
+  engine_version = var.engine_version
+  instance_class = var.instance_class
+
+  allocated_storage     = var.allocated_storage
+  max_allocated_storage = var.max_allocated_storage
+  storage_type          = "gp3"
+  storage_encrypted     = true
+  kms_key_id            = aws_kms_key.rds.arn
+
+  db_name  = var.database_name
+  username = var.master_username
+  password = random_password.master.result
+
+  db_subnet_group_name   = aws_db_subnet_group.main.name
+  vpc_security_group_ids = [aws_security_group.rds.id]
+  publicly_accessible    = false
+
+  multi_az               = var.multi_az
+  backup_retention_period = var.backup_retention_period
+  backup_window          = var.backup_window
+  maintenance_window     = var.maintenance_window
+
+  enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
+  performance_insights_enabled    = true
+  performance_insights_retention_period = 7
+
+  deletion_protection       = var.deletion_protection
+  skip_final_snapshot       = var.skip_final_snapshot
+  final_snapshot_identifier = var.skip_final_snapshot ? null : "${var.identifier}-final-snapshot"
+
+  auto_minor_version_upgrade = true
+  apply_immediately         = false
+
+  tags = {
+    Name        = var.identifier
+    Environment = var.environment
+  }
+}
+
+resource "aws_kms_key" "rds" {
+  description             = "KMS key for RDS encryption"
+  deletion_window_in_days = 10
+  enable_key_rotation     = true
+
+  tags = {
+    Name        = "${var.identifier}-kms"
+    Environment = var.environment
+  }
+}
+
+resource "aws_kms_alias" "rds" {
+  name          = "alias/${var.identifier}-rds"
+  target_key_id = aws_kms_key.rds.key_id
+}
+
+# Store password in Secrets Manager
+resource "aws_secretsmanager_secret" "db_password" {
+  name        = "${var.identifier}-db-password"
+  description = "Database master password"
+
+  tags = {
+    Name        = "${var.identifier}-db-password"
+    Environment = var.environment
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "db_password" {
+  secret_id     = aws_secretsmanager_secret.db_password.id
+  secret_string = jsonencode({
+    username = var.master_username
+    password = random_password.master.result
+    engine   = "postgres"
+    host     = aws_db_instance.postgres.address
+    port     = aws_db_instance.postgres.port
+    dbname   = var.database_name
+  })
+}
+
+# CloudWatch Alarms
+resource "aws_cloudwatch_metric_alarm" "cpu" {
+  alarm_name          = "${var.identifier}-cpu-utilization"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "2"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/RDS"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "80"
+  alarm_description   = "This metric monitors RDS CPU utilization"
+
+  dimensions = {
+    DBInstanceIdentifier = aws_db_instance.postgres.id
+  }
+
+  tags = {
+    Name        = "${var.identifier}-cpu-alarm"
+    Environment = var.environment
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "storage" {
+  alarm_name          = "${var.identifier}-free-storage-space"
+  comparison_operator = "LessThanThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "FreeStorageSpace"
+  namespace           = "AWS/RDS"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "10000000000" # 10GB in bytes
+  alarm_description   = "This metric monitors RDS free storage space"
+
+  dimensions = {
+    DBInstanceIdentifier = aws_db_instance.postgres.id
+  }
+
+  tags = {
+    Name        = "${var.identifier}-storage-alarm"
+    Environment = var.environment
+  }
+}
--- a/terraform/modules/rds/outputs.tf
+++ b/terraform/modules/rds/outputs.tf
@@ -0,0 +1,35 @@
+output "db_endpoint" {
+  description = "RDS instance endpoint"
+  value       = aws_db_instance.postgres.endpoint
+}
+
+output "db_address" {
+  description = "RDS instance address"
+  value       = aws_db_instance.postgres.address
+}
+
+output "db_port" {
+  description = "RDS instance port"
+  value       = aws_db_instance.postgres.port
+}
+
+output "db_name" {
+  description = "Database name"
+  value       = aws_db_instance.postgres.db_name
+}
+
+output "db_username" {
+  description = "Master username"
+  value       = aws_db_instance.postgres.username
+  sensitive   = true
+}
+
+output "security_group_id" {
+  description = "RDS security group ID"
+  value       = aws_security_group.rds.id
+}
+
+output "secret_arn" {
+  description = "ARN of the secret containing database credentials"
+  value       = aws_secretsmanager_secret.db_password.arn
+}
--- a/terraform/modules/rds/variables.tf
+++ b/terraform/modules/rds/variables.tf
@@ -0,0 +1,94 @@
+variable "environment" {
+  description = "Environment name"
+  type        = string
+}
+
+variable "identifier" {
+  description = "Identifier for RDS instance"
+  type        = string
+}
+
+variable "engine_version" {
+  description = "PostgreSQL engine version"
+  type        = string
+  default     = "15.3"
+}
+
+variable "instance_class" {
+  description = "RDS instance class"
+  type        = string
+  default     = "db.t3.medium"
+}
+
+variable "allocated_storage" {
+  description = "Allocated storage in GB"
+  type        = number
+  default     = 100
+}
+
+variable "max_allocated_storage" {
+  description = "Maximum allocated storage for autoscaling in GB"
+  type        = number
+  default     = 500
+}
+
+variable "database_name" {
+  description = "Name of the database"
+  type        = string
+}
+
+variable "master_username" {
+  description = "Master username"
+  type        = string
+}
+
+variable "vpc_id" {
+  description = "VPC ID"
+  type        = string
+}
+
+variable "database_subnet_ids" {
+  description = "Database subnet IDs"
+  type        = list(string)
+}
+
+variable "allowed_security_group_ids" {
+  description = "Security group IDs allowed to access RDS"
+  type        = list(string)
+}
+
+variable "multi_az" {
+  description = "Enable Multi-AZ deployment"
+  type        = bool
+  default     = true
+}
+
+variable "backup_retention_period" {
+  description = "Backup retention period in days"
+  type        = number
+  default     = 7
+}
+
+variable "backup_window" {
+  description = "Preferred backup window"
+  type        = string
+  default     = "03:00-04:00"
+}
+
+variable "maintenance_window" {
+  description = "Preferred maintenance window"
+  type        = string
+  default     = "mon:04:00-mon:05:00"
+}
+
+variable "deletion_protection" {
+  description = "Enable deletion protection"
+  type        = bool
+  default     = true
+}
+
+variable "skip_final_snapshot" {
+  description = "Skip final snapshot when destroying"
+  type        = bool
+  default     = false
+}
--- a/terraform/modules/redis/main.tf
+++ b/terraform/modules/redis/main.tf
@@ -0,0 +1,174 @@
+resource "aws_elasticache_subnet_group" "main" {
+  name       = "${var.cluster_id}-subnet-group"
+  subnet_ids = var.subnet_ids
+
+  tags = {
+    Name        = "${var.cluster_id}-subnet-group"
+    Environment = var.environment
+  }
+}
+
+resource "aws_security_group" "redis" {
+  name        = "${var.cluster_id}-redis-sg"
+  description = "Security group for ElastiCache Redis"
+  vpc_id      = var.vpc_id
+
+  ingress {
+    from_port       = 6379
+    to_port         = 6379
+    protocol        = "tcp"
+    security_groups = var.allowed_security_group_ids
+    description     = "Redis access from EKS"
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name        = "${var.cluster_id}-redis-sg"
+    Environment = var.environment
+  }
+}
+
+resource "random_password" "auth_token" {
+  length  = 32
+  special = false
+}
+
+resource "aws_elasticache_replication_group" "redis" {
+  replication_group_id       = var.cluster_id
+  replication_group_description = "Redis cluster for ${var.environment}"
+  engine                     = "redis"
+  engine_version             = var.engine_version
+  node_type                  = var.node_type
+  num_cache_clusters         = var.num_cache_nodes
+  port                       = 6379
+
+  subnet_group_name          = aws_elasticache_subnet_group.main.name
+  security_group_ids         = [aws_security_group.redis.id]
+
+  parameter_group_name       = aws_elasticache_parameter_group.main.name
+  
+  at_rest_encryption_enabled = true
+  transit_encryption_enabled = true
+  auth_token                 = random_password.auth_token.result
+  
+  automatic_failover_enabled = var.num_cache_nodes > 1 ? true : false
+  multi_az_enabled          = var.num_cache_nodes > 1 ? true : false
+
+  snapshot_retention_limit   = var.snapshot_retention_limit
+  snapshot_window           = var.snapshot_window
+  maintenance_window        = var.maintenance_window
+
+  auto_minor_version_upgrade = true
+  apply_immediately         = false
+
+  log_delivery_configuration {
+    destination      = aws_cloudwatch_log_group.redis.name
+    destination_type = "cloudwatch-logs"
+    log_format       = "json"
+    log_type         = "slow-log"
+  }
+
+  tags = {
+    Name        = var.cluster_id
+    Environment = var.environment
+  }
+}
+
+resource "aws_elasticache_parameter_group" "main" {
+  name   = "${var.cluster_id}-params"
+  family = "redis7"
+
+  parameter {
+    name  = "maxmemory-policy"
+    value = "allkeys-lru"
+  }
+
+  parameter {
+    name  = "timeout"
+    value = "300"
+  }
+
+  tags = {
+    Name        = "${var.cluster_id}-params"
+    Environment = var.environment
+  }
+}
+
+resource "aws_cloudwatch_log_group" "redis" {
+  name              = "/aws/elasticache/${var.cluster_id}"
+  retention_in_days = 7
+
+  tags = {
+    Name        = "${var.cluster_id}-logs"
+    Environment = var.environment
+  }
+}
+
+# Store auth token in Secrets Manager
+resource "aws_secretsmanager_secret" "redis_auth" {
+  name        = "${var.cluster_id}-auth-token"
+  description = "Redis authentication token"
+
+  tags = {
+    Name        = "${var.cluster_id}-auth-token"
+    Environment = var.environment
+  }
+}
+
+resource "aws_secretsmanager_secret_version" "redis_auth" {
+  secret_id     = aws_secretsmanager_secret.redis_auth.id
+  secret_string = jsonencode({
+    auth_token = random_password.auth_token.result
+    endpoint   = aws_elasticache_replication_group.redis.primary_endpoint_address
+    port       = 6379
+  })
+}
+
+# CloudWatch Alarms
+resource "aws_cloudwatch_metric_alarm" "cpu" {
+  alarm_name          = "${var.cluster_id}-cpu-utilization"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "2"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ElastiCache"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "75"
+  alarm_description   = "This metric monitors Redis CPU utilization"
+
+  dimensions = {
+    ReplicationGroupId = var.cluster_id
+  }
+
+  tags = {
+    Name        = "${var.cluster_id}-cpu-alarm"
+    Environment = var.environment
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "memory" {
+  alarm_name          = "${var.cluster_id}-database-memory-usage"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "DatabaseMemoryUsagePercentage"
+  namespace           = "AWS/ElastiCache"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "90"
+  alarm_description   = "This metric monitors Redis memory usage"
+
+  dimensions = {
+    ReplicationGroupId = var.cluster_id
+  }
+
+  tags = {
+    Name        = "${var.cluster_id}-memory-alarm"
+    Environment = var.environment
+  }
+}
--- a/terraform/modules/redis/outputs.tf
+++ b/terraform/modules/redis/outputs.tf
@@ -0,0 +1,19 @@
+output "redis_endpoint" {
+  description = "Redis primary endpoint address"
+  value       = aws_elasticache_replication_group.redis.primary_endpoint_address
+}
+
+output "redis_port" {
+  description = "Redis port"
+  value       = 6379
+}
+
+output "security_group_id" {
+  description = "Redis security group ID"
+  value       = aws_security_group.redis.id
+}
+
+output "secret_arn" {
+  description = "ARN of the secret containing Redis auth token"
+  value       = aws_secretsmanager_secret.redis_auth.arn
+}
--- a/terraform/modules/redis/variables.tf
+++ b/terraform/modules/redis/variables.tf
@@ -0,0 +1,60 @@
+variable "environment" {
+  description = "Environment name"
+  type        = string
+}
+
+variable "cluster_id" {
+  description = "ElastiCache cluster ID"
+  type        = string
+}
+
+variable "engine_version" {
+  description = "Redis engine version"
+  type        = string
+  default     = "7.0"
+}
+
+variable "node_type" {
+  description = "ElastiCache node type"
+  type        = string
+  default     = "cache.t3.medium"
+}
+
+variable "num_cache_nodes" {
+  description = "Number of cache nodes"
+  type        = number
+  default     = 1
+}
+
+variable "vpc_id" {
+  description = "VPC ID"
+  type        = string
+}
+
+variable "subnet_ids" {
+  description = "Subnet IDs for ElastiCache"
+  type        = list(string)
+}
+
+variable "allowed_security_group_ids" {
+  description = "Security group IDs allowed to access Redis"
+  type        = list(string)
+}
+
+variable "snapshot_retention_limit" {
+  description = "Number of days to retain automatic snapshots"
+  type        = number
+  default     = 5
+}
+
+variable "snapshot_window" {
+  description = "Daily time range for snapshots"
+  type        = string
+  default     = "03:00-05:00"
+}
+
+variable "maintenance_window" {
+  description = "Weekly time range for maintenance"
+  type        = string
+  default     = "sun:05:00-sun:07:00"
+}
--- a/terraform/modules/vpc/main.tf
+++ b/terraform/modules/vpc/main.tf
@@ -0,0 +1,213 @@
+resource "aws_vpc" "main" {
+  cidr_block           = var.vpc_cidr
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+
+  tags = {
+    Name        = "${var.environment}-vpc"
+    Environment = var.environment
+    "kubernetes.io/cluster/${var.environment}-cluster" = "shared"
+  }
+}
+
+# Internet Gateway
+resource "aws_internet_gateway" "main" {
+  vpc_id = aws_vpc.main.id
+
+  tags = {
+    Name        = "${var.environment}-igw"
+    Environment = var.environment
+  }
+}
+
+# Public Subnets
+resource "aws_subnet" "public" {
+  count             = length(var.public_subnet_cidrs)
+  vpc_id            = aws_vpc.main.id
+  cidr_block        = var.public_subnet_cidrs[count.index]
+  availability_zone = var.availability_zones[count.index]
+
+  map_public_ip_on_launch = true
+
+  tags = {
+    Name        = "${var.environment}-public-subnet-${count.index + 1}"
+    Environment = var.environment
+    Type        = "public"
+    "kubernetes.io/role/elb" = "1"
+    "kubernetes.io/cluster/${var.environment}-cluster" = "shared"
+  }
+}
+
+# Private Subnets
+resource "aws_subnet" "private" {
+  count             = length(var.private_subnet_cidrs)
+  vpc_id            = aws_vpc.main.id
+  cidr_block        = var.private_subnet_cidrs[count.index]
+  availability_zone = var.availability_zones[count.index]
+
+  tags = {
+    Name        = "${var.environment}-private-subnet-${count.index + 1}"
+    Environment = var.environment
+    Type        = "private"
+    "kubernetes.io/role/internal-elb" = "1"
+    "kubernetes.io/cluster/${var.environment}-cluster" = "shared"
+  }
+}
+
+# Database Subnets
+resource "aws_subnet" "database" {
+  count             = length(var.database_subnet_cidrs)
+  vpc_id            = aws_vpc.main.id
+  cidr_block        = var.database_subnet_cidrs[count.index]
+  availability_zone = var.availability_zones[count.index]
+
+  tags = {
+    Name        = "${var.environment}-database-subnet-${count.index + 1}"
+    Environment = var.environment
+    Type        = "database"
+  }
+}
+
+# Elastic IPs for NAT Gateways
+resource "aws_eip" "nat" {
+  count  = length(var.availability_zones)
+  domain = "vpc"
+
+  tags = {
+    Name        = "${var.environment}-nat-eip-${count.index + 1}"
+    Environment = var.environment
+  }
+}
+
+# NAT Gateways
+resource "aws_nat_gateway" "main" {
+  count         = length(var.availability_zones)
+  allocation_id = aws_eip.nat[count.index].id
+  subnet_id     = aws_subnet.public[count.index].id
+
+  tags = {
+    Name        = "${var.environment}-nat-gateway-${count.index + 1}"
+    Environment = var.environment
+  }
+
+  depends_on = [aws_internet_gateway.main]
+}
+
+# Public Route Table
+resource "aws_route_table" "public" {
+  vpc_id = aws_vpc.main.id
+
+  route {
+    cidr_block = "0.0.0.0/0"
+    gateway_id = aws_internet_gateway.main.id
+  }
+
+  tags = {
+    Name        = "${var.environment}-public-rt"
+    Environment = var.environment
+  }
+}
+
+# Private Route Tables
+resource "aws_route_table" "private" {
+  count  = length(var.availability_zones)
+  vpc_id = aws_vpc.main.id
+
+  route {
+    cidr_block     = "0.0.0.0/0"
+    nat_gateway_id = aws_nat_gateway.main[count.index].id
+  }
+
+  tags = {
+    Name        = "${var.environment}-private-rt-${count.index + 1}"
+    Environment = var.environment
+  }
+}
+
+# Route Table Associations - Public
+resource "aws_route_table_association" "public" {
+  count          = length(var.public_subnet_cidrs)
+  subnet_id      = aws_subnet.public[count.index].id
+  route_table_id = aws_route_table.public.id
+}
+
+# Route Table Associations - Private
+resource "aws_route_table_association" "private" {
+  count          = length(var.private_subnet_cidrs)
+  subnet_id      = aws_subnet.private[count.index].id
+  route_table_id = aws_route_table.private[count.index].id
+}
+
+# Route Table Associations - Database
+resource "aws_route_table_association" "database" {
+  count          = length(var.database_subnet_cidrs)
+  subnet_id      = aws_subnet.database[count.index].id
+  route_table_id = aws_route_table.private[count.index].id
+}
+
+# VPC Flow Logs
+resource "aws_flow_log" "main" {
+  iam_role_arn    = aws_iam_role.flow_log.arn
+  log_destination = aws_cloudwatch_log_group.flow_log.arn
+  traffic_type    = "ALL"
+  vpc_id          = aws_vpc.main.id
+
+  tags = {
+    Name        = "${var.environment}-flow-log"
+    Environment = var.environment
+  }
+}
+
+resource "aws_cloudwatch_log_group" "flow_log" {
+  name              = "/aws/vpc/${var.environment}-flow-log"
+  retention_in_days = 30
+
+  tags = {
+    Name        = "${var.environment}-flow-log"
+    Environment = var.environment
+  }
+}
+
+resource "aws_iam_role" "flow_log" {
+  name = "${var.environment}-vpc-flow-log-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = "sts:AssumeRole"
+        Effect = "Allow"
+        Principal = {
+          Service = "vpc-flow-logs.amazonaws.com"
+        }
+      }
+    ]
+  })
+
+  tags = {
+    Name        = "${var.environment}-flow-log-role"
+    Environment = var.environment
+  }
+}
+
+resource "aws_iam_role_policy" "flow_log" {
+  name = "${var.environment}-vpc-flow-log-policy"
+  role = aws_iam_role.flow_log.id
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = [
+          "logs:CreateLogGroup",
+          "logs:CreateLogStream",
+          "logs:PutLogEvents",
+          "logs:DescribeLogGroups",
+          "logs:DescribeLogStreams"
+        ]
+        Effect = "Allow"
+        Resource = "${aws_cloudwatch_log_group.flow_log.arn}:*"
+      }
+    ]
+  })
+}
--- a/terraform/modules/vpc/outputs.tf
+++ b/terraform/modules/vpc/outputs.tf
@@ -0,0 +1,29 @@
+output "vpc_id" {
+  description = "VPC ID"
+  value       = aws_vpc.main.id
+}
+
+output "vpc_cidr" {
+  description = "VPC CIDR block"
+  value       = aws_vpc.main.cidr_block
+}
+
+output "public_subnet_ids" {
+  description = "Public subnet IDs"
+  value       = aws_subnet.public[*].id
+}
+
+output "private_subnet_ids" {
+  description = "Private subnet IDs"
+  value       = aws_subnet.private[*].id
+}
+
+output "database_subnet_ids" {
+  description = "Database subnet IDs"
+  value       = aws_subnet.database[*].id
+}
+
+output "nat_gateway_ids" {
+  description = "NAT Gateway IDs"
+  value       = aws_nat_gateway.main[*].id
+}
--- a/terraform/modules/vpc/variables.tf
+++ b/terraform/modules/vpc/variables.tf
@@ -0,0 +1,29 @@
+variable "environment" {
+  description = "Environment name"
+  type        = string
+}
+
+variable "vpc_cidr" {
+  description = "CIDR block for VPC"
+  type        = string
+}
+
+variable "availability_zones" {
+  description = "List of availability zones"
+  type        = list(string)
+}
+
+variable "private_subnet_cidrs" {
+  description = "CIDR blocks for private subnets"
+  type        = list(string)
+}
+
+variable "public_subnet_cidrs" {
+  description = "CIDR blocks for public subnets"
+  type        = list(string)
+}
+
+variable "database_subnet_cidrs" {
+  description = "CIDR blocks for database subnets"
+  type        = list(string)
+}
--- a/terraform/outputs.tf
+++ b/terraform/outputs.tf
@@ -0,0 +1,61 @@
+output "vpc_id" {
+  description = "VPC ID"
+  value       = module.vpc.vpc_id
+}
+
+output "private_subnet_ids" {
+  description = "Private subnet IDs"
+  value       = module.vpc.private_subnet_ids
+}
+
+output "public_subnet_ids" {
+  description = "Public subnet IDs"
+  value       = module.vpc.public_subnet_ids
+}
+
+output "eks_cluster_name" {
+  description = "EKS cluster name"
+  value       = module.eks.cluster_name
+}
+
+output "eks_cluster_endpoint" {
+  description = "EKS cluster endpoint"
+  value       = module.eks.cluster_endpoint
+}
+
+output "eks_cluster_security_group_id" {
+  description = "EKS cluster security group ID"
+  value       = module.eks.cluster_security_group_id
+}
+
+output "rds_endpoint" {
+  description = "RDS database endpoint"
+  value       = module.rds.db_endpoint
+  sensitive   = true
+}
+
+output "rds_database_name" {
+  description = "RDS database name"
+  value       = module.rds.db_name
+}
+
+output "redis_endpoint" {
+  description = "Redis cluster endpoint"
+  value       = module.redis.redis_endpoint
+  sensitive   = true
+}
+
+output "alb_dns_name" {
+  description = "ALB DNS name"
+  value       = module.alb.alb_dns_name
+}
+
+output "alb_zone_id" {
+  description = "ALB zone ID"
+  value       = module.alb.alb_zone_id
+}
+
+output "configure_kubectl" {
+  description = "Command to configure kubectl"
+  value       = "aws eks update-kubeconfig --name ${module.eks.cluster_name} --region ${var.aws_region}"
+}
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -0,0 +1,132 @@
+variable "aws_region" {
+  description = "AWS region to deploy resources"
+  type        = string
+  default     = "us-east-1"
+}
+
+variable "environment" {
+  description = "Environment name (staging, production)"
+  type        = string
+}
+
+variable "project_name" {
+  description = "Project name"
+  type        = string
+  default     = "spywatcher"
+}
+
+# VPC Variables
+variable "vpc_cidr" {
+  description = "CIDR block for VPC"
+  type        = string
+  default     = "10.0.0.0/16"
+}
+
+variable "availability_zones" {
+  description = "List of availability zones"
+  type        = list(string)
+  default     = ["us-east-1a", "us-east-1b", "us-east-1c"]
+}
+
+variable "private_subnet_cidrs" {
+  description = "CIDR blocks for private subnets"
+  type        = list(string)
+  default     = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
+}
+
+variable "public_subnet_cidrs" {
+  description = "CIDR blocks for public subnets"
+  type        = list(string)
+  default     = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
+}
+
+variable "database_subnet_cidrs" {
+  description = "CIDR blocks for database subnets"
+  type        = list(string)
+  default     = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
+}
+
+# EKS Variables
+variable "eks_cluster_version" {
+  description = "Kubernetes version for EKS cluster"
+  type        = string
+  default     = "1.28"
+}
+
+variable "eks_node_groups" {
+  description = "EKS node groups configuration"
+  type = map(object({
+    desired_size   = number
+    min_size       = number
+    max_size       = number
+    instance_types = list(string)
+    capacity_type  = string
+  }))
+  default = {
+    general = {
+      desired_size   = 3
+      min_size       = 2
+      max_size       = 10
+      instance_types = ["t3.medium"]
+      capacity_type  = "ON_DEMAND"
+    }
+  }
+}
+
+# RDS Variables
+variable "rds_engine_version" {
+  description = "PostgreSQL engine version"
+  type        = string
+  default     = "15.3"
+}
+
+variable "rds_instance_class" {
+  description = "RDS instance class"
+  type        = string
+  default     = "db.t3.medium"
+}
+
+variable "rds_allocated_storage" {
+  description = "Allocated storage in GB"
+  type        = number
+  default     = 100
+}
+
+variable "database_name" {
+  description = "Name of the database"
+  type        = string
+  default     = "spywatcher"
+}
+
+variable "database_username" {
+  description = "Database master username"
+  type        = string
+  default     = "spywatcher"
+}
+
+# Redis Variables
+variable "redis_node_type" {
+  description = "ElastiCache node type"
+  type        = string
+  default     = "cache.t3.medium"
+}
+
+variable "redis_num_cache_nodes" {
+  description = "Number of cache nodes"
+  type        = number
+  default     = 1
+}
+
+# SSL Certificate
+variable "certificate_arn" {
+  description = "ARN of SSL certificate for ALB"
+  type        = string
+  default     = ""
+}
+
+# Tags
+variable "tags" {
+  description = "Additional tags for resources"
+  type        = map(string)
+  default     = {}
+}