Add production deployment infrastructure with Kubernetes, Terraform, and multi-strategy CI/CD (#145)
* Initial plan * Add Kubernetes manifests and Terraform infrastructure modules Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add Helm charts, deployment scripts, CI/CD workflows, and documentation Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Add infrastructure documentation and update README Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Fix code review issues and security vulnerabilities Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> * Address PR review comments: improve security, fix API versions, and enhance deployment reliability Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: onnwee <211922112+onnwee@users.noreply.github.com>
This commit was merged in pull request #145.
This commit is contained in:
291
.github/workflows/deploy-production.yml
vendored
Normal file
291
.github/workflows/deploy-production.yml
vendored
Normal file
@@ -0,0 +1,291 @@
|
||||
name: Deploy to Production
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
deployment_strategy:
|
||||
description: 'Deployment strategy'
|
||||
required: true
|
||||
default: 'rolling'
|
||||
type: choice
|
||||
options:
|
||||
- rolling
|
||||
- blue-green
|
||||
- canary
|
||||
|
||||
env:
|
||||
AWS_REGION: us-east-1
|
||||
EKS_CLUSTER_NAME: spywatcher-production
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME_BACKEND: ${{ github.repository_owner }}/spywatcher-backend
|
||||
IMAGE_NAME_FRONTEND: ${{ github.repository_owner }}/spywatcher-frontend
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
name: Build and Push Docker Images
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
outputs:
|
||||
backend-tag: ${{ steps.meta-backend.outputs.tags }}
|
||||
frontend-tag: ${{ steps.meta-frontend.outputs.tags }}
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata for backend
|
||||
id: meta-backend
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}
|
||||
tags: |
|
||||
type=sha,prefix={{branch}}-
|
||||
type=ref,event=branch
|
||||
type=semver,pattern={{version}}
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
- name: Build and push backend image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-backend.outputs.tags }}
|
||||
labels: ${{ steps.meta-backend.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Extract metadata for frontend
|
||||
id: meta-frontend
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}
|
||||
tags: |
|
||||
type=sha,prefix={{branch}}-
|
||||
type=ref,event=branch
|
||||
type=semver,pattern={{version}}
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
- name: Build and push frontend image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./frontend
|
||||
file: ./frontend/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-frontend.outputs.tags }}
|
||||
labels: ${{ steps.meta-frontend.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
deploy:
|
||||
name: Deploy to Kubernetes
|
||||
runs-on: ubuntu-latest
|
||||
needs: build-and-push
|
||||
environment: production
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # Required for AWS OIDC authentication
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Update kubeconfig
|
||||
run: |
|
||||
aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Install kubectl
|
||||
uses: azure/setup-kubectl@v4
|
||||
with:
|
||||
version: 'v1.28.0'
|
||||
|
||||
- name: Verify cluster access
|
||||
run: |
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
|
||||
- name: Run database migrations
|
||||
run: |
|
||||
# Create unique migration job name
|
||||
JOB_NAME="db-migration-$(date +%s)"
|
||||
|
||||
# Update the migration job manifest with unique name and latest image
|
||||
kubectl get job spywatcher-db-migration -n spywatcher -o yaml 2>/dev/null | \
|
||||
sed "s/name: spywatcher-db-migration/name: $JOB_NAME/" | \
|
||||
sed "s|image: ghcr.io/subculture-collective/spywatcher-backend:.*|image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest|" | \
|
||||
kubectl apply -f - || \
|
||||
kubectl create job $JOB_NAME --image=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
|
||||
-n spywatcher -- sh -c "npx prisma migrate deploy"
|
||||
|
||||
# Set DATABASE_URL secret for the job if created via kubectl create
|
||||
kubectl set env job/$JOB_NAME -n spywatcher --from=secret/spywatcher-secrets DATABASE_URL=database-url || true
|
||||
|
||||
# Wait for migration to complete
|
||||
kubectl wait --for=condition=complete --timeout=300s job/$JOB_NAME -n spywatcher
|
||||
|
||||
# Show migration logs
|
||||
kubectl logs job/$JOB_NAME -n spywatcher
|
||||
|
||||
- name: Deploy with Rolling Update
|
||||
if: github.event.inputs.deployment_strategy == 'rolling' || github.event.inputs.deployment_strategy == ''
|
||||
run: |
|
||||
# Update backend deployment
|
||||
kubectl set image deployment/spywatcher-backend \
|
||||
backend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }}:latest \
|
||||
-n spywatcher
|
||||
|
||||
# Update frontend deployment
|
||||
kubectl set image deployment/spywatcher-frontend \
|
||||
frontend=${{ env.REGISTRY }}/${{ env.IMAGE_NAME_FRONTEND }}:latest \
|
||||
-n spywatcher
|
||||
|
||||
# Wait for rollout to complete
|
||||
kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=10m
|
||||
kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=10m
|
||||
|
||||
- name: Deploy with Blue-Green
|
||||
if: github.event.inputs.deployment_strategy == 'blue-green'
|
||||
run: |
|
||||
chmod +x ./scripts/deployment/blue-green-deploy.sh
|
||||
IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
|
||||
|
||||
- name: Deploy with Canary
|
||||
if: github.event.inputs.deployment_strategy == 'canary'
|
||||
run: |
|
||||
chmod +x ./scripts/deployment/canary-deploy.sh
|
||||
IMAGE_TAG=latest ./scripts/deployment/canary-deploy.sh
|
||||
|
||||
- name: Run smoke tests
|
||||
run: |
|
||||
# Test via ingress if available, otherwise use port-forward
|
||||
INGRESS_HOST=$(kubectl get ingress spywatcher-ingress -n spywatcher -o jsonpath='{.spec.rules[1].host}' 2>/dev/null || echo "")
|
||||
|
||||
if [ -n "$INGRESS_HOST" ]; then
|
||||
echo "Testing via ingress: $INGRESS_HOST"
|
||||
BACKEND_URL="https://${INGRESS_HOST}"
|
||||
|
||||
# Test health endpoints
|
||||
echo "Testing liveness endpoint..."
|
||||
curl -f "${BACKEND_URL}/health/live" || exit 1
|
||||
|
||||
echo "Testing readiness endpoint..."
|
||||
curl -f "${BACKEND_URL}/health/ready" || exit 1
|
||||
else
|
||||
echo "No ingress found, testing via port-forward"
|
||||
# Port-forward backend service to localhost:8080
|
||||
kubectl port-forward svc/spywatcher-backend 8080:80 -n spywatcher &
|
||||
PORT_FORWARD_PID=$!
|
||||
|
||||
# Wait for port-forward to be ready
|
||||
sleep 5
|
||||
|
||||
# Test health endpoints
|
||||
echo "Testing liveness endpoint..."
|
||||
curl -f "http://localhost:8080/health/live" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
|
||||
|
||||
echo "Testing readiness endpoint..."
|
||||
curl -f "http://localhost:8080/health/ready" || (kill $PORT_FORWARD_PID 2>/dev/null; exit 1)
|
||||
|
||||
# Kill port-forward process
|
||||
kill $PORT_FORWARD_PID 2>/dev/null
|
||||
fi
|
||||
|
||||
echo "Smoke tests passed!"
|
||||
|
||||
- name: Verify deployment
|
||||
run: |
|
||||
echo "=== Deployment Status ==="
|
||||
kubectl get deployments -n spywatcher
|
||||
kubectl get pods -n spywatcher
|
||||
kubectl get services -n spywatcher
|
||||
|
||||
echo "=== Recent Events ==="
|
||||
kubectl get events -n spywatcher --sort-by='.lastTimestamp' | tail -20
|
||||
|
||||
- name: Rollback on failure
|
||||
if: failure()
|
||||
run: |
|
||||
echo "Deployment failed, rolling back..."
|
||||
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
|
||||
kubectl rollout undo deployment/spywatcher-frontend -n spywatcher
|
||||
|
||||
kubectl rollout status deployment/spywatcher-backend -n spywatcher --timeout=5m
|
||||
kubectl rollout status deployment/spywatcher-frontend -n spywatcher --timeout=5m
|
||||
|
||||
- name: Notify on success
|
||||
if: success()
|
||||
run: |
|
||||
echo "✅ Production deployment successful!"
|
||||
echo "Deployed commit: ${{ github.sha }}"
|
||||
echo "Deployment strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}"
|
||||
|
||||
- name: Notify on failure
|
||||
if: failure()
|
||||
uses: 8398a7/action-slack@v3
|
||||
with:
|
||||
status: failure
|
||||
text: |
|
||||
Production deployment failed!
|
||||
Commit: ${{ github.sha }}
|
||||
Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}
|
||||
webhook_url: ${{ secrets.SLACK_WEBHOOK }}
|
||||
continue-on-error: true
|
||||
|
||||
post-deployment:
|
||||
name: Post-Deployment Tasks
|
||||
runs-on: ubuntu-latest
|
||||
needs: deploy
|
||||
if: success()
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Update kubeconfig
|
||||
run: |
|
||||
aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Clean up old resources
|
||||
run: |
|
||||
# Clean up completed migration jobs older than 1 hour
|
||||
kubectl delete jobs -n spywatcher --field-selector status.successful=1 \
|
||||
--ignore-not-found=true || true
|
||||
|
||||
# Clean up old replica sets
|
||||
kubectl delete replicaset -n spywatcher --field-selector status.replicas=0 \
|
||||
--ignore-not-found=true || true
|
||||
|
||||
- name: Update deployment documentation
|
||||
run: |
|
||||
echo "Deployment completed at $(date)" >> deployment-log.txt
|
||||
echo "Commit: ${{ github.sha }}" >> deployment-log.txt
|
||||
echo "Strategy: ${{ github.event.inputs.deployment_strategy || 'rolling' }}" >> deployment-log.txt
|
||||
echo "---" >> deployment-log.txt
|
||||
413
DEPLOYMENT.md
Normal file
413
DEPLOYMENT.md
Normal file
@@ -0,0 +1,413 @@
|
||||
# Deployment Guide
|
||||
|
||||
This document describes the production deployment strategy for Spywatcher, including infrastructure setup, deployment procedures, and rollback strategies.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Overview](#overview)
|
||||
- [Infrastructure Setup](#infrastructure-setup)
|
||||
- [Deployment Strategies](#deployment-strategies)
|
||||
- [Kubernetes Deployment](#kubernetes-deployment)
|
||||
- [Terraform Infrastructure](#terraform-infrastructure)
|
||||
- [Helm Charts](#helm-charts)
|
||||
- [CI/CD Pipeline](#cicd-pipeline)
|
||||
- [Rollback Procedures](#rollback-procedures)
|
||||
- [Monitoring and Alerts](#monitoring-and-alerts)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
|
||||
## Overview
|
||||
|
||||
Spywatcher uses a multi-strategy deployment approach with:
|
||||
|
||||
- **Infrastructure as Code**: Terraform for AWS infrastructure
|
||||
- **Container Orchestration**: Kubernetes (EKS) for application deployment
|
||||
- **Package Management**: Helm charts for simplified deployments
|
||||
- **Deployment Strategies**: Rolling, Blue-Green, and Canary deployments
|
||||
- **CI/CD**: GitHub Actions for automated deployments
|
||||
|
||||
## Infrastructure Setup
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. AWS Account with appropriate permissions
|
||||
2. AWS CLI configured
|
||||
3. kubectl installed
|
||||
4. Terraform installed (>= 1.5.0)
|
||||
5. Helm installed (>= 3.0)
|
||||
|
||||
### Terraform Infrastructure
|
||||
|
||||
The infrastructure is defined in Terraform modules:
|
||||
|
||||
```bash
|
||||
cd terraform
|
||||
|
||||
# Initialize Terraform
|
||||
terraform init
|
||||
|
||||
# Review the plan
|
||||
terraform plan -var-file="environments/production/terraform.tfvars"
|
||||
|
||||
# Apply infrastructure
|
||||
terraform apply -var-file="environments/production/terraform.tfvars"
|
||||
```
|
||||
|
||||
#### Infrastructure Components
|
||||
|
||||
- **VPC**: Isolated network with public, private, and database subnets across 3 AZs
|
||||
- **EKS Cluster**: Kubernetes cluster with managed node groups
|
||||
- **RDS PostgreSQL**: Managed database with encryption and automated backups
|
||||
- **ElastiCache Redis**: In-memory cache with cluster mode
|
||||
- **Application Load Balancer**: With WAF for security
|
||||
- **Security Groups**: Least-privilege network access
|
||||
- **IAM Roles**: Service accounts and node permissions
|
||||
|
||||
### Configure kubectl
|
||||
|
||||
After infrastructure deployment:
|
||||
|
||||
```bash
|
||||
aws eks update-kubeconfig --name spywatcher-production --region us-east-1
|
||||
kubectl cluster-info
|
||||
```
|
||||
|
||||
## Deployment Strategies
|
||||
|
||||
### Rolling Deployment (Default)
|
||||
|
||||
Updates pods gradually, maintaining service availability.
|
||||
|
||||
```bash
|
||||
# Triggered automatically on push to main branch
|
||||
# Or manually via GitHub Actions UI
|
||||
```
|
||||
|
||||
**Advantages:**
|
||||
- Simple and predictable
|
||||
- Zero downtime
|
||||
- Automatic rollback on failure
|
||||
|
||||
**Disadvantages:**
|
||||
- Gradual rollout may take time
|
||||
- Both versions run simultaneously during update
|
||||
|
||||
### Blue-Green Deployment
|
||||
|
||||
Maintains two identical environments, switching traffic instantly.
|
||||
|
||||
```bash
|
||||
# Via GitHub Actions
|
||||
# Select "blue-green" as deployment strategy
|
||||
|
||||
# Or manually
|
||||
IMAGE_TAG=latest ./scripts/deployment/blue-green-deploy.sh
|
||||
|
||||
# Rollback if needed
|
||||
./scripts/deployment/blue-green-deploy.sh --rollback
|
||||
```
|
||||
|
||||
**Advantages:**
|
||||
- Instant traffic switch
|
||||
- Easy rollback
|
||||
- Full environment testing before switch
|
||||
|
||||
**Disadvantages:**
|
||||
- Requires double resources temporarily
|
||||
- Database migrations must be compatible with both versions
|
||||
|
||||
### Canary Deployment
|
||||
|
||||
Gradually shifts traffic to new version while monitoring metrics.
|
||||
|
||||
```bash
|
||||
# Via GitHub Actions
|
||||
# Select "canary" as deployment strategy
|
||||
|
||||
# Or manually
|
||||
IMAGE_TAG=latest CANARY_STEPS="5 25 50 100" ./scripts/deployment/canary-deploy.sh
|
||||
```
|
||||
|
||||
**Advantages:**
|
||||
- Risk mitigation through gradual rollout
|
||||
- Real-world testing with subset of users
|
||||
- Automated rollback on errors
|
||||
|
||||
**Disadvantages:**
|
||||
- Longer deployment time
|
||||
- Requires robust monitoring
|
||||
|
||||
## Kubernetes Deployment
|
||||
|
||||
### Using Kustomize
|
||||
|
||||
Deploy to different environments:
|
||||
|
||||
```bash
|
||||
# Production
|
||||
kubectl apply -k k8s/overlays/production
|
||||
|
||||
# Staging
|
||||
kubectl apply -k k8s/overlays/staging
|
||||
|
||||
# Development (base)
|
||||
kubectl apply -k k8s/base
|
||||
```
|
||||
|
||||
### Manual Deployment
|
||||
|
||||
```bash
|
||||
# Create namespace
|
||||
kubectl apply -f k8s/base/namespace.yaml
|
||||
|
||||
# Apply configurations
|
||||
kubectl apply -f k8s/base/configmap.yaml
|
||||
kubectl apply -f k8s/base/secrets.yaml
|
||||
|
||||
# Deploy databases
|
||||
kubectl apply -f k8s/base/postgres-statefulset.yaml
|
||||
kubectl apply -f k8s/base/redis-statefulset.yaml
|
||||
|
||||
# Deploy applications
|
||||
kubectl apply -f k8s/base/backend-deployment.yaml
|
||||
kubectl apply -f k8s/base/frontend-deployment.yaml
|
||||
|
||||
# Create services
|
||||
kubectl apply -f k8s/base/backend-service.yaml
|
||||
kubectl apply -f k8s/base/frontend-service.yaml
|
||||
|
||||
# Configure ingress
|
||||
kubectl apply -f k8s/base/ingress.yaml
|
||||
```
|
||||
|
||||
### Scaling
|
||||
|
||||
```bash
|
||||
# Manual scaling
|
||||
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
|
||||
|
||||
# Auto-scaling is configured via HPA
|
||||
kubectl get hpa -n spywatcher
|
||||
```
|
||||
|
||||
## Helm Charts
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
# Install with default values
|
||||
helm install spywatcher ./helm/spywatcher -n spywatcher --create-namespace
|
||||
|
||||
# Install with custom values
|
||||
helm install spywatcher ./helm/spywatcher \
|
||||
-n spywatcher \
|
||||
--create-namespace \
|
||||
-f helm/spywatcher/values-production.yaml
|
||||
```
|
||||
|
||||
### Upgrade
|
||||
|
||||
```bash
|
||||
helm upgrade spywatcher ./helm/spywatcher -n spywatcher
|
||||
```
|
||||
|
||||
### Rollback
|
||||
|
||||
```bash
|
||||
# List releases
|
||||
helm history spywatcher -n spywatcher
|
||||
|
||||
# Rollback to previous version
|
||||
helm rollback spywatcher -n spywatcher
|
||||
|
||||
# Rollback to specific revision
|
||||
helm rollback spywatcher 2 -n spywatcher
|
||||
```
|
||||
|
||||
## CI/CD Pipeline
|
||||
|
||||
### GitHub Actions Workflow
|
||||
|
||||
The deployment pipeline is triggered by:
|
||||
|
||||
1. Push to `main` branch (automatic)
|
||||
2. Manual workflow dispatch
|
||||
|
||||
#### Pipeline Steps
|
||||
|
||||
1. **Build and Push**
|
||||
- Build Docker images for backend and frontend
|
||||
- Push to GitHub Container Registry
|
||||
- Tag with commit SHA and latest
|
||||
|
||||
2. **Database Migration**
|
||||
- Run Prisma migrations
|
||||
- Verify migration success
|
||||
|
||||
3. **Deploy**
|
||||
- Apply selected deployment strategy
|
||||
- Update Kubernetes deployments
|
||||
- Monitor rollout status
|
||||
|
||||
4. **Smoke Tests**
|
||||
- Health check endpoints
|
||||
- Basic functionality tests
|
||||
|
||||
5. **Rollback on Failure**
|
||||
- Automatic rollback if deployment fails
|
||||
- Notification to team
|
||||
|
||||
### Required Secrets
|
||||
|
||||
Configure in GitHub repository settings:
|
||||
|
||||
```
|
||||
AWS_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY
|
||||
DATABASE_URL
|
||||
REDIS_URL
|
||||
JWT_SECRET
|
||||
JWT_REFRESH_SECRET
|
||||
DISCORD_BOT_TOKEN
|
||||
DISCORD_CLIENT_ID
|
||||
DISCORD_CLIENT_SECRET
|
||||
SLACK_WEBHOOK (optional)
|
||||
```
|
||||
|
||||
## Rollback Procedures
|
||||
|
||||
### Kubernetes Rollback
|
||||
|
||||
```bash
|
||||
# View rollout history
|
||||
kubectl rollout history deployment/spywatcher-backend -n spywatcher
|
||||
|
||||
# Rollback to previous version
|
||||
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
|
||||
|
||||
# Rollback to specific revision
|
||||
kubectl rollout undo deployment/spywatcher-backend --to-revision=2 -n spywatcher
|
||||
|
||||
# Check rollback status
|
||||
kubectl rollout status deployment/spywatcher-backend -n spywatcher
|
||||
```
|
||||
|
||||
### Blue-Green Rollback
|
||||
|
||||
```bash
|
||||
./scripts/deployment/blue-green-deploy.sh --rollback
|
||||
```
|
||||
|
||||
### Database Rollback
|
||||
|
||||
```bash
|
||||
# If migration needs to be rolled back
|
||||
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- npx prisma migrate resolve --rolled-back <migration_name>
|
||||
```
|
||||
|
||||
## Monitoring and Alerts
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Liveness probe
|
||||
curl https://api.spywatcher.example.com/health/live
|
||||
|
||||
# Readiness probe
|
||||
curl https://api.spywatcher.example.com/health/ready
|
||||
```
|
||||
|
||||
### Kubernetes Monitoring
|
||||
|
||||
```bash
|
||||
# Check pod status
|
||||
kubectl get pods -n spywatcher
|
||||
|
||||
# View pod logs
|
||||
kubectl logs -f deployment/spywatcher-backend -n spywatcher
|
||||
|
||||
# Check events
|
||||
kubectl get events -n spywatcher --sort-by='.lastTimestamp'
|
||||
|
||||
# Resource usage
|
||||
kubectl top pods -n spywatcher
|
||||
kubectl top nodes
|
||||
```
|
||||
|
||||
### CloudWatch Metrics
|
||||
|
||||
Monitor via AWS CloudWatch:
|
||||
- EKS cluster metrics
|
||||
- RDS performance metrics
|
||||
- ElastiCache metrics
|
||||
- ALB request metrics
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Pod Not Starting
|
||||
|
||||
```bash
|
||||
# Describe pod to see events
|
||||
kubectl describe pod <pod-name> -n spywatcher
|
||||
|
||||
# Check logs
|
||||
kubectl logs <pod-name> -n spywatcher
|
||||
|
||||
# Check resource constraints
|
||||
kubectl describe node <node-name>
|
||||
```
|
||||
|
||||
### Database Connection Issues
|
||||
|
||||
```bash
|
||||
# Verify database secret
|
||||
kubectl get secret spywatcher-secrets -n spywatcher -o yaml
|
||||
|
||||
# Test database connection
|
||||
kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -n spywatcher -- \
|
||||
psql -h <rds-endpoint> -U spywatcher -d spywatcher
|
||||
```
|
||||
|
||||
### Traffic Not Routing
|
||||
|
||||
```bash
|
||||
# Check service endpoints
|
||||
kubectl get endpoints -n spywatcher
|
||||
|
||||
# Check ingress
|
||||
kubectl describe ingress spywatcher-ingress -n spywatcher
|
||||
|
||||
# Check ALB target groups
|
||||
aws elbv2 describe-target-health --target-group-arn <arn>
|
||||
```
|
||||
|
||||
### High Resource Usage
|
||||
|
||||
```bash
|
||||
# Check HPA status
|
||||
kubectl get hpa -n spywatcher
|
||||
|
||||
# Scale manually if needed
|
||||
kubectl scale deployment spywatcher-backend --replicas=10 -n spywatcher
|
||||
|
||||
# Check resource limits
|
||||
kubectl describe deployment spywatcher-backend -n spywatcher
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always test in staging first**
|
||||
2. **Run database migrations before deploying code**
|
||||
3. **Use feature flags for risky changes**
|
||||
4. **Monitor error rates during deployment**
|
||||
5. **Keep rollback scripts ready**
|
||||
6. **Document all configuration changes**
|
||||
7. **Regular backup testing**
|
||||
8. **Security patches applied promptly**
|
||||
|
||||
## Support
|
||||
|
||||
For deployment issues:
|
||||
- Check GitHub Actions logs
|
||||
- Review CloudWatch logs
|
||||
- Contact DevOps team
|
||||
- Create incident in issue tracker
|
||||
351
INFRASTRUCTURE.md
Normal file
351
INFRASTRUCTURE.md
Normal file
@@ -0,0 +1,351 @@
|
||||
# Infrastructure Overview
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ AWS Cloud │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────┐ │
|
||||
│ │ VPC (10.0.0.0/16) │ │
|
||||
│ │ │ │
|
||||
│ │ ┌────────────────────────────────────────────────────┐ │ │
|
||||
│ │ │ Application Load Balancer (ALB) │ │ │
|
||||
│ │ │ with WAF Protection │ │ │
|
||||
│ │ └──────────────────┬─────────────────────────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌──────────────────┴────────────────────────┐ │ │
|
||||
│ │ │ EKS Cluster (Kubernetes) │ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ ┌────────────────┐ ┌─────────────────┐ │ │ │
|
||||
│ │ │ │ Backend │ │ Frontend │ │ │ │
|
||||
│ │ │ │ Pods (3) │ │ Pods (2) │ │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ │ - Auto-scaling │ │ - Auto-scaling │ │ │ │
|
||||
│ │ │ │ - Health checks│ │ - Health checks │ │ │ │
|
||||
│ │ │ └────────┬───────┘ └────────┬────────┘ │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ └───────┬───────────┘ │ │ │
|
||||
│ │ │ │ │ │ │
|
||||
│ │ └───────────────────┼──────────────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌───────────────────┼──────────────────────────────┐ │ │
|
||||
│ │ │ Database Subnets │ │ │ │
|
||||
│ │ │ │ │ │ │
|
||||
│ │ │ ┌────────────────▼────────┐ ┌───────────────┐ │ │ │
|
||||
│ │ │ │ RDS PostgreSQL 15 │ │ ElastiCache │ │ │ │
|
||||
│ │ │ │ │ │ Redis │ │ │ │
|
||||
│ │ │ │ - Multi-AZ │ │ │ │ │ │
|
||||
│ │ │ │ - Encrypted │ │ - Encrypted │ │ │ │
|
||||
│ │ │ │ - Automated Backups │ │ - Failover │ │ │ │
|
||||
│ │ │ └─────────────────────────┘ └───────────────┘ │ │ │
|
||||
│ │ └──────────────────────────────────────────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────┐ ┌──────────────────┐ │
|
||||
│ │ CloudWatch │ │ Secrets Manager │ │
|
||||
│ │ Monitoring │ │ Credentials │ │
|
||||
│ └──────────────────┘ └──────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ GitHub Actions │
|
||||
│ │
|
||||
│ Build → Test → Deploy → Smoke Tests → Monitor │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
### Compute
|
||||
- **EKS Cluster**: Managed Kubernetes cluster (v1.28)
|
||||
- **Node Groups**: Auto-scaling EC2 instances (t3.large)
|
||||
- **Pods**: Containerized applications with health checks
|
||||
|
||||
### Networking
|
||||
- **VPC**: Isolated network (10.0.0.0/16)
|
||||
- **Subnets**: Public, Private, and Database across 3 AZs
|
||||
- **NAT Gateways**: Internet access for private subnets
|
||||
- **ALB**: HTTPS termination and routing
|
||||
|
||||
### Data Storage
|
||||
- **RDS PostgreSQL**: Managed database (15.3)
|
||||
- Multi-AZ for high availability
|
||||
- Automated backups (7 days retention)
|
||||
- Encryption at rest (KMS)
|
||||
|
||||
- **ElastiCache Redis**: In-memory cache (7.0)
|
||||
- Authentication token
|
||||
- Encryption in transit
|
||||
- Automatic failover
|
||||
|
||||
### Security
|
||||
- **WAF**: Web Application Firewall with rate limiting
|
||||
- **Security Groups**: Network-level access control
|
||||
- **IAM Roles**: Fine-grained permissions
|
||||
- **Secrets Manager**: Secure credential storage
|
||||
- **TLS/SSL**: End-to-end encryption
|
||||
|
||||
### Monitoring
|
||||
- **CloudWatch**: Metrics, logs, and alarms
|
||||
- **Health Checks**: Liveness and readiness probes
|
||||
- **Resource Metrics**: CPU, memory, network usage
|
||||
|
||||
## Resource Sizing
|
||||
|
||||
### Production Environment
|
||||
|
||||
| Component | Type | Specs | Replicas | Scaling |
|
||||
|-----------|------|-------|----------|---------|
|
||||
| Backend | Pod | 512Mi RAM, 500m CPU | 3 | 2-10 |
|
||||
| Frontend | Pod | 128Mi RAM, 100m CPU | 2 | 2-5 |
|
||||
| PostgreSQL | RDS | db.t3.large | 1 (Multi-AZ) | Manual |
|
||||
| Redis | ElastiCache | cache.t3.medium | 2 | Manual |
|
||||
| EKS Nodes | EC2 | t3.large | 3 | 2-10 |
|
||||
|
||||
### Staging Environment
|
||||
|
||||
| Component | Type | Specs | Replicas | Scaling |
|
||||
|-----------|------|-------|----------|---------|
|
||||
| Backend | Pod | 256Mi RAM, 250m CPU | 1 | 1-3 |
|
||||
| Frontend | Pod | 128Mi RAM, 100m CPU | 1 | 1-2 |
|
||||
| PostgreSQL | RDS | db.t3.medium | 1 | N/A |
|
||||
| Redis | ElastiCache | cache.t3.small | 1 | N/A |
|
||||
| EKS Nodes | EC2 | t3.medium | 2 | 1-4 |
|
||||
|
||||
## Cost Estimation
|
||||
|
||||
### Monthly Costs (US East 1)
|
||||
|
||||
#### Production
|
||||
- EKS Cluster: $73
|
||||
- EC2 Nodes (3x t3.large): ~$150
|
||||
- RDS PostgreSQL (db.t3.large, Multi-AZ): ~$290
|
||||
- ElastiCache Redis (cache.t3.medium x2): ~$100
|
||||
- ALB: ~$25
|
||||
- Data Transfer: ~$50
|
||||
- Backups & Monitoring: ~$30
|
||||
|
||||
**Total: ~$718/month**
|
||||
|
||||
#### Staging
|
||||
- EKS Cluster: $73
|
||||
- EC2 Nodes (2x t3.medium): ~$60
|
||||
- RDS PostgreSQL (db.t3.medium): ~$70
|
||||
- ElastiCache Redis (cache.t3.small): ~$25
|
||||
- ALB: ~$25
|
||||
- Data Transfer: ~$20
|
||||
|
||||
**Total: ~$273/month**
|
||||
|
||||
*Note: Costs are estimates and may vary based on usage*
|
||||
|
||||
## Deployment Strategies
|
||||
|
||||
### 1. Rolling Update (Default)
|
||||
- **Use Case**: Standard deployments
|
||||
- **Downtime**: Zero
|
||||
- **Risk**: Low
|
||||
- **Duration**: 5-10 minutes
|
||||
|
||||
### 2. Blue-Green
|
||||
- **Use Case**: Major releases, critical changes
|
||||
- **Downtime**: Zero
|
||||
- **Risk**: Very Low (instant rollback)
|
||||
- **Duration**: 10-15 minutes
|
||||
|
||||
### 3. Canary
|
||||
- **Use Case**: High-risk changes, gradual rollout
|
||||
- **Downtime**: Zero
|
||||
- **Risk**: Minimal (gradual exposure)
|
||||
- **Duration**: 30-60 minutes
|
||||
|
||||
## High Availability
|
||||
|
||||
### Application Layer
|
||||
- Multiple replicas across availability zones
|
||||
- Pod anti-affinity rules
|
||||
- Pod disruption budgets (min 1 available)
|
||||
- Health checks with automatic restart
|
||||
|
||||
### Database Layer
|
||||
- Multi-AZ deployment for RDS
|
||||
- Automated failover (< 60 seconds)
|
||||
- Read replicas for scaling (optional)
|
||||
- Point-in-time recovery
|
||||
|
||||
### Network Layer
|
||||
- Multi-AZ load balancing
|
||||
- Health checks on targets
|
||||
- Automatic target deregistration
|
||||
- DDoS protection (AWS Shield)
|
||||
|
||||
## Disaster Recovery
|
||||
|
||||
### RTO (Recovery Time Objective)
|
||||
- Application: < 5 minutes
|
||||
- Database: < 1 minute (automated failover)
|
||||
- Full Infrastructure: < 30 minutes (Terraform redeploy)
|
||||
|
||||
### RPO (Recovery Point Objective)
|
||||
- Database: < 5 minutes (automated backups)
|
||||
- Application: 0 (stateless, recreatable)
|
||||
|
||||
### Backup Strategy
|
||||
- **Database**: Daily automated backups (7 days retention)
|
||||
- **Configuration**: Git repository (versioned)
|
||||
- **Infrastructure**: Terraform state (versioned in S3)
|
||||
|
||||
## Security Measures
|
||||
|
||||
### Network Security
|
||||
- Private subnets for application and database
|
||||
- Security groups with least-privilege rules
|
||||
- Network ACLs
|
||||
- VPC Flow Logs
|
||||
|
||||
### Application Security
|
||||
- Containers run as non-root
|
||||
- Read-only root filesystems where possible
|
||||
- No privilege escalation
|
||||
- Security scanning in CI/CD
|
||||
|
||||
### Data Security
|
||||
- Encryption at rest (KMS)
|
||||
- Encryption in transit (TLS 1.2+)
|
||||
- Secrets stored in AWS Secrets Manager
|
||||
- Database credentials auto-rotated
|
||||
|
||||
### Access Control
|
||||
- IAM roles with least privilege
|
||||
- RBAC in Kubernetes
|
||||
- MFA for admin access
|
||||
- Audit logging enabled
|
||||
|
||||
## Scaling Strategy
|
||||
|
||||
### Horizontal Scaling
|
||||
- **Triggers**:
|
||||
- CPU > 70%
|
||||
- Memory > 80%
|
||||
- Custom metrics (request rate)
|
||||
|
||||
- **Limits**:
|
||||
- Backend: 2-10 pods
|
||||
- Frontend: 2-5 pods
|
||||
- Nodes: 2-10 instances
|
||||
|
||||
### Vertical Scaling
|
||||
- Database: Manual scaling with downtime
|
||||
- Redis: Manual scaling with failover
|
||||
- Pods: Update resource limits and restart
|
||||
|
||||
## Monitoring Strategy
|
||||
|
||||
### Application Metrics
|
||||
- Request rate and latency
|
||||
- Error rate
|
||||
- Active connections
|
||||
- Cache hit rate
|
||||
|
||||
### Infrastructure Metrics
|
||||
- CPU utilization
|
||||
- Memory utilization
|
||||
- Network throughput
|
||||
- Disk I/O
|
||||
|
||||
### Business Metrics
|
||||
- Active users
|
||||
- API usage per tier
|
||||
- Feature usage
|
||||
- User sessions
|
||||
|
||||
### Alerting
|
||||
- Critical: Page immediately
|
||||
- Service down
|
||||
- Database unavailable
|
||||
- High error rate
|
||||
|
||||
- Warning: Notify during business hours
|
||||
- High CPU/memory
|
||||
- Low disk space
|
||||
- Elevated response time
|
||||
|
||||
## Maintenance Windows
|
||||
|
||||
### Planned Maintenance
|
||||
- **Schedule**: Sundays 02:00-04:00 UTC
|
||||
- **Notification**: 7 days advance notice
|
||||
- **Activities**:
|
||||
- OS patches
|
||||
- Database maintenance
|
||||
- Kubernetes upgrades
|
||||
- SSL certificate renewal
|
||||
|
||||
### Emergency Maintenance
|
||||
- Immediate security patches
|
||||
- Critical bug fixes
|
||||
- Infrastructure failures
|
||||
|
||||
## Compliance & Governance
|
||||
|
||||
### Tagging Strategy
|
||||
All resources tagged with:
|
||||
- `Environment`: production/staging
|
||||
- `Project`: spywatcher
|
||||
- `ManagedBy`: terraform
|
||||
- `CostCenter`: engineering
|
||||
|
||||
### Resource Naming
|
||||
- Pattern: `{project}-{environment}-{resource}`
|
||||
- Example: `spywatcher-production-backend`
|
||||
|
||||
### Access Audit
|
||||
- CloudTrail enabled
|
||||
- Quarterly access review
|
||||
- Regular security audits
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### Useful Commands
|
||||
|
||||
```bash
|
||||
# Check cluster status
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
|
||||
# View application status
|
||||
kubectl get all -n spywatcher
|
||||
|
||||
# View logs
|
||||
kubectl logs -f deployment/spywatcher-backend -n spywatcher
|
||||
|
||||
# Scale application
|
||||
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
|
||||
|
||||
# Rollback deployment
|
||||
kubectl rollout undo deployment/spywatcher-backend -n spywatcher
|
||||
|
||||
# Database backup
|
||||
aws rds create-db-snapshot --db-instance-identifier spywatcher-production
|
||||
|
||||
# View CloudWatch alarms
|
||||
aws cloudwatch describe-alarms --state-value ALARM
|
||||
```
|
||||
|
||||
### Important URLs
|
||||
|
||||
- Production: https://spywatcher.example.com
|
||||
- API: https://api.spywatcher.example.com
|
||||
- Staging: https://staging.spywatcher.example.com
|
||||
- Grafana: https://grafana.spywatcher.example.com
|
||||
- AWS Console: https://console.aws.amazon.com
|
||||
|
||||
### Support Contacts
|
||||
|
||||
- On-Call: oncall@spywatcher.example.com
|
||||
- DevOps: devops@spywatcher.example.com
|
||||
- Security: security@spywatcher.example.com
|
||||
63
README.md
63
README.md
@@ -453,6 +453,69 @@ Git hooks are automatically installed when you run `npm install` in the root dir
|
||||
|
||||
See [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines.
|
||||
|
||||
## 🚀 Production Deployment
|
||||
|
||||
Spywatcher includes comprehensive production deployment infrastructure with Kubernetes, Terraform, and CI/CD automation.
|
||||
|
||||
### Deployment Strategies
|
||||
|
||||
- **Rolling Updates**: Zero-downtime gradual deployment (default)
|
||||
- **Blue-Green**: Instant traffic switching with quick rollback
|
||||
- **Canary**: Gradual rollout with automated error detection
|
||||
|
||||
### Infrastructure as Code
|
||||
|
||||
- **Terraform**: Complete AWS infrastructure modules
|
||||
- VPC with multi-AZ setup
|
||||
- EKS Kubernetes cluster
|
||||
- RDS PostgreSQL (Multi-AZ, encrypted)
|
||||
- ElastiCache Redis (encrypted, failover)
|
||||
- Application Load Balancer with WAF
|
||||
- **Kubernetes**: Production-ready manifests
|
||||
- Auto-scaling with HorizontalPodAutoscaler
|
||||
- Health checks and pod disruption budgets
|
||||
- Security contexts and network policies
|
||||
- **Helm Charts**: Simplified deployment and configuration
|
||||
|
||||
### Quick Deployment
|
||||
|
||||
```bash
|
||||
# Deploy with Terraform
|
||||
cd terraform
|
||||
terraform init
|
||||
terraform apply -var-file="environments/production/terraform.tfvars"
|
||||
|
||||
# Deploy with Kubernetes
|
||||
kubectl apply -k k8s/overlays/production
|
||||
|
||||
# Deploy with Helm
|
||||
helm install spywatcher ./helm/spywatcher -n spywatcher
|
||||
|
||||
# Blue-green deployment
|
||||
./scripts/deployment/blue-green-deploy.sh
|
||||
|
||||
# Canary deployment
|
||||
./scripts/deployment/canary-deploy.sh
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
||||
- **[DEPLOYMENT.md](./DEPLOYMENT.md)** - Complete deployment guide
|
||||
- **[INFRASTRUCTURE.md](./INFRASTRUCTURE.md)** - Architecture overview
|
||||
- **[terraform/README.md](./terraform/README.md)** - Infrastructure as Code guide
|
||||
- **[k8s/README.md](./k8s/README.md)** - Kubernetes manifests guide
|
||||
|
||||
### CI/CD Pipeline
|
||||
|
||||
GitHub Actions workflows for automated deployment:
|
||||
- Docker image building and pushing to GHCR
|
||||
- Database migrations
|
||||
- Multiple deployment strategy support
|
||||
- Automated smoke tests and health checks
|
||||
- Rollback on failure
|
||||
|
||||
See [.github/workflows/deploy-production.yml](./.github/workflows/deploy-production.yml) for the complete pipeline.
|
||||
|
||||
## 👥 Contributions
|
||||
|
||||
See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines on contributing to this project.
|
||||
|
||||
15
helm/spywatcher/Chart.yaml
Normal file
15
helm/spywatcher/Chart.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: v2
|
||||
name: spywatcher
|
||||
description: A Helm chart for Spywatcher Discord surveillance and analytics application
|
||||
type: application
|
||||
version: 1.0.0
|
||||
appVersion: "1.0.0"
|
||||
keywords:
|
||||
- discord
|
||||
- monitoring
|
||||
- analytics
|
||||
maintainers:
|
||||
- name: Spywatcher Team
|
||||
home: https://github.com/subculture-collective/discord-spywatcher
|
||||
sources:
|
||||
- https://github.com/subculture-collective/discord-spywatcher
|
||||
65
helm/spywatcher/templates/_helpers.tpl
Normal file
65
helm/spywatcher/templates/_helpers.tpl
Normal file
@@ -0,0 +1,65 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "spywatcher.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
*/}}
|
||||
{{- define "spywatcher.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "spywatcher.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "spywatcher.labels" -}}
|
||||
helm.sh/chart: {{ include "spywatcher.chart" . }}
|
||||
{{ include "spywatcher.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "spywatcher.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "spywatcher.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Backend labels
|
||||
*/}}
|
||||
{{- define "spywatcher.backend.labels" -}}
|
||||
{{ include "spywatcher.labels" . }}
|
||||
app.kubernetes.io/component: backend
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Frontend labels
|
||||
*/}}
|
||||
{{- define "spywatcher.frontend.labels" -}}
|
||||
{{ include "spywatcher.labels" . }}
|
||||
app.kubernetes.io/component: frontend
|
||||
{{- end }}
|
||||
11
helm/spywatcher/templates/configmap.yaml
Normal file
11
helm/spywatcher/templates/configmap.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "spywatcher.fullname" . }}-config
|
||||
namespace: {{ .Values.namespace }}
|
||||
labels:
|
||||
{{- include "spywatcher.labels" . | nindent 4 }}
|
||||
data:
|
||||
{{- range $key, $value := .Values.configMap.data }}
|
||||
{{ $key }}: {{ $value | quote }}
|
||||
{{- end }}
|
||||
6
helm/spywatcher/templates/namespace.yaml
Normal file
6
helm/spywatcher/templates/namespace.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: {{ .Values.namespace }}
|
||||
labels:
|
||||
{{- include "spywatcher.labels" . | nindent 4 }}
|
||||
20
helm/spywatcher/templates/secrets.yaml
Normal file
20
helm/spywatcher/templates/secrets.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "spywatcher.fullname" . }}-secrets
|
||||
namespace: {{ .Values.namespace }}
|
||||
labels:
|
||||
{{- include "spywatcher.labels" . | nindent 4 }}
|
||||
type: Opaque
|
||||
stringData:
|
||||
database-url: {{ .Values.secrets.databaseUrl | quote }}
|
||||
redis-url: {{ .Values.secrets.redisUrl | quote }}
|
||||
jwt-secret: {{ .Values.secrets.jwtSecret | quote }}
|
||||
jwt-refresh-secret: {{ .Values.secrets.jwtRefreshSecret | quote }}
|
||||
discord-bot-token: {{ .Values.secrets.discordBotToken | quote }}
|
||||
discord-client-id: {{ .Values.secrets.discordClientId | quote }}
|
||||
discord-client-secret: {{ .Values.secrets.discordClientSecret | quote }}
|
||||
discord-guild-id: {{ .Values.secrets.discordGuildId | quote }}
|
||||
discord-redirect-uri: {{ .Values.secrets.discordRedirectUri | quote }}
|
||||
admin-discord-ids: {{ .Values.secrets.adminDiscordIds | quote }}
|
||||
bot-guild-ids: {{ .Values.secrets.botGuildIds | quote }}
|
||||
115
helm/spywatcher/values-production.yaml
Normal file
115
helm/spywatcher/values-production.yaml
Normal file
@@ -0,0 +1,115 @@
|
||||
# Production Environment Values
|
||||
# Override default values for production deployment
|
||||
|
||||
global:
|
||||
environment: production
|
||||
|
||||
namespace: spywatcher
|
||||
|
||||
image:
|
||||
backend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-backend
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
frontend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-frontend
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
|
||||
backend:
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 3
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
env:
|
||||
NODE_ENV: production
|
||||
PORT: "3001"
|
||||
LOG_LEVEL: info
|
||||
|
||||
frontend:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
|
||||
env:
|
||||
VITE_API_URL: "https://api.spywatcher.example.com"
|
||||
|
||||
# Use managed services instead of in-cluster databases
|
||||
postgresql:
|
||||
enabled: false
|
||||
|
||||
redis:
|
||||
enabled: false
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||
nginx.ingress.kubernetes.io/rate-limit: "100"
|
||||
|
||||
hosts:
|
||||
- host: spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-frontend
|
||||
port: 80
|
||||
- host: api.spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-backend
|
||||
port: 80
|
||||
|
||||
tls:
|
||||
- secretName: spywatcher-tls-cert
|
||||
hosts:
|
||||
- spywatcher.example.com
|
||||
- api.spywatcher.example.com
|
||||
|
||||
podDisruptionBudget:
|
||||
enabled: true
|
||||
minAvailable: 2
|
||||
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
|
||||
# Production-specific node affinity
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: node.kubernetes.io/instance-type
|
||||
operator: In
|
||||
values:
|
||||
- t3.large
|
||||
- t3a.large
|
||||
104
helm/spywatcher/values-staging.yaml
Normal file
104
helm/spywatcher/values-staging.yaml
Normal file
@@ -0,0 +1,104 @@
|
||||
# Staging Environment Values
|
||||
# Override default values for staging deployment
|
||||
|
||||
global:
|
||||
environment: staging
|
||||
|
||||
namespace: spywatcher-staging
|
||||
|
||||
image:
|
||||
backend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-backend
|
||||
tag: staging
|
||||
pullPolicy: Always
|
||||
frontend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-frontend
|
||||
tag: staging
|
||||
pullPolicy: Always
|
||||
|
||||
backend:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 1
|
||||
maxReplicas: 3
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
env:
|
||||
NODE_ENV: staging
|
||||
PORT: "3001"
|
||||
LOG_LEVEL: debug
|
||||
|
||||
frontend:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
|
||||
env:
|
||||
VITE_API_URL: "https://api-staging.spywatcher.example.com"
|
||||
|
||||
# Use in-cluster databases for staging
|
||||
postgresql:
|
||||
enabled: true
|
||||
primary:
|
||||
persistence:
|
||||
size: 10Gi
|
||||
|
||||
redis:
|
||||
enabled: true
|
||||
master:
|
||||
persistence:
|
||||
size: 5Gi
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-staging"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
|
||||
hosts:
|
||||
- host: staging.spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-frontend
|
||||
port: 80
|
||||
- host: api-staging.spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-backend
|
||||
port: 80
|
||||
|
||||
tls:
|
||||
- secretName: spywatcher-staging-tls-cert
|
||||
hosts:
|
||||
- staging.spywatcher.example.com
|
||||
- api-staging.spywatcher.example.com
|
||||
|
||||
podDisruptionBudget:
|
||||
enabled: false
|
||||
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
212
helm/spywatcher/values.yaml
Normal file
212
helm/spywatcher/values.yaml
Normal file
@@ -0,0 +1,212 @@
|
||||
# Default values for spywatcher
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
# Global settings
|
||||
global:
|
||||
environment: production
|
||||
|
||||
# Namespace
|
||||
namespace: spywatcher
|
||||
|
||||
# Image settings
|
||||
image:
|
||||
backend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-backend
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
frontend:
|
||||
repository: ghcr.io/subculture-collective/spywatcher-frontend
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
|
||||
imagePullSecrets: []
|
||||
|
||||
# Backend configuration
|
||||
backend:
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 80
|
||||
targetPort: 3001
|
||||
|
||||
env:
|
||||
NODE_ENV: production
|
||||
PORT: "3001"
|
||||
LOG_LEVEL: info
|
||||
|
||||
# Health check configuration
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/live
|
||||
port: 3001
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/ready
|
||||
port: 3001
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
|
||||
# Frontend configuration
|
||||
frontend:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 80
|
||||
targetPort: 80
|
||||
|
||||
env:
|
||||
VITE_API_URL: "https://api.spywatcher.example.com"
|
||||
|
||||
# PostgreSQL configuration
|
||||
postgresql:
|
||||
enabled: true
|
||||
image: postgres:15-alpine
|
||||
|
||||
auth:
|
||||
username: spywatcher
|
||||
database: spywatcher
|
||||
# Password should be set via --set or separate values file
|
||||
existingSecret: postgres-secret
|
||||
secretKeys:
|
||||
adminPasswordKey: password
|
||||
|
||||
primary:
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 20Gi
|
||||
storageClass: ""
|
||||
|
||||
# Redis configuration
|
||||
redis:
|
||||
enabled: true
|
||||
image: redis:7-alpine
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
existingSecret: redis-secret
|
||||
existingSecretPasswordKey: password
|
||||
|
||||
master:
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: ""
|
||||
|
||||
# Ingress configuration
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||
|
||||
hosts:
|
||||
- host: spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-frontend
|
||||
port: 80
|
||||
- host: api.spywatcher.example.com
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
service: spywatcher-backend
|
||||
port: 80
|
||||
|
||||
tls:
|
||||
- secretName: spywatcher-tls-cert
|
||||
hosts:
|
||||
- spywatcher.example.com
|
||||
- api.spywatcher.example.com
|
||||
|
||||
# ConfigMap data
|
||||
configMap:
|
||||
data:
|
||||
NODE_ENV: "production"
|
||||
PORT: "3001"
|
||||
LOG_LEVEL: "info"
|
||||
RATE_LIMIT_WINDOW_MS: "900000"
|
||||
RATE_LIMIT_MAX_REQUESTS: "100"
|
||||
|
||||
# Secrets (should be provided externally)
|
||||
secrets:
|
||||
# Database
|
||||
databaseUrl: ""
|
||||
# Redis
|
||||
redisUrl: ""
|
||||
# JWT
|
||||
jwtSecret: ""
|
||||
jwtRefreshSecret: ""
|
||||
# Discord
|
||||
discordBotToken: ""
|
||||
discordClientId: ""
|
||||
discordClientSecret: ""
|
||||
discordGuildId: ""
|
||||
discordRedirectUri: ""
|
||||
adminDiscordIds: ""
|
||||
botGuildIds: ""
|
||||
|
||||
# Pod Disruption Budget
|
||||
podDisruptionBudget:
|
||||
enabled: true
|
||||
minAvailable: 1
|
||||
|
||||
# Security context
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
|
||||
# Node affinity and tolerations
|
||||
affinity: {}
|
||||
tolerations: []
|
||||
nodeSelector: {}
|
||||
11
k8s/.gitignore
vendored
Normal file
11
k8s/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
# Ignore secret files that contain sensitive data
|
||||
secrets/
|
||||
*.secret.yaml
|
||||
*-secrets.yaml
|
||||
|
||||
# Ignore generated manifests
|
||||
generated/
|
||||
|
||||
# Ignore local development files
|
||||
*.local.yaml
|
||||
local/
|
||||
377
k8s/README.md
Normal file
377
k8s/README.md
Normal file
@@ -0,0 +1,377 @@
|
||||
# Kubernetes Manifests
|
||||
|
||||
This directory contains Kubernetes manifests for deploying Spywatcher.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
k8s/
|
||||
├── base/ # Base manifests
|
||||
│ ├── namespace.yaml # Namespace and resource quotas
|
||||
│ ├── configmap.yaml # Application configuration
|
||||
│ ├── secrets.yaml # Secrets template (DO NOT commit actual secrets)
|
||||
│ ├── migration-job.yaml # Database migration job
|
||||
│ ├── backend-deployment.yaml
|
||||
│ ├── backend-service.yaml
|
||||
│ ├── backend-hpa.yaml # Horizontal Pod Autoscaler
|
||||
│ ├── frontend-deployment.yaml
|
||||
│ ├── frontend-service.yaml
|
||||
│ ├── postgres-statefulset.yaml
|
||||
│ ├── redis-statefulset.yaml
|
||||
│ ├── ingress.yaml
|
||||
│ ├── pdb.yaml # Pod Disruption Budget
|
||||
│ └── kustomization.yaml
|
||||
├── overlays/ # Environment-specific overlays
|
||||
│ ├── production/
|
||||
│ └── staging/
|
||||
└── secrets/ # Actual secrets (gitignored)
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- kubectl configured with cluster access
|
||||
- kustomize (built into kubectl >= 1.14)
|
||||
|
||||
### Deploy to Production
|
||||
|
||||
```bash
|
||||
# Review what will be deployed
|
||||
kubectl kustomize k8s/overlays/production
|
||||
|
||||
# Apply manifests
|
||||
kubectl apply -k k8s/overlays/production
|
||||
|
||||
# Check deployment status
|
||||
kubectl get all -n spywatcher
|
||||
```
|
||||
|
||||
### Deploy to Staging
|
||||
|
||||
```bash
|
||||
kubectl apply -k k8s/overlays/staging
|
||||
kubectl get all -n spywatcher-staging
|
||||
```
|
||||
|
||||
## Configuration Management
|
||||
|
||||
### Secrets
|
||||
|
||||
**IMPORTANT**: Never commit actual secrets to git!
|
||||
|
||||
1. Copy the secrets template:
|
||||
```bash
|
||||
cp k8s/base/secrets.yaml k8s/secrets/secrets.yaml
|
||||
```
|
||||
|
||||
2. Edit with actual values:
|
||||
```bash
|
||||
vim k8s/secrets/secrets.yaml
|
||||
```
|
||||
|
||||
3. Apply separately:
|
||||
```bash
|
||||
kubectl apply -f k8s/secrets/secrets.yaml
|
||||
```
|
||||
|
||||
### ConfigMap
|
||||
|
||||
Application configuration is in `k8s/base/configmap.yaml`. Environment-specific values can be patched in overlays.
|
||||
|
||||
## Deployment Strategies
|
||||
|
||||
### Rolling Update (Default)
|
||||
|
||||
```bash
|
||||
# Update image
|
||||
kubectl set image deployment/spywatcher-backend \
|
||||
backend=ghcr.io/subculture-collective/spywatcher-backend:v2.0.0 \
|
||||
-n spywatcher
|
||||
|
||||
# Watch rollout
|
||||
kubectl rollout status deployment/spywatcher-backend -n spywatcher
|
||||
```
|
||||
|
||||
### Blue-Green Deployment
|
||||
|
||||
Use the provided script:
|
||||
```bash
|
||||
./scripts/deployment/blue-green-deploy.sh
|
||||
```
|
||||
|
||||
### Canary Deployment
|
||||
|
||||
Use the provided script:
|
||||
```bash
|
||||
./scripts/deployment/canary-deploy.sh
|
||||
```
|
||||
|
||||
## Scaling
|
||||
|
||||
### Manual Scaling
|
||||
|
||||
```bash
|
||||
# Scale backend
|
||||
kubectl scale deployment spywatcher-backend --replicas=5 -n spywatcher
|
||||
|
||||
# Scale frontend
|
||||
kubectl scale deployment spywatcher-frontend --replicas=3 -n spywatcher
|
||||
```
|
||||
|
||||
### Auto-scaling
|
||||
|
||||
HorizontalPodAutoscaler is configured to scale based on:
|
||||
- CPU utilization (target: 70%)
|
||||
- Memory utilization (target: 80%)
|
||||
|
||||
```bash
|
||||
# Check HPA status
|
||||
kubectl get hpa -n spywatcher
|
||||
|
||||
# Describe HPA
|
||||
kubectl describe hpa spywatcher-backend-hpa -n spywatcher
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Check Pod Status
|
||||
|
||||
```bash
|
||||
# List all pods
|
||||
kubectl get pods -n spywatcher
|
||||
|
||||
# Describe pod
|
||||
kubectl describe pod <pod-name> -n spywatcher
|
||||
|
||||
# View logs
|
||||
kubectl logs -f <pod-name> -n spywatcher
|
||||
|
||||
# View logs from all replicas
|
||||
kubectl logs -f deployment/spywatcher-backend -n spywatcher
|
||||
```
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Test liveness probe
|
||||
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
|
||||
wget -qO- http://localhost:3001/health/live
|
||||
|
||||
# Test readiness probe
|
||||
kubectl exec -it deployment/spywatcher-backend -n spywatcher -- \
|
||||
wget -qO- http://localhost:3001/health/ready
|
||||
```
|
||||
|
||||
### Resource Usage
|
||||
|
||||
```bash
|
||||
# Pod resource usage
|
||||
kubectl top pods -n spywatcher
|
||||
|
||||
# Node resource usage
|
||||
kubectl top nodes
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Pod Not Starting
|
||||
|
||||
```bash
|
||||
# Check events
|
||||
kubectl get events -n spywatcher --sort-by='.lastTimestamp'
|
||||
|
||||
# Describe pod
|
||||
kubectl describe pod <pod-name> -n spywatcher
|
||||
|
||||
# Check logs
|
||||
kubectl logs <pod-name> -n spywatcher --previous # Previous container
|
||||
```
|
||||
|
||||
### Network Issues
|
||||
|
||||
```bash
|
||||
# Check services
|
||||
kubectl get services -n spywatcher
|
||||
|
||||
# Check endpoints
|
||||
kubectl get endpoints -n spywatcher
|
||||
|
||||
# Test service from within cluster
|
||||
kubectl run -it --rm debug --image=busybox --restart=Never -n spywatcher -- \
|
||||
wget -qO- http://spywatcher-backend/health/live
|
||||
```
|
||||
|
||||
### Database Connection
|
||||
|
||||
```bash
|
||||
# Check database pod
|
||||
kubectl get pods -n spywatcher | grep postgres
|
||||
|
||||
# Test database connection
|
||||
kubectl exec -it postgres-0 -n spywatcher -- \
|
||||
psql -U spywatcher -d spywatcher -c "SELECT version();"
|
||||
|
||||
# Check database logs
|
||||
kubectl logs postgres-0 -n spywatcher
|
||||
```
|
||||
|
||||
### Redis Connection
|
||||
|
||||
```bash
|
||||
# Check Redis pod
|
||||
kubectl get pods -n spywatcher | grep redis
|
||||
|
||||
# Test Redis connection
|
||||
kubectl exec -it redis-0 -n spywatcher -- redis-cli ping
|
||||
|
||||
# Check Redis logs
|
||||
kubectl logs redis-0 -n spywatcher
|
||||
```
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Update Configuration
|
||||
|
||||
```bash
|
||||
# Edit configmap
|
||||
kubectl edit configmap spywatcher-config -n spywatcher
|
||||
|
||||
# Restart pods to pick up changes
|
||||
kubectl rollout restart deployment/spywatcher-backend -n spywatcher
|
||||
```
|
||||
|
||||
### Database Migrations
|
||||
|
||||
Database migrations are run as a separate Kubernetes Job to avoid race conditions.
|
||||
Migrations should be run before deploying new application versions.
|
||||
|
||||
```bash
|
||||
# Create a unique migration job
|
||||
JOB_NAME="db-migration-$(date +%s)"
|
||||
kubectl create job $JOB_NAME --from=job/spywatcher-db-migration -n spywatcher
|
||||
|
||||
# Or apply the migration job directly (it will run once)
|
||||
kubectl apply -f k8s/base/migration-job.yaml
|
||||
|
||||
# Check migration status
|
||||
kubectl get jobs -n spywatcher
|
||||
|
||||
# View migration logs
|
||||
kubectl logs job/$JOB_NAME -n spywatcher
|
||||
|
||||
# Delete completed migration jobs (optional, they auto-delete after 1 hour)
|
||||
kubectl delete job $JOB_NAME -n spywatcher
|
||||
```
|
||||
|
||||
**Important:** The migration job uses `completions: 1` and `parallelism: 1` to ensure
|
||||
only one migration runs at a time, preventing race conditions and deadlocks.
|
||||
|
||||
### Backup
|
||||
|
||||
```bash
|
||||
# Backup PostgreSQL
|
||||
kubectl exec postgres-0 -n spywatcher -- \
|
||||
pg_dump -U spywatcher spywatcher > backup.sql
|
||||
|
||||
# Backup Redis
|
||||
kubectl exec redis-0 -n spywatcher -- \
|
||||
redis-cli BGSAVE
|
||||
```
|
||||
|
||||
## Security
|
||||
|
||||
### Network Policies
|
||||
|
||||
Network policies restrict traffic between pods:
|
||||
- Backend can connect to: PostgreSQL, Redis
|
||||
- Frontend can connect to: Backend
|
||||
- External traffic: Ingress only
|
||||
|
||||
### RBAC
|
||||
|
||||
Service accounts with minimal permissions:
|
||||
- `spywatcher-backend`: Access to secrets, configmaps
|
||||
- `spywatcher-frontend`: Read-only access
|
||||
|
||||
### Secrets
|
||||
|
||||
- Use Sealed Secrets or External Secrets Operator for production
|
||||
- Never commit unencrypted secrets
|
||||
- Rotate secrets regularly
|
||||
|
||||
## Ingress
|
||||
|
||||
### NGINX Ingress Controller
|
||||
|
||||
Install if not already present:
|
||||
```bash
|
||||
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
|
||||
helm install nginx-ingress ingress-nginx/ingress-nginx
|
||||
```
|
||||
|
||||
### Cert-Manager
|
||||
|
||||
Install for automatic SSL certificates:
|
||||
```bash
|
||||
helm repo add jetstack https://charts.jetstack.io
|
||||
helm install cert-manager jetstack/cert-manager \
|
||||
--namespace cert-manager \
|
||||
--create-namespace \
|
||||
--set installCRDs=true
|
||||
```
|
||||
|
||||
Create ClusterIssuer:
|
||||
```yaml
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-prod
|
||||
spec:
|
||||
acme:
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
email: your-email@example.com
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-prod
|
||||
solvers:
|
||||
- http01:
|
||||
ingress:
|
||||
class: nginx
|
||||
```
|
||||
|
||||
## Clean Up
|
||||
|
||||
### Delete Resources
|
||||
|
||||
```bash
|
||||
# Delete all resources in namespace
|
||||
kubectl delete namespace spywatcher
|
||||
|
||||
# Or use kustomize
|
||||
kubectl delete -k k8s/overlays/production
|
||||
```
|
||||
|
||||
### Persistent Data
|
||||
|
||||
⚠️ **WARNING**: Deleting PVCs will delete all data!
|
||||
|
||||
```bash
|
||||
# List PVCs
|
||||
kubectl get pvc -n spywatcher
|
||||
|
||||
# Delete specific PVC
|
||||
kubectl delete pvc postgres-data-postgres-0 -n spywatcher
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use namespaces**: Separate environments with namespaces
|
||||
2. **Resource limits**: Always set requests and limits
|
||||
3. **Health checks**: Configure liveness and readiness probes
|
||||
4. **Security context**: Run containers as non-root
|
||||
5. **Pod disruption budgets**: Ensure high availability
|
||||
6. **Horizontal scaling**: Use HPA for dynamic scaling
|
||||
7. **Rolling updates**: Use for zero-downtime deployments
|
||||
8. **Monitoring**: Integrate with Prometheus/Grafana
|
||||
9. **Logging**: Centralize logs with ELK or Loki
|
||||
10. **Backups**: Regular backups of persistent data
|
||||
193
k8s/base/backend-deployment.yaml
Normal file
193
k8s/base/backend-deployment.yaml
Normal file
@@ -0,0 +1,193 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-backend
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
version: v1
|
||||
spec:
|
||||
replicas: 3
|
||||
revisionHistoryLimit: 10
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
version: v1
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "3001"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
# Security context for pod
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
|
||||
# Anti-affinity to spread pods across nodes
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- spywatcher
|
||||
- key: tier
|
||||
operator: In
|
||||
values:
|
||||
- backend
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
containers:
|
||||
- name: backend
|
||||
image: ghcr.io/subculture-collective/spywatcher-backend:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3001
|
||||
protocol: TCP
|
||||
|
||||
# Environment variables from ConfigMap
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: spywatcher-config
|
||||
|
||||
# Environment variables from Secrets
|
||||
env:
|
||||
- name: DATABASE_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: database-url
|
||||
- name: REDIS_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: redis-url
|
||||
- name: JWT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: jwt-secret
|
||||
- name: JWT_REFRESH_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: jwt-refresh-secret
|
||||
- name: DISCORD_BOT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-bot-token
|
||||
- name: DISCORD_CLIENT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-client-id
|
||||
- name: DISCORD_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-client-secret
|
||||
- name: DISCORD_GUILD_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-guild-id
|
||||
- name: DISCORD_REDIRECT_URI
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-redirect-uri
|
||||
- name: ADMIN_DISCORD_IDS
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: admin-discord-ids
|
||||
- name: BOT_GUILD_IDS
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: bot-guild-ids
|
||||
|
||||
# Resource limits and requests
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
# Liveness probe - checks if container is alive
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/live
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
|
||||
# Readiness probe - checks if container is ready to serve traffic
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
|
||||
# Startup probe - allows slow starting containers more time
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /health/live
|
||||
port: http
|
||||
initialDelaySeconds: 0
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
successThreshold: 1
|
||||
failureThreshold: 30
|
||||
|
||||
# Security context for container
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: false
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
|
||||
# Volume mounts
|
||||
volumeMounts:
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
|
||||
# Volumes
|
||||
volumes:
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
|
||||
# Image pull secrets if using private registry
|
||||
# imagePullSecrets:
|
||||
# - name: ghcr-secret
|
||||
49
k8s/base/backend-hpa.yaml
Normal file
49
k8s/base/backend-hpa.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: spywatcher-backend-hpa
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: spywatcher-backend
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
behavior:
|
||||
scaleDown:
|
||||
stabilizationWindowSeconds: 300
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 50
|
||||
periodSeconds: 60
|
||||
- type: Pods
|
||||
value: 1
|
||||
periodSeconds: 60
|
||||
selectPolicy: Min
|
||||
scaleUp:
|
||||
stabilizationWindowSeconds: 0
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 100
|
||||
periodSeconds: 30
|
||||
- type: Pods
|
||||
value: 2
|
||||
periodSeconds: 30
|
||||
selectPolicy: Max
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 80
|
||||
24
k8s/base/backend-service.yaml
Normal file
24
k8s/base/backend-service.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: spywatcher-backend
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
sessionAffinity: ClientIP
|
||||
sessionAffinityConfig:
|
||||
clientIP:
|
||||
timeoutSeconds: 10800
|
||||
selector:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
29
k8s/base/configmap.yaml
Normal file
29
k8s/base/configmap.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: spywatcher-config
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
data:
|
||||
# Application settings
|
||||
NODE_ENV: "production"
|
||||
PORT: "3001"
|
||||
|
||||
# Logging settings
|
||||
LOG_LEVEL: "info"
|
||||
|
||||
# Rate limiting settings
|
||||
RATE_LIMIT_WINDOW_MS: "900000"
|
||||
RATE_LIMIT_MAX_REQUESTS: "100"
|
||||
|
||||
# Health check settings
|
||||
HEALTH_CHECK_INTERVAL: "30"
|
||||
|
||||
# Database pool settings
|
||||
DB_POOL_MIN: "2"
|
||||
DB_POOL_MAX: "10"
|
||||
|
||||
# Redis settings
|
||||
REDIS_MAX_RETRIES: "3"
|
||||
REDIS_RETRY_DELAY: "1000"
|
||||
124
k8s/base/frontend-deployment.yaml
Normal file
124
k8s/base/frontend-deployment.yaml
Normal file
@@ -0,0 +1,124 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-frontend
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
version: v1
|
||||
spec:
|
||||
replicas: 2
|
||||
revisionHistoryLimit: 10
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
version: v1
|
||||
spec:
|
||||
# Security context for pod
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
|
||||
# Anti-affinity to spread pods across nodes
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- spywatcher
|
||||
- key: tier
|
||||
operator: In
|
||||
values:
|
||||
- frontend
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
containers:
|
||||
- name: frontend
|
||||
image: ghcr.io/subculture-collective/spywatcher-frontend:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 80
|
||||
protocol: TCP
|
||||
|
||||
env:
|
||||
- name: VITE_API_URL
|
||||
value: "https://api.spywatcher.example.com"
|
||||
- name: VITE_DISCORD_CLIENT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: discord-client-id
|
||||
|
||||
# Resource limits and requests
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
|
||||
# Liveness probe
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
|
||||
# Readiness probe
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
|
||||
# Security context for container
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
|
||||
# Volume mounts for nginx cache and temp files
|
||||
volumeMounts:
|
||||
- name: cache
|
||||
mountPath: /var/cache/nginx
|
||||
- name: run
|
||||
mountPath: /var/run
|
||||
|
||||
# Volumes
|
||||
volumes:
|
||||
- name: cache
|
||||
emptyDir: {}
|
||||
- name: run
|
||||
emptyDir: {}
|
||||
18
k8s/base/frontend-service.yaml
Normal file
18
k8s/base/frontend-service.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: spywatcher-frontend
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
76
k8s/base/ingress.yaml
Normal file
76
k8s/base/ingress.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: spywatcher-ingress
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
annotations:
|
||||
# SSL/TLS configuration
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
|
||||
# AWS ALB annotations (if using AWS)
|
||||
alb.ingress.kubernetes.io/scheme: internet-facing
|
||||
alb.ingress.kubernetes.io/target-type: ip
|
||||
alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=60
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /health/live
|
||||
alb.ingress.kubernetes.io/healthcheck-interval-seconds: "30"
|
||||
alb.ingress.kubernetes.io/healthcheck-timeout-seconds: "5"
|
||||
alb.ingress.kubernetes.io/healthy-threshold-count: "2"
|
||||
alb.ingress.kubernetes.io/unhealthy-threshold-count: "3"
|
||||
|
||||
# NGINX Ingress annotations (if using NGINX)
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
||||
|
||||
# WebSocket support
|
||||
nginx.ingress.kubernetes.io/websocket-services: spywatcher-backend
|
||||
nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
|
||||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
|
||||
# Security headers
|
||||
nginx.ingress.kubernetes.io/server-snippet: |
|
||||
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
|
||||
# Rate limiting
|
||||
nginx.ingress.kubernetes.io/limit-rps: "100"
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- spywatcher.example.com
|
||||
- api.spywatcher.example.com
|
||||
secretName: spywatcher-tls-cert
|
||||
rules:
|
||||
# Frontend
|
||||
- host: spywatcher.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: spywatcher-frontend
|
||||
port:
|
||||
number: 80
|
||||
|
||||
# Backend API
|
||||
- host: api.spywatcher.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: spywatcher-backend
|
||||
port:
|
||||
number: 80
|
||||
23
k8s/base/kustomization.yaml
Normal file
23
k8s/base/kustomization.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: spywatcher
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- configmap.yaml
|
||||
- secrets.yaml
|
||||
- postgres-statefulset.yaml
|
||||
- redis-statefulset.yaml
|
||||
- migration-job.yaml
|
||||
- backend-deployment.yaml
|
||||
- backend-service.yaml
|
||||
- backend-hpa.yaml
|
||||
- frontend-deployment.yaml
|
||||
- frontend-service.yaml
|
||||
- ingress.yaml
|
||||
- pdb.yaml
|
||||
|
||||
commonLabels:
|
||||
app.kubernetes.io/name: spywatcher
|
||||
app.kubernetes.io/managed-by: kustomize
|
||||
65
k8s/base/migration-job.yaml
Normal file
65
k8s/base/migration-job.yaml
Normal file
@@ -0,0 +1,65 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: spywatcher-db-migration
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
component: migration
|
||||
spec:
|
||||
# Only keep successful jobs for 1 hour
|
||||
ttlSecondsAfterFinished: 3600
|
||||
# Prevent concurrent migrations
|
||||
completions: 1
|
||||
parallelism: 1
|
||||
backoffLimit: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spywatcher
|
||||
component: migration
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
# Security context for pod
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
fsGroup: 1001
|
||||
|
||||
containers:
|
||||
- name: migrate
|
||||
image: ghcr.io/subculture-collective/spywatcher-backend:latest
|
||||
imagePullPolicy: Always
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
echo "Starting database migration..."
|
||||
npx prisma migrate deploy
|
||||
echo "Migration completed successfully"
|
||||
|
||||
env:
|
||||
- name: DATABASE_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: spywatcher-secrets
|
||||
key: database-url
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
|
||||
# Security context for container
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: false
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
35
k8s/base/namespace.yaml
Normal file
35
k8s/base/namespace.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: spywatcher
|
||||
labels:
|
||||
name: spywatcher
|
||||
environment: production
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: spywatcher-quota
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
hard:
|
||||
requests.cpu: "20"
|
||||
requests.memory: 40Gi
|
||||
limits.cpu: "40"
|
||||
limits.memory: 80Gi
|
||||
persistentvolumeclaims: "10"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: spywatcher-limit-range
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
limits:
|
||||
- default:
|
||||
cpu: "1"
|
||||
memory: 1Gi
|
||||
defaultRequest:
|
||||
cpu: "500m"
|
||||
memory: 512Mi
|
||||
type: Container
|
||||
29
k8s/base/pdb.yaml
Normal file
29
k8s/base/pdb.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: spywatcher-backend-pdb
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: backend
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: spywatcher-frontend-pdb
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: frontend
|
||||
113
k8s/base/postgres-statefulset.yaml
Normal file
113
k8s/base/postgres-statefulset.yaml
Normal file
@@ -0,0 +1,113 @@
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: postgres
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
spec:
|
||||
serviceName: postgres
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
spec:
|
||||
containers:
|
||||
- name: postgres
|
||||
image: postgres:15-alpine
|
||||
ports:
|
||||
- name: postgres
|
||||
containerPort: 5432
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: POSTGRES_DB
|
||||
value: spywatcher
|
||||
- name: POSTGRES_USER
|
||||
value: spywatcher
|
||||
- name: POSTGRES_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: postgres-secret
|
||||
key: password
|
||||
- name: PGDATA
|
||||
value: /var/lib/postgresql/data/pgdata
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
volumeMounts:
|
||||
- name: postgres-data
|
||||
mountPath: /var/lib/postgresql/data
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- pg_isready -U spywatcher
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- pg_isready -U spywatcher
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: postgres-data
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: postgres
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
selector:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
ports:
|
||||
- name: postgres
|
||||
port: 5432
|
||||
targetPort: postgres
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: postgres-secret
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: database
|
||||
type: Opaque
|
||||
stringData:
|
||||
password: "CHANGE_ME_IN_PRODUCTION"
|
||||
117
k8s/base/redis-statefulset.yaml
Normal file
117
k8s/base/redis-statefulset.yaml
Normal file
@@ -0,0 +1,117 @@
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: redis
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
spec:
|
||||
serviceName: redis
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
spec:
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
command:
|
||||
- redis-server
|
||||
- --appendonly
|
||||
- "yes"
|
||||
- --requirepass
|
||||
- $(REDIS_PASSWORD)
|
||||
ports:
|
||||
- name: redis
|
||||
containerPort: 6379
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: redis-secret
|
||||
key: password
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
volumeMounts:
|
||||
- name: redis-data
|
||||
mountPath: /data
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- --no-auth-warning
|
||||
- -a
|
||||
- $(REDIS_PASSWORD)
|
||||
- ping
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- --no-auth-warning
|
||||
- -a
|
||||
- $(REDIS_PASSWORD)
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: redis-data
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: redis
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
selector:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
ports:
|
||||
- name: redis
|
||||
port: 6379
|
||||
targetPort: redis
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: redis-secret
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
tier: cache
|
||||
type: Opaque
|
||||
stringData:
|
||||
password: "CHANGE_ME_IN_PRODUCTION"
|
||||
32
k8s/base/secrets.yaml
Normal file
32
k8s/base/secrets.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
# This is a template file for secrets
|
||||
# In production, use sealed-secrets, external-secrets, or your cloud provider's secret management
|
||||
# DO NOT commit actual secrets to git
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: spywatcher-secrets
|
||||
namespace: spywatcher
|
||||
labels:
|
||||
app: spywatcher
|
||||
type: Opaque
|
||||
stringData:
|
||||
# Database connection
|
||||
database-url: "postgresql://user:password@postgres-service:5432/spywatcher"
|
||||
|
||||
# Redis connection
|
||||
redis-url: "redis://redis-service:6379"
|
||||
|
||||
# JWT secrets
|
||||
jwt-secret: "CHANGE_ME_IN_PRODUCTION"
|
||||
jwt-refresh-secret: "CHANGE_ME_IN_PRODUCTION"
|
||||
|
||||
# Discord credentials
|
||||
discord-bot-token: "CHANGE_ME_IN_PRODUCTION"
|
||||
discord-client-id: "CHANGE_ME_IN_PRODUCTION"
|
||||
discord-client-secret: "CHANGE_ME_IN_PRODUCTION"
|
||||
discord-guild-id: "CHANGE_ME_IN_PRODUCTION"
|
||||
discord-redirect-uri: "https://spywatcher.example.com/auth/callback"
|
||||
|
||||
# Admin settings
|
||||
admin-discord-ids: ""
|
||||
bot-guild-ids: ""
|
||||
29
k8s/overlays/production/kustomization.yaml
Normal file
29
k8s/overlays/production/kustomization.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: spywatcher
|
||||
|
||||
bases:
|
||||
- ../../base
|
||||
|
||||
namePrefix: prod-
|
||||
|
||||
commonLabels:
|
||||
environment: production
|
||||
|
||||
patchesStrategicMerge:
|
||||
- replicas-patch.yaml
|
||||
- resources-patch.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: spywatcher-config
|
||||
behavior: merge
|
||||
literals:
|
||||
- NODE_ENV=production
|
||||
- LOG_LEVEL=info
|
||||
|
||||
images:
|
||||
- name: ghcr.io/subculture-collective/spywatcher-backend
|
||||
newTag: latest
|
||||
- name: ghcr.io/subculture-collective/spywatcher-frontend
|
||||
newTag: latest
|
||||
15
k8s/overlays/production/replicas-patch.yaml
Normal file
15
k8s/overlays/production/replicas-patch.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-backend
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
replicas: 3
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-frontend
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
replicas: 2
|
||||
35
k8s/overlays/production/resources-patch.yaml
Normal file
35
k8s/overlays/production/resources-patch.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-backend
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: backend
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-frontend
|
||||
namespace: spywatcher
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: frontend
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
28
k8s/overlays/staging/kustomization.yaml
Normal file
28
k8s/overlays/staging/kustomization.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: spywatcher-staging
|
||||
|
||||
bases:
|
||||
- ../../base
|
||||
|
||||
namePrefix: staging-
|
||||
|
||||
commonLabels:
|
||||
environment: staging
|
||||
|
||||
patchesStrategicMerge:
|
||||
- replicas-patch.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: spywatcher-config
|
||||
behavior: merge
|
||||
literals:
|
||||
- NODE_ENV=staging
|
||||
- LOG_LEVEL=debug
|
||||
|
||||
images:
|
||||
- name: ghcr.io/subculture-collective/spywatcher-backend
|
||||
newTag: staging
|
||||
- name: ghcr.io/subculture-collective/spywatcher-frontend
|
||||
newTag: staging
|
||||
15
k8s/overlays/staging/replicas-patch.yaml
Normal file
15
k8s/overlays/staging/replicas-patch.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-backend
|
||||
namespace: spywatcher-staging
|
||||
spec:
|
||||
replicas: 1
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: spywatcher-frontend
|
||||
namespace: spywatcher-staging
|
||||
spec:
|
||||
replicas: 1
|
||||
198
scripts/deployment/blue-green-deploy.sh
Executable file
198
scripts/deployment/blue-green-deploy.sh
Executable file
@@ -0,0 +1,198 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Blue-Green Deployment Script for Spywatcher
|
||||
# This script performs zero-downtime deployments by maintaining two identical environments
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration
|
||||
NAMESPACE="${NAMESPACE:-spywatcher}"
|
||||
APP_NAME="${APP_NAME:-spywatcher-backend}"
|
||||
IMAGE_TAG="${IMAGE_TAG:-latest}"
|
||||
HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
|
||||
HEALTH_CHECK_RETRIES="${HEALTH_CHECK_RETRIES:-10}"
|
||||
HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-10}"
|
||||
|
||||
# Function to print colored output
|
||||
print_info() {
|
||||
echo -e "${GREEN}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Function to check if a deployment exists
|
||||
deployment_exists() {
|
||||
kubectl get deployment "$1" -n "$NAMESPACE" &> /dev/null
|
||||
}
|
||||
|
||||
# Function to get current active environment
|
||||
get_active_environment() {
|
||||
local service_selector=$(kubectl get service "$APP_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.selector.version}')
|
||||
echo "$service_selector"
|
||||
}
|
||||
|
||||
# Function to perform health check
|
||||
health_check() {
|
||||
local deployment=$1
|
||||
local retries=$HEALTH_CHECK_RETRIES
|
||||
|
||||
print_info "Performing health check on $deployment..."
|
||||
|
||||
while [ $retries -gt 0 ]; do
|
||||
# Get pod name
|
||||
local pod=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=$deployment -o jsonpath='{.items[0].metadata.name}')
|
||||
|
||||
if [ -z "$pod" ]; then
|
||||
print_warning "No pod found for $deployment, retrying..."
|
||||
sleep $HEALTH_CHECK_INTERVAL
|
||||
retries=$((retries - 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check if pod is running
|
||||
local pod_status=$(kubectl get pod "$pod" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
|
||||
if [ "$pod_status" != "Running" ]; then
|
||||
print_warning "Pod $pod is not running (status: $pod_status), retrying..."
|
||||
sleep $HEALTH_CHECK_INTERVAL
|
||||
retries=$((retries - 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
# Perform HTTP health check
|
||||
if kubectl exec "$pod" -n "$NAMESPACE" -- wget -q -O- "http://localhost:3001$HEALTH_CHECK_PATH" &> /dev/null; then
|
||||
print_info "Health check passed for $deployment"
|
||||
return 0
|
||||
else
|
||||
print_warning "Health check failed for $deployment, retrying..."
|
||||
sleep $HEALTH_CHECK_INTERVAL
|
||||
retries=$((retries - 1))
|
||||
fi
|
||||
done
|
||||
|
||||
print_error "Health check failed after $HEALTH_CHECK_RETRIES retries"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Main deployment logic
|
||||
main() {
|
||||
print_info "Starting Blue-Green deployment for $APP_NAME"
|
||||
print_info "Namespace: $NAMESPACE"
|
||||
print_info "Image Tag: $IMAGE_TAG"
|
||||
|
||||
# Determine current active environment
|
||||
local current_env=$(get_active_environment)
|
||||
|
||||
if [ -z "$current_env" ]; then
|
||||
# No active environment, default to blue
|
||||
current_env="blue"
|
||||
new_env="green"
|
||||
print_info "No active environment found, will deploy to green"
|
||||
elif [ "$current_env" = "blue" ]; then
|
||||
new_env="green"
|
||||
else
|
||||
new_env="blue"
|
||||
fi
|
||||
|
||||
print_info "Current active environment: $current_env"
|
||||
print_info "Deploying to: $new_env"
|
||||
|
||||
# Create or update new environment deployment
|
||||
local new_deployment="$APP_NAME-$new_env"
|
||||
|
||||
# Apply deployment
|
||||
kubectl set image "deployment/$new_deployment" \
|
||||
backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
|
||||
-n "$NAMESPACE" 2>/dev/null || \
|
||||
kubectl create deployment "$new_deployment" \
|
||||
--image="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
|
||||
-n "$NAMESPACE"
|
||||
|
||||
# Label the deployment
|
||||
kubectl label deployment "$new_deployment" app=spywatcher version=$new_env -n "$NAMESPACE" --overwrite
|
||||
|
||||
# Wait for deployment to be ready
|
||||
print_info "Waiting for deployment $new_deployment to be ready..."
|
||||
kubectl rollout status "deployment/$new_deployment" -n "$NAMESPACE" --timeout=5m
|
||||
|
||||
# Perform health checks
|
||||
if ! health_check "$new_env"; then
|
||||
print_error "Health check failed for $new_env environment"
|
||||
print_error "Keeping traffic on $current_env environment"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Update service selector to point to new environment
|
||||
print_info "Switching traffic to $new_env environment..."
|
||||
kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
|
||||
-p "{\"spec\":{\"selector\":{\"version\":\"$new_env\"}}}"
|
||||
|
||||
print_info "Traffic successfully switched to $new_env"
|
||||
|
||||
# Wait a bit before considering old environment for removal
|
||||
print_info "Waiting 60 seconds before cleaning up old environment..."
|
||||
sleep 60
|
||||
|
||||
# Optional: Scale down old environment instead of deleting
|
||||
if deployment_exists "$APP_NAME-$current_env"; then
|
||||
print_info "Scaling down old environment: $current_env"
|
||||
kubectl scale deployment "$APP_NAME-$current_env" --replicas=0 -n "$NAMESPACE"
|
||||
print_info "Old environment scaled to 0 replicas (can be used for quick rollback)"
|
||||
fi
|
||||
|
||||
print_info "Blue-Green deployment completed successfully!"
|
||||
print_info "Active environment: $new_env"
|
||||
}
|
||||
|
||||
# Rollback function
|
||||
rollback() {
|
||||
print_warning "Rolling back deployment..."
|
||||
|
||||
local current_env=$(get_active_environment)
|
||||
local previous_env
|
||||
|
||||
if [ "$current_env" = "blue" ]; then
|
||||
previous_env="green"
|
||||
else
|
||||
previous_env="blue"
|
||||
fi
|
||||
|
||||
# Check if previous environment exists
|
||||
if ! deployment_exists "$APP_NAME-$previous_env"; then
|
||||
print_error "Previous environment $previous_env does not exist, cannot rollback"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Scale up previous environment if it's scaled down
|
||||
local replicas=$(kubectl get deployment "$APP_NAME-$previous_env" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
|
||||
if [ "$replicas" -eq 0 ]; then
|
||||
print_info "Scaling up previous environment: $previous_env"
|
||||
kubectl scale deployment "$APP_NAME-$previous_env" --replicas=3 -n "$NAMESPACE"
|
||||
kubectl rollout status "deployment/$APP_NAME-$previous_env" -n "$NAMESPACE" --timeout=5m
|
||||
fi
|
||||
|
||||
# Switch traffic back
|
||||
print_info "Switching traffic back to $previous_env"
|
||||
kubectl patch service "$APP_NAME" -n "$NAMESPACE" \
|
||||
-p "{\"spec\":{\"selector\":{\"version\":\"$previous_env\"}}}"
|
||||
|
||||
print_info "Rollback completed successfully!"
|
||||
print_info "Active environment: $previous_env"
|
||||
}
|
||||
|
||||
# Check if rollback flag is set
|
||||
if [ "$1" = "--rollback" ]; then
|
||||
rollback
|
||||
else
|
||||
main
|
||||
fi
|
||||
218
scripts/deployment/canary-deploy.sh
Executable file
218
scripts/deployment/canary-deploy.sh
Executable file
@@ -0,0 +1,218 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Canary Deployment Script for Spywatcher
|
||||
# This script gradually shifts traffic to a new version while monitoring for errors
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration
|
||||
NAMESPACE="${NAMESPACE:-spywatcher}"
|
||||
APP_NAME="${APP_NAME:-spywatcher-backend}"
|
||||
IMAGE_TAG="${IMAGE_TAG:-latest}"
|
||||
HEALTH_CHECK_PATH="${HEALTH_CHECK_PATH:-/health/ready}"
|
||||
|
||||
# Canary rollout percentages
|
||||
CANARY_STEPS="${CANARY_STEPS:-5 25 50 75 100}"
|
||||
CANARY_WAIT="${CANARY_WAIT:-60}" # Wait time between steps in seconds
|
||||
|
||||
# Error thresholds
|
||||
ERROR_THRESHOLD="${ERROR_THRESHOLD:-5}" # Max error percentage before rollback
|
||||
|
||||
# Function to print colored output
|
||||
print_info() {
|
||||
echo -e "${GREEN}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Function to check deployment health
|
||||
check_health() {
|
||||
local deployment=$1
|
||||
local replicas=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}')
|
||||
local desired=$(kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}')
|
||||
|
||||
if [ "$replicas" = "$desired" ] && [ "$replicas" -gt 0 ]; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check error rate (simplified - you should integrate with your monitoring system)
|
||||
check_error_rate() {
|
||||
local deployment=$1
|
||||
|
||||
# Get pod logs and check for errors
|
||||
local pods=$(kubectl get pods -n "$NAMESPACE" -l app=spywatcher,version=canary -o jsonpath='{.items[*].metadata.name}')
|
||||
|
||||
if [ -z "$pods" ]; then
|
||||
print_warning "No canary pods found"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Simple error check - count ERROR log entries
|
||||
local error_count=0
|
||||
for pod in $pods; do
|
||||
local pod_errors=$(kubectl logs "$pod" -n "$NAMESPACE" --tail=100 | grep -c "ERROR" || true)
|
||||
error_count=$((error_count + pod_errors))
|
||||
done
|
||||
|
||||
print_info "Detected $error_count errors in canary pods"
|
||||
|
||||
if [ "$error_count" -gt "$ERROR_THRESHOLD" ]; then
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to update traffic weights
|
||||
# NOTE: This implementation uses replica counts to approximate traffic splitting,
|
||||
# which is not precise. For accurate percentage-based traffic splitting,
|
||||
# consider using a service mesh (Istio, Linkerd) or an ingress controller
|
||||
# that supports weighted traffic splitting (like NGINX Ingress with canary annotations).
|
||||
update_traffic_weight() {
|
||||
local canary_weight=$1
|
||||
local stable_weight=$((100 - canary_weight))
|
||||
|
||||
print_info "Adjusting traffic: Canary $canary_weight%, Stable $stable_weight%"
|
||||
print_warning "Note: Replica-based traffic splitting is approximate. Actual traffic may not match percentages exactly."
|
||||
|
||||
# Calculate replica counts based on percentages
|
||||
local total_replicas=3
|
||||
local canary_replicas=$(( (total_replicas * canary_weight + 50) / 100 ))
|
||||
local stable_replicas=$((total_replicas - canary_replicas))
|
||||
|
||||
# Ensure at least 1 replica
|
||||
[ "$canary_replicas" -eq 0 ] && canary_replicas=1
|
||||
[ "$stable_replicas" -eq 0 ] && stable_replicas=1
|
||||
|
||||
# Scale deployments
|
||||
kubectl scale deployment "$APP_NAME-canary" --replicas=$canary_replicas -n "$NAMESPACE"
|
||||
kubectl scale deployment "$APP_NAME-stable" --replicas=$stable_replicas -n "$NAMESPACE"
|
||||
|
||||
# Wait for scaling to complete
|
||||
kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=2m
|
||||
kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=2m
|
||||
}
|
||||
|
||||
# Function to promote canary to stable
|
||||
promote_canary() {
|
||||
print_info "Promoting canary to stable..."
|
||||
|
||||
# Update stable deployment with canary image
|
||||
local canary_image=$(kubectl get deployment "$APP_NAME-canary" -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].image}')
|
||||
|
||||
kubectl set image "deployment/$APP_NAME-stable" backend="$canary_image" -n "$NAMESPACE"
|
||||
kubectl rollout status "deployment/$APP_NAME-stable" -n "$NAMESPACE" --timeout=5m
|
||||
|
||||
# Scale stable back to full capacity
|
||||
kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
|
||||
|
||||
# Remove canary deployment
|
||||
kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
|
||||
|
||||
print_info "Canary promoted to stable successfully!"
|
||||
}
|
||||
|
||||
# Function to rollback canary
|
||||
rollback_canary() {
|
||||
print_error "Rolling back canary deployment..."
|
||||
|
||||
# Delete canary deployment
|
||||
kubectl delete deployment "$APP_NAME-canary" -n "$NAMESPACE" --ignore-not-found=true
|
||||
|
||||
# Ensure stable is at full capacity
|
||||
kubectl scale deployment "$APP_NAME-stable" --replicas=3 -n "$NAMESPACE"
|
||||
|
||||
print_info "Canary deployment rolled back"
|
||||
}
|
||||
|
||||
# Main deployment logic
|
||||
main() {
|
||||
print_info "Starting Canary deployment for $APP_NAME"
|
||||
print_info "Namespace: $NAMESPACE"
|
||||
print_info "Image Tag: $IMAGE_TAG"
|
||||
print_info "Canary steps: $CANARY_STEPS"
|
||||
|
||||
# Ensure stable deployment exists
|
||||
if ! kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" &> /dev/null; then
|
||||
# If stable doesn't exist, copy from existing deployment
|
||||
if kubectl get deployment "$APP_NAME" -n "$NAMESPACE" &> /dev/null; then
|
||||
print_info "Creating stable deployment from existing deployment"
|
||||
kubectl get deployment "$APP_NAME" -n "$NAMESPACE" -o yaml | \
|
||||
sed "s/name: $APP_NAME$/name: $APP_NAME-stable/" | \
|
||||
kubectl apply -f -
|
||||
# Update the original deployment name if needed
|
||||
kubectl label deployment "$APP_NAME-stable" version=stable -n "$NAMESPACE" --overwrite
|
||||
else
|
||||
print_error "No existing deployment found"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Create canary deployment
|
||||
print_info "Creating canary deployment..."
|
||||
kubectl get deployment "$APP_NAME-stable" -n "$NAMESPACE" -o yaml | \
|
||||
sed "s/$APP_NAME-stable/$APP_NAME-canary/g" | \
|
||||
sed "s/version: stable/version: canary/g" | \
|
||||
kubectl apply -f -
|
||||
|
||||
# Update canary image
|
||||
kubectl set image "deployment/$APP_NAME-canary" \
|
||||
backend="ghcr.io/subculture-collective/spywatcher-backend:$IMAGE_TAG" \
|
||||
-n "$NAMESPACE"
|
||||
|
||||
kubectl label deployment "$APP_NAME-canary" version=canary -n "$NAMESPACE" --overwrite
|
||||
|
||||
# Start with minimal canary traffic
|
||||
kubectl scale deployment "$APP_NAME-canary" --replicas=1 -n "$NAMESPACE"
|
||||
kubectl rollout status "deployment/$APP_NAME-canary" -n "$NAMESPACE" --timeout=5m
|
||||
|
||||
# Gradually shift traffic
|
||||
for step in $CANARY_STEPS; do
|
||||
print_info "Canary rollout: ${step}%"
|
||||
|
||||
# Update traffic weights
|
||||
update_traffic_weight "$step"
|
||||
|
||||
# Wait for the step duration
|
||||
print_info "Waiting ${CANARY_WAIT}s before next step..."
|
||||
sleep "$CANARY_WAIT"
|
||||
|
||||
# Check health
|
||||
if ! check_health "$APP_NAME-canary"; then
|
||||
print_error "Canary health check failed"
|
||||
rollback_canary
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check error rate
|
||||
if ! check_error_rate "$APP_NAME-canary"; then
|
||||
print_error "Canary error rate exceeded threshold"
|
||||
rollback_canary
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Step ${step}% completed successfully"
|
||||
done
|
||||
|
||||
# Promote canary to stable
|
||||
promote_canary
|
||||
|
||||
print_info "Canary deployment completed successfully!"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
336
terraform/README.md
Normal file
336
terraform/README.md
Normal file
@@ -0,0 +1,336 @@
|
||||
# Spywatcher Infrastructure as Code
|
||||
|
||||
This directory contains Terraform configurations for deploying Spywatcher infrastructure on AWS.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- AWS CLI configured with appropriate credentials
|
||||
- Terraform >= 1.5.0
|
||||
- kubectl
|
||||
- Helm (optional)
|
||||
|
||||
## Infrastructure Components
|
||||
|
||||
### Modules
|
||||
|
||||
- **VPC**: Virtual Private Cloud with public, private, and database subnets
|
||||
- **EKS**: Elastic Kubernetes Service cluster
|
||||
- **RDS**: PostgreSQL database
|
||||
- **Redis**: ElastiCache Redis cluster
|
||||
- **ALB**: Application Load Balancer with WAF
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
terraform/
|
||||
├── main.tf # Root module configuration
|
||||
├── variables.tf # Root module variables
|
||||
├── outputs.tf # Root module outputs
|
||||
├── modules/ # Reusable modules
|
||||
│ ├── vpc/
|
||||
│ ├── eks/
|
||||
│ ├── rds/
|
||||
│ ├── redis/
|
||||
│ └── alb/
|
||||
└── environments/ # Environment-specific configurations
|
||||
├── production/
|
||||
│ └── terraform.tfvars
|
||||
└── staging/
|
||||
└── terraform.tfvars
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Configure Backend
|
||||
|
||||
First, create an S3 bucket and DynamoDB table for state management:
|
||||
|
||||
```bash
|
||||
# Create S3 bucket for state
|
||||
aws s3api create-bucket \
|
||||
--bucket spywatcher-terraform-state \
|
||||
--region us-east-1
|
||||
|
||||
# Enable versioning
|
||||
aws s3api put-bucket-versioning \
|
||||
--bucket spywatcher-terraform-state \
|
||||
--versioning-configuration Status=Enabled
|
||||
|
||||
# Create DynamoDB table for state locking
|
||||
aws dynamodb create-table \
|
||||
--table-name terraform-state-lock \
|
||||
--attribute-definitions AttributeName=LockID,AttributeType=S \
|
||||
--key-schema AttributeName=LockID,KeyType=HASH \
|
||||
--billing-mode PAY_PER_REQUEST \
|
||||
--region us-east-1
|
||||
```
|
||||
|
||||
### 2. Initialize Terraform
|
||||
|
||||
```bash
|
||||
cd terraform
|
||||
terraform init
|
||||
```
|
||||
|
||||
### 3. Review and Customize
|
||||
|
||||
Edit the appropriate `terraform.tfvars` file:
|
||||
|
||||
```bash
|
||||
# For production
|
||||
vim environments/production/terraform.tfvars
|
||||
|
||||
# For staging
|
||||
vim environments/staging/terraform.tfvars
|
||||
```
|
||||
|
||||
Key configurations to update:
|
||||
- `certificate_arn`: SSL certificate ARN from AWS Certificate Manager
|
||||
- VPC CIDR blocks (if needed)
|
||||
- Instance types and sizes
|
||||
- Database credentials (use environment variables or AWS Secrets Manager)
|
||||
|
||||
### 4. Plan Infrastructure
|
||||
|
||||
```bash
|
||||
# Production
|
||||
terraform plan -var-file="environments/production/terraform.tfvars"
|
||||
|
||||
# Staging
|
||||
terraform plan -var-file="environments/staging/terraform.tfvars"
|
||||
```
|
||||
|
||||
### 5. Apply Infrastructure
|
||||
|
||||
```bash
|
||||
# Production
|
||||
terraform apply -var-file="environments/production/terraform.tfvars"
|
||||
|
||||
# Staging
|
||||
terraform apply -var-file="environments/staging/terraform.tfvars"
|
||||
```
|
||||
|
||||
This will create:
|
||||
- VPC with NAT gateways
|
||||
- EKS cluster with node groups
|
||||
- RDS PostgreSQL instance
|
||||
- ElastiCache Redis cluster
|
||||
- Application Load Balancer
|
||||
- Security groups and IAM roles
|
||||
|
||||
### 6. Configure kubectl
|
||||
|
||||
After infrastructure is created:
|
||||
|
||||
```bash
|
||||
# Get the cluster name from outputs
|
||||
terraform output eks_cluster_name
|
||||
|
||||
# Configure kubectl
|
||||
aws eks update-kubeconfig \
|
||||
--name $(terraform output -raw eks_cluster_name) \
|
||||
--region us-east-1
|
||||
|
||||
# Verify connection
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
## Outputs
|
||||
|
||||
After applying, Terraform will output important values:
|
||||
|
||||
```bash
|
||||
# View all outputs
|
||||
terraform output
|
||||
|
||||
# View specific output
|
||||
terraform output rds_endpoint
|
||||
terraform output eks_cluster_endpoint
|
||||
```
|
||||
|
||||
## Secrets Management
|
||||
|
||||
### Database Password
|
||||
|
||||
The RDS password is auto-generated and stored in AWS Secrets Manager:
|
||||
|
||||
```bash
|
||||
# Retrieve database password
|
||||
aws secretsmanager get-secret-value \
|
||||
--secret-id spywatcher-production-db-password \
|
||||
--query SecretString \
|
||||
--output text
|
||||
```
|
||||
|
||||
### Redis Auth Token
|
||||
|
||||
Redis authentication token is also in Secrets Manager:
|
||||
|
||||
```bash
|
||||
# Retrieve Redis auth token
|
||||
aws secretsmanager get-secret-value \
|
||||
--secret-id spywatcher-production-auth-token \
|
||||
--query SecretString \
|
||||
--output text
|
||||
```
|
||||
|
||||
## Updating Infrastructure
|
||||
|
||||
```bash
|
||||
# Make changes to .tf files or terraform.tfvars
|
||||
|
||||
# Plan changes
|
||||
terraform plan -var-file="environments/production/terraform.tfvars"
|
||||
|
||||
# Apply changes
|
||||
terraform apply -var-file="environments/production/terraform.tfvars"
|
||||
```
|
||||
|
||||
## Destroying Infrastructure
|
||||
|
||||
⚠️ **WARNING**: This will destroy all resources. Make sure you have backups!
|
||||
|
||||
```bash
|
||||
# Destroy infrastructure
|
||||
terraform destroy -var-file="environments/production/terraform.tfvars"
|
||||
```
|
||||
|
||||
## Module Documentation
|
||||
|
||||
### VPC Module
|
||||
|
||||
Creates a VPC with:
|
||||
- 3 availability zones
|
||||
- Public, private, and database subnets
|
||||
- NAT gateways for private subnet internet access
|
||||
- VPC Flow Logs
|
||||
|
||||
### EKS Module
|
||||
|
||||
Creates an EKS cluster with:
|
||||
- Managed node groups
|
||||
- OIDC provider for IRSA
|
||||
- Essential add-ons (VPC CNI, CoreDNS, kube-proxy)
|
||||
- Security groups
|
||||
|
||||
### RDS Module
|
||||
|
||||
Creates a PostgreSQL database with:
|
||||
- Encryption at rest
|
||||
- Automated backups
|
||||
- Multi-AZ deployment (production)
|
||||
- Performance Insights
|
||||
- CloudWatch alarms
|
||||
|
||||
### Redis Module
|
||||
|
||||
Creates an ElastiCache Redis cluster with:
|
||||
- Encryption in transit and at rest
|
||||
- Authentication token
|
||||
- Automatic failover (if multi-node)
|
||||
- CloudWatch alarms
|
||||
|
||||
### ALB Module
|
||||
|
||||
Creates an Application Load Balancer with:
|
||||
- HTTPS termination
|
||||
- HTTP to HTTPS redirect
|
||||
- WAF with rate limiting
|
||||
- AWS Managed Rules
|
||||
|
||||
## Cost Optimization
|
||||
|
||||
### Development/Testing
|
||||
|
||||
For cost savings in non-production:
|
||||
- Use smaller instance types
|
||||
- Single-AZ deployments
|
||||
- Spot instances for EKS nodes
|
||||
- Reduce backup retention periods
|
||||
|
||||
### Production
|
||||
|
||||
- Use Reserved Instances for steady-state workload
|
||||
- Enable auto-scaling
|
||||
- Right-size instances based on metrics
|
||||
- Use S3 lifecycle policies for backups
|
||||
|
||||
## Monitoring
|
||||
|
||||
### CloudWatch Alarms
|
||||
|
||||
The modules create CloudWatch alarms for:
|
||||
- RDS CPU utilization
|
||||
- RDS storage space
|
||||
- Redis CPU utilization
|
||||
- Redis memory usage
|
||||
|
||||
Configure SNS topics for notifications:
|
||||
|
||||
```bash
|
||||
# Create SNS topic
|
||||
aws sns create-topic --name spywatcher-alerts
|
||||
|
||||
# Subscribe to topic
|
||||
aws sns subscribe \
|
||||
--topic-arn arn:aws:sns:us-east-1:123456789012:spywatcher-alerts \
|
||||
--protocol email \
|
||||
--notification-endpoint your-email@example.com
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### State Lock Issues
|
||||
|
||||
If you encounter state lock errors:
|
||||
|
||||
```bash
|
||||
# Force unlock (use carefully)
|
||||
terraform force-unlock <LOCK_ID>
|
||||
```
|
||||
|
||||
### EKS Access Issues
|
||||
|
||||
If you can't access the cluster:
|
||||
|
||||
```bash
|
||||
# Ensure your AWS credentials are correct
|
||||
aws sts get-caller-identity
|
||||
|
||||
# Update kubeconfig
|
||||
aws eks update-kubeconfig --name <cluster-name> --region us-east-1
|
||||
|
||||
# Check IAM authentication
|
||||
kubectl auth can-i get pods --all-namespaces
|
||||
```
|
||||
|
||||
### RDS Connection Issues
|
||||
|
||||
```bash
|
||||
# Check security group rules
|
||||
aws ec2 describe-security-groups --group-ids <sg-id>
|
||||
|
||||
# Test connection from EKS node
|
||||
kubectl run -it --rm debug --image=postgres:15-alpine --restart=Never -- \
|
||||
psql -h <rds-endpoint> -U spywatcher -d spywatcher
|
||||
```
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
1. **Never commit secrets**: Use AWS Secrets Manager or environment variables
|
||||
2. **Enable MFA**: For AWS account access
|
||||
3. **Use IAM roles**: Instead of access keys where possible
|
||||
4. **Regular updates**: Keep Terraform and providers up to date
|
||||
5. **Review changes**: Always review `terraform plan` output
|
||||
6. **Backup state**: S3 versioning is enabled for state files
|
||||
7. **Least privilege**: IAM policies follow least privilege principle
|
||||
|
||||
## Support
|
||||
|
||||
For infrastructure issues:
|
||||
- Check Terraform state: `terraform show`
|
||||
- Review CloudWatch logs
|
||||
- Check AWS CloudTrail for API calls
|
||||
- Consult AWS documentation
|
||||
- Create issue in repository
|
||||
54
terraform/environments/production/terraform.tfvars
Normal file
54
terraform/environments/production/terraform.tfvars
Normal file
@@ -0,0 +1,54 @@
|
||||
# Production Environment Configuration
|
||||
|
||||
environment = "production"
|
||||
aws_region = "us-east-1"
|
||||
project_name = "spywatcher"
|
||||
|
||||
# VPC Configuration
|
||||
vpc_cidr = "10.0.0.0/16"
|
||||
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
|
||||
private_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
|
||||
public_subnet_cidrs = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
|
||||
database_subnet_cidrs = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
|
||||
|
||||
# EKS Configuration
|
||||
eks_cluster_version = "1.28"
|
||||
eks_node_groups = {
|
||||
general = {
|
||||
desired_size = 3
|
||||
min_size = 2
|
||||
max_size = 10
|
||||
instance_types = ["t3.large"]
|
||||
capacity_type = "ON_DEMAND"
|
||||
}
|
||||
spot = {
|
||||
desired_size = 2
|
||||
min_size = 0
|
||||
max_size = 5
|
||||
instance_types = ["t3.large", "t3a.large"]
|
||||
capacity_type = "SPOT"
|
||||
}
|
||||
}
|
||||
|
||||
# RDS Configuration
|
||||
rds_engine_version = "15.3"
|
||||
rds_instance_class = "db.t3.large"
|
||||
rds_allocated_storage = 100
|
||||
database_name = "spywatcher"
|
||||
database_username = "spywatcher"
|
||||
|
||||
# Redis Configuration
|
||||
redis_node_type = "cache.t3.medium"
|
||||
redis_num_cache_nodes = 2
|
||||
|
||||
# SSL Certificate
|
||||
# Replace with actual certificate ARN after creating in ACM
|
||||
certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
|
||||
# Additional Tags
|
||||
tags = {
|
||||
Terraform = "true"
|
||||
Environment = "production"
|
||||
Project = "spywatcher"
|
||||
CostCenter = "engineering"
|
||||
}
|
||||
45
terraform/environments/staging/terraform.tfvars
Normal file
45
terraform/environments/staging/terraform.tfvars
Normal file
@@ -0,0 +1,45 @@
|
||||
# Staging Environment Configuration
|
||||
|
||||
environment = "staging"
|
||||
aws_region = "us-east-1"
|
||||
project_name = "spywatcher"
|
||||
|
||||
# VPC Configuration
|
||||
vpc_cidr = "10.1.0.0/16"
|
||||
availability_zones = ["us-east-1a", "us-east-1b"]
|
||||
private_subnet_cidrs = ["10.1.1.0/24", "10.1.2.0/24"]
|
||||
public_subnet_cidrs = ["10.1.101.0/24", "10.1.102.0/24"]
|
||||
database_subnet_cidrs = ["10.1.201.0/24", "10.1.202.0/24"]
|
||||
|
||||
# EKS Configuration
|
||||
eks_cluster_version = "1.28"
|
||||
eks_node_groups = {
|
||||
general = {
|
||||
desired_size = 2
|
||||
min_size = 1
|
||||
max_size = 4
|
||||
instance_types = ["t3.medium"]
|
||||
capacity_type = "ON_DEMAND"
|
||||
}
|
||||
}
|
||||
|
||||
# RDS Configuration
|
||||
rds_engine_version = "15.3"
|
||||
rds_instance_class = "db.t3.medium"
|
||||
rds_allocated_storage = 50
|
||||
database_name = "spywatcher"
|
||||
database_username = "spywatcher"
|
||||
|
||||
# Redis Configuration
|
||||
redis_node_type = "cache.t3.small"
|
||||
redis_num_cache_nodes = 1
|
||||
|
||||
# SSL Certificate
|
||||
certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
|
||||
# Additional Tags
|
||||
tags = {
|
||||
Terraform = "true"
|
||||
Environment = "staging"
|
||||
Project = "spywatcher"
|
||||
}
|
||||
141
terraform/main.tf
Normal file
141
terraform/main.tf
Normal file
@@ -0,0 +1,141 @@
|
||||
terraform {
|
||||
required_version = ">= 1.5.0"
|
||||
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 5.0"
|
||||
}
|
||||
kubernetes = {
|
||||
source = "hashicorp/kubernetes"
|
||||
version = "~> 2.23"
|
||||
}
|
||||
helm = {
|
||||
source = "hashicorp/helm"
|
||||
version = "~> 2.11"
|
||||
}
|
||||
}
|
||||
|
||||
# Backend configuration for state storage
|
||||
# Note: The key should be set dynamically using -backend-config flag:
|
||||
# terraform init -backend-config="key=<environment>/terraform.tfstate"
|
||||
backend "s3" {
|
||||
bucket = "spywatcher-terraform-state"
|
||||
key = "terraform.tfstate" # Override with -backend-config flag
|
||||
region = "us-east-1"
|
||||
encrypt = true
|
||||
dynamodb_table = "terraform-state-lock"
|
||||
}
|
||||
}
|
||||
|
||||
provider "aws" {
|
||||
region = var.aws_region
|
||||
|
||||
default_tags {
|
||||
tags = {
|
||||
Project = "spywatcher"
|
||||
Environment = var.environment
|
||||
ManagedBy = "terraform"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# VPC Module
|
||||
module "vpc" {
|
||||
source = "./modules/vpc"
|
||||
|
||||
environment = var.environment
|
||||
vpc_cidr = var.vpc_cidr
|
||||
availability_zones = var.availability_zones
|
||||
private_subnet_cidrs = var.private_subnet_cidrs
|
||||
public_subnet_cidrs = var.public_subnet_cidrs
|
||||
database_subnet_cidrs = var.database_subnet_cidrs
|
||||
}
|
||||
|
||||
# EKS Module
|
||||
module "eks" {
|
||||
source = "./modules/eks"
|
||||
|
||||
environment = var.environment
|
||||
cluster_name = "${var.project_name}-${var.environment}"
|
||||
cluster_version = var.eks_cluster_version
|
||||
vpc_id = module.vpc.vpc_id
|
||||
private_subnet_ids = module.vpc.private_subnet_ids
|
||||
|
||||
node_groups = var.eks_node_groups
|
||||
}
|
||||
|
||||
# RDS PostgreSQL Module
|
||||
module "rds" {
|
||||
source = "./modules/rds"
|
||||
|
||||
environment = var.environment
|
||||
identifier = "${var.project_name}-${var.environment}"
|
||||
engine_version = var.rds_engine_version
|
||||
instance_class = var.rds_instance_class
|
||||
allocated_storage = var.rds_allocated_storage
|
||||
database_name = var.database_name
|
||||
master_username = var.database_username
|
||||
vpc_id = module.vpc.vpc_id
|
||||
database_subnet_ids = module.vpc.database_subnet_ids
|
||||
allowed_security_group_ids = [module.eks.cluster_security_group_id]
|
||||
}
|
||||
|
||||
# ElastiCache Redis Module
|
||||
module "redis" {
|
||||
source = "./modules/redis"
|
||||
|
||||
environment = var.environment
|
||||
cluster_id = "${var.project_name}-${var.environment}"
|
||||
node_type = var.redis_node_type
|
||||
num_cache_nodes = var.redis_num_cache_nodes
|
||||
vpc_id = module.vpc.vpc_id
|
||||
subnet_ids = module.vpc.private_subnet_ids
|
||||
allowed_security_group_ids = [module.eks.cluster_security_group_id]
|
||||
}
|
||||
|
||||
# Application Load Balancer Module
|
||||
module "alb" {
|
||||
source = "./modules/alb"
|
||||
|
||||
environment = var.environment
|
||||
vpc_id = module.vpc.vpc_id
|
||||
public_subnet_ids = module.vpc.public_subnet_ids
|
||||
certificate_arn = var.certificate_arn
|
||||
}
|
||||
|
||||
# Configure Kubernetes provider
|
||||
provider "kubernetes" {
|
||||
host = module.eks.cluster_endpoint
|
||||
cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
|
||||
|
||||
exec {
|
||||
api_version = "client.authentication.k8s.io/v1"
|
||||
command = "aws"
|
||||
args = [
|
||||
"eks",
|
||||
"get-token",
|
||||
"--cluster-name",
|
||||
module.eks.cluster_name
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Configure Helm provider
|
||||
provider "helm" {
|
||||
kubernetes {
|
||||
host = module.eks.cluster_endpoint
|
||||
cluster_ca_certificate = base64decode(module.eks.cluster_ca_certificate)
|
||||
|
||||
exec {
|
||||
api_version = "client.authentication.k8s.io/v1"
|
||||
command = "aws"
|
||||
args = [
|
||||
"eks",
|
||||
"get-token",
|
||||
"--cluster-name",
|
||||
module.eks.cluster_name
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
262
terraform/modules/alb/main.tf
Normal file
262
terraform/modules/alb/main.tf
Normal file
@@ -0,0 +1,262 @@
|
||||
resource "aws_security_group" "alb" {
|
||||
name = "${var.environment}-alb-sg"
|
||||
description = "Security group for Application Load Balancer"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
ingress {
|
||||
from_port = 80
|
||||
to_port = 80
|
||||
protocol = "tcp"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
description = "HTTP access"
|
||||
}
|
||||
|
||||
ingress {
|
||||
from_port = 443
|
||||
to_port = 443
|
||||
protocol = "tcp"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
description = "HTTPS access"
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-alb-sg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_lb" "main" {
|
||||
name = "${var.environment}-alb"
|
||||
internal = false
|
||||
load_balancer_type = "application"
|
||||
security_groups = [aws_security_group.alb.id]
|
||||
subnets = var.public_subnet_ids
|
||||
|
||||
enable_deletion_protection = var.enable_deletion_protection
|
||||
enable_http2 = true
|
||||
enable_cross_zone_load_balancing = true
|
||||
|
||||
drop_invalid_header_fields = true
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-alb"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_lb_target_group" "backend" {
|
||||
name = "${var.environment}-backend-tg"
|
||||
port = 80
|
||||
protocol = "HTTP"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
health_check {
|
||||
enabled = true
|
||||
healthy_threshold = 2
|
||||
unhealthy_threshold = 3
|
||||
timeout = 5
|
||||
interval = 30
|
||||
path = "/health/live"
|
||||
matcher = "200"
|
||||
}
|
||||
|
||||
deregistration_delay = 30
|
||||
|
||||
stickiness {
|
||||
type = "lb_cookie"
|
||||
cookie_duration = 86400
|
||||
enabled = true
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-backend-tg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_lb_target_group" "frontend" {
|
||||
name = "${var.environment}-frontend-tg"
|
||||
port = 80
|
||||
protocol = "HTTP"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
health_check {
|
||||
enabled = true
|
||||
healthy_threshold = 2
|
||||
unhealthy_threshold = 3
|
||||
timeout = 5
|
||||
interval = 30
|
||||
path = "/"
|
||||
matcher = "200"
|
||||
}
|
||||
|
||||
deregistration_delay = 30
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-frontend-tg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# HTTP Listener - Redirect to HTTPS
|
||||
resource "aws_lb_listener" "http" {
|
||||
load_balancer_arn = aws_lb.main.arn
|
||||
port = "80"
|
||||
protocol = "HTTP"
|
||||
|
||||
default_action {
|
||||
type = "redirect"
|
||||
|
||||
redirect {
|
||||
port = "443"
|
||||
protocol = "HTTPS"
|
||||
status_code = "HTTP_301"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# HTTPS Listener
|
||||
resource "aws_lb_listener" "https" {
|
||||
load_balancer_arn = aws_lb.main.arn
|
||||
port = "443"
|
||||
protocol = "HTTPS"
|
||||
ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01"
|
||||
certificate_arn = var.certificate_arn
|
||||
|
||||
default_action {
|
||||
type = "forward"
|
||||
|
||||
forward {
|
||||
target_group {
|
||||
arn = aws_lb_target_group.frontend.arn
|
||||
weight = 100
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Listener Rules for API routing
|
||||
resource "aws_lb_listener_rule" "api" {
|
||||
listener_arn = aws_lb_listener.https.arn
|
||||
priority = 100
|
||||
|
||||
action {
|
||||
type = "forward"
|
||||
|
||||
forward {
|
||||
target_group {
|
||||
arn = aws_lb_target_group.backend.arn
|
||||
weight = 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
condition {
|
||||
path_pattern {
|
||||
values = ["/api/*", "/health/*"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# WAF Web ACL (optional but recommended)
|
||||
resource "aws_wafv2_web_acl" "main" {
|
||||
name = "${var.environment}-waf"
|
||||
scope = "REGIONAL"
|
||||
|
||||
default_action {
|
||||
allow {}
|
||||
}
|
||||
|
||||
# Rate limiting rule
|
||||
rule {
|
||||
name = "RateLimitRule"
|
||||
priority = 1
|
||||
|
||||
action {
|
||||
block {}
|
||||
}
|
||||
|
||||
statement {
|
||||
rate_based_statement {
|
||||
limit = 2000
|
||||
aggregate_key_type = "IP"
|
||||
}
|
||||
}
|
||||
|
||||
visibility_config {
|
||||
cloudwatch_metrics_enabled = true
|
||||
metric_name = "RateLimitRule"
|
||||
sampled_requests_enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
# AWS Managed Rules - Core Rule Set
|
||||
rule {
|
||||
name = "AWSManagedRulesCommonRuleSet"
|
||||
priority = 2
|
||||
|
||||
override_action {
|
||||
none {}
|
||||
}
|
||||
|
||||
statement {
|
||||
managed_rule_group_statement {
|
||||
name = "AWSManagedRulesCommonRuleSet"
|
||||
vendor_name = "AWS"
|
||||
}
|
||||
}
|
||||
|
||||
visibility_config {
|
||||
cloudwatch_metrics_enabled = true
|
||||
metric_name = "AWSManagedRulesCommonRuleSetMetric"
|
||||
sampled_requests_enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
# Known Bad Inputs Rule Set
|
||||
rule {
|
||||
name = "AWSManagedRulesKnownBadInputsRuleSet"
|
||||
priority = 3
|
||||
|
||||
override_action {
|
||||
none {}
|
||||
}
|
||||
|
||||
statement {
|
||||
managed_rule_group_statement {
|
||||
name = "AWSManagedRulesKnownBadInputsRuleSet"
|
||||
vendor_name = "AWS"
|
||||
}
|
||||
}
|
||||
|
||||
visibility_config {
|
||||
cloudwatch_metrics_enabled = true
|
||||
metric_name = "AWSManagedRulesKnownBadInputsRuleSetMetric"
|
||||
sampled_requests_enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
visibility_config {
|
||||
cloudwatch_metrics_enabled = true
|
||||
metric_name = "${var.environment}-waf"
|
||||
sampled_requests_enabled = true
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-waf"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Associate WAF with ALB
|
||||
resource "aws_wafv2_web_acl_association" "main" {
|
||||
resource_arn = aws_lb.main.arn
|
||||
web_acl_arn = aws_wafv2_web_acl.main.arn
|
||||
}
|
||||
29
terraform/modules/alb/outputs.tf
Normal file
29
terraform/modules/alb/outputs.tf
Normal file
@@ -0,0 +1,29 @@
|
||||
output "alb_arn" {
|
||||
description = "ALB ARN"
|
||||
value = aws_lb.main.arn
|
||||
}
|
||||
|
||||
output "alb_dns_name" {
|
||||
description = "ALB DNS name"
|
||||
value = aws_lb.main.dns_name
|
||||
}
|
||||
|
||||
output "alb_zone_id" {
|
||||
description = "ALB zone ID"
|
||||
value = aws_lb.main.zone_id
|
||||
}
|
||||
|
||||
output "backend_target_group_arn" {
|
||||
description = "Backend target group ARN"
|
||||
value = aws_lb_target_group.backend.arn
|
||||
}
|
||||
|
||||
output "frontend_target_group_arn" {
|
||||
description = "Frontend target group ARN"
|
||||
value = aws_lb_target_group.frontend.arn
|
||||
}
|
||||
|
||||
output "alb_security_group_id" {
|
||||
description = "ALB security group ID"
|
||||
value = aws_security_group.alb.id
|
||||
}
|
||||
25
terraform/modules/alb/variables.tf
Normal file
25
terraform/modules/alb/variables.tf
Normal file
@@ -0,0 +1,25 @@
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "vpc_id" {
|
||||
description = "VPC ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "public_subnet_ids" {
|
||||
description = "Public subnet IDs for ALB"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "certificate_arn" {
|
||||
description = "ARN of SSL certificate"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "enable_deletion_protection" {
|
||||
description = "Enable deletion protection for ALB"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
178
terraform/modules/eks/main.tf
Normal file
178
terraform/modules/eks/main.tf
Normal file
@@ -0,0 +1,178 @@
|
||||
resource "aws_eks_cluster" "main" {
|
||||
name = var.cluster_name
|
||||
role_arn = aws_iam_role.cluster.arn
|
||||
version = var.cluster_version
|
||||
|
||||
vpc_config {
|
||||
subnet_ids = var.private_subnet_ids
|
||||
endpoint_private_access = true
|
||||
endpoint_public_access = true
|
||||
security_group_ids = [aws_security_group.cluster.id]
|
||||
}
|
||||
|
||||
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
|
||||
|
||||
depends_on = [
|
||||
aws_iam_role_policy_attachment.cluster_policy,
|
||||
aws_iam_role_policy_attachment.vpc_resource_controller
|
||||
]
|
||||
|
||||
tags = {
|
||||
Name = var.cluster_name
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# EKS Cluster IAM Role
|
||||
resource "aws_iam_role" "cluster" {
|
||||
name = "${var.cluster_name}-cluster-role"
|
||||
|
||||
assume_role_policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [{
|
||||
Action = "sts:AssumeRole"
|
||||
Effect = "Allow"
|
||||
Principal = {
|
||||
Service = "eks.amazonaws.com"
|
||||
}
|
||||
}]
|
||||
})
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_name}-cluster-role"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "cluster_policy" {
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
|
||||
role = aws_iam_role.cluster.name
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "vpc_resource_controller" {
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
|
||||
role = aws_iam_role.cluster.name
|
||||
}
|
||||
|
||||
# Cluster Security Group
|
||||
resource "aws_security_group" "cluster" {
|
||||
name = "${var.cluster_name}-cluster-sg"
|
||||
description = "Security group for EKS cluster"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_name}-cluster-sg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# EKS Node Groups
|
||||
resource "aws_eks_node_group" "main" {
|
||||
for_each = var.node_groups
|
||||
|
||||
cluster_name = aws_eks_cluster.main.name
|
||||
node_group_name = "${var.cluster_name}-${each.key}"
|
||||
node_role_arn = aws_iam_role.node.arn
|
||||
subnet_ids = var.private_subnet_ids
|
||||
|
||||
capacity_type = each.value.capacity_type
|
||||
instance_types = each.value.instance_types
|
||||
|
||||
scaling_config {
|
||||
desired_size = each.value.desired_size
|
||||
max_size = each.value.max_size
|
||||
min_size = each.value.min_size
|
||||
}
|
||||
|
||||
update_config {
|
||||
max_unavailable = 1
|
||||
}
|
||||
|
||||
depends_on = [
|
||||
aws_iam_role_policy_attachment.node_policy,
|
||||
aws_iam_role_policy_attachment.cni_policy,
|
||||
aws_iam_role_policy_attachment.container_registry_policy,
|
||||
]
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_name}-${each.key}"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Node IAM Role
|
||||
resource "aws_iam_role" "node" {
|
||||
name = "${var.cluster_name}-node-role"
|
||||
|
||||
assume_role_policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [{
|
||||
Action = "sts:AssumeRole"
|
||||
Effect = "Allow"
|
||||
Principal = {
|
||||
Service = "ec2.amazonaws.com"
|
||||
}
|
||||
}]
|
||||
})
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_name}-node-role"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "node_policy" {
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
|
||||
role = aws_iam_role.node.name
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "cni_policy" {
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
|
||||
role = aws_iam_role.node.name
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "container_registry_policy" {
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
|
||||
role = aws_iam_role.node.name
|
||||
}
|
||||
|
||||
# OIDC Provider for IRSA (IAM Roles for Service Accounts)
|
||||
data "tls_certificate" "cluster" {
|
||||
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
|
||||
}
|
||||
|
||||
resource "aws_iam_openid_connect_provider" "cluster" {
|
||||
client_id_list = ["sts.amazonaws.com"]
|
||||
thumbprint_list = [data.tls_certificate.cluster.certificates[0].sha1_fingerprint]
|
||||
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_name}-oidc"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# EKS Add-ons
|
||||
resource "aws_eks_addon" "vpc_cni" {
|
||||
cluster_name = aws_eks_cluster.main.name
|
||||
addon_name = "vpc-cni"
|
||||
}
|
||||
|
||||
resource "aws_eks_addon" "coredns" {
|
||||
cluster_name = aws_eks_cluster.main.name
|
||||
addon_name = "coredns"
|
||||
|
||||
depends_on = [aws_eks_node_group.main]
|
||||
}
|
||||
|
||||
resource "aws_eks_addon" "kube_proxy" {
|
||||
cluster_name = aws_eks_cluster.main.name
|
||||
addon_name = "kube-proxy"
|
||||
}
|
||||
29
terraform/modules/eks/outputs.tf
Normal file
29
terraform/modules/eks/outputs.tf
Normal file
@@ -0,0 +1,29 @@
|
||||
output "cluster_name" {
|
||||
description = "EKS cluster name"
|
||||
value = aws_eks_cluster.main.name
|
||||
}
|
||||
|
||||
output "cluster_endpoint" {
|
||||
description = "EKS cluster endpoint"
|
||||
value = aws_eks_cluster.main.endpoint
|
||||
}
|
||||
|
||||
output "cluster_ca_certificate" {
|
||||
description = "EKS cluster CA certificate"
|
||||
value = aws_eks_cluster.main.certificate_authority[0].data
|
||||
}
|
||||
|
||||
output "cluster_security_group_id" {
|
||||
description = "Security group ID attached to the EKS cluster"
|
||||
value = aws_security_group.cluster.id
|
||||
}
|
||||
|
||||
output "cluster_oidc_issuer_url" {
|
||||
description = "OIDC issuer URL"
|
||||
value = aws_eks_cluster.main.identity[0].oidc[0].issuer
|
||||
}
|
||||
|
||||
output "node_role_arn" {
|
||||
description = "IAM role ARN for EKS nodes"
|
||||
value = aws_iam_role.node.arn
|
||||
}
|
||||
35
terraform/modules/eks/variables.tf
Normal file
35
terraform/modules/eks/variables.tf
Normal file
@@ -0,0 +1,35 @@
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "cluster_name" {
|
||||
description = "EKS cluster name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "cluster_version" {
|
||||
description = "Kubernetes version"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "vpc_id" {
|
||||
description = "VPC ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "private_subnet_ids" {
|
||||
description = "Private subnet IDs for EKS"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "node_groups" {
|
||||
description = "Node groups configuration"
|
||||
type = map(object({
|
||||
desired_size = number
|
||||
min_size = number
|
||||
max_size = number
|
||||
instance_types = list(string)
|
||||
capacity_type = string
|
||||
}))
|
||||
}
|
||||
164
terraform/modules/rds/main.tf
Normal file
164
terraform/modules/rds/main.tf
Normal file
@@ -0,0 +1,164 @@
|
||||
resource "aws_db_subnet_group" "main" {
|
||||
name = "${var.identifier}-subnet-group"
|
||||
subnet_ids = var.database_subnet_ids
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-subnet-group"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_security_group" "rds" {
|
||||
name = "${var.identifier}-rds-sg"
|
||||
description = "Security group for RDS PostgreSQL"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
ingress {
|
||||
from_port = 5432
|
||||
to_port = 5432
|
||||
protocol = "tcp"
|
||||
security_groups = var.allowed_security_group_ids
|
||||
description = "PostgreSQL access from EKS"
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-rds-sg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "random_password" "master" {
|
||||
length = 32
|
||||
special = true
|
||||
}
|
||||
|
||||
resource "aws_db_instance" "postgres" {
|
||||
identifier = var.identifier
|
||||
engine = "postgres"
|
||||
engine_version = var.engine_version
|
||||
instance_class = var.instance_class
|
||||
|
||||
allocated_storage = var.allocated_storage
|
||||
max_allocated_storage = var.max_allocated_storage
|
||||
storage_type = "gp3"
|
||||
storage_encrypted = true
|
||||
kms_key_id = aws_kms_key.rds.arn
|
||||
|
||||
db_name = var.database_name
|
||||
username = var.master_username
|
||||
password = random_password.master.result
|
||||
|
||||
db_subnet_group_name = aws_db_subnet_group.main.name
|
||||
vpc_security_group_ids = [aws_security_group.rds.id]
|
||||
publicly_accessible = false
|
||||
|
||||
multi_az = var.multi_az
|
||||
backup_retention_period = var.backup_retention_period
|
||||
backup_window = var.backup_window
|
||||
maintenance_window = var.maintenance_window
|
||||
|
||||
enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
|
||||
performance_insights_enabled = true
|
||||
performance_insights_retention_period = 7
|
||||
|
||||
deletion_protection = var.deletion_protection
|
||||
skip_final_snapshot = var.skip_final_snapshot
|
||||
final_snapshot_identifier = var.skip_final_snapshot ? null : "${var.identifier}-final-snapshot"
|
||||
|
||||
auto_minor_version_upgrade = true
|
||||
apply_immediately = false
|
||||
|
||||
tags = {
|
||||
Name = var.identifier
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_kms_key" "rds" {
|
||||
description = "KMS key for RDS encryption"
|
||||
deletion_window_in_days = 10
|
||||
enable_key_rotation = true
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-kms"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_kms_alias" "rds" {
|
||||
name = "alias/${var.identifier}-rds"
|
||||
target_key_id = aws_kms_key.rds.key_id
|
||||
}
|
||||
|
||||
# Store password in Secrets Manager
|
||||
resource "aws_secretsmanager_secret" "db_password" {
|
||||
name = "${var.identifier}-db-password"
|
||||
description = "Database master password"
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-db-password"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_secretsmanager_secret_version" "db_password" {
|
||||
secret_id = aws_secretsmanager_secret.db_password.id
|
||||
secret_string = jsonencode({
|
||||
username = var.master_username
|
||||
password = random_password.master.result
|
||||
engine = "postgres"
|
||||
host = aws_db_instance.postgres.address
|
||||
port = aws_db_instance.postgres.port
|
||||
dbname = var.database_name
|
||||
})
|
||||
}
|
||||
|
||||
# CloudWatch Alarms
|
||||
resource "aws_cloudwatch_metric_alarm" "cpu" {
|
||||
alarm_name = "${var.identifier}-cpu-utilization"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "CPUUtilization"
|
||||
namespace = "AWS/RDS"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = "80"
|
||||
alarm_description = "This metric monitors RDS CPU utilization"
|
||||
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = aws_db_instance.postgres.id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-cpu-alarm"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "storage" {
|
||||
alarm_name = "${var.identifier}-free-storage-space"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "FreeStorageSpace"
|
||||
namespace = "AWS/RDS"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = "10000000000" # 10GB in bytes
|
||||
alarm_description = "This metric monitors RDS free storage space"
|
||||
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = aws_db_instance.postgres.id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.identifier}-storage-alarm"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
35
terraform/modules/rds/outputs.tf
Normal file
35
terraform/modules/rds/outputs.tf
Normal file
@@ -0,0 +1,35 @@
|
||||
output "db_endpoint" {
|
||||
description = "RDS instance endpoint"
|
||||
value = aws_db_instance.postgres.endpoint
|
||||
}
|
||||
|
||||
output "db_address" {
|
||||
description = "RDS instance address"
|
||||
value = aws_db_instance.postgres.address
|
||||
}
|
||||
|
||||
output "db_port" {
|
||||
description = "RDS instance port"
|
||||
value = aws_db_instance.postgres.port
|
||||
}
|
||||
|
||||
output "db_name" {
|
||||
description = "Database name"
|
||||
value = aws_db_instance.postgres.db_name
|
||||
}
|
||||
|
||||
output "db_username" {
|
||||
description = "Master username"
|
||||
value = aws_db_instance.postgres.username
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "security_group_id" {
|
||||
description = "RDS security group ID"
|
||||
value = aws_security_group.rds.id
|
||||
}
|
||||
|
||||
output "secret_arn" {
|
||||
description = "ARN of the secret containing database credentials"
|
||||
value = aws_secretsmanager_secret.db_password.arn
|
||||
}
|
||||
94
terraform/modules/rds/variables.tf
Normal file
94
terraform/modules/rds/variables.tf
Normal file
@@ -0,0 +1,94 @@
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "identifier" {
|
||||
description = "Identifier for RDS instance"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "engine_version" {
|
||||
description = "PostgreSQL engine version"
|
||||
type = string
|
||||
default = "15.3"
|
||||
}
|
||||
|
||||
variable "instance_class" {
|
||||
description = "RDS instance class"
|
||||
type = string
|
||||
default = "db.t3.medium"
|
||||
}
|
||||
|
||||
variable "allocated_storage" {
|
||||
description = "Allocated storage in GB"
|
||||
type = number
|
||||
default = 100
|
||||
}
|
||||
|
||||
variable "max_allocated_storage" {
|
||||
description = "Maximum allocated storage for autoscaling in GB"
|
||||
type = number
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "database_name" {
|
||||
description = "Name of the database"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "master_username" {
|
||||
description = "Master username"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "vpc_id" {
|
||||
description = "VPC ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "database_subnet_ids" {
|
||||
description = "Database subnet IDs"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "allowed_security_group_ids" {
|
||||
description = "Security group IDs allowed to access RDS"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "multi_az" {
|
||||
description = "Enable Multi-AZ deployment"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "backup_retention_period" {
|
||||
description = "Backup retention period in days"
|
||||
type = number
|
||||
default = 7
|
||||
}
|
||||
|
||||
variable "backup_window" {
|
||||
description = "Preferred backup window"
|
||||
type = string
|
||||
default = "03:00-04:00"
|
||||
}
|
||||
|
||||
variable "maintenance_window" {
|
||||
description = "Preferred maintenance window"
|
||||
type = string
|
||||
default = "mon:04:00-mon:05:00"
|
||||
}
|
||||
|
||||
variable "deletion_protection" {
|
||||
description = "Enable deletion protection"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "skip_final_snapshot" {
|
||||
description = "Skip final snapshot when destroying"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
174
terraform/modules/redis/main.tf
Normal file
174
terraform/modules/redis/main.tf
Normal file
@@ -0,0 +1,174 @@
|
||||
resource "aws_elasticache_subnet_group" "main" {
|
||||
name = "${var.cluster_id}-subnet-group"
|
||||
subnet_ids = var.subnet_ids
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-subnet-group"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_security_group" "redis" {
|
||||
name = "${var.cluster_id}-redis-sg"
|
||||
description = "Security group for ElastiCache Redis"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
ingress {
|
||||
from_port = 6379
|
||||
to_port = 6379
|
||||
protocol = "tcp"
|
||||
security_groups = var.allowed_security_group_ids
|
||||
description = "Redis access from EKS"
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-redis-sg"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "random_password" "auth_token" {
|
||||
length = 32
|
||||
special = false
|
||||
}
|
||||
|
||||
resource "aws_elasticache_replication_group" "redis" {
|
||||
replication_group_id = var.cluster_id
|
||||
replication_group_description = "Redis cluster for ${var.environment}"
|
||||
engine = "redis"
|
||||
engine_version = var.engine_version
|
||||
node_type = var.node_type
|
||||
num_cache_clusters = var.num_cache_nodes
|
||||
port = 6379
|
||||
|
||||
subnet_group_name = aws_elasticache_subnet_group.main.name
|
||||
security_group_ids = [aws_security_group.redis.id]
|
||||
|
||||
parameter_group_name = aws_elasticache_parameter_group.main.name
|
||||
|
||||
at_rest_encryption_enabled = true
|
||||
transit_encryption_enabled = true
|
||||
auth_token = random_password.auth_token.result
|
||||
|
||||
automatic_failover_enabled = var.num_cache_nodes > 1 ? true : false
|
||||
multi_az_enabled = var.num_cache_nodes > 1 ? true : false
|
||||
|
||||
snapshot_retention_limit = var.snapshot_retention_limit
|
||||
snapshot_window = var.snapshot_window
|
||||
maintenance_window = var.maintenance_window
|
||||
|
||||
auto_minor_version_upgrade = true
|
||||
apply_immediately = false
|
||||
|
||||
log_delivery_configuration {
|
||||
destination = aws_cloudwatch_log_group.redis.name
|
||||
destination_type = "cloudwatch-logs"
|
||||
log_format = "json"
|
||||
log_type = "slow-log"
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = var.cluster_id
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_elasticache_parameter_group" "main" {
|
||||
name = "${var.cluster_id}-params"
|
||||
family = "redis7"
|
||||
|
||||
parameter {
|
||||
name = "maxmemory-policy"
|
||||
value = "allkeys-lru"
|
||||
}
|
||||
|
||||
parameter {
|
||||
name = "timeout"
|
||||
value = "300"
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-params"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_log_group" "redis" {
|
||||
name = "/aws/elasticache/${var.cluster_id}"
|
||||
retention_in_days = 7
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-logs"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Store auth token in Secrets Manager
|
||||
resource "aws_secretsmanager_secret" "redis_auth" {
|
||||
name = "${var.cluster_id}-auth-token"
|
||||
description = "Redis authentication token"
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-auth-token"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_secretsmanager_secret_version" "redis_auth" {
|
||||
secret_id = aws_secretsmanager_secret.redis_auth.id
|
||||
secret_string = jsonencode({
|
||||
auth_token = random_password.auth_token.result
|
||||
endpoint = aws_elasticache_replication_group.redis.primary_endpoint_address
|
||||
port = 6379
|
||||
})
|
||||
}
|
||||
|
||||
# CloudWatch Alarms
|
||||
resource "aws_cloudwatch_metric_alarm" "cpu" {
|
||||
alarm_name = "${var.cluster_id}-cpu-utilization"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "2"
|
||||
metric_name = "CPUUtilization"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = "75"
|
||||
alarm_description = "This metric monitors Redis CPU utilization"
|
||||
|
||||
dimensions = {
|
||||
ReplicationGroupId = var.cluster_id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-cpu-alarm"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "memory" {
|
||||
alarm_name = "${var.cluster_id}-database-memory-usage"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "DatabaseMemoryUsagePercentage"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = "90"
|
||||
alarm_description = "This metric monitors Redis memory usage"
|
||||
|
||||
dimensions = {
|
||||
ReplicationGroupId = var.cluster_id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.cluster_id}-memory-alarm"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
19
terraform/modules/redis/outputs.tf
Normal file
19
terraform/modules/redis/outputs.tf
Normal file
@@ -0,0 +1,19 @@
|
||||
output "redis_endpoint" {
|
||||
description = "Redis primary endpoint address"
|
||||
value = aws_elasticache_replication_group.redis.primary_endpoint_address
|
||||
}
|
||||
|
||||
output "redis_port" {
|
||||
description = "Redis port"
|
||||
value = 6379
|
||||
}
|
||||
|
||||
output "security_group_id" {
|
||||
description = "Redis security group ID"
|
||||
value = aws_security_group.redis.id
|
||||
}
|
||||
|
||||
output "secret_arn" {
|
||||
description = "ARN of the secret containing Redis auth token"
|
||||
value = aws_secretsmanager_secret.redis_auth.arn
|
||||
}
|
||||
60
terraform/modules/redis/variables.tf
Normal file
60
terraform/modules/redis/variables.tf
Normal file
@@ -0,0 +1,60 @@
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "cluster_id" {
|
||||
description = "ElastiCache cluster ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "engine_version" {
|
||||
description = "Redis engine version"
|
||||
type = string
|
||||
default = "7.0"
|
||||
}
|
||||
|
||||
variable "node_type" {
|
||||
description = "ElastiCache node type"
|
||||
type = string
|
||||
default = "cache.t3.medium"
|
||||
}
|
||||
|
||||
variable "num_cache_nodes" {
|
||||
description = "Number of cache nodes"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
variable "vpc_id" {
|
||||
description = "VPC ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "subnet_ids" {
|
||||
description = "Subnet IDs for ElastiCache"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "allowed_security_group_ids" {
|
||||
description = "Security group IDs allowed to access Redis"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "snapshot_retention_limit" {
|
||||
description = "Number of days to retain automatic snapshots"
|
||||
type = number
|
||||
default = 5
|
||||
}
|
||||
|
||||
variable "snapshot_window" {
|
||||
description = "Daily time range for snapshots"
|
||||
type = string
|
||||
default = "03:00-05:00"
|
||||
}
|
||||
|
||||
variable "maintenance_window" {
|
||||
description = "Weekly time range for maintenance"
|
||||
type = string
|
||||
default = "sun:05:00-sun:07:00"
|
||||
}
|
||||
213
terraform/modules/vpc/main.tf
Normal file
213
terraform/modules/vpc/main.tf
Normal file
@@ -0,0 +1,213 @@
|
||||
resource "aws_vpc" "main" {
|
||||
cidr_block = var.vpc_cidr
|
||||
enable_dns_hostnames = true
|
||||
enable_dns_support = true
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-vpc"
|
||||
Environment = var.environment
|
||||
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
|
||||
}
|
||||
}
|
||||
|
||||
# Internet Gateway
|
||||
resource "aws_internet_gateway" "main" {
|
||||
vpc_id = aws_vpc.main.id
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-igw"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Public Subnets
|
||||
resource "aws_subnet" "public" {
|
||||
count = length(var.public_subnet_cidrs)
|
||||
vpc_id = aws_vpc.main.id
|
||||
cidr_block = var.public_subnet_cidrs[count.index]
|
||||
availability_zone = var.availability_zones[count.index]
|
||||
|
||||
map_public_ip_on_launch = true
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-public-subnet-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
Type = "public"
|
||||
"kubernetes.io/role/elb" = "1"
|
||||
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
|
||||
}
|
||||
}
|
||||
|
||||
# Private Subnets
|
||||
resource "aws_subnet" "private" {
|
||||
count = length(var.private_subnet_cidrs)
|
||||
vpc_id = aws_vpc.main.id
|
||||
cidr_block = var.private_subnet_cidrs[count.index]
|
||||
availability_zone = var.availability_zones[count.index]
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-private-subnet-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
Type = "private"
|
||||
"kubernetes.io/role/internal-elb" = "1"
|
||||
"kubernetes.io/cluster/${var.environment}-cluster" = "shared"
|
||||
}
|
||||
}
|
||||
|
||||
# Database Subnets
|
||||
resource "aws_subnet" "database" {
|
||||
count = length(var.database_subnet_cidrs)
|
||||
vpc_id = aws_vpc.main.id
|
||||
cidr_block = var.database_subnet_cidrs[count.index]
|
||||
availability_zone = var.availability_zones[count.index]
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-database-subnet-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
Type = "database"
|
||||
}
|
||||
}
|
||||
|
||||
# Elastic IPs for NAT Gateways
|
||||
resource "aws_eip" "nat" {
|
||||
count = length(var.availability_zones)
|
||||
domain = "vpc"
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-nat-eip-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# NAT Gateways
|
||||
resource "aws_nat_gateway" "main" {
|
||||
count = length(var.availability_zones)
|
||||
allocation_id = aws_eip.nat[count.index].id
|
||||
subnet_id = aws_subnet.public[count.index].id
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-nat-gateway-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
}
|
||||
|
||||
depends_on = [aws_internet_gateway.main]
|
||||
}
|
||||
|
||||
# Public Route Table
|
||||
resource "aws_route_table" "public" {
|
||||
vpc_id = aws_vpc.main.id
|
||||
|
||||
route {
|
||||
cidr_block = "0.0.0.0/0"
|
||||
gateway_id = aws_internet_gateway.main.id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-public-rt"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Private Route Tables
|
||||
resource "aws_route_table" "private" {
|
||||
count = length(var.availability_zones)
|
||||
vpc_id = aws_vpc.main.id
|
||||
|
||||
route {
|
||||
cidr_block = "0.0.0.0/0"
|
||||
nat_gateway_id = aws_nat_gateway.main[count.index].id
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-private-rt-${count.index + 1}"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
# Route Table Associations - Public
|
||||
resource "aws_route_table_association" "public" {
|
||||
count = length(var.public_subnet_cidrs)
|
||||
subnet_id = aws_subnet.public[count.index].id
|
||||
route_table_id = aws_route_table.public.id
|
||||
}
|
||||
|
||||
# Route Table Associations - Private
|
||||
resource "aws_route_table_association" "private" {
|
||||
count = length(var.private_subnet_cidrs)
|
||||
subnet_id = aws_subnet.private[count.index].id
|
||||
route_table_id = aws_route_table.private[count.index].id
|
||||
}
|
||||
|
||||
# Route Table Associations - Database
|
||||
resource "aws_route_table_association" "database" {
|
||||
count = length(var.database_subnet_cidrs)
|
||||
subnet_id = aws_subnet.database[count.index].id
|
||||
route_table_id = aws_route_table.private[count.index].id
|
||||
}
|
||||
|
||||
# VPC Flow Logs
|
||||
resource "aws_flow_log" "main" {
|
||||
iam_role_arn = aws_iam_role.flow_log.arn
|
||||
log_destination = aws_cloudwatch_log_group.flow_log.arn
|
||||
traffic_type = "ALL"
|
||||
vpc_id = aws_vpc.main.id
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-flow-log"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_log_group" "flow_log" {
|
||||
name = "/aws/vpc/${var.environment}-flow-log"
|
||||
retention_in_days = 30
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-flow-log"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role" "flow_log" {
|
||||
name = "${var.environment}-vpc-flow-log-role"
|
||||
|
||||
assume_role_policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Action = "sts:AssumeRole"
|
||||
Effect = "Allow"
|
||||
Principal = {
|
||||
Service = "vpc-flow-logs.amazonaws.com"
|
||||
}
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
tags = {
|
||||
Name = "${var.environment}-flow-log-role"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "flow_log" {
|
||||
name = "${var.environment}-vpc-flow-log-policy"
|
||||
role = aws_iam_role.flow_log.id
|
||||
|
||||
policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Action = [
|
||||
"logs:CreateLogGroup",
|
||||
"logs:CreateLogStream",
|
||||
"logs:PutLogEvents",
|
||||
"logs:DescribeLogGroups",
|
||||
"logs:DescribeLogStreams"
|
||||
]
|
||||
Effect = "Allow"
|
||||
Resource = "${aws_cloudwatch_log_group.flow_log.arn}:*"
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
29
terraform/modules/vpc/outputs.tf
Normal file
29
terraform/modules/vpc/outputs.tf
Normal file
@@ -0,0 +1,29 @@
|
||||
output "vpc_id" {
|
||||
description = "VPC ID"
|
||||
value = aws_vpc.main.id
|
||||
}
|
||||
|
||||
output "vpc_cidr" {
|
||||
description = "VPC CIDR block"
|
||||
value = aws_vpc.main.cidr_block
|
||||
}
|
||||
|
||||
output "public_subnet_ids" {
|
||||
description = "Public subnet IDs"
|
||||
value = aws_subnet.public[*].id
|
||||
}
|
||||
|
||||
output "private_subnet_ids" {
|
||||
description = "Private subnet IDs"
|
||||
value = aws_subnet.private[*].id
|
||||
}
|
||||
|
||||
output "database_subnet_ids" {
|
||||
description = "Database subnet IDs"
|
||||
value = aws_subnet.database[*].id
|
||||
}
|
||||
|
||||
output "nat_gateway_ids" {
|
||||
description = "NAT Gateway IDs"
|
||||
value = aws_nat_gateway.main[*].id
|
||||
}
|
||||
29
terraform/modules/vpc/variables.tf
Normal file
29
terraform/modules/vpc/variables.tf
Normal file
@@ -0,0 +1,29 @@
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "vpc_cidr" {
|
||||
description = "CIDR block for VPC"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "availability_zones" {
|
||||
description = "List of availability zones"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "private_subnet_cidrs" {
|
||||
description = "CIDR blocks for private subnets"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "public_subnet_cidrs" {
|
||||
description = "CIDR blocks for public subnets"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "database_subnet_cidrs" {
|
||||
description = "CIDR blocks for database subnets"
|
||||
type = list(string)
|
||||
}
|
||||
61
terraform/outputs.tf
Normal file
61
terraform/outputs.tf
Normal file
@@ -0,0 +1,61 @@
|
||||
output "vpc_id" {
|
||||
description = "VPC ID"
|
||||
value = module.vpc.vpc_id
|
||||
}
|
||||
|
||||
output "private_subnet_ids" {
|
||||
description = "Private subnet IDs"
|
||||
value = module.vpc.private_subnet_ids
|
||||
}
|
||||
|
||||
output "public_subnet_ids" {
|
||||
description = "Public subnet IDs"
|
||||
value = module.vpc.public_subnet_ids
|
||||
}
|
||||
|
||||
output "eks_cluster_name" {
|
||||
description = "EKS cluster name"
|
||||
value = module.eks.cluster_name
|
||||
}
|
||||
|
||||
output "eks_cluster_endpoint" {
|
||||
description = "EKS cluster endpoint"
|
||||
value = module.eks.cluster_endpoint
|
||||
}
|
||||
|
||||
output "eks_cluster_security_group_id" {
|
||||
description = "EKS cluster security group ID"
|
||||
value = module.eks.cluster_security_group_id
|
||||
}
|
||||
|
||||
output "rds_endpoint" {
|
||||
description = "RDS database endpoint"
|
||||
value = module.rds.db_endpoint
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "rds_database_name" {
|
||||
description = "RDS database name"
|
||||
value = module.rds.db_name
|
||||
}
|
||||
|
||||
output "redis_endpoint" {
|
||||
description = "Redis cluster endpoint"
|
||||
value = module.redis.redis_endpoint
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "alb_dns_name" {
|
||||
description = "ALB DNS name"
|
||||
value = module.alb.alb_dns_name
|
||||
}
|
||||
|
||||
output "alb_zone_id" {
|
||||
description = "ALB zone ID"
|
||||
value = module.alb.alb_zone_id
|
||||
}
|
||||
|
||||
output "configure_kubectl" {
|
||||
description = "Command to configure kubectl"
|
||||
value = "aws eks update-kubeconfig --name ${module.eks.cluster_name} --region ${var.aws_region}"
|
||||
}
|
||||
132
terraform/variables.tf
Normal file
132
terraform/variables.tf
Normal file
@@ -0,0 +1,132 @@
|
||||
variable "aws_region" {
|
||||
description = "AWS region to deploy resources"
|
||||
type = string
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name (staging, production)"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "project_name" {
|
||||
description = "Project name"
|
||||
type = string
|
||||
default = "spywatcher"
|
||||
}
|
||||
|
||||
# VPC Variables
|
||||
variable "vpc_cidr" {
|
||||
description = "CIDR block for VPC"
|
||||
type = string
|
||||
default = "10.0.0.0/16"
|
||||
}
|
||||
|
||||
variable "availability_zones" {
|
||||
description = "List of availability zones"
|
||||
type = list(string)
|
||||
default = ["us-east-1a", "us-east-1b", "us-east-1c"]
|
||||
}
|
||||
|
||||
variable "private_subnet_cidrs" {
|
||||
description = "CIDR blocks for private subnets"
|
||||
type = list(string)
|
||||
default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
|
||||
}
|
||||
|
||||
variable "public_subnet_cidrs" {
|
||||
description = "CIDR blocks for public subnets"
|
||||
type = list(string)
|
||||
default = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
|
||||
}
|
||||
|
||||
variable "database_subnet_cidrs" {
|
||||
description = "CIDR blocks for database subnets"
|
||||
type = list(string)
|
||||
default = ["10.0.201.0/24", "10.0.202.0/24", "10.0.203.0/24"]
|
||||
}
|
||||
|
||||
# EKS Variables
|
||||
variable "eks_cluster_version" {
|
||||
description = "Kubernetes version for EKS cluster"
|
||||
type = string
|
||||
default = "1.28"
|
||||
}
|
||||
|
||||
variable "eks_node_groups" {
|
||||
description = "EKS node groups configuration"
|
||||
type = map(object({
|
||||
desired_size = number
|
||||
min_size = number
|
||||
max_size = number
|
||||
instance_types = list(string)
|
||||
capacity_type = string
|
||||
}))
|
||||
default = {
|
||||
general = {
|
||||
desired_size = 3
|
||||
min_size = 2
|
||||
max_size = 10
|
||||
instance_types = ["t3.medium"]
|
||||
capacity_type = "ON_DEMAND"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# RDS Variables
|
||||
variable "rds_engine_version" {
|
||||
description = "PostgreSQL engine version"
|
||||
type = string
|
||||
default = "15.3"
|
||||
}
|
||||
|
||||
variable "rds_instance_class" {
|
||||
description = "RDS instance class"
|
||||
type = string
|
||||
default = "db.t3.medium"
|
||||
}
|
||||
|
||||
variable "rds_allocated_storage" {
|
||||
description = "Allocated storage in GB"
|
||||
type = number
|
||||
default = 100
|
||||
}
|
||||
|
||||
variable "database_name" {
|
||||
description = "Name of the database"
|
||||
type = string
|
||||
default = "spywatcher"
|
||||
}
|
||||
|
||||
variable "database_username" {
|
||||
description = "Database master username"
|
||||
type = string
|
||||
default = "spywatcher"
|
||||
}
|
||||
|
||||
# Redis Variables
|
||||
variable "redis_node_type" {
|
||||
description = "ElastiCache node type"
|
||||
type = string
|
||||
default = "cache.t3.medium"
|
||||
}
|
||||
|
||||
variable "redis_num_cache_nodes" {
|
||||
description = "Number of cache nodes"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
# SSL Certificate
|
||||
variable "certificate_arn" {
|
||||
description = "ARN of SSL certificate for ALB"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
# Tags
|
||||
variable "tags" {
|
||||
description = "Additional tags for resources"
|
||||
type = map(string)
|
||||
default = {}
|
||||
}
|
||||
Reference in New Issue
Block a user