feat(release): add staging deploy workflow and smoke checks

Implements roadmap issue #29 foundation with a manual staging workflow, environment profile matrix, smoke automation script, and rollback/runbook documentation updates. Also adds docs/phase5-6-implementation-plan.md to capture the dependency-ordered plan for phases 5 and 6 from roadmap #35.
2026-02-27 07:47:43 -06:00
parent 116808b09a
commit 4aea88b4f8
7 changed files with 452 additions and 3 deletions
--- a/.github/workflows/staging-deploy.yml
+++ b/.github/workflows/staging-deploy.yml
@@ -0,0 +1,189 @@
+name: Staging Deploy
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: Git ref (branch, tag, or SHA) to deploy
+        required: false
+        default: main
+      api_host_port:
+        description: Host port for API container mapping
+        required: false
+        default: '3001'
+      openrouter_api_key:
+        description: OpenRouter key for live profile runs (leave empty for mock profile)
+        required: false
+        default: ''
+
+permissions:
+  contents: read
+
+concurrency:
+  group: staging-deploy
+  cancel-in-progress: false
+
+jobs:
+  deploy-staging:
+    name: Deploy staging (${{ matrix.profile }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - profile: mock
+            llm_mock: 'true'
+          - profile: live
+            llm_mock: 'false'
+    env:
+      API_HOST_PORT: ${{ github.event.inputs.api_host_port }}
+      DEPLOY_PROFILE: ${{ matrix.profile }}
+      LLM_MOCK: ${{ matrix.llm_mock }}
+      OPENROUTER_API_KEY_INPUT: ${{ github.event.inputs.openrouter_api_key }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.ref }}
+
+      - name: Emit deploy start status
+        run: |
+          echo "::notice title=deploy_start::profile=${DEPLOY_PROFILE} sha=${GITHUB_SHA}"
+          echo "deploy_started_at=$(date -u +%FT%TZ)" >> "$GITHUB_ENV"
+
+      - name: Validate staging credential contract
+        run: |
+          if [[ "${DEPLOY_PROFILE}" == "live" && -z "${OPENROUTER_API_KEY_INPUT}" ]]; then
+            echo "openrouter_api_key input is required for live staging profile."
+            exit 1
+          fi
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Prepare staging environment file
+        run: |
+          openrouter_key=""
+          if [[ "${DEPLOY_PROFILE}" == "live" ]]; then
+            openrouter_key="${OPENROUTER_API_KEY_INPUT}"
+          fi
+
+          cp .env.example .env
+          {
+            echo "API_HOST_PORT=${API_HOST_PORT}"
+            echo "LLM_MOCK=${LLM_MOCK}"
+            echo "OPENROUTER_API_KEY=${openrouter_key}"
+            echo "LOG_LEVEL=info"
+            echo "TTS_PROVIDER=noop"
+            echo "BROADCAST_PROVIDER=noop"
+          } >> .env
+
+      - name: Deploy compose stack
+        run: docker compose up -d --build
+
+      - name: Wait for API health
+        run: |
+          rm -f smoke-health.json
+          for _attempt in {1..30}; do
+            if curl -fsS "http://localhost:${API_HOST_PORT}/api/health" > smoke-health.json; then
+              cat smoke-health.json
+              exit 0
+            fi
+            sleep 2
+          done
+          echo "API health check did not pass within timeout."
+          exit 1
+
+      - name: Run staging smoke checks
+        run: |
+          chmod +x scripts/staging-smoke.sh
+          API_BASE_URL="http://localhost:${API_HOST_PORT}" \
+          SMOKE_OUTPUT_PATH="smoke-results.json" \
+          scripts/staging-smoke.sh
+
+      - name: Capture deploy metadata
+        if: ${{ success() }}
+        run: |
+          deploy_finished_at="$(date -u +%FT%TZ)"
+          cat <<JSON > deploy-metadata.json
+          {
+            "workflow": "${GITHUB_WORKFLOW}",
+            "runId": "${GITHUB_RUN_ID}",
+            "runAttempt": "${GITHUB_RUN_ATTEMPT}",
+            "profile": "${DEPLOY_PROFILE}",
+            "ref": "${GITHUB_REF}",
+            "sha": "${GITHUB_SHA}",
+            "deployedAt": "${deploy_started_at}",
+            "finishedAt": "${deploy_finished_at}",
+            "status": "success"
+          }
+          JSON
+          cat deploy-metadata.json
+
+      - name: Ensure deploy metadata exists on failure
+        if: ${{ failure() }}
+        run: |
+          if [[ ! -f deploy-metadata.json ]]; then
+            deploy_finished_at="$(date -u +%FT%TZ)"
+            cat <<JSON > deploy-metadata.json
+            {
+              "workflow": "${GITHUB_WORKFLOW}",
+              "runId": "${GITHUB_RUN_ID}",
+              "runAttempt": "${GITHUB_RUN_ATTEMPT}",
+              "profile": "${DEPLOY_PROFILE}",
+              "ref": "${GITHUB_REF}",
+              "sha": "${GITHUB_SHA}",
+              "deployedAt": "${deploy_started_at}",
+              "finishedAt": "${deploy_finished_at}",
+              "status": "failure"
+            }
+            JSON
+          fi
+
+      - name: Emit deploy end status (success)
+        if: ${{ success() }}
+        run: |
+          echo "::notice title=deploy_end::profile=${DEPLOY_PROFILE} status=success sha=${GITHUB_SHA}"
+
+      - name: Emit deploy end status (failure)
+        if: ${{ failure() }}
+        run: |
+          echo "::error title=deploy_end::profile=${DEPLOY_PROFILE} status=failure sha=${GITHUB_SHA}"
+
+      - name: Capture compose logs
+        if: ${{ always() }}
+        run: docker compose logs --no-color > docker-compose.log || true
+
+      - name: Tear down compose stack
+        if: ${{ always() }}
+        run: docker compose down -v || true
+
+      - name: Upload deployment artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: staging-${{ matrix.profile }}-${{ github.run_id }}
+          path: |
+            smoke-health.json
+            smoke-results.json
+            deploy-metadata.json
+            docker-compose.log
+
+      - name: Write job summary
+        if: ${{ always() }}
+        run: |
+          echo "### Staging deploy (${DEPLOY_PROFILE})" >> "$GITHUB_STEP_SUMMARY"
+          echo "- Ref: \`${GITHUB_REF}\`" >> "$GITHUB_STEP_SUMMARY"
+          echo "- SHA: \`${GITHUB_SHA}\`" >> "$GITHUB_STEP_SUMMARY"
+          if [[ -f smoke-results.json ]]; then
+            echo "- Smoke checks: ✅ passed" >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "- Smoke checks: ❌ failed" >> "$GITHUB_STEP_SUMMARY"
+          fi
--- a/5
+++ b/5
@@ -10,7 +10,7 @@ SHELL := /usr/bin/env bash
 NPM ?= npm
 DOCKER_COMPOSE ?= docker compose

-.PHONY: help install dev dev-dashboard lint build build-dashboard test test-spec ci start migrate migrate-dist docker-up docker-down docker-restart clean status
+.PHONY: help install dev dev-dashboard lint build build-dashboard test test-spec ci smoke-staging start migrate migrate-dist docker-up docker-down docker-restart clean status

 help: ## Show available commands
 	@awk 'BEGIN {FS = ":.*##"; printf "\nImprov Court Make targets:\n\n"} /^[a-zA-Z0-9_.-]+:.*##/ { printf "  %-18s %s\n", $$1, $$2 } END { printf "\n" }' $(MAKEFILE_LIST)
@@ -44,6 +44,9 @@ ci: ## Run local CI parity checks (lint + build + test)
 	$(MAKE) build
 	$(MAKE) test

+smoke-staging: ## Run staging smoke checks against a running API instance
+	$(NPM) run smoke:staging
+
 start: ## Run compiled app from dist/
 	$(NPM) run start

--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ It does **not** depend on `subcult-corp` at runtime.
 | [docs/operator-runbook.md](docs/operator-runbook.md)                                   | Setup, configuration, deployment, and monitoring                                       |
 | [docs/moderation-playbook.md](docs/moderation-playbook.md)                             | Content moderation system and incident procedures                                      |
 | [docs/event-taxonomy.md](docs/event-taxonomy.md)                                       | Canonical event taxonomy, payload schemas, and logging guidelines                      |
+| [docs/phase5-6-implementation-plan.md](docs/phase5-6-implementation-plan.md)           | Dependency-ordered implementation plan for roadmap phases 5 and 6                      |

 ## What is implemented

@@ -125,7 +126,9 @@ If you need host access to Postgres, add a `ports` mapping to the `db` service i

 ## Operations runbook (staging)

-See `docs/ops-runbook.md` for the repeatable staging deploy path, core SLI dashboard definitions, alert thresholds, and incident drill/recovery steps.
+See `docs/ops-runbook.md` for the repeatable staging deploy path, GitHub Actions
+workflow (`Staging Deploy`), core SLI dashboard definitions, alert thresholds,
+and incident drill/recovery steps.

 ## API

--- a/docs/ops-runbook.md
+++ b/docs/ops-runbook.md
@@ -16,13 +16,42 @@ Run from the project root directory:
 4. Optional migration-only verification:
   - `docker compose exec api npm run migrate:dist`

-Rollback (staging):
+### 1.1 GitHub Actions staging workflow
+
+Use workflow **`Staging Deploy`** (`.github/workflows/staging-deploy.yml`) to
+run repeatable staging deploy + smoke verification with an environment matrix:
+
+- `mock` profile (`LLM_MOCK=true`, no OpenRouter key required)
+- `live` profile (`LLM_MOCK=false`, requires `openrouter_api_key` workflow input)
+
+Workflow smoke checks:
+
+1. `GET /api/health`
+2. `POST /api/court/sessions`
+3. `GET /api/court/sessions/:id`
+
+Artifacts captured per run:
+
+- `smoke-health.json`
+- `smoke-results.json`
+- `deploy-metadata.json`
+- `docker-compose.log`
+
+### 1.2 Rollback (staging)

 1. Stop current stack: `npm run docker:down`
 2. Checkout previous known-good commit/tag.
 3. Start previous version: `npm run docker:up`
 4. Re-run health check curl above.

+Rollback trial checklist (verify once per release candidate):
+
+- [ ] Deploy a known good revision via `Staging Deploy`.
+- [ ] Deploy a deliberately broken revision (or force failed smoke input).
+- [ ] Roll back to previous good revision.
+- [ ] Confirm smoke checks pass and artifact logs show healthy recovery.
+- [ ] Record run ID, operator, and timestamp in incident notes.
+
 ## 2) Core SLI dashboard definitions

 Use these as dashboard panels (SQL via Postgres + synthetic HTTP check):
--- a/docs/phase5-6-implementation-plan.md
+++ b/docs/phase5-6-implementation-plan.md
@@ -0,0 +1,178 @@
+# Phase 5 & 6 Implementation Plan (Roadmap #35)
+
+Date: 2026-02-27
+
+## Objective
+
+Deliver all open roadmap work in **Phase 5 (Release & Operations)** and **Phase 6 (Post-launch Polish)** in dependency order, with production-grade quality gates and operational readiness.
+
+## Dependency order
+
+```text
+#29 -> #30 -> #31 -> (#33 + #34) -> #32
+```
+
+- `#29` blocks `#30`, `#31`, and `#32`
+- `#30` blocks `#31` and `#33`
+- `#31` blocks `#34` and `#32`
+- `#33` blocks `#32`
+
+## Phase 5 — Release & operations
+
+### #29 Release: Staging deployment workflow + env matrix + smoke checks
+
+#### Deliverables
+
+- GitHub Actions workflow for staging deployment (manual trigger).
+- Environment matrix (`mock`, `live`) with credential contract enforcement.
+- Post-deploy smoke checks for:
+  - `GET /api/health`
+  - `POST /api/court/sessions` (bootstrap path)
+- Deployment metadata capture (start/end status + revision context).
+- Rollback guidance + trial checklist in ops docs.
+
+#### File targets
+
+- `.github/workflows/staging-deploy.yml`
+- `scripts/staging-smoke.sh`
+- `docs/ops-runbook.md`
+- `README.md`
+- `package.json`
+
+#### Verification
+
+- Workflow logs contain smoke check output.
+- Artifact contains deploy metadata and compose logs.
+- Rollback trial checklist completed in docs.
+
+---
+
+### #30 Runtime dashboards and alerts for session health/moderation
+
+#### Deliverables
+
+- Dashboard definitions for core SLIs/SLO proxies:
+  - session completion
+  - vote latency
+  - moderation actions
+  - stream/API health
+- Alert threshold configurations with runbook links.
+- Synthetic alert validation instructions/tests.
+
+#### File targets
+
+- `ops/dashboards/*`
+- `ops/alerts/*`
+- `docs/ops-runbook.md`
+- `README.md`
+
+#### Verification
+
+- Simulated failure conditions trigger expected alert payloads.
+- Dashboard queries align with event taxonomy (`docs/event-taxonomy.md`).
+
+---
+
+### #31 Operator runbook: live controls + mistrial + incident response
+
+#### Deliverables
+
+- Expanded runbook covering startup, live operation, and shutdown.
+- Incident section with at least 5 common failure scenarios.
+- Mistrial fallback, emergency recap, and witness-swap procedures.
+- Dashboard/alert panel references embedded into procedures.
+
+#### File targets
+
+- `docs/operator-runbook.md`
+- `README.md`
+
+#### Verification
+
+- Tabletop drill notes captured and missing steps patched.
+
+## Phase 6 — Post-launch polish
+
+### #33 Token budget and summary cadence controls
+
+#### Deliverables
+
+- Runtime knobs for per-role token caps and recap cadence controls.
+- Safe defaults balancing quality and cost.
+- Session-level cost-estimate telemetry.
+- New telemetry events:
+  - `token_budget_applied`
+  - session token estimate event
+
+#### File targets
+
+- `src/court/orchestrator.ts`
+- `src/court/witness-caps.ts` (or dedicated budget module)
+- `src/types.ts`
+- `src/events.ts`
+- `dashboard/src/components/Analytics.tsx`
+- `docs/api.md`
+- `docs/event-taxonomy.md`
+
+#### Verification
+
+- Unit tests for budget enforcement.
+- Integration test: phase completion remains intact under stricter caps.
+
+---
+
+### #34 Onboarding/catch-up panel for new viewers
+
+#### Deliverables
+
+- Compact viewer-facing catch-up panel:
+  - “case so far” summary
+  - current phase/jury step status
+- Refresh on phase transitions.
+- Toggle without layout breakage.
+- Aggregate-only telemetry for toggle visibility usage.
+
+#### File targets
+
+- `public/index.html`
+- `public/app.js`
+- `docs/operator-runbook.md`
+
+#### Verification
+
+- Component/behavior tests for panel rendering and toggle.
+- Integration test for phase-change refresh behavior.
+
+---
+
+### #32 Post-launch retrospective template + technical debt queue
+
+#### Deliverables
+
+- Reusable retrospective template.
+- Structured debt intake queue format with triage rubric (P0-P3 + effort).
+- First filled example draft from mock incident.
+
+#### File targets
+
+- `docs/templates/retrospective-template.md`
+- `docs/templates/technical-debt-queue.md`
+- `README.md`
+
+#### Verification
+
+- Trial retrospective run confirms template usability.
+
+## PR slicing strategy
+
+1. PR-A: `#29` staging workflow + smoke + rollback docs
+2. PR-B: `#30` dashboards + alerts + alert simulation checks
+3. PR-C: `#31` operator runbook expansion + drill checklist
+4. PR-D: `#33` token budget/cadence controls + telemetry + tests
+5. PR-E: `#34` onboarding/catch-up panel + telemetry + tests
+6. PR-F: `#32` retrospective/debt templates + example draft
+
+## Current execution status
+
+- Plan documented ✅
+- Implementation started with `#29` ✅
--- a/package.json
+++ b/package.json
@@ -10,6 +10,7 @@
        "build": "tsc -p tsconfig.json && vite build",
        "build:dashboard": "vite build",
        "test": "node --import tsx --test src/*.test.ts src/**/*.test.ts",
+        "smoke:staging": "bash ./scripts/staging-smoke.sh",
        "start": "node dist/server.js",
        "migrate": "tsx src/scripts/migrate.ts",
        "migrate:dist": "node dist/scripts/migrate.js",
--- a/scripts/staging-smoke.sh
+++ b/scripts/staging-smoke.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+API_BASE_URL="${API_BASE_URL:-http://localhost:${API_HOST_PORT:-3001}}"
+SMOKE_TOPIC="${SMOKE_TOPIC:-Did the defendant weaponize office glitter in the break room?}"
+SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE:-criminal}"
+SMOKE_OUTPUT_PATH="${SMOKE_OUTPUT_PATH:-smoke-results.json}"
+
+echo "[smoke] starting checks for ${API_BASE_URL}"
+
+health_response="$(curl -fsS "${API_BASE_URL}/api/health")"
+echo "[smoke] /api/health response: ${health_response}"
+
+request_body="$(
+    SMOKE_TOPIC="${SMOKE_TOPIC}" SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE}" node -e "const topic = process.env.SMOKE_TOPIC; const caseType = process.env.SMOKE_CASE_TYPE || 'criminal'; process.stdout.write(JSON.stringify({ topic, caseType }));"
+)"
+
+session_response="$(
+    curl -fsS -X POST "${API_BASE_URL}/api/court/sessions" \
+        -H 'Content-Type: application/json' \
+        -d "${request_body}"
+)"
+echo "[smoke] POST /api/court/sessions response: ${session_response}"
+
+session_id="$(
+    printf '%s' "${session_response}" | node -e "let input=''; process.stdin.on('data', chunk => input += chunk); process.stdin.on('end', () => { try { const parsed = JSON.parse(input); const id = parsed?.session?.id; if (!id) process.exit(1); process.stdout.write(id); } catch { process.exit(1); } });"
+)"
+
+if [[ -z "${session_id}" ]]; then
+    echo "[smoke] failed to parse session id from bootstrap response"
+    exit 1
+fi
+
+session_lookup="$(curl -fsS "${API_BASE_URL}/api/court/sessions/${session_id}")"
+echo "[smoke] GET /api/court/sessions/${session_id} response: ${session_lookup}"
+
+SMOKE_OUTPUT_PATH="${SMOKE_OUTPUT_PATH}" \
+SMOKE_TOPIC="${SMOKE_TOPIC}" \
+SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE}" \
+API_BASE_URL="${API_BASE_URL}" \
+SESSION_ID="${session_id}" \
+HEALTH_RESPONSE="${health_response}" \
+node -e "const fs = require('node:fs'); const outputPath = process.env.SMOKE_OUTPUT_PATH; const payload = { checkedAt: new Date().toISOString(), apiBaseUrl: process.env.API_BASE_URL, topic: process.env.SMOKE_TOPIC, caseType: process.env.SMOKE_CASE_TYPE, sessionId: process.env.SESSION_ID, health: process.env.HEALTH_RESPONSE }; fs.writeFileSync(outputPath, JSON.stringify(payload, null, 2));"
+
+echo "[smoke] checks passed (session=${session_id})"
+echo "[smoke] result artifact written to ${SMOKE_OUTPUT_PATH}"