feat(release): add staging deploy workflow and smoke checks

Implements roadmap issue #29 foundation with a manual staging workflow, environment profile matrix, smoke automation script, and rollback/runbook documentation updates.

Also adds docs/phase5-6-implementation-plan.md to capture the dependency-ordered plan for phases 5 and 6 from roadmap #35.
This commit is contained in:
2026-02-27 07:47:43 -06:00
parent 116808b09a
commit 4aea88b4f8
7 changed files with 452 additions and 3 deletions

189
.github/workflows/staging-deploy.yml vendored Normal file
View File

@@ -0,0 +1,189 @@
name: Staging Deploy
on:
workflow_dispatch:
inputs:
ref:
description: Git ref (branch, tag, or SHA) to deploy
required: false
default: main
api_host_port:
description: Host port for API container mapping
required: false
default: '3001'
openrouter_api_key:
description: OpenRouter key for live profile runs (leave empty for mock profile)
required: false
default: ''
permissions:
contents: read
concurrency:
group: staging-deploy
cancel-in-progress: false
jobs:
deploy-staging:
name: Deploy staging (${{ matrix.profile }})
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- profile: mock
llm_mock: 'true'
- profile: live
llm_mock: 'false'
env:
API_HOST_PORT: ${{ github.event.inputs.api_host_port }}
DEPLOY_PROFILE: ${{ matrix.profile }}
LLM_MOCK: ${{ matrix.llm_mock }}
OPENROUTER_API_KEY_INPUT: ${{ github.event.inputs.openrouter_api_key }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.ref }}
- name: Emit deploy start status
run: |
echo "::notice title=deploy_start::profile=${DEPLOY_PROFILE} sha=${GITHUB_SHA}"
echo "deploy_started_at=$(date -u +%FT%TZ)" >> "$GITHUB_ENV"
- name: Validate staging credential contract
run: |
if [[ "${DEPLOY_PROFILE}" == "live" && -z "${OPENROUTER_API_KEY_INPUT}" ]]; then
echo "openrouter_api_key input is required for live staging profile."
exit 1
fi
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: npm
- name: Install dependencies
run: npm ci
- name: Prepare staging environment file
run: |
openrouter_key=""
if [[ "${DEPLOY_PROFILE}" == "live" ]]; then
openrouter_key="${OPENROUTER_API_KEY_INPUT}"
fi
cp .env.example .env
{
echo "API_HOST_PORT=${API_HOST_PORT}"
echo "LLM_MOCK=${LLM_MOCK}"
echo "OPENROUTER_API_KEY=${openrouter_key}"
echo "LOG_LEVEL=info"
echo "TTS_PROVIDER=noop"
echo "BROADCAST_PROVIDER=noop"
} >> .env
- name: Deploy compose stack
run: docker compose up -d --build
- name: Wait for API health
run: |
rm -f smoke-health.json
for _attempt in {1..30}; do
if curl -fsS "http://localhost:${API_HOST_PORT}/api/health" > smoke-health.json; then
cat smoke-health.json
exit 0
fi
sleep 2
done
echo "API health check did not pass within timeout."
exit 1
- name: Run staging smoke checks
run: |
chmod +x scripts/staging-smoke.sh
API_BASE_URL="http://localhost:${API_HOST_PORT}" \
SMOKE_OUTPUT_PATH="smoke-results.json" \
scripts/staging-smoke.sh
- name: Capture deploy metadata
if: ${{ success() }}
run: |
deploy_finished_at="$(date -u +%FT%TZ)"
cat <<JSON > deploy-metadata.json
{
"workflow": "${GITHUB_WORKFLOW}",
"runId": "${GITHUB_RUN_ID}",
"runAttempt": "${GITHUB_RUN_ATTEMPT}",
"profile": "${DEPLOY_PROFILE}",
"ref": "${GITHUB_REF}",
"sha": "${GITHUB_SHA}",
"deployedAt": "${deploy_started_at}",
"finishedAt": "${deploy_finished_at}",
"status": "success"
}
JSON
cat deploy-metadata.json
- name: Ensure deploy metadata exists on failure
if: ${{ failure() }}
run: |
if [[ ! -f deploy-metadata.json ]]; then
deploy_finished_at="$(date -u +%FT%TZ)"
cat <<JSON > deploy-metadata.json
{
"workflow": "${GITHUB_WORKFLOW}",
"runId": "${GITHUB_RUN_ID}",
"runAttempt": "${GITHUB_RUN_ATTEMPT}",
"profile": "${DEPLOY_PROFILE}",
"ref": "${GITHUB_REF}",
"sha": "${GITHUB_SHA}",
"deployedAt": "${deploy_started_at}",
"finishedAt": "${deploy_finished_at}",
"status": "failure"
}
JSON
fi
- name: Emit deploy end status (success)
if: ${{ success() }}
run: |
echo "::notice title=deploy_end::profile=${DEPLOY_PROFILE} status=success sha=${GITHUB_SHA}"
- name: Emit deploy end status (failure)
if: ${{ failure() }}
run: |
echo "::error title=deploy_end::profile=${DEPLOY_PROFILE} status=failure sha=${GITHUB_SHA}"
- name: Capture compose logs
if: ${{ always() }}
run: docker compose logs --no-color > docker-compose.log || true
- name: Tear down compose stack
if: ${{ always() }}
run: docker compose down -v || true
- name: Upload deployment artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: staging-${{ matrix.profile }}-${{ github.run_id }}
path: |
smoke-health.json
smoke-results.json
deploy-metadata.json
docker-compose.log
- name: Write job summary
if: ${{ always() }}
run: |
echo "### Staging deploy (${DEPLOY_PROFILE})" >> "$GITHUB_STEP_SUMMARY"
echo "- Ref: \`${GITHUB_REF}\`" >> "$GITHUB_STEP_SUMMARY"
echo "- SHA: \`${GITHUB_SHA}\`" >> "$GITHUB_STEP_SUMMARY"
if [[ -f smoke-results.json ]]; then
echo "- Smoke checks: ✅ passed" >> "$GITHUB_STEP_SUMMARY"
else
echo "- Smoke checks: ❌ failed" >> "$GITHUB_STEP_SUMMARY"
fi

View File

@@ -10,7 +10,7 @@ SHELL := /usr/bin/env bash
NPM ?= npm
DOCKER_COMPOSE ?= docker compose
.PHONY: help install dev dev-dashboard lint build build-dashboard test test-spec ci start migrate migrate-dist docker-up docker-down docker-restart clean status
.PHONY: help install dev dev-dashboard lint build build-dashboard test test-spec ci smoke-staging start migrate migrate-dist docker-up docker-down docker-restart clean status
help: ## Show available commands
@awk 'BEGIN {FS = ":.*##"; printf "\nImprov Court Make targets:\n\n"} /^[a-zA-Z0-9_.-]+:.*##/ { printf " %-18s %s\n", $$1, $$2 } END { printf "\n" }' $(MAKEFILE_LIST)
@@ -44,6 +44,9 @@ ci: ## Run local CI parity checks (lint + build + test)
$(MAKE) build
$(MAKE) test
smoke-staging: ## Run staging smoke checks against a running API instance
$(NPM) run smoke:staging
start: ## Run compiled app from dist/
$(NPM) run start

View File

@@ -15,6 +15,7 @@ It does **not** depend on `subcult-corp` at runtime.
| [docs/operator-runbook.md](docs/operator-runbook.md) | Setup, configuration, deployment, and monitoring |
| [docs/moderation-playbook.md](docs/moderation-playbook.md) | Content moderation system and incident procedures |
| [docs/event-taxonomy.md](docs/event-taxonomy.md) | Canonical event taxonomy, payload schemas, and logging guidelines |
| [docs/phase5-6-implementation-plan.md](docs/phase5-6-implementation-plan.md) | Dependency-ordered implementation plan for roadmap phases 5 and 6 |
## What is implemented
@@ -125,7 +126,9 @@ If you need host access to Postgres, add a `ports` mapping to the `db` service i
## Operations runbook (staging)
See `docs/ops-runbook.md` for the repeatable staging deploy path, core SLI dashboard definitions, alert thresholds, and incident drill/recovery steps.
See `docs/ops-runbook.md` for the repeatable staging deploy path, GitHub Actions
workflow (`Staging Deploy`), core SLI dashboard definitions, alert thresholds,
and incident drill/recovery steps.
## API

View File

@@ -16,13 +16,42 @@ Run from the project root directory:
4. Optional migration-only verification:
- `docker compose exec api npm run migrate:dist`
Rollback (staging):
### 1.1 GitHub Actions staging workflow
Use workflow **`Staging Deploy`** (`.github/workflows/staging-deploy.yml`) to
run repeatable staging deploy + smoke verification with an environment matrix:
- `mock` profile (`LLM_MOCK=true`, no OpenRouter key required)
- `live` profile (`LLM_MOCK=false`, requires `openrouter_api_key` workflow input)
Workflow smoke checks:
1. `GET /api/health`
2. `POST /api/court/sessions`
3. `GET /api/court/sessions/:id`
Artifacts captured per run:
- `smoke-health.json`
- `smoke-results.json`
- `deploy-metadata.json`
- `docker-compose.log`
### 1.2 Rollback (staging)
1. Stop current stack: `npm run docker:down`
2. Checkout previous known-good commit/tag.
3. Start previous version: `npm run docker:up`
4. Re-run health check curl above.
Rollback trial checklist (verify once per release candidate):
- [ ] Deploy a known good revision via `Staging Deploy`.
- [ ] Deploy a deliberately broken revision (or force failed smoke input).
- [ ] Roll back to previous good revision.
- [ ] Confirm smoke checks pass and artifact logs show healthy recovery.
- [ ] Record run ID, operator, and timestamp in incident notes.
## 2) Core SLI dashboard definitions
Use these as dashboard panels (SQL via Postgres + synthetic HTTP check):

View File

@@ -0,0 +1,178 @@
# Phase 5 & 6 Implementation Plan (Roadmap #35)
Date: 2026-02-27
## Objective
Deliver all open roadmap work in **Phase 5 (Release & Operations)** and **Phase 6 (Post-launch Polish)** in dependency order, with production-grade quality gates and operational readiness.
## Dependency order
```text
#29 -> #30 -> #31 -> (#33 + #34) -> #32
```
- `#29` blocks `#30`, `#31`, and `#32`
- `#30` blocks `#31` and `#33`
- `#31` blocks `#34` and `#32`
- `#33` blocks `#32`
## Phase 5 — Release & operations
### #29 Release: Staging deployment workflow + env matrix + smoke checks
#### Deliverables
- GitHub Actions workflow for staging deployment (manual trigger).
- Environment matrix (`mock`, `live`) with credential contract enforcement.
- Post-deploy smoke checks for:
- `GET /api/health`
- `POST /api/court/sessions` (bootstrap path)
- Deployment metadata capture (start/end status + revision context).
- Rollback guidance + trial checklist in ops docs.
#### File targets
- `.github/workflows/staging-deploy.yml`
- `scripts/staging-smoke.sh`
- `docs/ops-runbook.md`
- `README.md`
- `package.json`
#### Verification
- Workflow logs contain smoke check output.
- Artifact contains deploy metadata and compose logs.
- Rollback trial checklist completed in docs.
---
### #30 Runtime dashboards and alerts for session health/moderation
#### Deliverables
- Dashboard definitions for core SLIs/SLO proxies:
- session completion
- vote latency
- moderation actions
- stream/API health
- Alert threshold configurations with runbook links.
- Synthetic alert validation instructions/tests.
#### File targets
- `ops/dashboards/*`
- `ops/alerts/*`
- `docs/ops-runbook.md`
- `README.md`
#### Verification
- Simulated failure conditions trigger expected alert payloads.
- Dashboard queries align with event taxonomy (`docs/event-taxonomy.md`).
---
### #31 Operator runbook: live controls + mistrial + incident response
#### Deliverables
- Expanded runbook covering startup, live operation, and shutdown.
- Incident section with at least 5 common failure scenarios.
- Mistrial fallback, emergency recap, and witness-swap procedures.
- Dashboard/alert panel references embedded into procedures.
#### File targets
- `docs/operator-runbook.md`
- `README.md`
#### Verification
- Tabletop drill notes captured and missing steps patched.
## Phase 6 — Post-launch polish
### #33 Token budget and summary cadence controls
#### Deliverables
- Runtime knobs for per-role token caps and recap cadence controls.
- Safe defaults balancing quality and cost.
- Session-level cost-estimate telemetry.
- New telemetry events:
- `token_budget_applied`
- session token estimate event
#### File targets
- `src/court/orchestrator.ts`
- `src/court/witness-caps.ts` (or dedicated budget module)
- `src/types.ts`
- `src/events.ts`
- `dashboard/src/components/Analytics.tsx`
- `docs/api.md`
- `docs/event-taxonomy.md`
#### Verification
- Unit tests for budget enforcement.
- Integration test: phase completion remains intact under stricter caps.
---
### #34 Onboarding/catch-up panel for new viewers
#### Deliverables
- Compact viewer-facing catch-up panel:
- “case so far” summary
- current phase/jury step status
- Refresh on phase transitions.
- Toggle without layout breakage.
- Aggregate-only telemetry for toggle visibility usage.
#### File targets
- `public/index.html`
- `public/app.js`
- `docs/operator-runbook.md`
#### Verification
- Component/behavior tests for panel rendering and toggle.
- Integration test for phase-change refresh behavior.
---
### #32 Post-launch retrospective template + technical debt queue
#### Deliverables
- Reusable retrospective template.
- Structured debt intake queue format with triage rubric (P0-P3 + effort).
- First filled example draft from mock incident.
#### File targets
- `docs/templates/retrospective-template.md`
- `docs/templates/technical-debt-queue.md`
- `README.md`
#### Verification
- Trial retrospective run confirms template usability.
## PR slicing strategy
1. PR-A: `#29` staging workflow + smoke + rollback docs
2. PR-B: `#30` dashboards + alerts + alert simulation checks
3. PR-C: `#31` operator runbook expansion + drill checklist
4. PR-D: `#33` token budget/cadence controls + telemetry + tests
5. PR-E: `#34` onboarding/catch-up panel + telemetry + tests
6. PR-F: `#32` retrospective/debt templates + example draft
## Current execution status
- Plan documented ✅
- Implementation started with `#29`

View File

@@ -10,6 +10,7 @@
"build": "tsc -p tsconfig.json && vite build",
"build:dashboard": "vite build",
"test": "node --import tsx --test src/*.test.ts src/**/*.test.ts",
"smoke:staging": "bash ./scripts/staging-smoke.sh",
"start": "node dist/server.js",
"migrate": "tsx src/scripts/migrate.ts",
"migrate:dist": "node dist/scripts/migrate.js",

46
scripts/staging-smoke.sh Normal file
View File

@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail
API_BASE_URL="${API_BASE_URL:-http://localhost:${API_HOST_PORT:-3001}}"
SMOKE_TOPIC="${SMOKE_TOPIC:-Did the defendant weaponize office glitter in the break room?}"
SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE:-criminal}"
SMOKE_OUTPUT_PATH="${SMOKE_OUTPUT_PATH:-smoke-results.json}"
echo "[smoke] starting checks for ${API_BASE_URL}"
health_response="$(curl -fsS "${API_BASE_URL}/api/health")"
echo "[smoke] /api/health response: ${health_response}"
request_body="$(
SMOKE_TOPIC="${SMOKE_TOPIC}" SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE}" node -e "const topic = process.env.SMOKE_TOPIC; const caseType = process.env.SMOKE_CASE_TYPE || 'criminal'; process.stdout.write(JSON.stringify({ topic, caseType }));"
)"
session_response="$(
curl -fsS -X POST "${API_BASE_URL}/api/court/sessions" \
-H 'Content-Type: application/json' \
-d "${request_body}"
)"
echo "[smoke] POST /api/court/sessions response: ${session_response}"
session_id="$(
printf '%s' "${session_response}" | node -e "let input=''; process.stdin.on('data', chunk => input += chunk); process.stdin.on('end', () => { try { const parsed = JSON.parse(input); const id = parsed?.session?.id; if (!id) process.exit(1); process.stdout.write(id); } catch { process.exit(1); } });"
)"
if [[ -z "${session_id}" ]]; then
echo "[smoke] failed to parse session id from bootstrap response"
exit 1
fi
session_lookup="$(curl -fsS "${API_BASE_URL}/api/court/sessions/${session_id}")"
echo "[smoke] GET /api/court/sessions/${session_id} response: ${session_lookup}"
SMOKE_OUTPUT_PATH="${SMOKE_OUTPUT_PATH}" \
SMOKE_TOPIC="${SMOKE_TOPIC}" \
SMOKE_CASE_TYPE="${SMOKE_CASE_TYPE}" \
API_BASE_URL="${API_BASE_URL}" \
SESSION_ID="${session_id}" \
HEALTH_RESPONSE="${health_response}" \
node -e "const fs = require('node:fs'); const outputPath = process.env.SMOKE_OUTPUT_PATH; const payload = { checkedAt: new Date().toISOString(), apiBaseUrl: process.env.API_BASE_URL, topic: process.env.SMOKE_TOPIC, caseType: process.env.SMOKE_CASE_TYPE, sessionId: process.env.SESSION_ID, health: process.env.HEALTH_RESPONSE }; fs.writeFileSync(outputPath, JSON.stringify(payload, null, 2));"
echo "[smoke] checks passed (session=${session_id})"
echo "[smoke] result artifact written to ${SMOKE_OUTPUT_PATH}"