feat(live): task 1.5.1 — WS server scaffold + heartbeat

Stand up the WebSocket live-broadcast server inside the Processor process:
- src/live/server.ts: createLiveServer factory with start/stop lifecycle,
  per-connection LiveConnection type, sendOutbound helper with back-pressure
  guard, 30s frame-level heartbeat via ws ping/pong, pluggable onMessage
  handler (stub returns error/not-implemented until 1.5.2/1.5.3).
- src/live/protocol.ts: zod schemas for inbound subscribe/unsubscribe messages,
  all outbound types (subscribed/unsubscribed/position/error), WsCloseCodes.
- src/shared/types.ts: extracted Metrics interface so src/live/ can import it
  without crossing the enforced src/live/ ↔ src/core/ ESLint boundary.
- src/core/types.ts: re-exports Metrics from shared/types to keep Phase 1
  call sites unchanged.
- src/config/load.ts: LIVE_WS_PORT, LIVE_WS_HOST, LIVE_WS_PING_INTERVAL_MS,
  LIVE_WS_DRAIN_TIMEOUT_MS, LIVE_WS_BACKPRESSURE_THRESHOLD_BYTES,
  DIRECTUS_BASE_URL, DIRECTUS_AUTH_TIMEOUT_MS, DIRECTUS_AUTHZ_TIMEOUT_MS,
  LIVE_BROADCAST_GROUP_PREFIX, LIVE_BROADCAST_BATCH_SIZE,
  LIVE_BROADCAST_BATCH_BLOCK_MS, LIVE_DEVICE_EVENT_REFRESH_MS.
- src/observability/metrics.ts: Phase 1.5 metrics inventory (connections,
  inbound/outbound counters, auth/authz histograms, subscription gauge,
  broadcast counters + lag histogram, snapshot histograms, device-event map).
- src/main.ts: wires the live server alongside the durable-write consumer;
  shutdown order: live server → consumer → metrics → Redis → Postgres.
- eslint.config.js: import/no-restricted-paths zones for src/live/ ↔ src/core/.
- test/live-server.test.ts: 7 unit tests covering connect, ping, protocol
  violation, valid message dispatch, connections gauge, and stop() drain.
This commit is contained in:
2026-05-02 17:33:31 +02:00
parent e1c6f59948
commit 7154a0a49c
11 changed files with 1134 additions and 21 deletions
+45 -6
View File
@@ -16,6 +16,9 @@ import { connectRedis, createConsumer } from './core/consumer.js';
import type { ConsumedRecord } from './core/consumer.js';
import { createDeviceStateStore } from './core/state.js';
import { createWriter } from './core/writer.js';
import { createLiveServer, sendOutbound } from './live/server.js';
import type { LiveServer, LiveConnection } from './live/server.js';
import type { InboundMessage } from './live/protocol.js';
// -------------------------------------------------------------------------
// Startup: validate config (fail fast on bad env), build logger
@@ -128,17 +131,41 @@ async function main(): Promise<void> {
return ackIds;
};
// 10. Build and start the consumer
// 10. Build the live WebSocket server (task 1.5.1).
// The stub message handler replies with `error/not-implemented` until
// tasks 1.5.2 and 1.5.3 wire in the real auth + registry handler.
const stubMessageHandler = async (
conn: LiveConnection,
_message: InboundMessage,
): Promise<void> => {
sendOutbound(
conn,
{ type: 'error', code: 'not-implemented' },
metrics,
config.LIVE_WS_BACKPRESSURE_THRESHOLD_BYTES,
);
};
const liveServer: LiveServer = createLiveServer(
config,
logger,
metrics,
stubMessageHandler,
);
await liveServer.start();
// 11. Build and start the durable-write consumer
const consumer = createConsumer(redis, config, logger, metrics, sink);
await consumer.start();
// 11. Install graceful shutdown.
// Full Phase 3 hardening: explicit consumer-group commit on SIGTERM,
// uncaught-exception handler, multi-instance drain mode.
// 12. Install graceful shutdown.
// Shutdown order: live server first (no new connections), then
// broadcast consumer (task 1.5.4 adds this), then durable-write consumer.
installGracefulShutdown({
redis,
pool,
consumer,
liveServer,
metricsServer,
pgHealth,
lagSampler,
@@ -151,6 +178,7 @@ async function main(): Promise<void> {
group: config.REDIS_CONSUMER_GROUP,
consumer: config.REDIS_CONSUMER_NAME,
metricsPort: config.METRICS_PORT,
wsPort: config.LIVE_WS_PORT,
},
'processor ready',
);
@@ -164,6 +192,7 @@ type ShutdownDeps = {
readonly redis: Redis;
readonly pool: pg.Pool;
readonly consumer: { stop: () => Promise<void> };
readonly liveServer: LiveServer;
readonly metricsServer: http.Server;
readonly pgHealth: { stop: () => void };
readonly lagSampler: { stop: () => void };
@@ -171,7 +200,7 @@ type ShutdownDeps = {
};
function installGracefulShutdown(deps: ShutdownDeps): void {
const { redis, pool, consumer, metricsServer, pgHealth, lagSampler, logger: log } = deps;
const { redis, pool, consumer, liveServer, metricsServer, pgHealth, lagSampler, logger: log } = deps;
let shuttingDown = false;
@@ -187,8 +216,18 @@ function installGracefulShutdown(deps: ShutdownDeps): void {
lagSampler.stop();
pgHealth.stop();
consumer
// Shutdown order:
// 1. Live server — stop accepting new connections and drain existing ones
// first, so clients know the server is going away before the consumer
// stops processing.
// 2. Durable-write consumer — lets the in-flight batch finish.
// 3. Metrics server, Redis, Postgres.
liveServer
.stop()
.then(() => {
log.info('live server stopped');
return consumer.stop();
})
.then(() => {
log.info('consumer stopped');
return new Promise<void>((resolve, reject) =>