feat(live): task 1.5.1 — WS server scaffold + heartbeat

Stand up the WebSocket live-broadcast server inside the Processor process:
- src/live/server.ts: createLiveServer factory with start/stop lifecycle,
  per-connection LiveConnection type, sendOutbound helper with back-pressure
  guard, 30s frame-level heartbeat via ws ping/pong, pluggable onMessage
  handler (stub returns error/not-implemented until 1.5.2/1.5.3).
- src/live/protocol.ts: zod schemas for inbound subscribe/unsubscribe messages,
  all outbound types (subscribed/unsubscribed/position/error), WsCloseCodes.
- src/shared/types.ts: extracted Metrics interface so src/live/ can import it
  without crossing the enforced src/live/ ↔ src/core/ ESLint boundary.
- src/core/types.ts: re-exports Metrics from shared/types to keep Phase 1
  call sites unchanged.
- src/config/load.ts: LIVE_WS_PORT, LIVE_WS_HOST, LIVE_WS_PING_INTERVAL_MS,
  LIVE_WS_DRAIN_TIMEOUT_MS, LIVE_WS_BACKPRESSURE_THRESHOLD_BYTES,
  DIRECTUS_BASE_URL, DIRECTUS_AUTH_TIMEOUT_MS, DIRECTUS_AUTHZ_TIMEOUT_MS,
  LIVE_BROADCAST_GROUP_PREFIX, LIVE_BROADCAST_BATCH_SIZE,
  LIVE_BROADCAST_BATCH_BLOCK_MS, LIVE_DEVICE_EVENT_REFRESH_MS.
- src/observability/metrics.ts: Phase 1.5 metrics inventory (connections,
  inbound/outbound counters, auth/authz histograms, subscription gauge,
  broadcast counters + lag histogram, snapshot histograms, device-event map).
- src/main.ts: wires the live server alongside the durable-write consumer;
  shutdown order: live server → consumer → metrics → Redis → Postgres.
- eslint.config.js: import/no-restricted-paths zones for src/live/ ↔ src/core/.
- test/live-server.test.ts: 7 unit tests covering connect, ping, protocol
  violation, valid message dispatch, connections gauge, and stop() drain.
This commit is contained in:
2026-05-02 17:33:31 +02:00
parent e1c6f59948
commit 7154a0a49c
11 changed files with 1134 additions and 21 deletions
+200
View File
@@ -45,6 +45,23 @@ type InternalRegistry = {
readonly acksTotal: Counter;
readonly deviceStateSizeGauge: Gauge;
readonly deviceStateEvictionsTotal: Counter;
// Phase 1.5 — Live broadcast
readonly liveConnectionsGauge: Gauge;
readonly liveMessagesInboundTotal: Counter;
readonly liveMessagesOutboundTotal: Counter;
readonly liveAuthAttemptsTotal: Counter;
readonly liveAuthLatencyMs: Histogram;
readonly liveSubscriptionsGauge: Gauge;
readonly liveSubscribeAttemptsTotal: Counter;
readonly liveAuthzLatencyMs: Histogram;
readonly liveBroadcastRecordsTotal: Counter;
readonly liveBroadcastFanoutMessagesTotal: Counter;
readonly liveBroadcastOrphanRecordsTotal: Counter;
readonly liveBroadcastLagMs: Histogram;
readonly liveSnapshotQueryLatencyMs: Histogram;
readonly liveSnapshotSize: Histogram;
readonly liveDeviceEventRefreshLatencyMs: Histogram;
readonly liveDeviceEventEntries: Gauge;
};
// ---------------------------------------------------------------------------
@@ -376,6 +393,121 @@ function buildInternalRegistry(): InternalRegistry {
registers: [registry],
});
// -------------------------------------------------------------------------
// Phase 1.5 — Live broadcast metrics
// -------------------------------------------------------------------------
const liveConnectionsGauge = new Gauge({
name: 'processor_live_connections',
help: 'Current number of open WebSocket connections.',
labelNames: ['instance_id'],
registers: [registry],
});
const liveMessagesInboundTotal = new Counter({
name: 'processor_live_messages_inbound_total',
help: 'Inbound WS messages. type=subscribe|unsubscribe|invalid.',
labelNames: ['type', 'instance_id'],
registers: [registry],
});
const liveMessagesOutboundTotal = new Counter({
name: 'processor_live_messages_outbound_total',
help: 'Outbound WS messages. type=subscribed|unsubscribed|position|error.',
labelNames: ['type', 'instance_id'],
registers: [registry],
});
const liveAuthAttemptsTotal = new Counter({
name: 'processor_live_auth_attempts_total',
help: 'WS upgrade auth attempts. result=success|unauthorized|error.',
labelNames: ['result'],
registers: [registry],
});
const liveAuthLatencyMs = new Histogram({
name: 'processor_live_auth_latency_ms',
help: 'Latency of /users/me round-trip for WS upgrade auth.',
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
registers: [registry],
});
const liveSubscriptionsGauge = new Gauge({
name: 'processor_live_subscriptions',
help: 'Current total active topic subscriptions across all connections.',
labelNames: ['instance_id'],
registers: [registry],
});
const liveSubscribeAttemptsTotal = new Counter({
name: 'processor_live_subscribe_attempts_total',
help: 'Subscribe attempts. result=success|forbidden|not-found|unknown-topic|error.',
labelNames: ['result'],
registers: [registry],
});
const liveAuthzLatencyMs = new Histogram({
name: 'processor_live_authz_latency_ms',
help: 'Latency of /items/events/<id> round-trip for per-event authorization.',
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
registers: [registry],
});
const liveBroadcastRecordsTotal = new Counter({
name: 'processor_live_broadcast_records_total',
help: 'Records consumed by the broadcast consumer group.',
labelNames: ['instance_id'],
registers: [registry],
});
const liveBroadcastFanoutMessagesTotal = new Counter({
name: 'processor_live_broadcast_fanout_messages_total',
help: 'Outbound position frames sent via fan-out.',
labelNames: ['instance_id'],
registers: [registry],
});
const liveBroadcastOrphanRecordsTotal = new Counter({
name: 'processor_live_broadcast_orphan_records_total',
help: 'Records for devices not registered to any event (no fan-out).',
labelNames: ['instance_id'],
registers: [registry],
});
const liveBroadcastLagMs = new Histogram({
name: 'processor_live_broadcast_lag_ms',
help: 'End-to-end latency from record ts to fan-out send, in milliseconds.',
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
registers: [registry],
});
const liveSnapshotQueryLatencyMs = new Histogram({
name: 'processor_live_snapshot_query_latency_ms',
help: 'Latency of the snapshot-on-subscribe query.',
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
registers: [registry],
});
const liveSnapshotSize = new Histogram({
name: 'processor_live_snapshot_size',
help: 'Number of positions in each snapshot response.',
buckets: [0, 1, 5, 10, 25, 50, 100, 250, 500],
registers: [registry],
});
const liveDeviceEventRefreshLatencyMs = new Histogram({
name: 'processor_live_device_event_refresh_latency_ms',
help: 'Latency of device-event map refresh queries.',
buckets: [1, 5, 10, 25, 50, 100, 250, 500],
registers: [registry],
});
const liveDeviceEventEntries = new Gauge({
name: 'processor_live_device_event_entries',
help: 'Number of device→event mappings currently in the in-memory cache.',
registers: [registry],
});
return {
registry,
consumerReadsTotal,
@@ -387,6 +519,22 @@ function buildInternalRegistry(): InternalRegistry {
acksTotal,
deviceStateSizeGauge,
deviceStateEvictionsTotal,
liveConnectionsGauge,
liveMessagesInboundTotal,
liveMessagesOutboundTotal,
liveAuthAttemptsTotal,
liveAuthLatencyMs,
liveSubscriptionsGauge,
liveSubscribeAttemptsTotal,
liveAuthzLatencyMs,
liveBroadcastRecordsTotal,
liveBroadcastFanoutMessagesTotal,
liveBroadcastOrphanRecordsTotal,
liveBroadcastLagMs,
liveSnapshotQueryLatencyMs,
liveSnapshotSize,
liveDeviceEventRefreshLatencyMs,
liveDeviceEventEntries,
};
}
@@ -420,6 +568,30 @@ function dispatchInc(
case 'processor_device_state_evictions_total':
r.deviceStateEvictionsTotal.inc(v);
break;
// Phase 1.5 — Live broadcast (connections are set via observe, not inc)
case 'processor_live_messages_inbound_total':
r.liveMessagesInboundTotal.inc(labels ?? {}, v);
break;
case 'processor_live_messages_outbound_total':
r.liveMessagesOutboundTotal.inc(labels ?? {}, v);
break;
case 'processor_live_auth_attempts_total':
r.liveAuthAttemptsTotal.inc(labels ?? {}, v);
break;
// subscriptions gauge is set via observe (see dispatchObserve)
case 'processor_live_subscribe_attempts_total':
r.liveSubscribeAttemptsTotal.inc(labels ?? {}, v);
break;
case 'processor_live_broadcast_records_total':
r.liveBroadcastRecordsTotal.inc(labels ?? {}, v);
break;
case 'processor_live_broadcast_fanout_messages_total':
r.liveBroadcastFanoutMessagesTotal.inc(labels ?? {}, v);
break;
case 'processor_live_broadcast_orphan_records_total':
r.liveBroadcastOrphanRecordsTotal.inc(labels ?? {}, v);
break;
// device_event_entries gauge is set via observe (see dispatchObserve)
default:
// Unknown metric name — silently ignore. This preserves the contract
// that the Metrics interface never throws, and avoids crashing the
@@ -445,6 +617,34 @@ function dispatchObserve(
case 'processor_device_state_size':
r.deviceStateSizeGauge.set(value);
break;
// Phase 1.5 — Live broadcast
case 'processor_live_connections':
r.liveConnectionsGauge.set(value);
break;
case 'processor_live_auth_latency_ms':
r.liveAuthLatencyMs.observe(value);
break;
case 'processor_live_authz_latency_ms':
r.liveAuthzLatencyMs.observe(value);
break;
case 'processor_live_broadcast_lag_ms':
r.liveBroadcastLagMs.observe(value);
break;
case 'processor_live_snapshot_query_latency_ms':
r.liveSnapshotQueryLatencyMs.observe(value);
break;
case 'processor_live_snapshot_size':
r.liveSnapshotSize.observe(value);
break;
case 'processor_live_device_event_refresh_latency_ms':
r.liveDeviceEventRefreshLatencyMs.observe(value);
break;
case 'processor_live_subscriptions':
r.liveSubscriptionsGauge.set(value);
break;
case 'processor_live_device_event_entries':
r.liveDeviceEventEntries.set(value);
break;
default:
// Unknown metric name — silently ignore (see dispatchInc comment).
break;