feat(live): task 1.5.1 — WS server scaffold + heartbeat
Stand up the WebSocket live-broadcast server inside the Processor process: - src/live/server.ts: createLiveServer factory with start/stop lifecycle, per-connection LiveConnection type, sendOutbound helper with back-pressure guard, 30s frame-level heartbeat via ws ping/pong, pluggable onMessage handler (stub returns error/not-implemented until 1.5.2/1.5.3). - src/live/protocol.ts: zod schemas for inbound subscribe/unsubscribe messages, all outbound types (subscribed/unsubscribed/position/error), WsCloseCodes. - src/shared/types.ts: extracted Metrics interface so src/live/ can import it without crossing the enforced src/live/ ↔ src/core/ ESLint boundary. - src/core/types.ts: re-exports Metrics from shared/types to keep Phase 1 call sites unchanged. - src/config/load.ts: LIVE_WS_PORT, LIVE_WS_HOST, LIVE_WS_PING_INTERVAL_MS, LIVE_WS_DRAIN_TIMEOUT_MS, LIVE_WS_BACKPRESSURE_THRESHOLD_BYTES, DIRECTUS_BASE_URL, DIRECTUS_AUTH_TIMEOUT_MS, DIRECTUS_AUTHZ_TIMEOUT_MS, LIVE_BROADCAST_GROUP_PREFIX, LIVE_BROADCAST_BATCH_SIZE, LIVE_BROADCAST_BATCH_BLOCK_MS, LIVE_DEVICE_EVENT_REFRESH_MS. - src/observability/metrics.ts: Phase 1.5 metrics inventory (connections, inbound/outbound counters, auth/authz histograms, subscription gauge, broadcast counters + lag histogram, snapshot histograms, device-event map). - src/main.ts: wires the live server alongside the durable-write consumer; shutdown order: live server → consumer → metrics → Redis → Postgres. - eslint.config.js: import/no-restricted-paths zones for src/live/ ↔ src/core/. - test/live-server.test.ts: 7 unit tests covering connect, ping, protocol violation, valid message dispatch, connections gauge, and stop() drain.
This commit is contained in:
@@ -45,6 +45,23 @@ type InternalRegistry = {
|
||||
readonly acksTotal: Counter;
|
||||
readonly deviceStateSizeGauge: Gauge;
|
||||
readonly deviceStateEvictionsTotal: Counter;
|
||||
// Phase 1.5 — Live broadcast
|
||||
readonly liveConnectionsGauge: Gauge;
|
||||
readonly liveMessagesInboundTotal: Counter;
|
||||
readonly liveMessagesOutboundTotal: Counter;
|
||||
readonly liveAuthAttemptsTotal: Counter;
|
||||
readonly liveAuthLatencyMs: Histogram;
|
||||
readonly liveSubscriptionsGauge: Gauge;
|
||||
readonly liveSubscribeAttemptsTotal: Counter;
|
||||
readonly liveAuthzLatencyMs: Histogram;
|
||||
readonly liveBroadcastRecordsTotal: Counter;
|
||||
readonly liveBroadcastFanoutMessagesTotal: Counter;
|
||||
readonly liveBroadcastOrphanRecordsTotal: Counter;
|
||||
readonly liveBroadcastLagMs: Histogram;
|
||||
readonly liveSnapshotQueryLatencyMs: Histogram;
|
||||
readonly liveSnapshotSize: Histogram;
|
||||
readonly liveDeviceEventRefreshLatencyMs: Histogram;
|
||||
readonly liveDeviceEventEntries: Gauge;
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -376,6 +393,121 @@ function buildInternalRegistry(): InternalRegistry {
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Phase 1.5 — Live broadcast metrics
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
const liveConnectionsGauge = new Gauge({
|
||||
name: 'processor_live_connections',
|
||||
help: 'Current number of open WebSocket connections.',
|
||||
labelNames: ['instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveMessagesInboundTotal = new Counter({
|
||||
name: 'processor_live_messages_inbound_total',
|
||||
help: 'Inbound WS messages. type=subscribe|unsubscribe|invalid.',
|
||||
labelNames: ['type', 'instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveMessagesOutboundTotal = new Counter({
|
||||
name: 'processor_live_messages_outbound_total',
|
||||
help: 'Outbound WS messages. type=subscribed|unsubscribed|position|error.',
|
||||
labelNames: ['type', 'instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveAuthAttemptsTotal = new Counter({
|
||||
name: 'processor_live_auth_attempts_total',
|
||||
help: 'WS upgrade auth attempts. result=success|unauthorized|error.',
|
||||
labelNames: ['result'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveAuthLatencyMs = new Histogram({
|
||||
name: 'processor_live_auth_latency_ms',
|
||||
help: 'Latency of /users/me round-trip for WS upgrade auth.',
|
||||
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveSubscriptionsGauge = new Gauge({
|
||||
name: 'processor_live_subscriptions',
|
||||
help: 'Current total active topic subscriptions across all connections.',
|
||||
labelNames: ['instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveSubscribeAttemptsTotal = new Counter({
|
||||
name: 'processor_live_subscribe_attempts_total',
|
||||
help: 'Subscribe attempts. result=success|forbidden|not-found|unknown-topic|error.',
|
||||
labelNames: ['result'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveAuthzLatencyMs = new Histogram({
|
||||
name: 'processor_live_authz_latency_ms',
|
||||
help: 'Latency of /items/events/<id> round-trip for per-event authorization.',
|
||||
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveBroadcastRecordsTotal = new Counter({
|
||||
name: 'processor_live_broadcast_records_total',
|
||||
help: 'Records consumed by the broadcast consumer group.',
|
||||
labelNames: ['instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveBroadcastFanoutMessagesTotal = new Counter({
|
||||
name: 'processor_live_broadcast_fanout_messages_total',
|
||||
help: 'Outbound position frames sent via fan-out.',
|
||||
labelNames: ['instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveBroadcastOrphanRecordsTotal = new Counter({
|
||||
name: 'processor_live_broadcast_orphan_records_total',
|
||||
help: 'Records for devices not registered to any event (no fan-out).',
|
||||
labelNames: ['instance_id'],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveBroadcastLagMs = new Histogram({
|
||||
name: 'processor_live_broadcast_lag_ms',
|
||||
help: 'End-to-end latency from record ts to fan-out send, in milliseconds.',
|
||||
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveSnapshotQueryLatencyMs = new Histogram({
|
||||
name: 'processor_live_snapshot_query_latency_ms',
|
||||
help: 'Latency of the snapshot-on-subscribe query.',
|
||||
buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveSnapshotSize = new Histogram({
|
||||
name: 'processor_live_snapshot_size',
|
||||
help: 'Number of positions in each snapshot response.',
|
||||
buckets: [0, 1, 5, 10, 25, 50, 100, 250, 500],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveDeviceEventRefreshLatencyMs = new Histogram({
|
||||
name: 'processor_live_device_event_refresh_latency_ms',
|
||||
help: 'Latency of device-event map refresh queries.',
|
||||
buckets: [1, 5, 10, 25, 50, 100, 250, 500],
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
const liveDeviceEventEntries = new Gauge({
|
||||
name: 'processor_live_device_event_entries',
|
||||
help: 'Number of device→event mappings currently in the in-memory cache.',
|
||||
registers: [registry],
|
||||
});
|
||||
|
||||
return {
|
||||
registry,
|
||||
consumerReadsTotal,
|
||||
@@ -387,6 +519,22 @@ function buildInternalRegistry(): InternalRegistry {
|
||||
acksTotal,
|
||||
deviceStateSizeGauge,
|
||||
deviceStateEvictionsTotal,
|
||||
liveConnectionsGauge,
|
||||
liveMessagesInboundTotal,
|
||||
liveMessagesOutboundTotal,
|
||||
liveAuthAttemptsTotal,
|
||||
liveAuthLatencyMs,
|
||||
liveSubscriptionsGauge,
|
||||
liveSubscribeAttemptsTotal,
|
||||
liveAuthzLatencyMs,
|
||||
liveBroadcastRecordsTotal,
|
||||
liveBroadcastFanoutMessagesTotal,
|
||||
liveBroadcastOrphanRecordsTotal,
|
||||
liveBroadcastLagMs,
|
||||
liveSnapshotQueryLatencyMs,
|
||||
liveSnapshotSize,
|
||||
liveDeviceEventRefreshLatencyMs,
|
||||
liveDeviceEventEntries,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -420,6 +568,30 @@ function dispatchInc(
|
||||
case 'processor_device_state_evictions_total':
|
||||
r.deviceStateEvictionsTotal.inc(v);
|
||||
break;
|
||||
// Phase 1.5 — Live broadcast (connections are set via observe, not inc)
|
||||
case 'processor_live_messages_inbound_total':
|
||||
r.liveMessagesInboundTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_live_messages_outbound_total':
|
||||
r.liveMessagesOutboundTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_live_auth_attempts_total':
|
||||
r.liveAuthAttemptsTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
// subscriptions gauge is set via observe (see dispatchObserve)
|
||||
case 'processor_live_subscribe_attempts_total':
|
||||
r.liveSubscribeAttemptsTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_live_broadcast_records_total':
|
||||
r.liveBroadcastRecordsTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_live_broadcast_fanout_messages_total':
|
||||
r.liveBroadcastFanoutMessagesTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_live_broadcast_orphan_records_total':
|
||||
r.liveBroadcastOrphanRecordsTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
// device_event_entries gauge is set via observe (see dispatchObserve)
|
||||
default:
|
||||
// Unknown metric name — silently ignore. This preserves the contract
|
||||
// that the Metrics interface never throws, and avoids crashing the
|
||||
@@ -445,6 +617,34 @@ function dispatchObserve(
|
||||
case 'processor_device_state_size':
|
||||
r.deviceStateSizeGauge.set(value);
|
||||
break;
|
||||
// Phase 1.5 — Live broadcast
|
||||
case 'processor_live_connections':
|
||||
r.liveConnectionsGauge.set(value);
|
||||
break;
|
||||
case 'processor_live_auth_latency_ms':
|
||||
r.liveAuthLatencyMs.observe(value);
|
||||
break;
|
||||
case 'processor_live_authz_latency_ms':
|
||||
r.liveAuthzLatencyMs.observe(value);
|
||||
break;
|
||||
case 'processor_live_broadcast_lag_ms':
|
||||
r.liveBroadcastLagMs.observe(value);
|
||||
break;
|
||||
case 'processor_live_snapshot_query_latency_ms':
|
||||
r.liveSnapshotQueryLatencyMs.observe(value);
|
||||
break;
|
||||
case 'processor_live_snapshot_size':
|
||||
r.liveSnapshotSize.observe(value);
|
||||
break;
|
||||
case 'processor_live_device_event_refresh_latency_ms':
|
||||
r.liveDeviceEventRefreshLatencyMs.observe(value);
|
||||
break;
|
||||
case 'processor_live_subscriptions':
|
||||
r.liveSubscriptionsGauge.set(value);
|
||||
break;
|
||||
case 'processor_live_device_event_entries':
|
||||
r.liveDeviceEventEntries.set(value);
|
||||
break;
|
||||
default:
|
||||
// Unknown metric name — silently ignore (see dispatchInc comment).
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user