import type * as http from 'node:http'; import type { Redis } from 'ioredis'; import type pg from 'pg'; import { loadConfig } from './config/load.js'; import type { Config } from './config/load.js'; import { createLogger } from './observability/logger.js'; import { createMetrics, startMetricsServer, createPostgresHealthCheck, createConsumerLagSampler, } from './observability/metrics.js'; import { createPool, connectWithRetry } from './db/pool.js'; import { runMigrations } from './db/migrate.js'; import { connectRedis, createConsumer } from './core/consumer.js'; import type { ConsumedRecord } from './core/consumer.js'; import { createDeviceStateStore } from './core/state.js'; import { createWriter } from './core/writer.js'; // ------------------------------------------------------------------------- // Startup: validate config (fail fast on bad env), build logger // ------------------------------------------------------------------------- let config: Config; try { config = loadConfig(); } catch (err) { // Config validation failures print a human-readable message and exit 1. // Logger is not available yet — process.stderr is the only output channel. process.stderr.write(`${err instanceof Error ? err.message : String(err)}\n`); process.exit(1); } const logger = createLogger({ level: config.LOG_LEVEL, nodeEnv: config.NODE_ENV, instanceId: config.INSTANCE_ID, }); logger.info('processor starting'); // ------------------------------------------------------------------------- // Wire up the pipeline // ------------------------------------------------------------------------- async function main(): Promise { // 1. Build real prom-client metrics (replaces the trace-log shim from // pre-1.9 main.ts). Metrics are wired before any I/O so that counters // start at zero from the moment the process starts. const metrics = createMetrics(); // 2. Connect Postgres with exponential-backoff retry const pool = createPool(config.POSTGRES_URL); await connectWithRetry(pool, logger); // 3. Run migrations before any consumer activity. // Phase 1 limitation: multiple instances starting simultaneously both try // to migrate. Postgres advisory locks would solve this — deferred to Phase 3 // (production hardening), which is acceptable for the Phase 1 single-instance // pilot. await runMigrations(pool, logger); logger.info('migrations applied'); // 4. Connect Redis with exponential-backoff retry const redis: Redis = await connectRedis(config.REDIS_URL, logger); // 5. Build pipeline components const state = createDeviceStateStore(config, logger, metrics); const writer = createWriter(pool, config, logger, metrics); // 6. Postgres health check — background cached SELECT 1 for /readyz. // The check starts probing immediately so /readyz is accurate from the // first request after the metrics server starts listening. const pgHealth = createPostgresHealthCheck(pool); // 7. Start metrics HTTP server. // Bound before the consumer starts so /healthz responds even during the // brief window between metrics-server start and first stream read. const metricsServer: http.Server = startMetricsServer( config.METRICS_PORT, () => metrics.serializeMetrics(), { isRedisReady: () => redis.status === 'ready', isPostgresReady: pgHealth.isReady, }, ); logger.info({ port: config.METRICS_PORT }, 'metrics server listening'); // 8. Start consumer lag sampler (background interval, every 10 s). const lagSampler = createConsumerLagSampler( redis, config.REDIS_TELEMETRY_STREAM, config.REDIS_CONSUMER_GROUP, metrics, (msg) => logger.debug(msg), ); // 9. Define the sink: central decision point for state update and Postgres write. // State is updated BEFORE the write so that in-memory state is consistent with // what has been seen, even if the Postgres write subsequently fails. If the write // fails the record stays pending (not ACKed) and will be re-delivered — applying // the same position twice to state is idempotent for last_position and last_seen; // only position_count_session is double-counted, which is a session counter that // resets on restart and is not a correctness concern. const sink = async (records: ConsumedRecord[]): Promise => { // 9a. Update in-memory state for every record (cheap, synchronous-like, cannot // fail meaningfully — Map operations do not throw). for (const record of records) { state.update(record.position); } // 9b. Emit device-state gauges (sampled per-batch; cheap). metrics.observe('processor_device_state_size', state.size()); // 9c. Write to Postgres const results = await writer.write(records); // 9d. ACK only the IDs that succeeded or were already present. // 'failed' records are deliberately left pending for retry. const ackIds = results .filter((r) => r.status === 'inserted' || r.status === 'duplicate') .map((r) => r.id); if (ackIds.length > 0) { metrics.inc('processor_acks_total', undefined, ackIds.length); } return ackIds; }; // 10. Build and start the consumer const consumer = createConsumer(redis, config, logger, metrics, sink); await consumer.start(); // 11. Install graceful shutdown. // Full Phase 3 hardening: explicit consumer-group commit on SIGTERM, // uncaught-exception handler, multi-instance drain mode. installGracefulShutdown({ redis, pool, consumer, metricsServer, pgHealth, lagSampler, logger, }); logger.info( { stream: config.REDIS_TELEMETRY_STREAM, group: config.REDIS_CONSUMER_GROUP, consumer: config.REDIS_CONSUMER_NAME, metricsPort: config.METRICS_PORT, }, 'processor ready', ); } // ------------------------------------------------------------------------- // Graceful shutdown — Phase 3 finalizes this // ------------------------------------------------------------------------- type ShutdownDeps = { readonly redis: Redis; readonly pool: pg.Pool; readonly consumer: { stop: () => Promise }; readonly metricsServer: http.Server; readonly pgHealth: { stop: () => void }; readonly lagSampler: { stop: () => void }; readonly logger: ReturnType; }; function installGracefulShutdown(deps: ShutdownDeps): void { const { redis, pool, consumer, metricsServer, pgHealth, lagSampler, logger: log } = deps; let shuttingDown = false; function shutdown(signal: string): void { if (shuttingDown) return; shuttingDown = true; log.info({ signal }, 'shutdown signal received'); // Cancel background intervals first — they hold no resources that need // draining, and stopping them early prevents spurious log noise during // the shutdown sequence. lagSampler.stop(); pgHealth.stop(); consumer .stop() .then(() => { log.info('consumer stopped'); return new Promise((resolve, reject) => metricsServer.close((err) => (err ? reject(err) : resolve())), ); }) .then(() => { log.info('metrics server closed'); return redis.quit(); }) .then(() => { log.info('Redis disconnected'); return pool.end(); }) .then(() => { log.info('graceful shutdown complete'); process.exit(0); }) .catch((err: unknown) => { log.error({ err }, 'error during shutdown'); process.exit(1); }); // Force exit after 15s if the graceful path stalls (e.g. a hung Postgres write). setTimeout(() => { log.warn('forced exit after shutdown timeout'); process.exit(1); }, 15_000).unref(); } process.on('SIGTERM', () => shutdown('SIGTERM')); process.on('SIGINT', () => shutdown('SIGINT')); } // ------------------------------------------------------------------------- // Entry point // ------------------------------------------------------------------------- main().catch((err: unknown) => { process.stderr.write( `Fatal startup error: ${err instanceof Error ? err.message : String(err)}\n`, ); process.exit(1); });