Files
processor/src/main.ts
T
julian d758c211ae Fix metric wiring gaps audited against live processor output
Several Phase 1 metrics were registered in observability/metrics.ts but
either unwired at the call sites or wired with wrong counts. Production
output showed 11 records ingested per logs but only 4 in metrics. The
fixes below align metric values with actual hot-path activity.

Wiring gaps closed (consumer.ts):
- processor_consumer_reads_total{result=ok|empty|error} — was registered
  but never inc'd. Now fires on each XREADGROUP outcome.
- processor_consumer_records_total — was registered but never inc'd.
  Now fires once per XREADGROUP, with the entry count.

Counts corrected (writer.ts):
- processor_position_writes_total{status} — was inc'd unconditionally
  by 1 per chunk for each of inserted/duplicate. Now inc'd by the
  actual per-status count, and only when count > 0.
- processor_position_writes_total{status='failed'} — was inc'd by 1
  per failed chunk. Now inc'd by chunk.length so every failed record
  is counted.

Counts corrected (main.ts):
- processor_acks_total — was inc'd by 1 per non-empty batch. Now
  inc'd by ackIds.length so every ACK'd ID is counted.

Wiring gap closed (state.ts):
- processor_device_state_evictions_total — internal `evicted` counter
  existed but was never published to metrics. createDeviceStateStore
  now accepts a Metrics injection and inc's on each eviction.

Metrics interface extended (types.ts, metrics.ts):
- Metrics.inc gained an optional third `value` parameter (defaults to 1)
  for batched increments. dispatchInc passes it through to prom-client's
  Counter.inc(labels, value).

Tests updated to reflect the new third arg and the state.ts factory's
new metrics parameter. Total 134 unit tests passing (no count change —
existing tests adjusted, no new tests added; the real verification is
on stage where the metrics are now meaningful again).
2026-05-01 11:43:06 +02:00

236 lines
8.1 KiB
TypeScript

import type * as http from 'node:http';
import type { Redis } from 'ioredis';
import type pg from 'pg';
import { loadConfig } from './config/load.js';
import type { Config } from './config/load.js';
import { createLogger } from './observability/logger.js';
import {
createMetrics,
startMetricsServer,
createPostgresHealthCheck,
createConsumerLagSampler,
} from './observability/metrics.js';
import { createPool, connectWithRetry } from './db/pool.js';
import { runMigrations } from './db/migrate.js';
import { connectRedis, createConsumer } from './core/consumer.js';
import type { ConsumedRecord } from './core/consumer.js';
import { createDeviceStateStore } from './core/state.js';
import { createWriter } from './core/writer.js';
// -------------------------------------------------------------------------
// Startup: validate config (fail fast on bad env), build logger
// -------------------------------------------------------------------------
let config: Config;
try {
config = loadConfig();
} catch (err) {
// Config validation failures print a human-readable message and exit 1.
// Logger is not available yet — process.stderr is the only output channel.
process.stderr.write(`${err instanceof Error ? err.message : String(err)}\n`);
process.exit(1);
}
const logger = createLogger({
level: config.LOG_LEVEL,
nodeEnv: config.NODE_ENV,
instanceId: config.INSTANCE_ID,
});
logger.info('processor starting');
// -------------------------------------------------------------------------
// Wire up the pipeline
// -------------------------------------------------------------------------
async function main(): Promise<void> {
// 1. Build real prom-client metrics (replaces the trace-log shim from
// pre-1.9 main.ts). Metrics are wired before any I/O so that counters
// start at zero from the moment the process starts.
const metrics = createMetrics();
// 2. Connect Postgres with exponential-backoff retry
const pool = createPool(config.POSTGRES_URL);
await connectWithRetry(pool, logger);
// 3. Run migrations before any consumer activity.
// Phase 1 limitation: multiple instances starting simultaneously both try
// to migrate. Postgres advisory locks would solve this — deferred to Phase 3
// (production hardening), which is acceptable for the Phase 1 single-instance
// pilot.
await runMigrations(pool, logger);
logger.info('migrations applied');
// 4. Connect Redis with exponential-backoff retry
const redis: Redis = await connectRedis(config.REDIS_URL, logger);
// 5. Build pipeline components
const state = createDeviceStateStore(config, logger, metrics);
const writer = createWriter(pool, config, logger, metrics);
// 6. Postgres health check — background cached SELECT 1 for /readyz.
// The check starts probing immediately so /readyz is accurate from the
// first request after the metrics server starts listening.
const pgHealth = createPostgresHealthCheck(pool);
// 7. Start metrics HTTP server.
// Bound before the consumer starts so /healthz responds even during the
// brief window between metrics-server start and first stream read.
const metricsServer: http.Server = startMetricsServer(
config.METRICS_PORT,
() => metrics.serializeMetrics(),
{
isRedisReady: () => redis.status === 'ready',
isPostgresReady: pgHealth.isReady,
},
);
logger.info({ port: config.METRICS_PORT }, 'metrics server listening');
// 8. Start consumer lag sampler (background interval, every 10 s).
const lagSampler = createConsumerLagSampler(
redis,
config.REDIS_TELEMETRY_STREAM,
config.REDIS_CONSUMER_GROUP,
metrics,
(msg) => logger.debug(msg),
);
// 9. Define the sink: central decision point for state update and Postgres write.
// State is updated BEFORE the write so that in-memory state is consistent with
// what has been seen, even if the Postgres write subsequently fails. If the write
// fails the record stays pending (not ACKed) and will be re-delivered — applying
// the same position twice to state is idempotent for last_position and last_seen;
// only position_count_session is double-counted, which is a session counter that
// resets on restart and is not a correctness concern.
const sink = async (records: ConsumedRecord[]): Promise<string[]> => {
// 9a. Update in-memory state for every record (cheap, synchronous-like, cannot
// fail meaningfully — Map operations do not throw).
for (const record of records) {
state.update(record.position);
}
// 9b. Emit device-state gauges (sampled per-batch; cheap).
metrics.observe('processor_device_state_size', state.size());
// 9c. Write to Postgres
const results = await writer.write(records);
// 9d. ACK only the IDs that succeeded or were already present.
// 'failed' records are deliberately left pending for retry.
const ackIds = results
.filter((r) => r.status === 'inserted' || r.status === 'duplicate')
.map((r) => r.id);
if (ackIds.length > 0) {
metrics.inc('processor_acks_total', undefined, ackIds.length);
}
return ackIds;
};
// 10. Build and start the consumer
const consumer = createConsumer(redis, config, logger, metrics, sink);
await consumer.start();
// 11. Install graceful shutdown.
// Full Phase 3 hardening: explicit consumer-group commit on SIGTERM,
// uncaught-exception handler, multi-instance drain mode.
installGracefulShutdown({
redis,
pool,
consumer,
metricsServer,
pgHealth,
lagSampler,
logger,
});
logger.info(
{
stream: config.REDIS_TELEMETRY_STREAM,
group: config.REDIS_CONSUMER_GROUP,
consumer: config.REDIS_CONSUMER_NAME,
metricsPort: config.METRICS_PORT,
},
'processor ready',
);
}
// -------------------------------------------------------------------------
// Graceful shutdown — Phase 3 finalizes this
// -------------------------------------------------------------------------
type ShutdownDeps = {
readonly redis: Redis;
readonly pool: pg.Pool;
readonly consumer: { stop: () => Promise<void> };
readonly metricsServer: http.Server;
readonly pgHealth: { stop: () => void };
readonly lagSampler: { stop: () => void };
readonly logger: ReturnType<typeof createLogger>;
};
function installGracefulShutdown(deps: ShutdownDeps): void {
const { redis, pool, consumer, metricsServer, pgHealth, lagSampler, logger: log } = deps;
let shuttingDown = false;
function shutdown(signal: string): void {
if (shuttingDown) return;
shuttingDown = true;
log.info({ signal }, 'shutdown signal received');
// Cancel background intervals first — they hold no resources that need
// draining, and stopping them early prevents spurious log noise during
// the shutdown sequence.
lagSampler.stop();
pgHealth.stop();
consumer
.stop()
.then(() => {
log.info('consumer stopped');
return new Promise<void>((resolve, reject) =>
metricsServer.close((err) => (err ? reject(err) : resolve())),
);
})
.then(() => {
log.info('metrics server closed');
return redis.quit();
})
.then(() => {
log.info('Redis disconnected');
return pool.end();
})
.then(() => {
log.info('graceful shutdown complete');
process.exit(0);
})
.catch((err: unknown) => {
log.error({ err }, 'error during shutdown');
process.exit(1);
});
// Force exit after 15s if the graceful path stalls (e.g. a hung Postgres write).
setTimeout(() => {
log.warn('forced exit after shutdown timeout');
process.exit(1);
}, 15_000).unref();
}
process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));
}
// -------------------------------------------------------------------------
// Entry point
// -------------------------------------------------------------------------
main().catch((err: unknown) => {
process.stderr.write(
`Fatal startup error: ${err instanceof Error ? err.message : String(err)}\n`,
);
process.exit(1);
});