Fix metric wiring gaps audited against live processor output
Several Phase 1 metrics were registered in observability/metrics.ts but
either unwired at the call sites or wired with wrong counts. Production
output showed 11 records ingested per logs but only 4 in metrics. The
fixes below align metric values with actual hot-path activity.
Wiring gaps closed (consumer.ts):
- processor_consumer_reads_total{result=ok|empty|error} — was registered
but never inc'd. Now fires on each XREADGROUP outcome.
- processor_consumer_records_total — was registered but never inc'd.
Now fires once per XREADGROUP, with the entry count.
Counts corrected (writer.ts):
- processor_position_writes_total{status} — was inc'd unconditionally
by 1 per chunk for each of inserted/duplicate. Now inc'd by the
actual per-status count, and only when count > 0.
- processor_position_writes_total{status='failed'} — was inc'd by 1
per failed chunk. Now inc'd by chunk.length so every failed record
is counted.
Counts corrected (main.ts):
- processor_acks_total — was inc'd by 1 per non-empty batch. Now
inc'd by ackIds.length so every ACK'd ID is counted.
Wiring gap closed (state.ts):
- processor_device_state_evictions_total — internal `evicted` counter
existed but was never published to metrics. createDeviceStateStore
now accepts a Metrics injection and inc's on each eviction.
Metrics interface extended (types.ts, metrics.ts):
- Metrics.inc gained an optional third `value` parameter (defaults to 1)
for batched increments. dispatchInc passes it through to prom-client's
Counter.inc(labels, value).
Tests updated to reflect the new third arg and the state.ts factory's
new metrics parameter. Total 134 unit tests passing (no count change —
existing tests adjusted, no new tests added; the real verification is
on stage where the metrics are now meaningful again).
This commit is contained in:
+12
-2
@@ -275,18 +275,28 @@ export function createConsumer(
|
||||
)) as [string, [string, string[]][]][] | null;
|
||||
} catch (err) {
|
||||
if (stopping) break;
|
||||
metrics.inc('processor_consumer_reads_total', { result: 'error' });
|
||||
logger.error({ err }, 'XREADGROUP failed; backing off');
|
||||
await sleep(1_000);
|
||||
continue;
|
||||
}
|
||||
|
||||
// BLOCK timeout — no new entries; loop again to check stopping flag.
|
||||
if (rawResult === null) continue;
|
||||
if (rawResult === null) {
|
||||
metrics.inc('processor_consumer_reads_total', { result: 'empty' });
|
||||
continue;
|
||||
}
|
||||
|
||||
// rawResult is [[streamName, [[id, fields], ...]]]
|
||||
// We only subscribed to one stream so we take the first element.
|
||||
const streamEntries = rawResult[0]?.[1] ?? [];
|
||||
if (streamEntries.length === 0) continue;
|
||||
if (streamEntries.length === 0) {
|
||||
metrics.inc('processor_consumer_reads_total', { result: 'empty' });
|
||||
continue;
|
||||
}
|
||||
|
||||
metrics.inc('processor_consumer_reads_total', { result: 'ok' });
|
||||
metrics.inc('processor_consumer_records_total', undefined, streamEntries.length);
|
||||
|
||||
logger.debug({ stream, n: streamEntries.length }, 'batch consumed');
|
||||
|
||||
|
||||
+7
-2
@@ -15,7 +15,7 @@
|
||||
|
||||
import type { Logger } from 'pino';
|
||||
import type { Config } from '../config/load.js';
|
||||
import type { Position, DeviceState } from './types.js';
|
||||
import type { Position, DeviceState, Metrics } from './types.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public interface
|
||||
@@ -47,7 +47,11 @@ export type DeviceStateStore = {
|
||||
// Factory
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function createDeviceStateStore(config: Config, logger: Logger): DeviceStateStore {
|
||||
export function createDeviceStateStore(
|
||||
config: Config,
|
||||
logger: Logger,
|
||||
metrics: Metrics,
|
||||
): DeviceStateStore {
|
||||
const cap = config.DEVICE_STATE_LRU_CAP;
|
||||
const store = new Map<string, DeviceState>();
|
||||
let evicted = 0;
|
||||
@@ -88,6 +92,7 @@ export function createDeviceStateStore(config: Config, logger: Logger): DeviceSt
|
||||
if (oldestKey !== undefined) {
|
||||
store.delete(oldestKey);
|
||||
evicted++;
|
||||
metrics.inc('processor_device_state_evictions_total');
|
||||
logger.debug(
|
||||
{ evictedDevice: oldestKey, storeSize: store.size, cap },
|
||||
'device state evicted (LRU)',
|
||||
|
||||
+10
-4
@@ -84,11 +84,17 @@ export type DeviceState = {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Minimal metrics interface exposed to pipeline components. Concrete
|
||||
* implementation (prom-client) lands in task 1.9; this keeps types stable
|
||||
* through tasks 1.2–1.8.
|
||||
* Minimal metrics interface exposed to pipeline components.
|
||||
*
|
||||
* `inc` accepts an optional `value` for batched increments — counters that
|
||||
* naturally arrive in groups (records consumed, rows inserted, IDs ACKed)
|
||||
* should pass the count rather than calling `inc` N times. Defaults to 1.
|
||||
*/
|
||||
export type Metrics = {
|
||||
readonly inc: (name: string, labels?: Record<string, string>) => void;
|
||||
readonly inc: (
|
||||
name: string,
|
||||
labels?: Record<string, string>,
|
||||
value?: number,
|
||||
) => void;
|
||||
readonly observe: (name: string, value: number, labels?: Record<string, string>) => void;
|
||||
};
|
||||
|
||||
+9
-3
@@ -190,7 +190,8 @@ export function createWriter(
|
||||
const error = err instanceof Error ? err : new Error(String(err));
|
||||
logger.error({ err, chunkSize: chunk.length }, 'position write failed');
|
||||
|
||||
metrics.inc('processor_position_writes_total', { status: 'failed' });
|
||||
// Every record in the failed chunk gets `status: failed` — count them all.
|
||||
metrics.inc('processor_position_writes_total', { status: 'failed' }, chunk.length);
|
||||
|
||||
return chunk.map((record) => ({ id: record.id, status: 'failed' as const, error }));
|
||||
}
|
||||
@@ -200,8 +201,13 @@ export function createWriter(
|
||||
const insertedCount = results.filter((r) => r.status === 'inserted').length;
|
||||
const duplicateCount = results.filter((r) => r.status === 'duplicate').length;
|
||||
|
||||
metrics.inc('processor_position_writes_total', { status: 'inserted' });
|
||||
metrics.inc('processor_position_writes_total', { status: 'duplicate' });
|
||||
// Counts must match per-record outcomes, not be incremented once per chunk.
|
||||
if (insertedCount > 0) {
|
||||
metrics.inc('processor_position_writes_total', { status: 'inserted' }, insertedCount);
|
||||
}
|
||||
if (duplicateCount > 0) {
|
||||
metrics.inc('processor_position_writes_total', { status: 'duplicate' }, duplicateCount);
|
||||
}
|
||||
metrics.observe('processor_position_write_duration_seconds', (Date.now() - startMs) / 1_000);
|
||||
|
||||
logger.debug(
|
||||
|
||||
+2
-2
@@ -65,7 +65,7 @@ async function main(): Promise<void> {
|
||||
const redis: Redis = await connectRedis(config.REDIS_URL, logger);
|
||||
|
||||
// 5. Build pipeline components
|
||||
const state = createDeviceStateStore(config, logger);
|
||||
const state = createDeviceStateStore(config, logger, metrics);
|
||||
const writer = createWriter(pool, config, logger, metrics);
|
||||
|
||||
// 6. Postgres health check — background cached SELECT 1 for /readyz.
|
||||
@@ -122,7 +122,7 @@ async function main(): Promise<void> {
|
||||
.map((r) => r.id);
|
||||
|
||||
if (ackIds.length > 0) {
|
||||
metrics.inc('processor_acks_total');
|
||||
metrics.inc('processor_acks_total', undefined, ackIds.length);
|
||||
}
|
||||
|
||||
return ackIds;
|
||||
|
||||
@@ -71,8 +71,8 @@ export function createMetrics(): Metrics & {
|
||||
collectDefaultMetrics({ register: internal.registry });
|
||||
|
||||
const metricsImpl: Metrics & { serializeMetrics: () => Promise<string> } = {
|
||||
inc(name: string, labels?: Record<string, string>): void {
|
||||
dispatchInc(internal, name, labels);
|
||||
inc(name: string, labels?: Record<string, string>, value?: number): void {
|
||||
dispatchInc(internal, name, labels, value);
|
||||
},
|
||||
|
||||
observe(name: string, value: number, labels?: Record<string, string>): void {
|
||||
@@ -398,25 +398,27 @@ function dispatchInc(
|
||||
r: InternalRegistry,
|
||||
name: string,
|
||||
labels?: Record<string, string>,
|
||||
value?: number,
|
||||
): void {
|
||||
const v = value ?? 1;
|
||||
switch (name) {
|
||||
case 'processor_consumer_reads_total':
|
||||
r.consumerReadsTotal.inc(labels ?? {});
|
||||
r.consumerReadsTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_consumer_records_total':
|
||||
r.consumerRecordsTotal.inc();
|
||||
r.consumerRecordsTotal.inc(v);
|
||||
break;
|
||||
case 'processor_decode_errors_total':
|
||||
r.decodeErrorsTotal.inc();
|
||||
r.decodeErrorsTotal.inc(v);
|
||||
break;
|
||||
case 'processor_position_writes_total':
|
||||
r.positionWritesTotal.inc(labels ?? {});
|
||||
r.positionWritesTotal.inc(labels ?? {}, v);
|
||||
break;
|
||||
case 'processor_acks_total':
|
||||
r.acksTotal.inc();
|
||||
r.acksTotal.inc(v);
|
||||
break;
|
||||
case 'processor_device_state_evictions_total':
|
||||
r.deviceStateEvictionsTotal.inc();
|
||||
r.deviceStateEvictionsTotal.inc(v);
|
||||
break;
|
||||
default:
|
||||
// Unknown metric name — silently ignore. This preserves the contract
|
||||
|
||||
Reference in New Issue
Block a user