Implement Phase 1 tasks 1.9-1.11 (observability + integration test + Dockerfile/CI)
src/observability/metrics.ts — full prom-client implementation. All 10
Phase 1 metrics registered (processor_consumer_reads_total,
_records_total, _lag, _decode_errors_total, processor_position_writes_total
{status}, _write_duration_seconds, processor_acks_total,
processor_device_state_{size,evictions_total}) plus nodejs_* defaults.
node:http server with /metrics, /healthz, /readyz. /readyz checks
redis.status === 'ready' AND a 5s-cached SELECT 1 Postgres probe.
processor_consumer_lag sampled every 10s via XINFO GROUPS, falling back
to a no-op when the consumer group hasn't been created yet.
src/main.ts — replaces the trace-logging shim with createMetrics() and
startMetricsServer(); shutdown closes the metrics server before
redis.quit() and pool.end().
test/metrics.test.ts — 22 unit tests: exposition format, every metric
type behaviour, all four HTTP endpoint paths including /readyz 503 cases.
test/pipeline.integration.test.ts — testcontainers Redis 7 +
TimescaleDB latest-pg16. Four scenarios: happy path with bigint+Buffer
attribute round-trip, idempotency on (device_id, ts), malformed payload
stays in PEL (decode_errors_total increments), writer failure → retry
(weaker variant per spec: stop Postgres before publish, restart, verify
row appears). Skip-on-no-Docker pattern verified — exits 0 without
Docker.
Dockerfile — multi-stage matching tcp-ingestion. EXPOSE 9090 only,
HEALTHCHECK on /readyz, image-source label points at processor repo.
.gitea/workflows/build.yml — single-job workflow mirroring
tcp-ingestion. Path filters cover src/, test/, build config, Dockerfile.
Portainer webhook step uncommented for :main auto-deploy.
compose.dev.yaml — local-build variant with Redis + TimescaleDB +
processor-dev for verifying Dockerfile changes without the registry
round-trip.
README.md — fleshed out from stub: quick-start, Docker build, deployment
note, env vars, tests (unit vs. integration), CI behavior. Flags the
deploy-side change needed: deploy/compose.yaml needs a TimescaleDB
service and a processor service entry added.
Verification: typecheck, lint clean; 134 unit tests passing across 8
files (+22 from this batch). pnpm test:integration runs cleanly under
the no-Docker skip pattern.
Phase 1 is now complete. Service is pilot-ready.
This commit is contained in:
+88
-35
@@ -1,15 +1,21 @@
|
||||
import type * as http from 'node:http';
|
||||
import type { Redis } from 'ioredis';
|
||||
import type pg from 'pg';
|
||||
import { loadConfig } from './config/load.js';
|
||||
import type { Config } from './config/load.js';
|
||||
import { createLogger } from './observability/logger.js';
|
||||
import {
|
||||
createMetrics,
|
||||
startMetricsServer,
|
||||
createPostgresHealthCheck,
|
||||
createConsumerLagSampler,
|
||||
} from './observability/metrics.js';
|
||||
import { createPool, connectWithRetry } from './db/pool.js';
|
||||
import { runMigrations } from './db/migrate.js';
|
||||
import { connectRedis, createConsumer } from './core/consumer.js';
|
||||
import type { ConsumedRecord } from './core/consumer.js';
|
||||
import { createDeviceStateStore } from './core/state.js';
|
||||
import { createWriter } from './core/writer.js';
|
||||
import type { Metrics } from './core/types.js';
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Startup: validate config (fail fast on bad env), build logger
|
||||
@@ -33,33 +39,21 @@ const logger = createLogger({
|
||||
|
||||
logger.info('processor starting');
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Metrics placeholder shim (task 1.9 replaces this with prom-client)
|
||||
//
|
||||
// Uses trace-level logging so the calls are observable in development but
|
||||
// are silent in production builds where the log level is info or higher.
|
||||
// This mirrors tcp-ingestion's approach before task 1.10 landed there.
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
const metrics: Metrics = {
|
||||
inc: (name: string, labels?: Record<string, string>) => {
|
||||
logger.trace({ metric: name, labels }, 'metrics.inc');
|
||||
},
|
||||
observe: (name: string, value: number, labels?: Record<string, string>) => {
|
||||
logger.trace({ metric: name, value, labels }, 'metrics.observe');
|
||||
},
|
||||
};
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Wire up the pipeline
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
async function main(): Promise<void> {
|
||||
// 1. Connect Postgres with exponential-backoff retry
|
||||
// 1. Build real prom-client metrics (replaces the trace-log shim from
|
||||
// pre-1.9 main.ts). Metrics are wired before any I/O so that counters
|
||||
// start at zero from the moment the process starts.
|
||||
const metrics = createMetrics();
|
||||
|
||||
// 2. Connect Postgres with exponential-backoff retry
|
||||
const pool = createPool(config.POSTGRES_URL);
|
||||
await connectWithRetry(pool, logger);
|
||||
|
||||
// 2. Run migrations before any consumer activity.
|
||||
// 3. Run migrations before any consumer activity.
|
||||
// Phase 1 limitation: multiple instances starting simultaneously both try
|
||||
// to migrate. Postgres advisory locks would solve this — deferred to Phase 3
|
||||
// (production hardening), which is acceptable for the Phase 1 single-instance
|
||||
@@ -67,14 +61,41 @@ async function main(): Promise<void> {
|
||||
await runMigrations(pool, logger);
|
||||
logger.info('migrations applied');
|
||||
|
||||
// 3. Connect Redis with exponential-backoff retry
|
||||
// 4. Connect Redis with exponential-backoff retry
|
||||
const redis: Redis = await connectRedis(config.REDIS_URL, logger);
|
||||
|
||||
// 4. Build pipeline components
|
||||
// 5. Build pipeline components
|
||||
const state = createDeviceStateStore(config, logger);
|
||||
const writer = createWriter(pool, config, logger, metrics);
|
||||
|
||||
// 5. Define the sink: central decision point for state update and Postgres write.
|
||||
// 6. Postgres health check — background cached SELECT 1 for /readyz.
|
||||
// The check starts probing immediately so /readyz is accurate from the
|
||||
// first request after the metrics server starts listening.
|
||||
const pgHealth = createPostgresHealthCheck(pool);
|
||||
|
||||
// 7. Start metrics HTTP server.
|
||||
// Bound before the consumer starts so /healthz responds even during the
|
||||
// brief window between metrics-server start and first stream read.
|
||||
const metricsServer: http.Server = startMetricsServer(
|
||||
config.METRICS_PORT,
|
||||
() => metrics.serializeMetrics(),
|
||||
{
|
||||
isRedisReady: () => redis.status === 'ready',
|
||||
isPostgresReady: pgHealth.isReady,
|
||||
},
|
||||
);
|
||||
logger.info({ port: config.METRICS_PORT }, 'metrics server listening');
|
||||
|
||||
// 8. Start consumer lag sampler (background interval, every 10 s).
|
||||
const lagSampler = createConsumerLagSampler(
|
||||
redis,
|
||||
config.REDIS_TELEMETRY_STREAM,
|
||||
config.REDIS_CONSUMER_GROUP,
|
||||
metrics,
|
||||
(msg) => logger.debug(msg),
|
||||
);
|
||||
|
||||
// 9. Define the sink: central decision point for state update and Postgres write.
|
||||
// State is updated BEFORE the write so that in-memory state is consistent with
|
||||
// what has been seen, even if the Postgres write subsequently fails. If the write
|
||||
// fails the record stays pending (not ACKed) and will be re-delivered — applying
|
||||
@@ -82,54 +103,75 @@ async function main(): Promise<void> {
|
||||
// only position_count_session is double-counted, which is a session counter that
|
||||
// resets on restart and is not a correctness concern.
|
||||
const sink = async (records: ConsumedRecord[]): Promise<string[]> => {
|
||||
// 5a. Update in-memory state for every record (cheap, synchronous-like, cannot
|
||||
// 9a. Update in-memory state for every record (cheap, synchronous-like, cannot
|
||||
// fail meaningfully — Map operations do not throw).
|
||||
for (const record of records) {
|
||||
state.update(record.position);
|
||||
}
|
||||
|
||||
// 5b. Write to Postgres
|
||||
// 9b. Emit device-state gauges (sampled per-batch; cheap).
|
||||
metrics.observe('processor_device_state_size', state.size());
|
||||
|
||||
// 9c. Write to Postgres
|
||||
const results = await writer.write(records);
|
||||
|
||||
// 5c. ACK only the IDs that succeeded or were already present.
|
||||
// 9d. ACK only the IDs that succeeded or were already present.
|
||||
// 'failed' records are deliberately left pending for retry.
|
||||
return results
|
||||
const ackIds = results
|
||||
.filter((r) => r.status === 'inserted' || r.status === 'duplicate')
|
||||
.map((r) => r.id);
|
||||
|
||||
if (ackIds.length > 0) {
|
||||
metrics.inc('processor_acks_total');
|
||||
}
|
||||
|
||||
return ackIds;
|
||||
};
|
||||
|
||||
// 6. Build and start the consumer
|
||||
// 10. Build and start the consumer
|
||||
const consumer = createConsumer(redis, config, logger, metrics, sink);
|
||||
await consumer.start();
|
||||
|
||||
// 7. Install graceful shutdown stub.
|
||||
// Full Phase 3 hardening: explicit consumer-group commit on SIGTERM,
|
||||
// uncaught-exception handler, multi-instance drain mode.
|
||||
installGracefulShutdown({ redis, pool, consumer, logger });
|
||||
// 11. Install graceful shutdown.
|
||||
// Full Phase 3 hardening: explicit consumer-group commit on SIGTERM,
|
||||
// uncaught-exception handler, multi-instance drain mode.
|
||||
installGracefulShutdown({
|
||||
redis,
|
||||
pool,
|
||||
consumer,
|
||||
metricsServer,
|
||||
pgHealth,
|
||||
lagSampler,
|
||||
logger,
|
||||
});
|
||||
|
||||
logger.info(
|
||||
{
|
||||
stream: config.REDIS_TELEMETRY_STREAM,
|
||||
group: config.REDIS_CONSUMER_GROUP,
|
||||
consumer: config.REDIS_CONSUMER_NAME,
|
||||
metricsPort: config.METRICS_PORT,
|
||||
},
|
||||
'processor ready',
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Graceful shutdown stub — Phase 3 finalizes this
|
||||
// Graceful shutdown — Phase 3 finalizes this
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
type ShutdownDeps = {
|
||||
readonly redis: Redis;
|
||||
readonly pool: pg.Pool;
|
||||
readonly consumer: { stop: () => Promise<void> };
|
||||
readonly metricsServer: http.Server;
|
||||
readonly pgHealth: { stop: () => void };
|
||||
readonly lagSampler: { stop: () => void };
|
||||
readonly logger: ReturnType<typeof createLogger>;
|
||||
};
|
||||
|
||||
function installGracefulShutdown(deps: ShutdownDeps): void {
|
||||
const { redis, pool, consumer, logger: log } = deps;
|
||||
const { redis, pool, consumer, metricsServer, pgHealth, lagSampler, logger: log } = deps;
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
@@ -139,11 +181,22 @@ function installGracefulShutdown(deps: ShutdownDeps): void {
|
||||
|
||||
log.info({ signal }, 'shutdown signal received');
|
||||
|
||||
// Stop consumer loop — exits after the current batch finishes.
|
||||
// Cancel background intervals first — they hold no resources that need
|
||||
// draining, and stopping them early prevents spurious log noise during
|
||||
// the shutdown sequence.
|
||||
lagSampler.stop();
|
||||
pgHealth.stop();
|
||||
|
||||
consumer
|
||||
.stop()
|
||||
.then(() => {
|
||||
log.info('consumer stopped');
|
||||
return new Promise<void>((resolve, reject) =>
|
||||
metricsServer.close((err) => (err ? reject(err) : resolve())),
|
||||
);
|
||||
})
|
||||
.then(() => {
|
||||
log.info('metrics server closed');
|
||||
return redis.quit();
|
||||
})
|
||||
.then(() => {
|
||||
|
||||
Reference in New Issue
Block a user