diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index b25ff8b..675265e 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -41,7 +41,7 @@ These rules govern every task. Any deviation must be discussed and documented as ### Phase 1 โ€” Inbound telemetry (Codec 8, 8E, 16) -**Status:** ๐ŸŸจ In progress (core implementation done; observability + hardening + device authority paused for pilot test) +**Status:** ๐ŸŸจ In progress (observability landed; hardening + device authority paused for pilot test) **Outcome:** A production-ready Node.js TCP server ingesting Teltonika telemetry from any FMB/FMC/FMM/FMU device, publishing normalized `Position` records to Redis Streams, with full observability and CI/CD via Gitea. [**See `phase-1-telemetry/README.md`**](./phase-1-telemetry/README.md) @@ -57,18 +57,17 @@ These rules govern every task. Any deviation must be discussed and documented as | 1.7 | [Codec 16 parser (incl. Generation Type)](./phase-1-telemetry/07-codec-16.md) | ๐ŸŸฉ | `381287b` | | 1.8 | [Redis Streams publisher & main wiring](./phase-1-telemetry/08-redis-publisher.md) | ๐ŸŸฉ | `af06973` | | 1.9 | [Fixture suite & testing strategy](./phase-1-telemetry/09-fixture-suite.md) | ๐ŸŸฉ | `381287b` | -| 1.10 | [Observability (Prometheus metrics)](./phase-1-telemetry/10-observability.md) | โธ | *deferred โ€” see below* | +| 1.10 | [Observability (Prometheus metrics)](./phase-1-telemetry/10-observability.md) | ๐ŸŸฉ | *(pending commit SHA)* | | 1.11 | [Dockerfile & Gitea workflow](./phase-1-telemetry/11-dockerfile-and-ci.md) | ๐ŸŸฉ | `88b742d` (slim pilot variant) | | 1.12 | [Production hardening](./phase-1-telemetry/12-production-hardening.md) | โธ | *deferred โ€” see below* | | 1.13 | [Device authority (Redis allow-list refresher)](./phase-1-telemetry/13-device-authority.md) | โธ | *deferred โ€” see below* | #### Deferred (resume after the real-device pilot test) -These three tasks are paused so we can get the service onto real hardware as fast as possible. They are paused, not cancelled โ€” each must be completed before the service is considered production-ready. +These two tasks are paused so we can get the service onto real hardware as fast as possible. They are paused, not cancelled โ€” each must be completed before the service is considered production-ready. -- **1.10 Observability (Prometheus metrics).** Tracking via the placeholder `Metrics` interface for now. **Resume trigger:** as soon as the pilot is generating real traffic and we want to measure it, *or* before any second instance is deployed (without metrics, "consumer lag" and "unknown codec" alerts cannot fire). - **1.12 Production hardening.** Graceful shutdown is a stub today; uncaught-exception handlers are minimal. **Resume trigger:** before the pilot graduates to "always-on" or before any deployment that does rolling restarts. Acceptable for a manual pilot where we can stop/start the process by hand. -- **1.13 Device authority (Redis allow-list refresher).** Default `AllowAllAuthority` accepts every IMEI; observability of `known | unknown` is moot until 1.10 lands. **Resume trigger:** when Directus has a `devices` collection publishing the allow-list to Redis, or when the operational picture demands rejecting unknown IMEIs (`STRICT_DEVICE_AUTH=true`). +- **1.13 Device authority (Redis allow-list refresher).** Default `AllowAllAuthority` accepts every IMEI. **Resume trigger:** when Directus has a `devices` collection publishing the allow-list to Redis, or when the operational picture demands rejecting unknown IMEIs (`STRICT_DEVICE_AUTH=true`). When resuming any of these, change the status from โธ back to โฌœ or ๐ŸŸจ here and in the task file's status badge, and clear the deferral note in the task file. diff --git a/.planning/phase-1-telemetry/10-observability.md b/.planning/phase-1-telemetry/10-observability.md index 063b8c0..4453674 100644 --- a/.planning/phase-1-telemetry/10-observability.md +++ b/.planning/phase-1-telemetry/10-observability.md @@ -1,7 +1,7 @@ # Task 1.10 โ€” Observability (Prometheus metrics) **Phase:** 1 โ€” Inbound telemetry -**Status:** โธ Paused โ€” deferred until after the real-device pilot test. See ROADMAP.md "Deferred" section for resume triggers. The placeholder `Metrics` interface in `src/core/types.ts` is what code currently uses; this task replaces it with `prom-client` and adds the `/metrics`, `/healthz`, `/readyz` HTTP endpoints. +**Status:** ๐ŸŸฉ **Depends on:** 1.2, 1.3 **Wiki refs:** `docs/wiki/sources/teltonika-ingestion-architecture.md` ยง 7. Observability, `docs/wiki/sources/gps-tracking-architecture.md` ยง 7.4 @@ -81,4 +81,4 @@ Use Node's `node:http` directly โ€” no Express/Fastify dependency for two endpoi ## Done -(Fill in once complete.) +Implemented `src/observability/metrics.ts` with `createMetrics()`, `startMetricsServer()`, and `ReadyzDeps`. Replaced the placeholder shim in `src/main.ts`, wired metrics server into boot and graceful shutdown, downgraded `frame ingested` log to debug, and re-enabled the Dockerfile `HEALTHCHECK`. Landed in: *(pending commit SHA)* diff --git a/Dockerfile b/Dockerfile index 5069b07..beaf6ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,9 +24,8 @@ COPY --from=build --chown=app:app /app/node_modules ./node_modules COPY --from=build --chown=app:app /app/dist ./dist COPY --from=build --chown=app:app /app/package.json ./package.json USER app -# Only the TCP port is exposed. METRICS_PORT (9090) is in the config schema but -# no HTTP server runs today โ€” task 1.10 (observability) adds that server. EXPOSE 5027 -# HEALTHCHECK deferred โ€” re-add `wget -qO- http://localhost:${METRICS_PORT}/readyz` -# when task 1.10 (observability) ships and the HTTP server is running. +EXPOSE 9090 +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD wget -qO- http://localhost:9090/readyz || exit 1 CMD ["node", "dist/main.js"] diff --git a/README.md b/README.md index f20a0ca..f2302cc 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,6 @@ The workflow uses `secrets.REGISTRY_USERNAME` and `secrets.REGISTRY_PASSWORD` fo This service is running in pilot form. The following tasks are **paused** โ€” they are not missing by accident, they are deferred by design to get onto real Teltonika hardware first: -- **Observability (task 1.10):** No `/metrics`, `/healthz`, or `/readyz` HTTP endpoints exist yet. `METRICS_PORT` is in the config schema but nothing listens on it. The Docker `HEALTHCHECK` is also absent for this reason. - **Production hardening (task 1.12):** Graceful shutdown is a functional stub; uncaught-exception handling is minimal. - **Device authority (task 1.13):** `AllowAllAuthority` is active โ€” every IMEI is accepted. `STRICT_DEVICE_AUTH=true` is wired but the Redis allow-list refresher is not yet implemented. diff --git a/src/adapters/teltonika/index.ts b/src/adapters/teltonika/index.ts index faf3dc7..1b0669a 100644 --- a/src/adapters/teltonika/index.ts +++ b/src/adapters/teltonika/index.ts @@ -217,9 +217,9 @@ export function createTeltonikaAdapter(options: TeltonikaAdapterOptions): Adapte result: 'ok', }); - // Pilot-stage visibility into per-frame ingest. Downgrade to debug - // once task 1.10 wires up prom-client and frames_total is scrapeable. - sessionLogger.info( + // teltonika_frames_total{result="ok"} and teltonika_records_published_total + // now carry this signal in Prometheus; keep the log at debug to avoid noise. + sessionLogger.debug( { codec: codecLabel, records: result.recordCount }, 'frame ingested', ); diff --git a/src/main.ts b/src/main.ts index fba738f..2efe28a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,13 +1,14 @@ import type { Redis } from 'ioredis'; +import type * as http from 'node:http'; import type * as net from 'node:net'; import { loadConfig } from './config/load.js'; import type { Config } from './config/load.js'; import { createLogger } from './observability/logger.js'; +import { createMetrics, startMetricsServer } from './observability/metrics.js'; import { createPublisher, connectRedis } from './core/publish.js'; import { startServer } from './core/server.js'; import { createTeltonikaAdapter } from './adapters/teltonika/index.js'; import { AllowAllAuthority } from './adapters/teltonika/device-authority.js'; -import type { Metrics } from './core/types.js'; // ------------------------------------------------------------------------- // Startup: validate config (fail fast on bad env), build logger, boot server @@ -31,12 +32,8 @@ const logger = createLogger({ logger.info('tcp-ingestion starting'); -// Placeholder metrics implementation โ€” replaced in task 1.10. -// Using the Metrics interface from types.ts (no prom-client yet). -const metrics: Metrics = { - inc: (name, labels) => logger.trace({ metric: name, labels }, 'metric inc'), - observe: (name, value, labels) => logger.trace({ metric: name, value, labels }, 'metric observe'), -}; +// Real prom-client metrics implementation (task 1.10). +const metrics = createMetrics(); // ------------------------------------------------------------------------- // Wire up the pipeline @@ -65,10 +62,18 @@ async function main(): Promise { metrics, }); - // 5. Install graceful shutdown (stub โ€” full hardening in task 1.12) - installGracefulShutdown({ server, redis, publisher, logger }); + // 5. Start metrics HTTP server (task 1.10). + // readyzDeps use ioredis's synchronous `.status` field and net.Server's + // `.listening` boolean โ€” no I/O, so these closures are always cheap. + const metricsServer = startMetricsServer(config.METRICS_PORT, metrics.serializeMetrics, { + isRedisReady: () => redis.status === 'ready', + isTcpListening: () => server.listening, + }); - logger.info({ port: config.TELTONIKA_PORT }, 'tcp-ingestion ready'); + // 6. Install graceful shutdown (stub โ€” full hardening in task 1.12) + installGracefulShutdown({ server, metricsServer, redis, publisher, logger }); + + logger.info({ port: config.TELTONIKA_PORT, metricsPort: config.METRICS_PORT }, 'tcp-ingestion ready'); } // ------------------------------------------------------------------------- @@ -77,13 +82,14 @@ async function main(): Promise { type ShutdownDeps = { readonly server: net.Server; + readonly metricsServer: http.Server; readonly redis: Redis; readonly publisher: { drain(timeoutMs: number): Promise }; readonly logger: ReturnType; }; function installGracefulShutdown(deps: ShutdownDeps): void { - const { server, redis, publisher, logger: log } = deps; + const { server, metricsServer, redis, publisher, logger: log } = deps; let shuttingDown = false; @@ -93,11 +99,15 @@ function installGracefulShutdown(deps: ShutdownDeps): void { log.info({ signal }, 'shutdown signal received'); - // Stop accepting new connections + // Stop accepting new TCP connections server.close(() => { log.info('TCP server closed'); }); + // Close the metrics HTTP server before quitting Redis so /readyz reports + // not-ready during the drain window (task 1.12 will tighten this further). + metricsServer.close(); + // Drain publisher queue then disconnect Redis publisher .drain(10_000) diff --git a/src/observability/metrics.ts b/src/observability/metrics.ts new file mode 100644 index 0000000..5dccf09 --- /dev/null +++ b/src/observability/metrics.ts @@ -0,0 +1,326 @@ +import * as http from 'node:http'; +import { + Registry, + Counter, + Gauge, + Histogram, + collectDefaultMetrics, +} from 'prom-client'; +import type { Metrics } from '../core/types.js'; + +// --------------------------------------------------------------------------- +// Readiness probe dependencies โ€” injected to keep this module free of +// adapters/ and core/ imports that would violate the layering rule. +// --------------------------------------------------------------------------- + +export type ReadyzDeps = { + /** + * Returns `true` when the Redis connection is ready for commands. + * Typically: `() => redis.status === 'ready'` + */ + readonly isRedisReady: () => boolean; + /** + * Returns `true` when the TCP listener is bound. + * Typically: `() => tcpServer.listening` + */ + readonly isTcpListening: () => boolean; +}; + +// --------------------------------------------------------------------------- +// Internal metric registry type โ€” one typed field per metric in the inventory. +// All mutation goes through the Metrics interface; the internal fields are +// only needed to call prom-client's own APIs (inc/set/observe). +// --------------------------------------------------------------------------- + +type InternalRegistry = { + readonly registry: Registry; + readonly connectionsActive: Gauge; + readonly handshakeTotal: Counter; + readonly deviceAuthorityFailuresTotal: Counter; + readonly framesTotal: Counter; + readonly recordsPublishedTotal: Counter; + readonly parseDurationSeconds: Histogram; + readonly unknownCodecTotal: Counter; + readonly publishQueueDepth: Gauge; + readonly publishOverflowTotal: Counter; + readonly publishDurationSeconds: Histogram; +}; + +// --------------------------------------------------------------------------- +// createMetrics โ€” builds the full prom-client registry and returns a Metrics +// wrapper that satisfies the existing call-site interface. +// --------------------------------------------------------------------------- + +/** + * Builds a fresh prom-client `Registry`, registers every metric in the Phase 1 + * inventory, and returns: + * - a `Metrics` object (satisfies `src/core/types.ts:Metrics`) for injection + * into adapters and the publisher + * - a `serializeMetrics()` function for exposition format + * + * `collectDefaultMetrics` is called once to enable Node.js process metrics + * (GC, event loop lag, heap stats, etc.) under the same registry. + */ +export function createMetrics(): Metrics & { + serializeMetrics: () => Promise; +} { + const internal = buildInternalRegistry(); + + // Expose default Node.js process metrics (nodejs_*) on the same registry. + collectDefaultMetrics({ register: internal.registry }); + + const metricsImpl: Metrics & { serializeMetrics: () => Promise } = { + inc(name: string, labels?: Record): void { + dispatchInc(internal, name, labels); + }, + + observe(name: string, value: number, labels?: Record): void { + dispatchObserve(internal, name, value, labels); + }, + + serializeMetrics(): Promise { + return internal.registry.metrics(); + }, + }; + + return metricsImpl; +} + +// --------------------------------------------------------------------------- +// serializeMetrics โ€” standalone helper for use outside the Metrics wrapper. +// Exported so startMetricsServer can call it without holding a reference to +// the internal registry. +// --------------------------------------------------------------------------- + +// Note: this is accessed via the metricsImpl.serializeMetrics closure above. +// A standalone export is not required; the server takes the bound method. + +// --------------------------------------------------------------------------- +// startMetricsServer โ€” minimal node:http server for /metrics, /healthz, /readyz +// --------------------------------------------------------------------------- + +/** + * Starts the Prometheus metrics HTTP server on the given port. + * + * Endpoints: + * GET /metrics โ€” Prometheus exposition format (text/plain; version=0.0.4) + * GET /healthz โ€” 200 if the process is alive (liveness probe) + * GET /readyz โ€” 200 if Redis is connected AND TCP server is listening; + * 503 otherwise (readiness probe) + * + * @param port Port to bind; 0 lets the OS pick (useful in tests). + * @param serializeMetrics Function that returns the Prometheus text format. + * @param readyzDeps Sync accessors for Redis and TCP readiness state. + */ +export function startMetricsServer( + port: number, + serializeMetrics: () => Promise, + readyzDeps: ReadyzDeps, +): http.Server { + const server = http.createServer((req, res) => { + const url = req.url ?? '/'; + const method = req.method ?? 'GET'; + + // Reject non-GET requests for all endpoints. + if (method !== 'GET') { + res.writeHead(405, { 'Content-Type': 'text/plain' }); + res.end('Method Not Allowed'); + return; + } + + if (url === '/metrics') { + serializeMetrics() + .then((text) => { + res.writeHead(200, { 'Content-Type': 'text/plain; version=0.0.4; charset=utf-8' }); + res.end(text); + }) + .catch((err: unknown) => { + res.writeHead(500, { 'Content-Type': 'text/plain' }); + res.end(`Internal Server Error: ${err instanceof Error ? err.message : String(err)}`); + }); + return; + } + + if (url === '/healthz') { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'ok' })); + return; + } + + if (url === '/readyz') { + const redisOk = readyzDeps.isRedisReady(); + const tcpOk = readyzDeps.isTcpListening(); + + if (redisOk && tcpOk) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'ok' })); + } else { + res.writeHead(503, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'not ready', redis: redisOk, tcp: tcpOk })); + } + return; + } + + res.writeHead(404, { 'Content-Type': 'text/plain' }); + res.end('Not Found'); + }); + + server.listen(port); + return server; +} + +// --------------------------------------------------------------------------- +// Private: registry construction +// --------------------------------------------------------------------------- + +function buildInternalRegistry(): InternalRegistry { + const registry = new Registry(); + + const connectionsActive = new Gauge({ + name: 'teltonika_connections_active', + help: 'Currently open device sessions.', + registers: [registry], + }); + + const handshakeTotal = new Counter({ + name: 'teltonika_handshake_total', + help: 'IMEI handshake outcomes.', + labelNames: ['result', 'known'], + registers: [registry], + }); + + const deviceAuthorityFailuresTotal = new Counter({ + name: 'teltonika_device_authority_failures_total', + help: 'Times a DeviceAuthority.check call threw or timed out.', + registers: [registry], + }); + + const framesTotal = new Counter({ + name: 'teltonika_frames_total', + help: 'Frame-level outcomes.', + labelNames: ['codec', 'result'], + registers: [registry], + }); + + const recordsPublishedTotal = new Counter({ + name: 'teltonika_records_published_total', + help: 'AVL records emitted to Redis.', + labelNames: ['codec'], + registers: [registry], + }); + + const parseDurationSeconds = new Histogram({ + name: 'teltonika_parse_duration_seconds', + help: 'Per-frame parse time.', + labelNames: ['codec'], + buckets: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], + registers: [registry], + }); + + const unknownCodecTotal = new Counter({ + name: 'teltonika_unknown_codec_total', + help: 'Canary for codec coverage drift.', + labelNames: ['codec_id'], + registers: [registry], + }); + + const publishQueueDepth = new Gauge({ + name: 'teltonika_publish_queue_depth', + help: 'Current bounded-queue depth.', + registers: [registry], + }); + + const publishOverflowTotal = new Counter({ + name: 'teltonika_publish_overflow_total', + help: 'Records dropped because the queue was full.', + registers: [registry], + }); + + const publishDurationSeconds = new Histogram({ + name: 'teltonika_publish_duration_seconds', + help: 'XADD latency.', + buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0], + registers: [registry], + }); + + return { + registry, + connectionsActive, + handshakeTotal, + deviceAuthorityFailuresTotal, + framesTotal, + recordsPublishedTotal, + parseDurationSeconds, + unknownCodecTotal, + publishQueueDepth, + publishOverflowTotal, + publishDurationSeconds, + }; +} + +// --------------------------------------------------------------------------- +// Private: dispatch helpers โ€” map string metric names to typed prom-client calls +// --------------------------------------------------------------------------- + +function dispatchInc( + r: InternalRegistry, + name: string, + labels?: Record, +): void { + switch (name) { + case 'teltonika_handshake_total': + r.handshakeTotal.inc(labels ?? {}); + break; + case 'teltonika_device_authority_failures_total': + r.deviceAuthorityFailuresTotal.inc(); + break; + case 'teltonika_frames_total': + r.framesTotal.inc(labels ?? {}); + break; + case 'teltonika_records_published_total': + r.recordsPublishedTotal.inc(labels ?? {}); + break; + case 'teltonika_unknown_codec_total': + r.unknownCodecTotal.inc(labels ?? {}); + break; + case 'teltonika_publish_overflow_total': + r.publishOverflowTotal.inc(labels ?? {}); + break; + case 'teltonika_connections_active': + // inc() on a gauge means +1 (connection opened) + r.connectionsActive.inc(); + break; + default: + // Unknown metric name โ€” silently ignore. This preserves the contract + // that the Metrics interface never throws, and avoids crashing the + // process when a call site references a metric not yet in the registry + // (e.g. during staged rollouts or future tasks). + break; + } +} + +function dispatchObserve( + r: InternalRegistry, + name: string, + value: number, + labels?: Record, +): void { + switch (name) { + case 'teltonika_parse_duration_seconds': + r.parseDurationSeconds.observe(labels ?? {}, value); + break; + case 'teltonika_publish_duration_seconds': + r.publishDurationSeconds.observe(value); + break; + case 'teltonika_publish_queue_depth': + r.publishQueueDepth.set(value); + break; + case 'teltonika_connections_active': + // observe() on a gauge means set(value) โ€” caller controls the value + r.connectionsActive.set(value); + break; + default: + // Unknown metric name โ€” silently ignore (see dispatchInc comment). + break; + } +} diff --git a/test/metrics.test.ts b/test/metrics.test.ts new file mode 100644 index 0000000..b521c0f --- /dev/null +++ b/test/metrics.test.ts @@ -0,0 +1,360 @@ +/** + * Unit tests for src/observability/metrics.ts + * + * Covers: + * - serializeMetrics() returns valid Prometheus exposition format with the + * full metric inventory present (some at zero) + * - counter increments are reflected in the serialized output + * - teltonika_unknown_codec_total{codec_id="0xff"} appears after an unknown-codec event + * - startMetricsServer responds correctly to /metrics, /healthz, and /readyz + * - /readyz returns 503 when Redis is not ready or TCP is not listening + */ + +import { describe, it, expect, afterEach } from 'vitest'; +import * as http from 'node:http'; +import { createMetrics, startMetricsServer } from '../src/observability/metrics.js'; +import type { ReadyzDeps } from '../src/observability/metrics.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Makes an HTTP GET request and resolves with { statusCode, body }. + * Uses the raw node:http module to match the server's implementation. + */ +function get(port: number, path: string): Promise<{ statusCode: number; body: string }> { + return new Promise((resolve, reject) => { + const req = http.get(`http://localhost:${port}${path}`, (res) => { + let body = ''; + res.on('data', (chunk: Buffer) => { + body += chunk.toString(); + }); + res.on('end', () => { + resolve({ statusCode: res.statusCode ?? 0, body }); + }); + }); + req.on('error', reject); + }); +} + +/** + * Waits for a server to be listening, then runs `fn`, then closes the server. + * Returns after the server is fully closed. + */ +function withServer( + server: http.Server, + fn: (port: number) => Promise, +): Promise { + return new Promise((resolve, reject) => { + server.once('listening', () => { + const addr = server.address(); + const port = typeof addr === 'object' && addr !== null ? addr.port : 0; + + fn(port) + .then(() => { + server.close((err) => { + if (err) reject(err); + else resolve(); + }); + }) + .catch((err: unknown) => { + server.close(() => reject(err)); + }); + }); + server.once('error', reject); + }); +} + +// --------------------------------------------------------------------------- +// 1. serializeMetrics โ€” Prometheus exposition format +// --------------------------------------------------------------------------- + +describe('createMetrics โ€” serializeMetrics', () => { + it('returns a non-empty string', async () => { + const metrics = createMetrics(); + const text = await metrics.serializeMetrics(); + expect(typeof text).toBe('string'); + expect(text.length).toBeGreaterThan(0); + }); + + it('contains every teltonika_* metric in the Phase 1 inventory', async () => { + const metrics = createMetrics(); + const text = await metrics.serializeMetrics(); + + const expectedMetrics = [ + 'teltonika_connections_active', + 'teltonika_handshake_total', + 'teltonika_device_authority_failures_total', + 'teltonika_frames_total', + 'teltonika_records_published_total', + 'teltonika_parse_duration_seconds', + 'teltonika_unknown_codec_total', + 'teltonika_publish_queue_depth', + 'teltonika_publish_overflow_total', + 'teltonika_publish_duration_seconds', + ]; + + for (const name of expectedMetrics) { + expect(text, `expected "${name}" in metrics output`).toContain(name); + } + }); + + it('contains nodejs_* default process metrics', async () => { + const metrics = createMetrics(); + const text = await metrics.serializeMetrics(); + // prom-client collectDefaultMetrics registers nodejs_version_info at minimum + expect(text).toContain('nodejs_'); + }); + + it('exposition format contains HELP and TYPE lines', async () => { + const metrics = createMetrics(); + const text = await metrics.serializeMetrics(); + expect(text).toContain('# HELP teltonika_handshake_total'); + expect(text).toContain('# TYPE teltonika_handshake_total counter'); + expect(text).toContain('# HELP teltonika_frames_total'); + expect(text).toContain('# TYPE teltonika_frames_total counter'); + }); +}); + +// --------------------------------------------------------------------------- +// 2. Counter increments +// --------------------------------------------------------------------------- + +describe('createMetrics โ€” counter increments', () => { + it('increments teltonika_frames_total and teltonika_records_published_total', async () => { + const metrics = createMetrics(); + + // Simulate a successful Codec 8E frame + metrics.inc('teltonika_frames_total', { codec: '8E', result: 'ok' }); + metrics.inc('teltonika_records_published_total', { codec: '8E' }); + + const text = await metrics.serializeMetrics(); + + // The exposition format for a counter with labels looks like: + // teltonika_frames_total{codec="8E",result="ok"} 1 + expect(text).toMatch(/teltonika_frames_total\{[^}]*codec="8E"[^}]*result="ok"[^}]*\}\s+1/); + expect(text).toMatch(/teltonika_records_published_total\{[^}]*codec="8E"[^}]*\}\s+1/); + }); + + it('accumulates multiple increments on the same label set', async () => { + const metrics = createMetrics(); + + metrics.inc('teltonika_frames_total', { codec: '8', result: 'ok' }); + metrics.inc('teltonika_frames_total', { codec: '8', result: 'ok' }); + metrics.inc('teltonika_frames_total', { codec: '8', result: 'ok' }); + + const text = await metrics.serializeMetrics(); + expect(text).toMatch(/teltonika_frames_total\{[^}]*codec="8"[^}]*result="ok"[^}]*\}\s+3/); + }); + + it('tracks crc_fail result separately from ok', async () => { + const metrics = createMetrics(); + + metrics.inc('teltonika_frames_total', { codec: '8', result: 'ok' }); + metrics.inc('teltonika_frames_total', { codec: '8', result: 'crc_fail' }); + + const text = await metrics.serializeMetrics(); + expect(text).toMatch(/teltonika_frames_total\{[^}]*codec="8"[^}]*result="ok"[^}]*\}\s+1/); + expect(text).toMatch(/teltonika_frames_total\{[^}]*codec="8"[^}]*result="crc_fail"[^}]*\}\s+1/); + }); +}); + +// --------------------------------------------------------------------------- +// 3. Unknown codec canary +// --------------------------------------------------------------------------- + +describe('createMetrics โ€” unknown codec canary', () => { + it('records teltonika_unknown_codec_total{codec_id="0xff"} after an unknown-codec event', async () => { + const metrics = createMetrics(); + + metrics.inc('teltonika_unknown_codec_total', { codec_id: '0xff' }); + + const text = await metrics.serializeMetrics(); + expect(text).toMatch(/teltonika_unknown_codec_total\{[^}]*codec_id="0xff"[^}]*\}\s+1/); + }); + + it('distinguishes different codec_id values', async () => { + const metrics = createMetrics(); + + metrics.inc('teltonika_unknown_codec_total', { codec_id: '0x0f' }); + metrics.inc('teltonika_unknown_codec_total', { codec_id: '0x0f' }); + metrics.inc('teltonika_unknown_codec_total', { codec_id: '0x20' }); + + const text = await metrics.serializeMetrics(); + expect(text).toMatch(/teltonika_unknown_codec_total\{[^}]*codec_id="0x0f"[^}]*\}\s+2/); + expect(text).toMatch(/teltonika_unknown_codec_total\{[^}]*codec_id="0x20"[^}]*\}\s+1/); + }); +}); + +// --------------------------------------------------------------------------- +// 4. observe() โ€” gauge and histogram +// --------------------------------------------------------------------------- + +describe('createMetrics โ€” observe', () => { + it('sets teltonika_publish_queue_depth gauge', async () => { + const metrics = createMetrics(); + + metrics.observe('teltonika_publish_queue_depth', 42); + + const text = await metrics.serializeMetrics(); + expect(text).toMatch(/teltonika_publish_queue_depth\s+42/); + }); + + it('records teltonika_parse_duration_seconds histogram observation', async () => { + const metrics = createMetrics(); + + metrics.observe('teltonika_parse_duration_seconds', 0.0003, { codec: '8E' }); + + const text = await metrics.serializeMetrics(); + // The histogram sum should contain the observed value + expect(text).toMatch(/teltonika_parse_duration_seconds_sum\{[^}]*codec="8E"[^}]*\}\s+0\.0003/); + }); + + it('ignores unknown metric names without throwing', () => { + const metrics = createMetrics(); + // Must not throw โ€” the Metrics interface contract is never-throw + expect(() => metrics.inc('teltonika_nonexistent_metric')).not.toThrow(); + expect(() => metrics.observe('teltonika_nonexistent_metric', 1.0)).not.toThrow(); + }); +}); + +// --------------------------------------------------------------------------- +// 5. startMetricsServer โ€” HTTP endpoint behaviour +// --------------------------------------------------------------------------- + +describe('startMetricsServer', () => { + // Track servers for cleanup in case of test failure + const openServers: http.Server[] = []; + + afterEach(() => { + for (const s of openServers) { + if (s.listening) s.close(); + } + openServers.length = 0; + }); + + it('GET /metrics returns 200 with Prometheus text', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => true, + isTcpListening: () => true, + }; + + // port=0 โ†’ OS picks a free port + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode, body } = await get(port, '/metrics'); + expect(statusCode).toBe(200); + expect(body).toContain('teltonika_'); + expect(body).toContain('nodejs_'); + }); + }); + + it('GET /healthz returns 200 regardless of Redis/TCP state', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => false, // deliberately unhealthy + isTcpListening: () => false, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode, body } = await get(port, '/healthz'); + expect(statusCode).toBe(200); + expect(JSON.parse(body)).toEqual({ status: 'ok' }); + }); + }); + + it('GET /readyz returns 200 when Redis is ready and TCP is listening', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => true, + isTcpListening: () => true, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode, body } = await get(port, '/readyz'); + expect(statusCode).toBe(200); + expect(JSON.parse(body)).toEqual({ status: 'ok' }); + }); + }); + + it('GET /readyz returns 503 when Redis is not ready', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => false, + isTcpListening: () => true, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode, body } = await get(port, '/readyz'); + expect(statusCode).toBe(503); + const parsed = JSON.parse(body) as { status: string; redis: boolean; tcp: boolean }; + expect(parsed.status).toBe('not ready'); + expect(parsed.redis).toBe(false); + expect(parsed.tcp).toBe(true); + }); + }); + + it('GET /readyz returns 503 when TCP server is not listening', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => true, + isTcpListening: () => false, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode } = await get(port, '/readyz'); + expect(statusCode).toBe(503); + }); + }); + + it('GET /readyz returns 503 when both Redis and TCP are down', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => false, + isTcpListening: () => false, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode, body } = await get(port, '/readyz'); + expect(statusCode).toBe(503); + const parsed = JSON.parse(body) as { status: string; redis: boolean; tcp: boolean }; + expect(parsed.redis).toBe(false); + expect(parsed.tcp).toBe(false); + }); + }); + + it('GET /unknown-path returns 404', async () => { + const metrics = createMetrics(); + const readyzDeps: ReadyzDeps = { + isRedisReady: () => true, + isTcpListening: () => true, + }; + + const server = startMetricsServer(0, metrics.serializeMetrics, readyzDeps); + openServers.push(server); + + await withServer(server, async (port) => { + const { statusCode } = await get(port, '/not-a-real-endpoint'); + expect(statusCode).toBe(404); + }); + }); +});