Understanding API Observability
API monitoring and logging are essential for maintaining system health, debugging issues, understanding user behavior, and ensuring SLAs are met. Observability encompasses logs, metrics, and traces that provide visibility into your API's behavior in production environments.
The Three Pillars of Observability:
- Logs: Detailed records of discrete events (requests, errors, warnings)
- Metrics: Numerical measurements over time (response times, request rates, error rates)
- Traces: End-to-end request flow across distributed services
Request Logging
Comprehensive request logging helps you understand API usage patterns, identify errors, and debug production issues.
Structured Logging with Winston
// winston-logger.js
const winston = require('winston');
// Create logger instance
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp({
format: 'YYYY-MM-DD HH:mm:ss'
}),
winston.format.errors({ stack: true }),
winston.format.splat(),
winston.format.json()
),
defaultMeta: { service: 'api-service' },
transports: [
// Write errors to error.log
new winston.transports.File({
filename: 'logs/error.log',
level: 'error',
maxsize: 5242880, // 5MB
maxFiles: 5
}),
// Write all logs to combined.log
new winston.transports.File({
filename: 'logs/combined.log',
maxsize: 5242880,
maxFiles: 5
})
]
});
// Add console output in development
if (process.env.NODE_ENV !== 'production') {
logger.add(new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}));
}
module.exports = logger;
Request Logging Middleware
const express = require('express');
const logger = require('./winston-logger');
const { v4: uuidv4 } = require('uuid');
const app = express();
// Request logging middleware
app.use((req, res, next) => {
// Generate unique request ID
req.id = uuidv4();
req.startTime = Date.now();
// Log incoming request
logger.info('Incoming request', {
requestId: req.id,
method: req.method,
path: req.path,
query: req.query,
ip: req.ip,
userAgent: req.get('user-agent')
});
// Log response
const originalSend = res.send;
res.send = function(data) {
const duration = Date.now() - req.startTime;
logger.info('Request completed', {
requestId: req.id,
method: req.method,
path: req.path,
statusCode: res.statusCode,
duration: `${duration}ms`,
contentLength: res.get('content-length')
});
// Log slow requests
if (duration > 1000) {
logger.warn('Slow request detected', {
requestId: req.id,
method: req.method,
path: req.path,
duration: `${duration}ms`
});
}
return originalSend.call(this, data);
};
next();
});
// Error logging middleware
app.use((err, req, res, next) => {
logger.error('Request error', {
requestId: req.id,
method: req.method,
path: req.path,
error: {
message: err.message,
stack: err.stack,
code: err.code
},
statusCode: res.statusCode
});
res.status(err.statusCode || 500).json({
error: {
message: err.message,
requestId: req.id
}
});
});
Structured Logging Benefits: JSON-formatted logs are easily parseable by log aggregation tools (ELK Stack, Splunk, Datadog). Include request IDs to trace requests across services. Always log timestamps for time-based analysis.
Advanced Logging with Morgan
const morgan = require('morgan');
const logger = require('./winston-logger');
// Custom token for request ID
morgan.token('id', (req) => req.id);
// Custom token for response time in milliseconds
morgan.token('response-time-ms', (req, res) => {
const duration = Date.now() - req.startTime;
return `${duration}ms`;
});
// Create stream to send Morgan logs to Winston
const stream = {
write: (message) => logger.http(message.trim())
};
// Morgan format with custom tokens
const format = ':id :method :url :status :response-time-ms - :res[content-length]';
// Use Morgan middleware
app.use(morgan(format, { stream }));
// Custom Morgan format for detailed logging
const detailedFormat = (tokens, req, res) => {
return JSON.stringify({
requestId: tokens.id(req, res),
method: tokens.method(req, res),
url: tokens.url(req, res),
status: tokens.status(req, res),
responseTime: tokens['response-time'](req, res),
contentLength: tokens.res(req, res, 'content-length'),
userAgent: tokens['user-agent'](req, res),
ip: tokens['remote-addr'](req, res)
});
};
app.use(morgan(detailedFormat, { stream }));
Performance Metrics
Collect and monitor key performance metrics to identify bottlenecks and track system health over time.
Prometheus Metrics Integration
const express = require('express');
const prometheus = require('prom-client');
const app = express();
// Create a Registry to register metrics
const register = new prometheus.Registry();
// Enable default metrics (CPU, memory, event loop lag)
prometheus.collectDefaultMetrics({ register });
// Custom metrics
const httpRequestDuration = new prometheus.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.5, 1, 2, 5, 10] // Seconds
});
const httpRequestTotal = new prometheus.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
const activeConnections = new prometheus.Gauge({
name: 'http_active_connections',
help: 'Number of active HTTP connections'
});
const databaseQueryDuration = new prometheus.Histogram({
name: 'database_query_duration_seconds',
help: 'Duration of database queries in seconds',
labelNames: ['query_type', 'table'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2]
});
// Register custom metrics
register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestTotal);
register.registerMetric(activeConnections);
register.registerMetric(databaseQueryDuration);
// Metrics middleware
app.use((req, res, next) => {
activeConnections.inc();
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000; // Convert to seconds
const route = req.route ? req.route.path : req.path;
httpRequestDuration
.labels(req.method, route, res.statusCode)
.observe(duration);
httpRequestTotal
.labels(req.method, route, res.statusCode)
.inc();
activeConnections.dec();
});
next();
});
// Expose metrics endpoint for Prometheus
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
// Example: Track database query metrics
async function executeQuery(queryType, table, query, params) {
const start = Date.now();
try {
const result = await db.query(query, params);
const duration = (Date.now() - start) / 1000;
databaseQueryDuration
.labels(queryType, table)
.observe(duration);
return result;
} catch (error) {
const duration = (Date.now() - start) / 1000;
databaseQueryDuration
.labels(queryType, table)
.observe(duration);
throw error;
}
}
// Usage
app.get('/api/posts', async (req, res) => {
const posts = await executeQuery('SELECT', 'posts', 'SELECT * FROM posts LIMIT 10');
res.json(posts);
});
Key Metrics to Track:
- Request rate: Requests per second (RPS)
- Error rate: Percentage of failed requests (4xx, 5xx)
- Response time: P50, P95, P99 percentiles
- Throughput: Data transferred per second
- Active connections: Number of concurrent connections
- Database performance: Query duration, connection pool usage
Health Checks
Implement health check endpoints to monitor API availability and dependencies.
Basic Health Check
// Simple health check
app.get('/health', (req, res) => {
res.status(200).json({
status: 'ok',
timestamp: new Date().toISOString(),
uptime: process.uptime()
});
});
Comprehensive Health Check
const Redis = require('ioredis');
const mysql = require('mysql2/promise');
const redis = new Redis();
const dbPool = mysql.createPool({ /* config */ });
// Detailed health check with dependency checks
app.get('/health', async (req, res) => {
const health = {
status: 'ok',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
checks: {}
};
// Check database
try {
await dbPool.query('SELECT 1');
health.checks.database = { status: 'healthy', latency: '<50ms' };
} catch (error) {
health.status = 'degraded';
health.checks.database = {
status: 'unhealthy',
error: error.message
};
}
// Check Redis
try {
const start = Date.now();
await redis.ping();
const latency = Date.now() - start;
health.checks.redis = {
status: 'healthy',
latency: `${latency}ms`
};
} catch (error) {
health.status = 'degraded';
health.checks.redis = {
status: 'unhealthy',
error: error.message
};
}
// Check memory usage
const memUsage = process.memoryUsage();
health.checks.memory = {
rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`,
heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`,
heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
};
// Return 503 if unhealthy
const statusCode = health.status === 'ok' ? 200 : 503;
res.status(statusCode).json(health);
});
// Readiness check (can service accept traffic?)
app.get('/ready', async (req, res) => {
try {
// Check critical dependencies
await dbPool.query('SELECT 1');
await redis.ping();
res.status(200).json({ ready: true });
} catch (error) {
res.status(503).json({
ready: false,
error: error.message
});
}
});
// Liveness check (is service running?)
app.get('/live', (req, res) => {
res.status(200).json({ alive: true });
});
Health Check Best Practices: Implement separate /health (detailed status), /ready (can accept traffic), and /live (is process alive) endpoints. Health checks should be lightweight (<100ms). Don't expose sensitive information in health check responses.
Error Tracking and Alerting
Integrate error tracking services to capture, aggregate, and alert on production errors.
Sentry Integration
const Sentry = require('@sentry/node');
const express = require('express');
const app = express();
// Initialize Sentry
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
tracesSampleRate: 0.1, // Sample 10% of transactions
integrations: [
// Enable HTTP tracking
new Sentry.Integrations.Http({ tracing: true }),
// Enable Express.js middleware tracking
new Sentry.Integrations.Express({ app })
]
});
// Request handler must be first middleware
app.use(Sentry.Handlers.requestHandler());
// Tracing handler for performance monitoring
app.use(Sentry.Handlers.tracingHandler());
// Your routes
app.get('/api/posts', async (req, res) => {
try {
const posts = await fetchPosts();
res.json(posts);
} catch (error) {
// Manually capture error with context
Sentry.captureException(error, {
tags: {
endpoint: '/api/posts',
method: 'GET'
},
extra: {
userId: req.user?.id,
query: req.query
}
});
res.status(500).json({ error: 'Failed to fetch posts' });
}
});
// Error handler must be after routes
app.use(Sentry.Handlers.errorHandler());
// Custom error handler
app.use((err, req, res, next) => {
res.status(err.statusCode || 500).json({
error: err.message,
requestId: req.id
});
});
Custom Alerting System
const nodemailer = require('nodemailer');
const logger = require('./winston-logger');
class AlertingService {
constructor() {
this.errorCounts = new Map();
this.alertThreshold = 10; // Alert after 10 errors in 5 minutes
this.timeWindow = 5 * 60 * 1000; // 5 minutes
this.transporter = nodemailer.createTransport({
host: process.env.SMTP_HOST,
port: process.env.SMTP_PORT,
auth: {
user: process.env.SMTP_USER,
pass: process.env.SMTP_PASS
}
});
}
async trackError(errorType, error, context = {}) {
const key = `${errorType}:${error.message}`;
const now = Date.now();
if (!this.errorCounts.has(key)) {
this.errorCounts.set(key, []);
}
const timestamps = this.errorCounts.get(key);
timestamps.push(now);
// Remove old timestamps
const cutoff = now - this.timeWindow;
const recentTimestamps = timestamps.filter(ts => ts > cutoff);
this.errorCounts.set(key, recentTimestamps);
// Check if threshold exceeded
if (recentTimestamps.length >= this.alertThreshold) {
await this.sendAlert(errorType, error, recentTimestamps.length, context);
this.errorCounts.delete(key); // Reset after alert
}
}
async sendAlert(errorType, error, count, context) {
const subject = `[ALERT] ${errorType}: ${error.message}`;
const body = `
Error Type: ${errorType}
Error Message: ${error.message}
Occurrences: ${count} in last 5 minutes
Stack Trace: ${error.stack}
Context:
${JSON.stringify(context, null, 2)}
Time: ${new Date().toISOString()}
`;
try {
await this.transporter.sendMail({
from: process.env.ALERT_FROM_EMAIL,
to: process.env.ALERT_TO_EMAIL,
subject,
text: body
});
logger.info('Alert sent', { errorType, count });
} catch (err) {
logger.error('Failed to send alert', { error: err.message });
}
}
// Clean up old error counts periodically
cleanup() {
const now = Date.now();
const cutoff = now - this.timeWindow;
for (const [key, timestamps] of this.errorCounts.entries()) {
const recentTimestamps = timestamps.filter(ts => ts > cutoff);
if (recentTimestamps.length === 0) {
this.errorCounts.delete(key);
} else {
this.errorCounts.set(key, recentTimestamps);
}
}
}
}
const alerting = new AlertingService();
// Cleanup every minute
setInterval(() => alerting.cleanup(), 60000);
// Usage in error handling
app.use(async (err, req, res, next) => {
logger.error('API Error', {
error: err.message,
stack: err.stack,
path: req.path
});
// Track error for alerting
await alerting.trackError('API_ERROR', err, {
method: req.method,
path: req.path,
statusCode: res.statusCode
});
res.status(err.statusCode || 500).json({
error: err.message
});
});
Application Performance Monitoring (APM)
APM tools provide deep insights into application performance, database queries, external API calls, and distributed tracing.
New Relic Integration
// newrelic.js (must be first file required)
'use strict';
exports.config = {
app_name: ['My API'],
license_key: process.env.NEW_RELIC_LICENSE_KEY,
logging: {
level: 'info'
},
transaction_tracer: {
enabled: true,
transaction_threshold: 'apdex_f',
record_sql: 'obfuscated'
},
error_collector: {
enabled: true,
ignore_status_codes: [404]
},
distributed_tracing: {
enabled: true
}
};
// server.js
require('newrelic'); // Must be first!
const express = require('express');
const newrelic = require('newrelic');
const app = express();
// Custom metrics
app.get('/api/posts', async (req, res) => {
// Record custom metric
newrelic.recordMetric('Custom/Posts/Fetched', 1);
// Create custom transaction
const transaction = newrelic.getTransaction();
transaction.acceptDistributedTraceHeaders('HTTP', req.headers);
try {
const posts = await fetchPosts();
// Add custom attributes
newrelic.addCustomAttributes({
userId: req.user?.id,
postCount: posts.length,
cached: posts.cached || false
});
res.json(posts);
} catch (error) {
// Notice error (recorded automatically, but can add context)
newrelic.noticeError(error, {
userId: req.user?.id,
endpoint: '/api/posts'
});
res.status(500).json({ error: 'Failed to fetch posts' });
}
});
// Background job monitoring
async function processJob(job) {
// Create background transaction
return newrelic.startBackgroundTransaction(
'processEmailJob',
async () => {
const transaction = newrelic.getTransaction();
transaction.acceptDistributedTraceHeaders('Other', job.headers);
try {
await sendEmail(job.data);
newrelic.recordMetric('Custom/Email/Sent', 1);
} catch (error) {
newrelic.noticeError(error);
throw error;
}
}
);
}
Popular APM Tools:
- New Relic: Comprehensive APM with distributed tracing
- Datadog: Full-stack observability platform
- Elastic APM: Open-source APM with ELK Stack integration
- AppDynamics: Enterprise-grade APM with business metrics
Log Aggregation and Analysis
Centralize logs from multiple services for efficient searching, analysis, and troubleshooting.
ELK Stack (Elasticsearch, Logstash, Kibana) Integration
// winston-elasticsearch.js
const winston = require('winston');
const { ElasticsearchTransport } = require('winston-elasticsearch');
const esTransportOpts = {
level: 'info',
clientOpts: {
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200',
auth: {
username: process.env.ES_USERNAME,
password: process.env.ES_PASSWORD
}
},
index: 'api-logs',
indexPrefix: 'api',
indexSuffixPattern: 'YYYY-MM-DD' // Daily indices
};
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new ElasticsearchTransport(esTransportOpts),
new winston.transports.Console()
]
});
module.exports = logger;
Structured Logging for Better Analysis
const logger = require('./winston-elasticsearch');
// Log with consistent structure
app.get('/api/posts/:id', async (req, res) => {
const context = {
requestId: req.id,
userId: req.user?.id,
postId: req.params.id,
ip: req.ip,
userAgent: req.get('user-agent')
};
logger.info('Fetching post', context);
try {
const post = await fetchPost(req.params.id);
logger.info('Post fetched successfully', {
...context,
cached: post.cached || false,
responseTime: Date.now() - req.startTime
});
res.json(post);
} catch (error) {
logger.error('Failed to fetch post', {
...context,
error: {
message: error.message,
stack: error.stack,
code: error.code
}
});
res.status(500).json({ error: 'Failed to fetch post' });
}
});
Log Levels Best Practices:
- ERROR: Application errors that need immediate attention
- WARN: Warning conditions that might lead to errors
- INFO: Important business events (user login, order placed)
- DEBUG: Detailed diagnostic information for debugging
- TRACE: Very detailed information (usually disabled in production)
Real-time Monitoring Dashboard
// Real-time metrics with Socket.IO
const socketIO = require('socket.io');
const io = socketIO(server);
// Metrics collection
const metrics = {
requestsPerSecond: 0,
activeUsers: 0,
averageResponseTime: 0,
errorRate: 0
};
let requestCount = 0;
let responseTimes = [];
// Update metrics
setInterval(() => {
metrics.requestsPerSecond = requestCount;
metrics.averageResponseTime = responseTimes.length > 0
? responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length
: 0;
// Broadcast to all connected clients
io.emit('metrics', metrics);
// Reset counters
requestCount = 0;
responseTimes = [];
}, 1000); // Every second
// Track requests
app.use((req, res, next) => {
requestCount++;
const start = Date.now();
res.on('finish', () => {
responseTimes.push(Date.now() - start);
});
next();
});
Exercise: Implement a complete monitoring solution for a REST API with the following requirements:
Part 1: Logging
- Set up Winston logger with JSON format and daily rotating file transport
- Create request logging middleware that logs:
- Request ID (UUID)
- Method, path, query parameters
- Response status code and duration
- IP address and user agent
- Log slow requests (>1 second) as warnings
- Log errors with full stack traces
Part 2: Metrics
- Integrate Prometheus client
- Track the following metrics:
- HTTP request duration (histogram with P50, P95, P99)
- Total HTTP requests (counter by method, route, status)
- Active connections (gauge)
- Database query duration (histogram by query type)
- Expose /metrics endpoint for Prometheus scraping
Part 3: Health Checks
- Implement /health endpoint with database and Redis checks
- Implement /ready endpoint (for Kubernetes readiness probes)
- Implement /live endpoint (for Kubernetes liveness probes)
- Return proper status codes (200 healthy, 503 unhealthy)
Part 4: Alerting
- Create alerting service that tracks error rates
- Send email alert when error rate exceeds 10 errors in 5 minutes
- Include error details, stack trace, and context in alert
Bonus: Create a simple dashboard that displays real-time metrics (requests/second, average response time, error rate) using Socket.IO.