Add System Health Monitor: RAM/CPU/locks tracking with emergency kill controls + Visual Builder plan
This commit is contained in:
235
src/lib/shim/health.ts
Normal file
235
src/lib/shim/health.ts
Normal file
@@ -0,0 +1,235 @@
|
||||
// System Health Monitoring for 100k Scale
|
||||
// Tracks RAM, CPU, Database Locks, and Connection Pressure
|
||||
|
||||
import { pool } from '@/lib/db';
|
||||
import pidusage from 'pidusage';
|
||||
|
||||
export interface SystemHealth {
|
||||
process: {
|
||||
memory: {
|
||||
usage: number; // MB
|
||||
percentage: number; // % of 16GB
|
||||
limit: number; // 16GB in MB
|
||||
};
|
||||
cpu: number; // % utilization
|
||||
uptime: number; // seconds
|
||||
};
|
||||
database: {
|
||||
activeConnections: number;
|
||||
stuckLocks: number;
|
||||
longRunningQueries: number;
|
||||
oldestQueryAge: number | null; // seconds
|
||||
};
|
||||
status: 'healthy' | 'warning' | 'critical';
|
||||
alerts: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get complete system health metrics
|
||||
* Combines process stats (pidusage) with database stats (pg_stat)
|
||||
*/
|
||||
export async function getSystemHealth(): Promise<SystemHealth> {
|
||||
// 1. Get Process Metrics (RAM/CPU)
|
||||
const processStats = await pidusage(process.pid);
|
||||
|
||||
const memoryUsageMB = processStats.memory / 1024 / 1024;
|
||||
const memoryLimitMB = 16384; // 16GB
|
||||
const memoryPercentage = (memoryUsageMB / memoryLimitMB) * 100;
|
||||
|
||||
// 2. Get Database Metrics (Active Connections & Stuck Locks)
|
||||
const { rows: dbRows } = await pool.query<{
|
||||
active_conns: string;
|
||||
waiting_locks: string;
|
||||
long_queries: string;
|
||||
oldest_query_seconds: string | null;
|
||||
}>(`
|
||||
SELECT
|
||||
(SELECT count(*) FROM pg_stat_activity WHERE state = 'active') as active_conns,
|
||||
(SELECT count(*) FROM pg_locks WHERE NOT granted) as waiting_locks,
|
||||
(SELECT count(*) FROM pg_stat_activity
|
||||
WHERE state = 'active'
|
||||
AND query_start < NOW() - INTERVAL '30 seconds'
|
||||
AND query NOT LIKE '%pg_stat_activity%'
|
||||
) as long_queries,
|
||||
(SELECT EXTRACT(EPOCH FROM (NOW() - query_start))::integer
|
||||
FROM pg_stat_activity
|
||||
WHERE state = 'active'
|
||||
AND query NOT LIKE '%pg_stat_activity%'
|
||||
ORDER BY query_start ASC
|
||||
LIMIT 1
|
||||
) as oldest_query_seconds
|
||||
`);
|
||||
|
||||
const dbStats = dbRows[0];
|
||||
|
||||
// 3. Determine Health Status
|
||||
const alerts: string[] = [];
|
||||
let status: 'healthy' | 'warning' | 'critical' = 'healthy';
|
||||
|
||||
// Memory alerts
|
||||
if (memoryPercentage > 90) {
|
||||
status = 'critical';
|
||||
alerts.push(`🚨 CRITICAL: Memory at ${memoryPercentage.toFixed(1)}%. Risk of OOM!`);
|
||||
} else if (memoryPercentage > 75) {
|
||||
status = status === 'critical' ? 'critical' : 'warning';
|
||||
alerts.push(`⚠️ WARNING: Memory at ${memoryPercentage.toFixed(1)}%. Monitor closely.`);
|
||||
}
|
||||
|
||||
// CPU alerts
|
||||
if (processStats.cpu > 90) {
|
||||
status = 'critical';
|
||||
alerts.push(`🚨 CRITICAL: CPU at ${processStats.cpu.toFixed(1)}%. Severe load!`);
|
||||
} else if (processStats.cpu > 70) {
|
||||
status = status === 'critical' ? 'critical' : 'warning';
|
||||
alerts.push(`⚠️ WARNING: CPU at ${processStats.cpu.toFixed(1)}%.`);
|
||||
}
|
||||
|
||||
// Lock alerts
|
||||
const waitingLocks = parseInt(dbStats.waiting_locks) || 0;
|
||||
if (waitingLocks > 10) {
|
||||
status = 'critical';
|
||||
alerts.push(`🚨 CRITICAL: ${waitingLocks} queries waiting on locks!`);
|
||||
} else if (waitingLocks > 0) {
|
||||
status = status === 'critical' ? 'critical' : 'warning';
|
||||
alerts.push(`⚠️ WARNING: ${waitingLocks} stuck locks detected.`);
|
||||
}
|
||||
|
||||
// Long-running query alerts
|
||||
const longQueries = parseInt(dbStats.long_queries) || 0;
|
||||
if (longQueries > 5) {
|
||||
status = status === 'critical' ? 'critical' : 'warning';
|
||||
alerts.push(`⚠️ ${longQueries} queries running >30s.`);
|
||||
}
|
||||
|
||||
return {
|
||||
process: {
|
||||
memory: {
|
||||
usage: Math.round(memoryUsageMB),
|
||||
percentage: Math.round(memoryPercentage * 10) / 10,
|
||||
limit: memoryLimitMB
|
||||
},
|
||||
cpu: Math.round(processStats.cpu * 10) / 10,
|
||||
uptime: Math.round(process.uptime())
|
||||
},
|
||||
database: {
|
||||
activeConnections: parseInt(dbStats.active_conns) || 0,
|
||||
stuckLocks: waitingLocks,
|
||||
longRunningQueries: longQueries,
|
||||
oldestQueryAge: dbStats.oldest_query_seconds ? parseInt(dbStats.oldest_query_seconds) : null
|
||||
},
|
||||
status,
|
||||
alerts
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Kill all waiting locks (EMERGENCY USE ONLY)
|
||||
* Terminates queries that are blocking other queries
|
||||
*/
|
||||
export async function killStuckLocks(): Promise<number> {
|
||||
console.warn('[EMERGENCY] Killing stuck locks...');
|
||||
|
||||
const { rows } = await pool.query<{ pid: number }>(
|
||||
`SELECT pg_terminate_backend(pid) as pid
|
||||
FROM pg_stat_activity
|
||||
WHERE pid IN (
|
||||
SELECT DISTINCT blocking.pid
|
||||
FROM pg_locks blocked
|
||||
JOIN pg_stat_activity blocking ON blocking.pid = blocked.pid
|
||||
WHERE NOT blocked.granted
|
||||
)
|
||||
AND pid != pg_backend_pid()`
|
||||
);
|
||||
|
||||
const killedCount = rows.length;
|
||||
console.warn(`[EMERGENCY] Killed ${killedCount} blocking queries`);
|
||||
|
||||
return killedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of long-running queries for debugging
|
||||
*/
|
||||
export async function getLongRunningQueries(): Promise<Array<{
|
||||
pid: number;
|
||||
duration: number;
|
||||
query: string;
|
||||
state: string;
|
||||
}>> {
|
||||
const { rows } = await pool.query<{
|
||||
pid: number;
|
||||
duration_seconds: string;
|
||||
query: string;
|
||||
state: string;
|
||||
}>(
|
||||
`SELECT
|
||||
pid,
|
||||
EXTRACT(EPOCH FROM (NOW() - query_start))::integer as duration_seconds,
|
||||
query,
|
||||
state
|
||||
FROM pg_stat_activity
|
||||
WHERE state = 'active'
|
||||
AND query NOT LIKE '%pg_stat_activity%'
|
||||
AND query_start < NOW() - INTERVAL '10 seconds'
|
||||
ORDER BY query_start ASC
|
||||
LIMIT 20`
|
||||
);
|
||||
|
||||
return rows.map(row => ({
|
||||
pid: row.pid,
|
||||
duration: parseInt(row.duration_seconds),
|
||||
query: row.query.slice(0, 200), // Truncate for display
|
||||
state: row.state
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get blocking/blocked query relationships
|
||||
*/
|
||||
export async function getBlockingQueries(): Promise<Array<{
|
||||
blockedPid: number;
|
||||
blockingPid: number;
|
||||
blockedQuery: string;
|
||||
blockingQuery: string;
|
||||
waitTime: number;
|
||||
}>> {
|
||||
const { rows } = await pool.query<{
|
||||
blocked_pid: number;
|
||||
blocking_pid: number;
|
||||
blocked_query: string;
|
||||
blocking_query: string;
|
||||
wait_time_seconds: string;
|
||||
}>(
|
||||
`SELECT
|
||||
blocked_locks.pid AS blocked_pid,
|
||||
blocking_locks.pid AS blocking_pid,
|
||||
blocked_activity.query AS blocked_query,
|
||||
blocking_activity.query AS blocking_query,
|
||||
EXTRACT(EPOCH FROM (NOW() - blocked_activity.query_start))::integer as wait_time_seconds
|
||||
FROM pg_locks blocked_locks
|
||||
JOIN pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
|
||||
JOIN pg_locks blocking_locks
|
||||
ON blocking_locks.locktype = blocked_locks.locktype
|
||||
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
|
||||
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
|
||||
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
|
||||
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
|
||||
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
|
||||
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
|
||||
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
|
||||
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
|
||||
AND blocking_locks.pid != blocked_locks.pid
|
||||
JOIN pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
|
||||
WHERE NOT blocked_locks.granted
|
||||
ORDER BY wait_time_seconds DESC
|
||||
LIMIT 10`
|
||||
);
|
||||
|
||||
return rows.map(row => ({
|
||||
blockedPid: row.blocked_pid,
|
||||
blockingPid: row.blocking_pid,
|
||||
blockedQuery: row.blocked_query.slice(0, 100),
|
||||
blockingQuery: row.blocking_query.slice(0, 100),
|
||||
waitTime: parseInt(row.wait_time_seconds)
|
||||
}));
|
||||
}
|
||||
Reference in New Issue
Block a user