Add System Health Monitor: RAM/CPU/locks tracking with emergency kill controls + Visual Builder plan

This commit is contained in:
cawcenter
2025-12-16 11:06:40 -05:00
parent 11af92b0d0
commit 0f4330b7e1
6 changed files with 967 additions and 3 deletions

View File

@@ -0,0 +1,309 @@
// System Health Dashboard Component
// Real-time RAM/CPU/DB monitoring with emergency controls
import React, { useState } from 'react';
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
import { LineChart, Line, YAxis, XAxis, Tooltip, ResponsiveContainer } from 'recharts';
import { AlertTriangle, Zap, Database, Lock, TrendingUp } from 'lucide-react';
import { Button } from '@/components/ui/button';
interface HealthData {
timestamp: string;
status: 'healthy' | 'warning' | 'critical';
system: {
process: {
memory: { usage: number; percentage: number; limit: number };
cpu: number;
uptime: number;
};
database: {
activeConnections: number;
stuckLocks: number;
longRunningQueries: number;
oldestQueryAge: number | null;
};
status: 'healthy' | 'warning' | 'critical';
alerts: string[];
};
alerts: string[];
}
export default function HealthDash() {
const queryClient = useQueryClient();
const [history, setHistory] = useState<Array<{ time: string; memory: number; cpu: number }>>([]);
const { data, isLoading } = useQuery<HealthData>({
queryKey: ['system-health'],
queryFn: async () => {
const response = await fetch('/api/shim/health', {
headers: {
'Authorization': `Bearer ${import.meta.env.PUBLIC_GOD_MODE_TOKEN || 'local-dev-token'}`
}
});
if (!response.ok) throw new Error('Health check failed');
const data = await response.json();
// Update history for charts
setHistory(prev => {
const newEntry = {
time: new Date().toLocaleTimeString(),
memory: data.system.process.memory.percentage,
cpu: data.system.process.cpu
};
return [...prev.slice(-20), newEntry]; // Keep last 20 points
});
return data;
},
refetchInterval: 2000, // Poll every 2 seconds
staleTime: 1000,
});
const killLocksMutation = useMutation({
mutationFn: async () => {
const response = await fetch('/api/shim/emergency/kill-locks', {
method: 'POST',
headers: {
'Authorization': `Bearer ${import.meta.env.PUBLIC_GOD_MODE_TOKEN || 'local-dev-token'}`
}
});
if (!response.ok) throw new Error('Failed to kill locks');
return response.json();
},
onSuccess: (data) => {
alert(`✅ Killed ${data.killedCount} stuck locks`);
queryClient.invalidateQueries({ queryKey: ['system-health'] });
},
onError: (error) => {
alert(`❌ Failed to kill locks: ${error.message}`);
}
});
if (isLoading || !data) {
return (
<div className="p-8 text-center text-slate-400">
Loading system health...
</div>
);
}
const system = data.system;
return (
<div className="space-y-6">
{/* Alert Banner */}
{data.alerts.length > 0 && (
<div className={`p-4 rounded-lg border ${data.status === 'critical'
? 'bg-red-900/20 border-red-700'
: 'bg-yellow-900/20 border-yellow-700'
}`}>
<div className="flex items-start gap-3">
<AlertTriangle className={`w-6 h-6 mt-1 ${data.status === 'critical' ? 'text-red-400' : 'text-yellow-400'
}`} />
<div className="flex-1">
<h3 className={`font-semibold ${data.status === 'critical' ? 'text-red-400' : 'text-yellow-400'
}`}>
{data.status === 'critical' ? '🚨 CRITICAL ALERTS' : '⚠️ WARNINGS'}
</h3>
<ul className="mt-2 space-y-1">
{data.alerts.map((alert, i) => (
<li key={i} className={`text-sm ${data.status === 'critical' ? 'text-red-300' : 'text-yellow-300'
}`}>
{alert}
</li>
))}
</ul>
</div>
</div>
</div>
)}
{/* Metrics Grid */}
<div className="grid grid-cols-1 md:grid-cols-3 gap-4">
{/* RAM Usage */}
<div className={`border p-6 rounded-lg ${system.process.memory.percentage > 90
? 'border-red-500 bg-red-900/10'
: system.process.memory.percentage > 75
? 'border-yellow-500 bg-yellow-900/10'
: 'border-green-500 bg-green-900/10'
}`}>
<div className="flex items-center justify-between mb-2">
<h3 className="text-sm font-medium text-slate-400 flex items-center gap-2">
<Zap className="w-4 h-4" />
RAM USAGE
</h3>
<span className={`text-3xl font-bold ${system.process.memory.percentage > 90 ? 'text-red-400' :
system.process.memory.percentage > 75 ? 'text-yellow-400' :
'text-green-400'
}`}>
{system.process.memory.percentage}%
</span>
</div>
<div className="h-2 bg-slate-900 rounded-full overflow-hidden mt-3">
<div
className={`h-full transition-all ${system.process.memory.percentage > 90 ? 'bg-red-500' :
system.process.memory.percentage > 75 ? 'bg-yellow-500' :
'bg-green-500'
}`}
style={{ width: `${system.process.memory.percentage}%` }}
/>
</div>
<p className="mt-2 text-xs text-slate-500">
{system.process.memory.usage} MB / {system.process.memory.limit} MB
</p>
</div>
{/* DB Connections */}
<div className={`border p-6 rounded-lg ${system.database.activeConnections > 100
? 'border-yellow-500 bg-yellow-900/10'
: 'border-blue-500 bg-blue-900/10'
}`}>
<div className="flex items-center justify-between mb-2">
<h3 className="text-sm font-medium text-slate-400 flex items-center gap-2">
<Database className="w-4 h-4" />
DB CONNECTIONS
</h3>
<span className={`text-3xl font-bold ${system.database.activeConnections > 100 ? 'text-yellow-400' : 'text-blue-400'
}`}>
{system.database.activeConnections}
</span>
</div>
<p className="text-xs text-slate-500 mt-3">
Limit: 10,000 {system.database.longRunningQueries} long queries
</p>
{system.database.oldestQueryAge && (
<p className="text-xs text-yellow-400 mt-1">
Oldest: {system.database.oldestQueryAge}s
</p>
)}
</div>
{/* Stuck Locks */}
<div className={`border p-6 rounded-lg ${system.database.stuckLocks > 0
? 'border-red-500 bg-red-900/10'
: 'border-gray-500 bg-gray-900/10'
}`}>
<div className="flex items-center justify-between mb-2">
<h3 className="text-sm font-medium text-slate-400 flex items-center gap-2">
<Lock className="w-4 h-4" />
STUCK LOCKS
</h3>
<span className={`text-3xl font-bold ${system.database.stuckLocks > 0 ? 'text-red-400' : 'text-gray-400'
}`}>
{system.database.stuckLocks}
</span>
</div>
{system.database.stuckLocks > 0 && (
<Button
onClick={() => {
if (confirm(`⚠️ EMERGENCY: Kill ${system.database.stuckLocks} stuck locks?\n\nThis will terminate blocking queries. Continue?`)) {
killLocksMutation.mutate();
}
}}
disabled={killLocksMutation.isPending}
className="w-full mt-3 bg-red-600 hover:bg-red-500 text-white"
>
{killLocksMutation.isPending ? 'Killing...' : '🚨 KILL ALL'}
</Button>
)}
{system.database.stuckLocks === 0 && (
<p className="text-xs text-gray-500 mt-3">No blocking queries</p>
)}
</div>
</div>
{/* Charts */}
{history.length > 5 && (
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
{/* Memory Chart */}
<div className="bg-slate-800 border border-slate-700 rounded-lg p-4">
<h3 className="text-white font-semibold mb-4 flex items-center gap-2">
<TrendingUp className="w-4 h-4" />
Memory Trend (Last 40s)
</h3>
<ResponsiveContainer width="100%" height={150}>
<LineChart data={history}>
<XAxis dataKey="time" tick={{ fontSize: 10, fill: '#64748b' }} />
<YAxis domain={[0, 100]} tick={{ fontSize: 10, fill: '#64748b' }} />
<Tooltip
contentStyle={{ backgroundColor: '#1e293b', border: '1px solid #475569' }}
labelStyle={{ color: '#94a3b8' }}
/>
<Line
type="monotone"
dataKey="memory"
stroke="#f59e0b"
strokeWidth={2}
dot={false}
/>
</LineChart>
</ResponsiveContainer>
</div>
{/* CPU Chart */}
<div className="bg-slate-800 border border-slate-700 rounded-lg p-4">
<h3 className="text-white font-semibold mb-4 flex items-center gap-2">
<TrendingUp className="w-4 h-4" />
CPU Trend (Last 40s)
</h3>
<ResponsiveContainer width="100%" height={150}>
<LineChart data={history}>
<XAxis dataKey="time" tick={{ fontSize: 10, fill: '#64748b' }} />
<YAxis domain={[0, 100]} tick={{ fontSize: 10, fill: '#64748b' }} />
<Tooltip
contentStyle={{ backgroundColor: '#1e293b', border: '1px solid #475569' }}
labelStyle={{ color: '#94a3b8' }}
/>
<Line
type="monotone"
dataKey="cpu"
stroke="#3b82f6"
strokeWidth={2}
dot={false}
/>
</LineChart>
</ResponsiveContainer>
</div>
</div>
)}
{/* System Info */}
<div className="grid grid-cols-2 md:grid-cols-4 gap-3 text-sm">
<div className="p-3 bg-slate-800 rounded border border-slate-700">
<div className="text-slate-500 text-xs">CPU Load</div>
<div className="text-white font-semibold text-lg">{system.process.cpu}%</div>
</div>
<div className="p-3 bg-slate-800 rounded border border-slate-700">
<div className="text-slate-500 text-xs">Uptime</div>
<div className="text-white font-semibold text-lg">
{Math.floor(system.process.uptime / 3600)}h {Math.floor((system.process.uptime % 3600) / 60)}m
</div>
</div>
<div className="p-3 bg-slate-800 rounded border border-slate-700">
<div className="text-slate-500 text-xs">Status</div>
<div className={`font-semibold text-lg ${data.status === 'healthy' ? 'text-green-400' :
data.status === 'warning' ? 'text-yellow-400' :
'text-red-400'
}`}>
{data.status.toUpperCase()}
</div>
</div>
<div className="p-3 bg-slate-800 rounded border border-slate-700">
<div className="text-slate-500 text-xs">Last Check</div>
<div className="text-white font-semibold text-lg">
{new Date(data.timestamp).toLocaleTimeString()}
</div>
</div>
</div>
</div>
);
}

235
src/lib/shim/health.ts Normal file
View File

@@ -0,0 +1,235 @@
// System Health Monitoring for 100k Scale
// Tracks RAM, CPU, Database Locks, and Connection Pressure
import { pool } from '@/lib/db';
import pidusage from 'pidusage';
export interface SystemHealth {
process: {
memory: {
usage: number; // MB
percentage: number; // % of 16GB
limit: number; // 16GB in MB
};
cpu: number; // % utilization
uptime: number; // seconds
};
database: {
activeConnections: number;
stuckLocks: number;
longRunningQueries: number;
oldestQueryAge: number | null; // seconds
};
status: 'healthy' | 'warning' | 'critical';
alerts: string[];
}
/**
* Get complete system health metrics
* Combines process stats (pidusage) with database stats (pg_stat)
*/
export async function getSystemHealth(): Promise<SystemHealth> {
// 1. Get Process Metrics (RAM/CPU)
const processStats = await pidusage(process.pid);
const memoryUsageMB = processStats.memory / 1024 / 1024;
const memoryLimitMB = 16384; // 16GB
const memoryPercentage = (memoryUsageMB / memoryLimitMB) * 100;
// 2. Get Database Metrics (Active Connections & Stuck Locks)
const { rows: dbRows } = await pool.query<{
active_conns: string;
waiting_locks: string;
long_queries: string;
oldest_query_seconds: string | null;
}>(`
SELECT
(SELECT count(*) FROM pg_stat_activity WHERE state = 'active') as active_conns,
(SELECT count(*) FROM pg_locks WHERE NOT granted) as waiting_locks,
(SELECT count(*) FROM pg_stat_activity
WHERE state = 'active'
AND query_start < NOW() - INTERVAL '30 seconds'
AND query NOT LIKE '%pg_stat_activity%'
) as long_queries,
(SELECT EXTRACT(EPOCH FROM (NOW() - query_start))::integer
FROM pg_stat_activity
WHERE state = 'active'
AND query NOT LIKE '%pg_stat_activity%'
ORDER BY query_start ASC
LIMIT 1
) as oldest_query_seconds
`);
const dbStats = dbRows[0];
// 3. Determine Health Status
const alerts: string[] = [];
let status: 'healthy' | 'warning' | 'critical' = 'healthy';
// Memory alerts
if (memoryPercentage > 90) {
status = 'critical';
alerts.push(`🚨 CRITICAL: Memory at ${memoryPercentage.toFixed(1)}%. Risk of OOM!`);
} else if (memoryPercentage > 75) {
status = status === 'critical' ? 'critical' : 'warning';
alerts.push(`⚠️ WARNING: Memory at ${memoryPercentage.toFixed(1)}%. Monitor closely.`);
}
// CPU alerts
if (processStats.cpu > 90) {
status = 'critical';
alerts.push(`🚨 CRITICAL: CPU at ${processStats.cpu.toFixed(1)}%. Severe load!`);
} else if (processStats.cpu > 70) {
status = status === 'critical' ? 'critical' : 'warning';
alerts.push(`⚠️ WARNING: CPU at ${processStats.cpu.toFixed(1)}%.`);
}
// Lock alerts
const waitingLocks = parseInt(dbStats.waiting_locks) || 0;
if (waitingLocks > 10) {
status = 'critical';
alerts.push(`🚨 CRITICAL: ${waitingLocks} queries waiting on locks!`);
} else if (waitingLocks > 0) {
status = status === 'critical' ? 'critical' : 'warning';
alerts.push(`⚠️ WARNING: ${waitingLocks} stuck locks detected.`);
}
// Long-running query alerts
const longQueries = parseInt(dbStats.long_queries) || 0;
if (longQueries > 5) {
status = status === 'critical' ? 'critical' : 'warning';
alerts.push(`⚠️ ${longQueries} queries running >30s.`);
}
return {
process: {
memory: {
usage: Math.round(memoryUsageMB),
percentage: Math.round(memoryPercentage * 10) / 10,
limit: memoryLimitMB
},
cpu: Math.round(processStats.cpu * 10) / 10,
uptime: Math.round(process.uptime())
},
database: {
activeConnections: parseInt(dbStats.active_conns) || 0,
stuckLocks: waitingLocks,
longRunningQueries: longQueries,
oldestQueryAge: dbStats.oldest_query_seconds ? parseInt(dbStats.oldest_query_seconds) : null
},
status,
alerts
};
}
/**
* Kill all waiting locks (EMERGENCY USE ONLY)
* Terminates queries that are blocking other queries
*/
export async function killStuckLocks(): Promise<number> {
console.warn('[EMERGENCY] Killing stuck locks...');
const { rows } = await pool.query<{ pid: number }>(
`SELECT pg_terminate_backend(pid) as pid
FROM pg_stat_activity
WHERE pid IN (
SELECT DISTINCT blocking.pid
FROM pg_locks blocked
JOIN pg_stat_activity blocking ON blocking.pid = blocked.pid
WHERE NOT blocked.granted
)
AND pid != pg_backend_pid()`
);
const killedCount = rows.length;
console.warn(`[EMERGENCY] Killed ${killedCount} blocking queries`);
return killedCount;
}
/**
* Get list of long-running queries for debugging
*/
export async function getLongRunningQueries(): Promise<Array<{
pid: number;
duration: number;
query: string;
state: string;
}>> {
const { rows } = await pool.query<{
pid: number;
duration_seconds: string;
query: string;
state: string;
}>(
`SELECT
pid,
EXTRACT(EPOCH FROM (NOW() - query_start))::integer as duration_seconds,
query,
state
FROM pg_stat_activity
WHERE state = 'active'
AND query NOT LIKE '%pg_stat_activity%'
AND query_start < NOW() - INTERVAL '10 seconds'
ORDER BY query_start ASC
LIMIT 20`
);
return rows.map(row => ({
pid: row.pid,
duration: parseInt(row.duration_seconds),
query: row.query.slice(0, 200), // Truncate for display
state: row.state
}));
}
/**
* Get blocking/blocked query relationships
*/
export async function getBlockingQueries(): Promise<Array<{
blockedPid: number;
blockingPid: number;
blockedQuery: string;
blockingQuery: string;
waitTime: number;
}>> {
const { rows } = await pool.query<{
blocked_pid: number;
blocking_pid: number;
blocked_query: string;
blocking_query: string;
wait_time_seconds: string;
}>(
`SELECT
blocked_locks.pid AS blocked_pid,
blocking_locks.pid AS blocking_pid,
blocked_activity.query AS blocked_query,
blocking_activity.query AS blocking_query,
EXTRACT(EPOCH FROM (NOW() - blocked_activity.query_start))::integer as wait_time_seconds
FROM pg_locks blocked_locks
JOIN pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_locks blocking_locks
ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted
ORDER BY wait_time_seconds DESC
LIMIT 10`
);
return rows.map(row => ({
blockedPid: row.blocked_pid,
blockingPid: row.blocking_pid,
blockedQuery: row.blocked_query.slice(0, 100),
blockingQuery: row.blocking_query.slice(0, 100),
waitTime: parseInt(row.wait_time_seconds)
}));
}

View File

@@ -0,0 +1,49 @@
// EMERGENCY API: Kill stuck database locks
// USE WITH CAUTION - Terminates blocking queries
import type { APIRoute } from 'astro';
import { killStuckLocks, getBlockingQueries } from '@/lib/shim/health';
export const POST: APIRoute = async ({ request }) => {
try {
// STRICT token validation - this is destructive
const authHeader = request.headers.get('Authorization');
const token = authHeader?.replace('Bearer ', '');
const godToken = import.meta.env.GOD_MODE_TOKEN;
if (!godToken || token !== godToken) {
return new Response(JSON.stringify({ error: 'Unauthorized' }), {
status: 401,
headers: { 'Content-Type': 'application/json' }
});
}
// Get list of what will be killed before killing
const blocking = await getBlockingQueries();
// Execute kill
const killedCount = await killStuckLocks();
console.warn(`[EMERGENCY] Killed ${killedCount} stuck locks`, { blocking });
return new Response(JSON.stringify({
success: true,
killedCount,
blockedQueries: blocking.length,
message: `Terminated ${killedCount} blocking queries`
}), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
} catch (error: any) {
console.error('[EMERGENCY] Kill locks failed:', error);
return new Response(JSON.stringify({
error: 'Kill locks failed',
message: error.message
}), {
status: 500,
headers: { 'Content-Type': 'application/json' }
});
}
};

View File

@@ -1,8 +1,9 @@
// API Route: GET /api/shim/health
// Returns connection pool stats and database health
// Returns connection pool stats, database health, and system metrics (RAM/CPU/locks)
import type { APIRoute } from 'astro';
import { getPoolStats, getDatabaseStats, getVacuumCandidates } from '@/lib/shim/pool';
import { getSystemHealth } from '@/lib/shim/health';
export const GET: APIRoute = async ({ request }) => {
try {
@@ -18,21 +19,35 @@ export const GET: APIRoute = async ({ request }) => {
});
}
// Get health stats
// Get all health stats
const poolStats = getPoolStats();
const dbStats = await getDatabaseStats();
const vacuumCandidates = await getVacuumCandidates();
const systemHealth = await getSystemHealth();
const needsVacuum = vacuumCandidates.length > 0 && vacuumCandidates[0].deadPercent > 20;
// Overall status (most critical wins)
const overallStatus =
systemHealth.status === 'critical' || poolStats.status === 'critical'
? 'critical'
: systemHealth.status === 'warning' || poolStats.status === 'warning'
? 'warning'
: 'healthy';
return new Response(JSON.stringify({
timestamp: new Date().toISOString(),
status: overallStatus,
system: systemHealth,
pool: poolStats,
database: dbStats,
vacuum: {
recommended: needsVacuum,
candidates: vacuumCandidates
},
status: poolStats.status
alerts: [
...systemHealth.alerts,
...(poolStats.status !== 'healthy' ? [poolStats.message] : [])
]
}), {
status: 200,
headers: { 'Content-Type': 'application/json' }

View File

@@ -6,6 +6,7 @@ import { getPoolStats, getDatabaseStats, getVacuumCandidates } from '@/lib/shim/
import { getArticlesCountByStatus } from '@/lib/shim/articles';
import { getSitesCountByStatus } from '@/lib/shim/sites';
import ShimMonitor from '@/components/shim/ShimMonitor';
import HealthDash from '@/components/shim/HealthDash';
// Server-side stats (instant load)
const poolStats = getPoolStats();
@@ -182,6 +183,17 @@ const totalSites = Object.values(siteCounts).reduce((a, b) => a + b, 0);
</div>
)}
<!-- System Health Monitor (RAM/CPU/Locks) -->
<div class="bg-slate-800 rounded-lg border border-slate-700">
<div class="p-4 border-b border-slate-700">
<h2 class="text-white font-semibold text-lg">🔋 System Health Monitor</h2>
<p class="text-slate-400 text-sm mt-1">Real-time RAM, CPU, and database lock monitoring (2s refresh)</p>
</div>
<div class="p-6">
<HealthDash client:load />
</div>
</div>
<!-- Two-Column Layout -->
<div class="grid grid-cols-1 lg:grid-cols-2 gap-6">
@@ -282,6 +294,7 @@ const totalSites = Object.values(siteCounts).reduce((a, b) => a + b, 0);
<li>✅ <strong>Zod Validation</strong> - All data validated before SQL execution</li>
<li>✅ <strong>SEO Enforcement</strong> - Cannot publish without metadata</li>
<li>✅ <strong>Connection Monitoring</strong> - Real-time pool health tracking</li>
<li>✅ <strong>System Health Monitor</strong> - RAM/CPU/locks with emergency controls</li>
<li>✅ <strong>Auto VACUUM Detection</strong> - Prevents performance degradation</li>
</ul>
</div>