From 236089f5c89804007b3a9dfd9879d07a7d961d0e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 2 Mar 2026 17:43:41 +0000 Subject: [PATCH 01/18] Add long-running agents anomaly monitoring end-to-end Co-authored-by: leor --- docs/agents_monitoring_handoff.md | 128 +++ electron-ui/index.html | 93 +++ electron-ui/main.js | 316 ++++++- electron-ui/preload.js | 29 + electron-ui/renderer.js | 379 +++++++++ electron-ui/styles.css | 275 +++++++ scripts/anomaly_monitor.py | 912 +++++++++++++++++++++ scripts/anomaly_rules.py | 217 +++++ scripts/fixtures/anomaly_replay_cases.json | 32 + scripts/graph_api.py | 11 +- scripts/ignition_api_client.py | 69 +- scripts/neo4j_ontology.py | 148 +++- 12 files changed, 2565 insertions(+), 44 deletions(-) create mode 100644 docs/agents_monitoring_handoff.md create mode 100644 scripts/anomaly_monitor.py create mode 100644 scripts/anomaly_rules.py create mode 100644 scripts/fixtures/anomaly_replay_cases.json diff --git a/docs/agents_monitoring_handoff.md b/docs/agents_monitoring_handoff.md new file mode 100644 index 0000000..a5368fb --- /dev/null +++ b/docs/agents_monitoring_handoff.md @@ -0,0 +1,128 @@ +# Long-Running Agents Monitoring Handoff + +## Summary + +This handoff documents the implemented V1 monitoring capability: + +- New **Agents** tab in Electron UI for starting/stopping long-running monitoring. +- Continuous Python worker (`anomaly_monitor.py`) with: + - deterministic historical-deviation scoring, + - quality/staleness gates, + - optional LLM triage, + - Neo4j persistence for `AgentRun` and `AnomalyEvent`, + - event dedup and retention cleanup. +- IPC surface and stream channels from Electron main to renderer: + - `agents:start`, `agents:status`, `agents:stop`, + - `agents:list-events`, `agents:get-event`, `agents:ack-event`, `agents:cleanup`, + - channels: `agent-status`, `agent-event`, `agent-error`, `agent-complete`. +- Graph drill-down integration with anomaly node support. + +## Files Changed + +### Electron + +- `electron-ui/index.html` + - Added **Agents** nav button. + - Added `tab-agents` page shell with controls, filters, feed, and detail panel. + - Added graph filter option for anomaly layer. + +- `electron-ui/styles.css` + - Added Agents tab styles (`agents-*`, `status-chip`, feed cards, detail panel). + +- `electron-ui/preload.js` + - Added `agents*` API bridge methods. + - Added event listeners for `agent-status/event/error/complete`. + +- `electron-ui/main.js` + - Added background agent runtime management (`activeAgentRun`). + - Added stream parser for monitor stdout markers (`[AGENT_STATUS]`, etc.). + - Added full `agents:*` IPC handlers. + - Added graceful stop handling on app shutdown. + +- `electron-ui/renderer.js` + - Added Agents tab state management. + - Added start/stop/refresh/cleanup/ack handlers. + - Added realtime feed updates from agent channels. + - Added event detail rendering and graph drill-down action. + +### Python backend + +- `scripts/anomaly_rules.py` (new) + - Deterministic scoring logic (`z`, `MAD`, rate, drift trend, flatline). + - Quality/staleness helpers and dedup key generator. + +- `scripts/anomaly_monitor.py` (new) + - Long-running monitoring worker with CLI subcommands: + - `run`, `status`, `list-events`, `get-event`, `ack-event`, `cleanup`, `replay-fixtures`. + - Neo4j persistence + dedup + retention cleanup. + - Optional LLM triage with structured JSON fallback. + +- `scripts/ignition_api_client.py` + - Added `query_tag_history(...)` and local-time-to-UTC conversion helper. + +- `scripts/neo4j_ontology.py` + - Added monitoring schema constraints/indexes for `AgentRun` / `AnomalyEvent`. + - Added helper methods: list/get/cleanup anomaly events. + - Added CLI commands: + - `init-agent-schema` + - `list-anomaly-events` + - `get-anomaly-event` + - `cleanup-anomaly-events` + +- `scripts/graph_api.py` + - Added node groups/colors for `AgentRun` and `AnomalyEvent`. + - Extended neighbor center-node lookup to support `event_id` and `run_id`. + +### Fixtures + +- `scripts/fixtures/anomaly_replay_cases.json` (new) + - Deterministic replay cases: + - normal baseline, + - sudden spike, + - slow drift, + - flatline/stuck. + +## Runtime Commands + +### Deterministic replay validation + +```bash +python3 scripts/anomaly_monitor.py replay-fixtures --fixture-file scripts/fixtures/anomaly_replay_cases.json +``` + +### Monitor worker manual run + +```bash +python3 scripts/anomaly_monitor.py run --run-id demo-run --config-json '{"pollIntervalMs":15000}' +``` + +### Event operations + +```bash +python3 scripts/anomaly_monitor.py list-events --limit 50 +python3 scripts/anomaly_monitor.py get-event --event-id +python3 scripts/anomaly_monitor.py ack-event --event-id --note "Reviewed by operator" +python3 scripts/anomaly_monitor.py cleanup --retention-days 14 +``` + +## Known Environment Requirements + +The Python environment must include packages from `requirements.txt`: + +- `neo4j` +- `anthropic` (for LLM triage; deterministic fallback works without API key) +- `python-dotenv` +- `requests` + +If `ANTHROPIC_API_KEY` is absent, triage automatically falls back to deterministic explanations. + +## Validation Status + +- Syntax checks passed: + - Python (`py_compile`) for all modified scripts. + - JS syntax checks (`node --check`) for Electron files. +- Fixture replay passed: + - `4/4` deterministic scenarios. + +Live end-to-end validation against actual Ignition + Neo4j + Anthropic requires connected runtime services. + diff --git a/electron-ui/index.html b/electron-ui/index.html index 03b808e..7e5e8a7 100644 --- a/electron-ui/index.html +++ b/electron-ui/index.html @@ -36,6 +36,13 @@ Assist + + + + + +
+ Idle + No active run +
+ + +
+ + + + + + + + + + + + + + +
+ +
+
Cycle (ms)0
+
Candidates0
+
Triaged0
+
Emitted0
+
Last heartbeatn/a
+
+ +
+ + +
+
+

Event Details

+
+ + +
+
+
+

Select an anomaly event from the feed.

+
+
+
+ +
@@ -630,6 +722,7 @@

Ontology Graph

+ diff --git a/electron-ui/main.js b/electron-ui/main.js index b5cdb4d..e215fb4 100644 --- a/electron-ui/main.js +++ b/electron-ui/main.js @@ -4,6 +4,7 @@ const fs = require('fs'); const { spawn } = require('child_process'); let mainWindow; +let activeAgentRun = null; // --------------------------------------------------------------------------- // Python backend configuration (works in both dev and packaged modes) @@ -103,6 +104,16 @@ app.on('window-all-closed', () => { } }); +app.on('before-quit', () => { + if (activeAgentRun && activeAgentRun.process && !activeAgentRun.process.killed) { + try { + activeAgentRun.process.kill('SIGTERM'); + } catch (err) { + // Ignore termination errors during shutdown. + } + } +}); + app.on('activate', () => { if (BrowserWindow.getAllWindows().length === 0) { createWindow(); @@ -185,6 +196,132 @@ function runPythonScript(scriptName, args = [], options = {}) { }); } +function normalizeAgentConfig(config = {}) { + const thresholds = (config && typeof config.thresholds === 'object' && config.thresholds) || {}; + const scope = (config && typeof config.scope === 'object' && config.scope) || {}; + return { + pollIntervalMs: Math.max(5000, Number(config.pollIntervalMs || 15000)), + historyWindowMinutes: Math.max(10, Number(config.historyWindowMinutes || 360)), + minHistoryPoints: Math.max(10, Number(config.minHistoryPoints || 30)), + maxMonitoredTags: Math.max(10, Number(config.maxMonitoredTags || 200)), + maxCandidatesPerCycle: Math.max(1, Number(config.maxCandidatesPerCycle || 25)), + maxLlmTriagesPerCycle: Math.max(0, Number(config.maxLlmTriagesPerCycle || 5)), + dedupCooldownMinutes: Math.max(1, Number(config.dedupCooldownMinutes || 10)), + retentionDays: Math.max(1, Number(config.retentionDays || 14)), + cleanupEveryCycles: Math.max(1, Number(config.cleanupEveryCycles || 40)), + thresholds: { + z: Number(thresholds.z ?? 3.0), + mad: Number(thresholds.mad ?? 3.5), + rate: Number(thresholds.rate ?? 0.0), + stalenessSec: Number(thresholds.stalenessSec ?? 120), + flatline_std_epsilon: Number(thresholds.flatline_std_epsilon ?? 1e-6), + stuck_window_size: Number(thresholds.stuck_window_size ?? 20), + }, + scope: { + project: scope.project || null, + equipmentTags: Array.isArray(scope.equipmentTags) ? scope.equipmentTags : [], + tagRegex: scope.tagRegex || null, + }, + }; +} + +function routeAgentMessage(channel, payload) { + if (mainWindow) { + mainWindow.webContents.send(channel, payload); + } +} + +function parseAgentLine(line) { + const trimmed = (line || '').trim(); + if (!trimmed) return null; + const prefixes = [ + { key: '[AGENT_STATUS]', channel: 'agent-status' }, + { key: '[AGENT_EVENT]', channel: 'agent-event' }, + { key: '[AGENT_ERROR]', channel: 'agent-error' }, + { key: '[AGENT_COMPLETE]', channel: 'agent-complete' }, + ]; + for (const prefix of prefixes) { + if (!trimmed.startsWith(prefix.key)) continue; + const jsonText = trimmed.slice(prefix.key.length).trim(); + try { + const payload = JSON.parse(jsonText); + return { channel: prefix.channel, payload }; + } catch (err) { + return { + channel: 'agent-error', + payload: { + runId: activeAgentRun ? activeAgentRun.runId : null, + code: 'invalid_agent_json', + message: `Failed to parse agent stream line: ${trimmed.slice(0, 200)}`, + recoverable: true, + timestamp: new Date().toISOString(), + }, + }; + } + } + return null; +} + +function handleAgentStdoutChunk(text) { + if (!activeAgentRun) return; + activeAgentRun.stdoutBuffer += text; + const lines = activeAgentRun.stdoutBuffer.split(/\r?\n/); + activeAgentRun.stdoutBuffer = lines.pop() || ''; + for (const line of lines) { + const parsed = parseAgentLine(line); + if (!parsed) continue; + if (parsed.channel === 'agent-status' && parsed.payload) { + activeAgentRun.status = parsed.payload.state || activeAgentRun.status; + activeAgentRun.metrics = { + cycleMs: parsed.payload.cycleMs || 0, + candidates: parsed.payload.candidates || 0, + triaged: parsed.payload.triaged || 0, + emitted: parsed.payload.emitted || 0, + timestamp: parsed.payload.timestamp || new Date().toISOString(), + }; + } + routeAgentMessage(parsed.channel, parsed.payload); + } +} + +async function stopActiveAgent(reason = 'stopped_by_user') { + if (!activeAgentRun || !activeAgentRun.process || activeAgentRun.process.killed) { + return { success: false, error: 'No active agent run' }; + } + const runId = activeAgentRun.runId; + activeAgentRun.status = 'stopping'; + + return new Promise((resolve) => { + const proc = activeAgentRun.process; + let settled = false; + const done = (result) => { + if (settled) return; + settled = true; + resolve(result); + }; + + proc.once('close', () => { + done({ success: true, runId, stoppedAt: new Date().toISOString(), reason }); + }); + + try { + proc.kill('SIGTERM'); + } catch (err) { + done({ success: false, error: err.message }); + return; + } + + setTimeout(() => { + if (proc.killed) return; + try { + proc.kill('SIGKILL'); + } catch (err) { + // Ignore forced termination errors. + } + }, 5000); + }); +} + // IPC Handlers // Select file dialog @@ -1304,7 +1441,9 @@ function readDbCredentials() { if (!fs.existsSync(credPath)) return {}; try { return JSON.parse(fs.readFileSync(credPath, 'utf-8')); - } catch { return {}; } + } catch { + return {}; + } } // Get database connections from Neo4j + credential status from db_credentials.json @@ -1314,10 +1453,8 @@ ipcMain.handle('get-db-connections', async () => { const proc = spawnPythonProcess('neo4j_ontology.py', ['db-connections', '--json']); let stdout = ''; - let stderr = ''; proc.stdout.on('data', (data) => { stdout += data.toString(); }); - proc.stderr.on('data', (data) => { stderr += data.toString(); }); proc.on('close', (code) => { if (code !== 0) { @@ -1335,7 +1472,7 @@ ipcMain.handle('get-db-connections', async () => { })); resolve({ success: true, connections: enriched }); - } catch (e) { + } catch { resolve({ success: true, connections: [] }); } }); @@ -1349,7 +1486,7 @@ ipcMain.handle('get-db-connections', async () => { ipcMain.handle('save-db-credentials', async (event, credentials) => { try { const credPath = getDbCredentialsPath(); - let existing = readDbCredentials(); + const existing = readDbCredentials(); for (const [name, cred] of Object.entries(credentials)) { existing[name] = { @@ -1392,4 +1529,173 @@ ipcMain.handle('test-db-connection', async (event, connectionName) => { } catch (error) { return { success: false, error: error.message }; } +}); + +// ============================================ +// Long-running Agent Monitoring IPC Handlers +// ============================================ + +ipcMain.handle('agents:start', async (event, rawConfig = {}) => { + if (activeAgentRun && activeAgentRun.process && !activeAgentRun.process.killed) { + return { success: false, error: `Agent run already active: ${activeAgentRun.runId}`, runId: activeAgentRun.runId }; + } + + const runId = `agent-${Date.now()}`; + const config = normalizeAgentConfig(rawConfig); + + try { + const proc = spawnPythonProcess('anomaly_monitor.py', [ + 'run', + '--run-id', + runId, + '--config-json', + JSON.stringify(config), + ]); + + activeAgentRun = { + runId, + process: proc, + status: 'starting', + startedAt: new Date().toISOString(), + metrics: { + cycleMs: 0, + candidates: 0, + triaged: 0, + emitted: 0, + timestamp: new Date().toISOString(), + }, + stdoutBuffer: '', + config, + }; + + proc.stdout.on('data', (data) => { + handleAgentStdoutChunk(data.toString()); + }); + + proc.stderr.on('data', (data) => { + const text = data.toString().trim(); + if (!text) return; + routeAgentMessage('agent-error', { + runId, + code: 'worker_stderr', + message: text, + recoverable: true, + timestamp: new Date().toISOString(), + }); + }); + + proc.on('close', (code) => { + const hadActive = activeAgentRun && activeAgentRun.runId === runId; + if (hadActive) { + routeAgentMessage('agent-complete', { + runId, + success: code === 0, + reason: code === 0 ? 'completed' : 'worker_exit_error', + stoppedAt: new Date().toISOString(), + }); + activeAgentRun = null; + } + }); + + proc.on('error', (err) => { + routeAgentMessage('agent-error', { + runId, + code: 'worker_spawn_error', + message: err.message, + recoverable: false, + timestamp: new Date().toISOString(), + }); + activeAgentRun = null; + }); + + return { success: true, runId, startedAt: activeAgentRun.startedAt, config }; + } catch (error) { + activeAgentRun = null; + return { success: false, error: error.message, runId }; + } +}); + +ipcMain.handle('agents:status', async (event, runId) => { + if (activeAgentRun && (!runId || runId === activeAgentRun.runId)) { + return { + success: true, + runId: activeAgentRun.runId, + status: activeAgentRun.status, + metrics: activeAgentRun.metrics, + lastHeartbeatAt: activeAgentRun.metrics.timestamp, + startedAt: activeAgentRun.startedAt, + config: activeAgentRun.config, + active: true, + }; + } + + if (!runId) { + return { success: true, active: false, status: 'idle' }; + } + + try { + const output = await runPythonScript('anomaly_monitor.py', ['status', '--run-id', runId]); + const parsed = JSON.parse(output || '{}'); + return parsed; + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:stop', async (event, runId = null) => { + if (!activeAgentRun) { + return { success: false, error: 'No active agent run' }; + } + if (runId && runId !== activeAgentRun.runId) { + return { success: false, error: `Requested run ${runId} does not match active run ${activeAgentRun.runId}` }; + } + return stopActiveAgent('stopped_by_user'); +}); + +ipcMain.handle('agents:list-events', async (event, filters = {}) => { + const args = ['list-events']; + if (filters.limit) args.push('--limit', String(filters.limit)); + if (filters.state) args.push('--state', String(filters.state)); + if (filters.severity) args.push('--severity', String(filters.severity)); + if (filters.runId) args.push('--run-id', String(filters.runId)); + + try { + const output = await runPythonScript('anomaly_monitor.py', args); + return JSON.parse(output || '{"success":true,"events":[]}'); + } catch (error) { + return { success: false, error: error.message, events: [] }; + } +}); + +ipcMain.handle('agents:get-event', async (event, eventId) => { + try { + const output = await runPythonScript('anomaly_monitor.py', ['get-event', '--event-id', String(eventId)]); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:ack-event', async (event, eventId, note = '') => { + try { + const args = ['ack-event', '--event-id', String(eventId)]; + if (note) args.push('--note', String(note)); + const output = await runPythonScript('anomaly_monitor.py', args); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('agents:cleanup', async (event, retentionDays = 14) => { + try { + const output = await runPythonScript('anomaly_monitor.py', [ + 'cleanup', + '--retention-days', + String(retentionDays), + ]); + return JSON.parse(output || '{}'); + } catch (error) { + return { success: false, error: error.message }; + } }); \ No newline at end of file diff --git a/electron-ui/preload.js b/electron-ui/preload.js index d3c8171..1e0930c 100644 --- a/electron-ui/preload.js +++ b/electron-ui/preload.js @@ -70,6 +70,15 @@ contextBridge.exposeInMainWorld('api', { getSettings: () => ipcRenderer.invoke('get-settings'), saveSettings: (settings) => ipcRenderer.invoke('save-settings', settings), testIgnitionConnection: (options) => ipcRenderer.invoke('test-ignition-connection', options), + + // Long-running agents monitoring + agentsStart: (config) => ipcRenderer.invoke('agents:start', config), + agentsStatus: (runId) => ipcRenderer.invoke('agents:status', runId), + agentsStop: (runId) => ipcRenderer.invoke('agents:stop', runId), + agentsListEvents: (filters) => ipcRenderer.invoke('agents:list-events', filters), + agentsGetEvent: (eventId) => ipcRenderer.invoke('agents:get-event', eventId), + agentsAckEvent: (eventId, note) => ipcRenderer.invoke('agents:ack-event', eventId, note), + agentsCleanup: (retentionDays) => ipcRenderer.invoke('agents:cleanup', retentionDays), // Database connections getDbConnections: () => ipcRenderer.invoke('get-db-connections'), @@ -91,6 +100,26 @@ contextBridge.exposeInMainWorld('api', { const handler = (event, data) => callback(data); ipcRenderer.on('stream-complete', handler); return () => ipcRenderer.removeListener('stream-complete', handler); + }, + onAgentStatus: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-status', handler); + return () => ipcRenderer.removeListener('agent-status', handler); + }, + onAgentEvent: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-event', handler); + return () => ipcRenderer.removeListener('agent-event', handler); + }, + onAgentError: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-error', handler); + return () => ipcRenderer.removeListener('agent-error', handler); + }, + onAgentComplete: (callback) => { + const handler = (event, data) => callback(data); + ipcRenderer.on('agent-complete', handler); + return () => ipcRenderer.removeListener('agent-complete', handler); } }); diff --git a/electron-ui/renderer.js b/electron-ui/renderer.js index 53974f5..cab7e8b 100644 --- a/electron-ui/renderer.js +++ b/electron-ui/renderer.js @@ -3536,6 +3536,381 @@ btnSaveDbCreds?.addEventListener('click', async () => { btnSaveDbCreds.disabled = false; } }); +// Agents Tab - Long-running monitoring +// ============================================ + +const agentsState = { + runId: null, + status: 'idle', + events: [], + selectedEventId: null, + listenersReady: false, +}; + +function getAgentsElements() { + return { + btnStart: document.getElementById('btn-agents-start'), + btnStop: document.getElementById('btn-agents-stop'), + btnRefresh: document.getElementById('btn-agents-refresh'), + btnCleanup: document.getElementById('btn-agents-cleanup'), + btnOpenGraph: document.getElementById('btn-agents-open-graph'), + btnAck: document.getElementById('btn-agents-ack'), + statusChip: document.getElementById('agents-status-chip'), + statusText: document.getElementById('agents-status-text'), + list: document.getElementById('agents-event-list'), + detail: document.getElementById('agents-event-detail'), + filterState: document.getElementById('agents-filter-state'), + filterSeverity: document.getElementById('agents-filter-severity'), + filterSearch: document.getElementById('agents-filter-search'), + metricCycle: document.getElementById('agents-metric-cycle'), + metricCandidates: document.getElementById('agents-metric-candidates'), + metricTriaged: document.getElementById('agents-metric-triaged'), + metricEmitted: document.getElementById('agents-metric-emitted'), + metricHeartbeat: document.getElementById('agents-metric-heartbeat'), + cfgPoll: document.getElementById('agents-config-poll-ms'), + cfgHist: document.getElementById('agents-config-history-min'), + cfgPoints: document.getElementById('agents-config-min-points'), + cfgMaxLlm: document.getElementById('agents-config-max-llm'), + cfgZ: document.getElementById('agents-config-threshold-z'), + cfgMad: document.getElementById('agents-config-threshold-mad'), + cfgStale: document.getElementById('agents-config-staleness-sec'), + }; +} + +function getAgentsConfigFromUI() { + const el = getAgentsElements(); + return { + pollIntervalMs: Number(el.cfgPoll?.value || 15000), + historyWindowMinutes: Number(el.cfgHist?.value || 360), + minHistoryPoints: Number(el.cfgPoints?.value || 30), + maxLlmTriagesPerCycle: Number(el.cfgMaxLlm?.value || 5), + thresholds: { + z: Number(el.cfgZ?.value || 3), + mad: Number(el.cfgMad?.value || 3.5), + stalenessSec: Number(el.cfgStale?.value || 120), + }, + }; +} + +function formatAgentTime(ts) { + if (!ts) return 'n/a'; + const d = new Date(ts); + if (Number.isNaN(d.getTime())) return String(ts); + return d.toLocaleString(); +} + +function updateAgentStatusUi(status, text) { + const el = getAgentsElements(); + if (!el.statusChip || !el.statusText) return; + el.statusChip.className = 'status-chip'; + const normalized = (status || 'idle').toLowerCase(); + if (normalized === 'running') el.statusChip.classList.add('running'); + if (normalized === 'failed' || normalized === 'error') el.statusChip.classList.add('error'); + el.statusChip.textContent = normalized; + el.statusText.textContent = text || normalized; + if (el.btnStart) el.btnStart.disabled = normalized === 'running' || normalized === 'starting'; + if (el.btnStop) el.btnStop.disabled = !(normalized === 'running' || normalized === 'starting' || normalized === 'stopping'); +} + +function updateAgentMetrics(metrics = {}, heartbeatTs = null) { + const el = getAgentsElements(); + if (el.metricCycle) el.metricCycle.textContent = String(metrics.cycleMs ?? metrics.lastCycleMs ?? 0); + if (el.metricCandidates) el.metricCandidates.textContent = String(metrics.candidates ?? metrics.lastCandidates ?? 0); + if (el.metricTriaged) el.metricTriaged.textContent = String(metrics.triaged ?? metrics.lastTriaged ?? 0); + if (el.metricEmitted) el.metricEmitted.textContent = String(metrics.emitted ?? metrics.lastEmitted ?? 0); + if (el.metricHeartbeat) el.metricHeartbeat.textContent = formatAgentTime(heartbeatTs || metrics.timestamp); +} + +function getFilteredAgentEvents() { + const el = getAgentsElements(); + const state = (el.filterState?.value || '').toLowerCase(); + const severity = (el.filterSeverity?.value || '').toLowerCase(); + const search = (el.filterSearch?.value || '').trim().toLowerCase(); + return agentsState.events.filter((event) => { + if (state && String(event.state || '').toLowerCase() !== state) return false; + if (severity && String(event.severity || '').toLowerCase() !== severity) return false; + if (search) { + const haystack = [ + event.summary, + event.source_tag, + event.tag_name, + ...(event.equipment || []), + ...(event.tags || []), + ] + .filter(Boolean) + .join(' ') + .toLowerCase(); + if (!haystack.includes(search)) return false; + } + return true; + }); +} + +function renderAgentEventList() { + const el = getAgentsElements(); + if (!el.list) return; + const events = getFilteredAgentEvents(); + if (!events.length) { + el.list.innerHTML = '
No anomaly events match the current filters.
'; + return; + } + el.list.innerHTML = events + .map((event) => { + const active = event.event_id === agentsState.selectedEventId ? ' active' : ''; + const sev = String(event.severity || 'low').toLowerCase(); + const equipment = (event.equipment || []).slice(0, 2).join(', '); + return ` +
+
+ ${escapeHtml(sev)} + ${escapeHtml(formatAgentTime(event.created_at))} +
+
${escapeHtml(event.summary || 'Untitled anomaly')}
+
${escapeHtml(event.tag_name || event.source_tag || '')}${equipment ? ` • ${escapeHtml(equipment)}` : ''}
+
+ `; + }) + .join(''); + + el.list.querySelectorAll('.agents-event-card').forEach((card) => { + card.addEventListener('click', () => { + const eventId = card.getAttribute('data-event-id'); + if (!eventId) return; + selectAgentEvent(eventId); + }); + }); +} + +function resolveAgentGraphTarget(event) { + const equipment = (event.equipment || []).find(Boolean); + if (equipment) return { name: equipment, type: 'Equipment' }; + const tagName = event.tag_name || (event.tags || []).find(Boolean) || event.source_tag; + if (tagName) return { name: tagName, type: 'ScadaTag' }; + return null; +} + +function renderAgentEventDetails(event) { + const el = getAgentsElements(); + if (!el.detail) return; + if (!event) { + el.detail.innerHTML = '

Select an anomaly event from the feed.

'; + if (el.btnOpenGraph) el.btnOpenGraph.disabled = true; + if (el.btnAck) el.btnAck.disabled = true; + return; + } + + let checks = []; + let causes = []; + let safety = []; + try { checks = JSON.parse(event.recommended_checks_json || '[]'); } catch (e) {} + try { causes = JSON.parse(event.probable_causes_json || '[]'); } catch (e) {} + try { safety = JSON.parse(event.safety_notes_json || '[]'); } catch (e) {} + + el.detail.innerHTML = ` +
+
Event ID${escapeHtml(event.event_id || '')}
+
State${escapeHtml(event.state || '')}
+
Severity${escapeHtml(event.severity || '')}
+
Confidence${escapeHtml(String(event.confidence ?? ''))}
+
Category${escapeHtml(event.category || '')}
+
Timestamp${escapeHtml(formatAgentTime(event.created_at))}
+
Source Tag${escapeHtml(event.source_tag || '')}
+
Tag Name${escapeHtml(event.tag_name || '')}
+
z-score${escapeHtml(String(event.z_score ?? '0'))}
+
MAD score${escapeHtml(String(event.mad_score ?? '0'))}
+
+
+
Summary
+
${escapeHtml(event.summary || '')}
+
+
+
Explanation
+
${escapeHtml(event.explanation || '')}
+
+
+
Probable Causes
+
    ${(causes || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+
+
Verification Checks
+
    ${(checks || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+
+
Safety Notes
+
    ${(safety || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
+
+ `; + + if (el.btnOpenGraph) el.btnOpenGraph.disabled = !resolveAgentGraphTarget(event); + if (el.btnAck) el.btnAck.disabled = event.state === 'acknowledged'; +} + +async function selectAgentEvent(eventId) { + agentsState.selectedEventId = eventId; + const existing = agentsState.events.find((e) => e.event_id === eventId); + if (existing && existing.explanation && existing.recommended_checks_json) { + renderAgentEventList(); + renderAgentEventDetails(existing); + return; + } + const detailResult = await window.api.agentsGetEvent(eventId); + if (detailResult.success && detailResult.event) { + const idx = agentsState.events.findIndex((e) => e.event_id === eventId); + if (idx >= 0) { + agentsState.events[idx] = { ...agentsState.events[idx], ...detailResult.event }; + } else { + agentsState.events.unshift(detailResult.event); + } + renderAgentEventList(); + renderAgentEventDetails(detailResult.event); + } +} + +async function loadAgentEvents() { + const el = getAgentsElements(); + const result = await window.api.agentsListEvents({ + limit: 200, + state: el.filterState?.value || undefined, + severity: el.filterSeverity?.value || undefined, + runId: agentsState.runId || undefined, + }); + if (!result.success) return; + agentsState.events = Array.isArray(result.events) ? result.events : []; + renderAgentEventList(); + + if (agentsState.selectedEventId) { + const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + renderAgentEventDetails(selected || null); + } +} + +async function refreshAgentStatus() { + const status = await window.api.agentsStatus(agentsState.runId || undefined); + if (!status.success) { + updateAgentStatusUi('error', status.error || 'Failed to fetch status'); + return; + } + if (status.active) { + agentsState.runId = status.runId || agentsState.runId; + agentsState.status = status.status || 'running'; + updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId}`); + updateAgentMetrics(status.metrics || {}, status.lastHeartbeatAt); + } else { + agentsState.status = 'idle'; + updateAgentStatusUi('idle', 'No active run'); + } +} + +async function startAgentsMonitoring() { + const config = getAgentsConfigFromUI(); + const result = await window.api.agentsStart(config); + if (!result.success) { + updateAgentStatusUi('error', result.error || 'Failed to start monitoring'); + return; + } + agentsState.runId = result.runId; + agentsState.status = 'running'; + updateAgentStatusUi('running', `Run ${result.runId}`); + await loadAgentEvents(); +} + +async function stopAgentsMonitoring() { + const result = await window.api.agentsStop(agentsState.runId || undefined); + if (!result.success) { + updateAgentStatusUi('error', result.error || 'Failed to stop monitoring'); + return; + } + agentsState.status = 'stopped'; + updateAgentStatusUi('stopped', 'Monitoring stopped'); +} + +async function acknowledgeSelectedAgentEvent() { + if (!agentsState.selectedEventId) return; + const result = await window.api.agentsAckEvent(agentsState.selectedEventId, ''); + if (!result.success) return; + await loadAgentEvents(); + const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + renderAgentEventDetails(selected || null); +} + +function upsertRealtimeAgentEvent(payload) { + if (!payload || !payload.eventId) return; + const idx = agentsState.events.findIndex((e) => e.event_id === payload.eventId); + const next = { + event_id: payload.eventId, + severity: payload.severity || 'medium', + summary: payload.summary || 'Anomaly detected', + category: payload.category || 'deviation', + created_at: payload.createdAt || new Date().toISOString(), + source_tag: payload.entityRefs?.sourceTag || payload.entityRefs?.tag || '', + tag_name: payload.entityRefs?.tag || '', + state: 'open', + }; + if (idx >= 0) { + agentsState.events[idx] = { ...agentsState.events[idx], ...next }; + } else { + agentsState.events.unshift(next); + } + renderAgentEventList(); +} + +function ensureAgentListeners() { + if (agentsState.listenersReady) return; + agentsState.listenersReady = true; + + window.api.onAgentStatus((payload) => { + if (!payload) return; + if (payload.runId) agentsState.runId = payload.runId; + agentsState.status = payload.state || agentsState.status; + updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId || 'n/a'}`); + updateAgentMetrics(payload, payload.timestamp); + }); + + window.api.onAgentEvent((payload) => { + upsertRealtimeAgentEvent(payload); + }); + + window.api.onAgentError((payload) => { + if (!payload) return; + updateAgentStatusUi('error', payload.message || 'Agent runtime error'); + }); + + window.api.onAgentComplete((payload) => { + if (!payload) return; + agentsState.status = payload.success ? 'stopped' : 'failed'; + updateAgentStatusUi(agentsState.status, payload.reason || 'Run complete'); + refreshAgentStatus(); + }); +} + +function initAgentsTab() { + ensureAgentListeners(); + const el = getAgentsElements(); + if (!el.btnStart) return; + if (!el.btnStart.dataset.bound) { + el.btnStart.dataset.bound = '1'; + el.btnStart.addEventListener('click', startAgentsMonitoring); + el.btnStop?.addEventListener('click', stopAgentsMonitoring); + el.btnRefresh?.addEventListener('click', loadAgentEvents); + el.btnCleanup?.addEventListener('click', async () => { + await window.api.agentsCleanup(14); + await loadAgentEvents(); + }); + el.btnAck?.addEventListener('click', acknowledgeSelectedAgentEvent); + el.btnOpenGraph?.addEventListener('click', () => { + const event = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); + if (!event) return; + const target = resolveAgentGraphTarget(event); + if (!target) return; + openGraphModal(target.name, target.type, event.summary || target.name); + }); + el.filterState?.addEventListener('change', loadAgentEvents); + el.filterSeverity?.addEventListener('change', loadAgentEvents); + el.filterSearch?.addEventListener('input', renderAgentEventList); + } + refreshAgentStatus(); + loadAgentEvents(); +} // Initialize graph tab when it's first shown navButtons.forEach(btn => { @@ -3558,6 +3933,9 @@ navButtons.forEach(btn => { loadSettings(); loadDbConnections(); } + if (btn.dataset.tab === 'agents') { + setTimeout(initAgentsTab, 100); + } }); }); @@ -3569,5 +3947,6 @@ setTimeout(() => { loadTiaProjects(); loadSettings(); loadDbConnections(); + ensureAgentListeners(); }, 500); diff --git a/electron-ui/styles.css b/electron-ui/styles.css index 5ba9186..f1e066e 100644 --- a/electron-ui/styles.css +++ b/electron-ui/styles.css @@ -2979,3 +2979,278 @@ select.input, .connection-status .status-dot { flex-shrink: 0; } + +/* ============================================ + AGENTS TAB + ============================================ */ + +.agents-topbar { + display: flex; + justify-content: space-between; + align-items: center; + gap: var(--space-4); + margin-bottom: var(--space-3); + flex-wrap: wrap; +} + +.agents-run-controls { + display: flex; + gap: var(--space-2); + flex-wrap: wrap; +} + +.agents-run-status { + display: flex; + align-items: center; + gap: var(--space-2); + color: var(--color-text-secondary); + font-size: var(--text-sm); +} + +.status-chip { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: 999px; + border: 1px solid var(--color-border); + background: var(--color-bg-panel-2); + color: var(--color-text-secondary); + font-size: var(--text-xs); + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.4px; +} + +.status-chip.running { + color: var(--color-success); + border-color: rgba(34, 197, 94, 0.35); + background: rgba(34, 197, 94, 0.12); +} + +.status-chip.error { + color: var(--color-danger); + border-color: rgba(239, 68, 68, 0.35); + background: rgba(239, 68, 68, 0.12); +} + +.agents-config-row { + display: grid; + grid-template-columns: repeat(14, minmax(0, 1fr)); + gap: var(--space-2); + margin-bottom: var(--space-4); + align-items: center; +} + +.agents-config-row label { + font-size: var(--text-xs); + color: var(--color-text-secondary); + text-transform: uppercase; + letter-spacing: 0.35px; +} + +.agents-config-row .input { + min-width: 0; +} + +.agents-metrics-row { + display: grid; + grid-template-columns: repeat(5, minmax(0, 1fr)); + gap: var(--space-2); + margin-bottom: var(--space-4); +} + +.metric-card { + border: 1px solid var(--color-border); + background: var(--color-bg-panel); + border-radius: var(--radius-md); + padding: var(--space-2) var(--space-3); + display: flex; + flex-direction: column; + gap: 2px; +} + +.metric-label { + font-size: var(--text-xs); + color: var(--color-text-muted); +} + +.metric-value { + font-family: var(--font-mono); + font-size: var(--text-sm); + color: var(--color-text); +} + +.agents-main { + display: grid; + grid-template-columns: minmax(300px, 38%) minmax(0, 1fr); + gap: var(--space-3); + min-height: 480px; +} + +.agents-feed-panel, +.agents-detail-panel { + border: 1px solid var(--color-border); + background: var(--color-bg-panel); + border-radius: var(--radius-lg); + overflow: hidden; + display: flex; + flex-direction: column; +} + +.agents-feed-header, +.agents-detail-header { + padding: var(--space-3); + border-bottom: 1px solid var(--color-border-subtle); + display: flex; + justify-content: space-between; + align-items: center; + gap: var(--space-2); +} + +.agents-feed-header h3, +.agents-detail-header h3 { + font-size: var(--text-md); + font-weight: 600; +} + +.agents-feed-filters { + display: flex; + gap: var(--space-2); + flex-wrap: wrap; +} + +.agents-feed-filters .input { + min-width: 120px; +} + +.agents-event-list { + overflow-y: auto; + padding: var(--space-2); + display: flex; + flex-direction: column; + gap: var(--space-2); + flex: 1; +} + +.agents-empty { + color: var(--color-text-muted); + font-size: var(--text-sm); + padding: var(--space-4); + text-align: center; +} + +.agents-event-card { + border: 1px solid var(--color-border); + background: var(--color-bg-panel-2); + border-radius: var(--radius-md); + padding: var(--space-2) var(--space-3); + cursor: pointer; + transition: border-color var(--transition-fast), transform var(--transition-fast); +} + +.agents-event-card:hover { + border-color: var(--color-border-active); + transform: translateY(-1px); +} + +.agents-event-card.active { + border-color: var(--color-accent); + box-shadow: 0 0 0 1px rgba(34, 211, 238, 0.35) inset; +} + +.agents-event-line-top { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 4px; + gap: var(--space-2); +} + +.agents-severity { + font-size: var(--text-xs); + text-transform: uppercase; + letter-spacing: 0.4px; + padding: 2px 6px; + border-radius: 999px; + border: 1px solid transparent; +} + +.agents-severity.sev-critical { + color: #fecaca; + background: rgba(239, 68, 68, 0.2); + border-color: rgba(239, 68, 68, 0.4); +} + +.agents-severity.sev-high { + color: #fdba74; + background: rgba(249, 115, 22, 0.18); + border-color: rgba(249, 115, 22, 0.35); +} + +.agents-severity.sev-medium { + color: #fde68a; + background: rgba(245, 158, 11, 0.15); + border-color: rgba(245, 158, 11, 0.35); +} + +.agents-severity.sev-low { + color: #bfdbfe; + background: rgba(59, 130, 246, 0.15); + border-color: rgba(59, 130, 246, 0.35); +} + +.agents-event-time { + font-size: var(--text-xs); + color: var(--color-text-muted); + font-family: var(--font-mono); +} + +.agents-event-summary { + font-size: var(--text-sm); + color: var(--color-text); + margin-bottom: 4px; +} + +.agents-event-meta { + font-size: var(--text-xs); + color: var(--color-text-muted); +} + +.agents-detail-content { + padding: var(--space-3); + overflow-y: auto; + font-size: var(--text-sm); + display: flex; + flex-direction: column; + gap: var(--space-3); +} + +.agents-detail-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: var(--space-2) var(--space-3); +} + +.agents-detail-item { + display: flex; + flex-direction: column; + gap: 2px; +} + +.agents-detail-label { + font-size: var(--text-xs); + color: var(--color-text-muted); + text-transform: uppercase; + letter-spacing: 0.3px; +} + +.agents-detail-value { + font-family: var(--font-mono); + color: var(--color-text); +} + +.agents-list { + margin-left: var(--space-4); + color: var(--color-text-secondary); +} diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py new file mode 100644 index 0000000..70a0f4b --- /dev/null +++ b/scripts/anomaly_monitor.py @@ -0,0 +1,912 @@ +#!/usr/bin/env python3 +""" +Long-running anomaly monitor worker. + +Modes: + - run: start continuous monitoring loop + - status: get run status + - list-events: list persisted anomaly events + - get-event: fetch one anomaly event + - ack-event: mark event as acknowledged + - cleanup: delete old events by retention policy + - replay-fixtures: run deterministic fixture validation +""" + +from __future__ import annotations + +import argparse +import json +import os +import signal +import sys +import time +import uuid +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional + +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal environments + def load_dotenv(*_args, **_kwargs): + return False + +from anomaly_rules import ( + compute_deviation_scores, + dedup_key, + is_quality_good, + is_stale, + parse_timestamp, + safe_float, +) + + +load_dotenv() + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def emit(prefix: str, payload: Dict[str, Any]) -> None: + """Emit machine-parseable messages for Electron main process.""" + print(f"[{prefix}] {json.dumps(payload, default=str)}", flush=True) + + +def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + raw = dict(config or {}) + thresholds = raw.get("thresholds", {}) if isinstance(raw.get("thresholds"), dict) else {} + defaults = { + "pollIntervalMs": 15000, + "historyWindowMinutes": 360, + "minHistoryPoints": 30, + "maxMonitoredTags": 200, + "maxCandidatesPerCycle": 25, + "maxLlmTriagesPerCycle": 5, + "dedupCooldownMinutes": 10, + "retentionDays": 14, + "cleanupEveryCycles": 40, + "runMode": "live", + "scope": { + "project": None, + "equipmentTags": [], + "tagRegex": None, + }, + "thresholds": { + "z": 3.0, + "mad": 3.5, + "rate": 0.0, + "stalenessSec": 120, + "flatline_std_epsilon": 1e-6, + "stuck_window_size": 20, + }, + } + cfg = defaults + cfg.update({k: v for k, v in raw.items() if k in defaults and k != "thresholds"}) + cfg["thresholds"].update({k: v for k, v in thresholds.items() if v is not None}) + if isinstance(raw.get("scope"), dict): + cfg["scope"].update(raw["scope"]) + return cfg + + +class AnomalyMonitor: + def __init__(self, config: Dict[str, Any], run_id: Optional[str] = None): + self.config = merge_defaults(config) + self.run_id = run_id or f"agent-run-{uuid.uuid4()}" + from ignition_api_client import IgnitionApiClient + from neo4j_ontology import get_ontology_graph + + self.graph = get_ontology_graph() + + self.api = IgnitionApiClient( + base_url=self.config.get("ignitionApiUrl") or os.getenv("IGNITION_API_URL"), + api_token=self.config.get("ignitionApiToken") or os.getenv("IGNITION_API_TOKEN"), + timeout=15.0, + ) + + self.llm = None + self._llm_enabled = bool(os.getenv("ANTHROPIC_API_KEY")) + if self._llm_enabled: + try: + from claude_client import ClaudeClient + + self.llm = ClaudeClient( + enable_tools=False, + ignition_api_url=self.config.get("ignitionApiUrl"), + ignition_api_token=self.config.get("ignitionApiToken"), + ) + except Exception as exc: + self._llm_enabled = False + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "llm_init_failed", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + self._running = True + self._cycle_count = 0 + self._prev_values: Dict[str, float] = {} + + # ----------------------------- + # Schema / run lifecycle + # ----------------------------- + def init_schema(self) -> None: + self.graph.init_agent_monitoring_schema() + + def upsert_run(self, status: str, reason: Optional[str] = None) -> None: + with self.graph.session() as session: + session.run( + """ + MERGE (r:AgentRun {run_id: $run_id}) + SET r.status = $status, + r.updated_at = datetime(), + r.last_heartbeat_at = datetime(), + r.config_json = $config_json, + r.cycle_count = $cycle_count, + r.started_at = coalesce(r.started_at, datetime()), + r.stopped_at = CASE WHEN $status IN ['stopped', 'failed'] THEN datetime() ELSE r.stopped_at END, + r.stop_reason = CASE WHEN $reason IS NULL THEN r.stop_reason ELSE $reason END + """, + run_id=self.run_id, + status=status, + config_json=json.dumps(self.config, default=str), + cycle_count=self._cycle_count, + reason=reason, + ) + + def heartbeat(self, metrics: Dict[str, Any]) -> None: + with self.graph.session() as session: + session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + SET r.last_heartbeat_at = datetime(), + r.cycle_count = $cycle_count, + r.last_cycle_ms = $cycle_ms, + r.last_candidates = $candidates, + r.last_triaged = $triaged, + r.last_emitted = $emitted + """, + run_id=self.run_id, + cycle_count=self._cycle_count, + cycle_ms=metrics.get("cycleMs", 0), + candidates=metrics.get("candidates", 0), + triaged=metrics.get("triaged", 0), + emitted=metrics.get("emitted", 0), + ) + + # ----------------------------- + # Tag and context collection + # ----------------------------- + def get_monitored_tags(self) -> List[Dict[str, str]]: + max_tags = int(self.config.get("maxMonitoredTags", 200)) + scope = self.config.get("scope", {}) + tag_regex = scope.get("tagRegex") + equipment_tags = set(scope.get("equipmentTags") or []) + + with self.graph.session() as session: + result = session.run( + """ + MATCH (t:ScadaTag) + WHERE coalesce(t.opc_item_path, t.name) IS NOT NULL + AND coalesce(t.opc_item_path, t.name) <> '' + RETURN DISTINCT coalesce(t.opc_item_path, t.name) AS tag_path, + coalesce(t.name, t.opc_item_path) AS tag_name + LIMIT $limit + """, + limit=max_tags * 3, + ) + tags = [{"path": r["tag_path"], "name": r["tag_name"]} for r in result if r["tag_path"]] + + if tag_regex: + import re + try: + pattern = re.compile(tag_regex, re.IGNORECASE) + tags = [t for t in tags if pattern.search(t["path"]) or pattern.search(t["name"])] + except re.error: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "invalid_tag_regex", + "message": f"Invalid regex: {tag_regex}", + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + if equipment_tags: + tags = [t for t in tags if t["name"] in equipment_tags or t["path"] in equipment_tags] + + return tags[:max_tags] + + def _extract_history_values(self, history_data: Any, tag_path: str) -> List[float]: + """Normalize multiple gateway response shapes to numeric values list.""" + values: List[float] = [] + if history_data is None: + return values + if isinstance(history_data, dict) and history_data.get("error"): + return values + + rows: List[Any] = [] + if isinstance(history_data, list): + rows = history_data + elif isinstance(history_data, dict): + for key in ("rows", "data", "results", "values", "history"): + chunk = history_data.get(key) + if isinstance(chunk, list): + rows = chunk + break + if not rows and "tagHistory" in history_data and isinstance(history_data["tagHistory"], list): + rows = history_data["tagHistory"] + + for row in rows: + if isinstance(row, (int, float, str)): + val = safe_float(row) + if val is not None: + values.append(val) + continue + if not isinstance(row, dict): + continue + candidate = None + if "value" in row: + candidate = row.get("value") + elif tag_path in row: + candidate = row.get(tag_path) + else: + # Wide format often has timestamp + one tag column. + for k, v in row.items(): + if k.lower() in {"timestamp", "ts", "t", "time"}: + continue + candidate = v + break + val = safe_float(candidate) + if val is not None: + values.append(val) + return values + + def fetch_history_values(self, tag_path: str) -> List[float]: + minutes = int(self.config.get("historyWindowMinutes", 360)) + end_dt = datetime.now(timezone.utc) + start_dt = end_dt - timedelta(minutes=minutes) + data = self.api.query_tag_history( + [tag_path], + start_dt.isoformat(), + end_dt.isoformat(), + return_size=max(100, int(self.config.get("minHistoryPoints", 30)) * 4), + aggregation_mode="Average", + return_format="Wide", + ) + return self._extract_history_values(data, tag_path) + + def get_context(self, tag_path: str) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (t:ScadaTag) + WHERE t.name = $tag OR t.opc_item_path = $tag + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) + OPTIONAL MATCH (eq)-[:HAS_SYMPTOM]->(s:FaultSymptom) + OPTIONAL MATCH (s)-[:CAUSED_BY]->(c:FaultCause) + OPTIONAL MATCH (eq)-[:HAS_PATTERN]->(p:ControlPattern) + OPTIONAL MATCH (eq)-[:SAFETY_CRITICAL]->(se:SafetyElement) + RETURN t, + collect(DISTINCT eq.name) AS equipment, + collect(DISTINCT s.symptom) AS symptoms, + collect(DISTINCT c.cause) AS causes, + collect(DISTINCT p.pattern_name) AS patterns, + collect(DISTINCT se.name) AS safety + LIMIT 1 + """, + tag=tag_path, + ) + record = result.single() + if not record: + return { + "tag_path": tag_path, + "equipment": [], + "symptoms": [], + "causes": [], + "patterns": [], + "safety": [], + } + node = record["t"] + return { + "tag_path": tag_path, + "tag_name": node.get("name") if node else tag_path, + "equipment": [x for x in record["equipment"] if x], + "symptoms": [x for x in record["symptoms"] if x], + "causes": [x for x in record["causes"] if x], + "patterns": [x for x in record["patterns"] if x], + "safety": [x for x in record["safety"] if x], + } + + # ----------------------------- + # Triage and persistence + # ----------------------------- + def run_llm_triage( + self, + context: Dict[str, Any], + deterministic: Dict[str, Any], + live_sample: Dict[str, Any], + ) -> Dict[str, Any]: + fallback = { + "summary": f"Deterministic anomaly on {context.get('tag_name', context['tag_path'])}", + "category": deterministic.get("category", "deviation"), + "severity": "medium", + "confidence": 0.55, + "probable_causes": ["Signal deviates from historical baseline."], + "verification_checks": [ + f"Check live quality/timestamp for {context.get('tag_path')}", + "Inspect upstream interlocks and communication health.", + ], + "safety_notes": context.get("safety", []), + "rationale": "LLM triage unavailable; using deterministic fallback.", + "related_entities": [ + {"label": "Equipment", "name": e} for e in context.get("equipment", [])[:3] + ], + } + if not self.llm: + return fallback + + system_prompt = ( + "You are an industrial anomaly triage assistant. " + "Return ONLY valid JSON with keys: summary, category, severity, confidence, " + "probable_causes, verification_checks, safety_notes, rationale, related_entities. " + "Severity must be one of critical/high/medium/low. " + "Category must be one of spike/drift/stuck/state-conflict/quality-issue/deviation. " + "related_entities is a list of objects: {label,name}." + ) + user_prompt = json.dumps( + { + "context": context, + "deterministic": deterministic, + "live_sample": live_sample, + }, + default=str, + ) + try: + result = self.llm.query_json( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=900, + use_tools=False, + ) + data = result.get("data") + if not isinstance(data, dict): + return fallback + merged = dict(fallback) + merged.update({k: v for k, v in data.items() if v is not None}) + return merged + except Exception as exc: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "llm_triage_failed", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + return fallback + + def _severity_from_scores(self, deterministic: Dict[str, Any], llm_out: Dict[str, Any]) -> str: + sev = str(llm_out.get("severity", "")).lower() + if sev in {"critical", "high", "medium", "low"}: + return sev + z = abs(float(deterministic.get("z_score", 0.0))) + if z >= 8: + return "critical" + if z >= 5: + return "high" + if z >= 3: + return "medium" + return "low" + + def is_duplicate_recent(self, dedup_sig: str) -> bool: + cooldown = max(1, int(self.config.get("dedupCooldownMinutes", 10))) + with self.graph.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {dedup_key: $dedup_key}) + WHERE e.created_at IS NOT NULL + AND datetime(e.created_at) > datetime() - duration({minutes: $minutes}) + RETURN count(e) AS cnt + """, + dedup_key=dedup_sig, + minutes=cooldown, + ) + row = result.single() + return bool(row and row["cnt"] > 0) + + def persist_event( + self, + context: Dict[str, Any], + deterministic: Dict[str, Any], + live_sample: Dict[str, Any], + triage: Dict[str, Any], + ) -> Optional[Dict[str, Any]]: + category = triage.get("category") or deterministic.get("category", "deviation") + dedup_sig = dedup_key(context["tag_path"], category, int(self.config.get("dedupCooldownMinutes", 10))) + if self.is_duplicate_recent(dedup_sig): + return None + + event_id = f"ae-{uuid.uuid4()}" + severity = self._severity_from_scores(deterministic, triage) + confidence = float(max(0.0, min(1.0, triage.get("confidence", 0.5)))) + event_data = { + "event_id": event_id, + "run_id": self.run_id, + "event_schema_version": 1, + "state": "open", + "severity": severity, + "confidence": confidence, + "category": category, + "summary": triage.get("summary", f"Anomaly on {context['tag_path']}"), + "explanation": triage.get("rationale", ""), + "recommended_checks_json": json.dumps(triage.get("verification_checks", []), default=str), + "probable_causes_json": json.dumps(triage.get("probable_causes", []), default=str), + "safety_notes_json": json.dumps(triage.get("safety_notes", []), default=str), + "deterministic_reasons_json": json.dumps(deterministic.get("reasons", []), default=str), + "z_score": float(deterministic.get("z_score", 0.0)), + "mad_score": float(deterministic.get("mad_score", 0.0)), + "delta_rate": float(deterministic.get("delta_rate", 0.0)), + "window_volatility": float(deterministic.get("window_volatility", 0.0)), + "source_tag": context["tag_path"], + "tag_name": context.get("tag_name") or context["tag_path"], + "live_quality": live_sample.get("quality"), + "live_timestamp": live_sample.get("timestamp"), + "live_value": str(live_sample.get("value")), + "dedup_key": dedup_sig, + "created_at": utc_now_iso(), + "updated_at": utc_now_iso(), + } + + with self.graph.session() as session: + session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + CREATE (e:AnomalyEvent $props) + MERGE (r)-[:EMITTED]->(e) + """, + run_id=self.run_id, + props=event_data, + ) + + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (t:ScadaTag) + WHERE t.name = $tag OR t.opc_item_path = $tag + MERGE (e)-[:OBSERVED_ON]->(t) + """, + event_id=event_id, + tag=context["tag_path"], + ) + + for equipment_name in context.get("equipment", [])[:5]: + session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + MATCH (eq:Equipment {name: $name}) + MERGE (e)-[:AFFECTS]->(eq) + """, + event_id=event_id, + name=equipment_name, + ) + + related_inputs: List[Dict[str, str]] = [] + for item in triage.get("related_entities", []) or []: + if isinstance(item, dict) and item.get("label") and item.get("name"): + related_inputs.append({"label": str(item["label"]), "name": str(item["name"])}) + for name in context.get("symptoms", [])[:3]: + related_inputs.append({"label": "FaultSymptom", "name": name}) + for name in context.get("causes", [])[:3]: + related_inputs.append({"label": "FaultCause", "name": name}) + + for rel in related_inputs[:8]: + label = rel["label"] + if label not in {"FaultSymptom", "FaultCause", "ControlPattern", "SafetyElement", "Equipment", "ScadaTag"}: + continue + session.run( + f""" + MATCH (e:AnomalyEvent {{event_id: $event_id}}) + MATCH (n:{label}) + WHERE n.name = $name OR n.symptom = $name OR n.cause = $name + MERGE (e)-[:RELATED_TO]->(n) + """, + event_id=event_id, + name=rel["name"], + ) + + return event_data + + # ----------------------------- + # Monitoring loop + # ----------------------------- + def run_cycle(self) -> Dict[str, Any]: + cycle_start = time.time() + metrics = {"candidates": 0, "triaged": 0, "emitted": 0, "cycleMs": 0} + thresholds = self.config.get("thresholds", {}) + min_history = int(self.config.get("minHistoryPoints", 30)) + + if not self.api.is_configured: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "ignition_not_configured", + "message": "Ignition API URL/token not configured.", + "recoverable": True, + "timestamp": utc_now_iso(), + }) + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + tags = self.get_monitored_tags() + if not tags: + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "no_tags_found", + "message": "No ScadaTag nodes with readable tag paths found.", + "recoverable": True, + "timestamp": utc_now_iso(), + }) + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + tag_paths = [t["path"] for t in tags] + live_values = self.api.read_tags(tag_paths) + candidates: List[Dict[str, Any]] = [] + now = datetime.now(timezone.utc) + + for tv in live_values: + if tv.error: + continue + if not is_quality_good(tv.quality): + # quality gate: only emit quality anomalies if this persists via triage. + continue + if is_stale(tv.timestamp, int(thresholds.get("stalenessSec", 120)), now=now): + continue + + history = self.fetch_history_values(tv.path) + if len(history) < min_history: + continue + + prev_val = self._prev_values.get(tv.path) + deterministic = compute_deviation_scores( + current_value=tv.value, + history_values=history, + prev_value=prev_val, + thresholds=thresholds, + ) + curr_num = safe_float(tv.value) + if curr_num is not None: + self._prev_values[tv.path] = curr_num + + if deterministic.get("candidate"): + context = self.get_context(tv.path) + candidates.append( + { + "context": context, + "deterministic": deterministic, + "live_sample": { + "path": tv.path, + "value": tv.value, + "quality": tv.quality, + "timestamp": tv.timestamp, + "data_type": tv.data_type, + }, + } + ) + + metrics["candidates"] = len(candidates) + max_candidates = int(self.config.get("maxCandidatesPerCycle", 25)) + max_triage = int(self.config.get("maxLlmTriagesPerCycle", 5)) + shortlisted = candidates[:max_candidates] + + for idx, candidate in enumerate(shortlisted): + use_llm = idx < max_triage + triage = ( + self.run_llm_triage( + candidate["context"], + candidate["deterministic"], + candidate["live_sample"], + ) + if use_llm + else { + "summary": f"Deviation on {candidate['context'].get('tag_name', candidate['context']['tag_path'])}", + "category": candidate["deterministic"].get("category", "deviation"), + "severity": "medium", + "confidence": 0.5, + "verification_checks": [], + "probable_causes": [], + "safety_notes": [], + "rationale": "Triaged in deterministic-only mode due per-cycle LLM cap.", + "related_entities": [], + } + ) + metrics["triaged"] += 1 + persisted = self.persist_event( + candidate["context"], + candidate["deterministic"], + candidate["live_sample"], + triage, + ) + if persisted: + metrics["emitted"] += 1 + emit("AGENT_EVENT", { + "runId": self.run_id, + "eventId": persisted["event_id"], + "severity": persisted["severity"], + "summary": persisted["summary"], + "category": persisted.get("category"), + "entityRefs": { + "tag": persisted.get("tag_name") or persisted.get("source_tag"), + "sourceTag": persisted.get("source_tag"), + }, + "createdAt": persisted.get("created_at"), + }) + + metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) + return metrics + + def cleanup_retention(self) -> int: + retention_days = int(self.config.get("retentionDays", 14)) + return self.graph.cleanup_anomaly_events(retention_days=retention_days) + + def run_forever(self) -> int: + self.init_schema() + self.upsert_run("running") + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": 0, + "candidates": 0, + "triaged": 0, + "emitted": 0, + "timestamp": utc_now_iso(), + }) + + poll_ms = int(self.config.get("pollIntervalMs", 15000)) + cleanup_every = max(1, int(self.config.get("cleanupEveryCycles", 40))) + exit_code = 0 + reason = "stopped" + + while self._running: + self._cycle_count += 1 + cycle_started = time.time() + try: + metrics = self.run_cycle() + self.heartbeat(metrics) + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": metrics["cycleMs"], + "candidates": metrics["candidates"], + "triaged": metrics["triaged"], + "emitted": metrics["emitted"], + "timestamp": utc_now_iso(), + }) + if self._cycle_count % cleanup_every == 0: + deleted = self.cleanup_retention() + if deleted > 0: + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "retention_cleanup", + "cycleMs": 0, + "candidates": 0, + "triaged": 0, + "emitted": deleted, + "timestamp": utc_now_iso(), + }) + except Exception as exc: + reason = "failed" + exit_code = 1 + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": "cycle_error", + "message": str(exc), + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + elapsed_ms = int((time.time() - cycle_started) * 1000) + remaining = max(0, poll_ms - elapsed_ms) / 1000.0 + if remaining > 0: + time.sleep(remaining) + + self.upsert_run("stopped" if reason != "failed" else "failed", reason=reason) + emit("AGENT_COMPLETE", { + "runId": self.run_id, + "success": exit_code == 0, + "reason": reason, + "stoppedAt": utc_now_iso(), + }) + return exit_code + + # ----------------------------- + # Single-operation helpers + # ----------------------------- + def list_events(self, limit: int, state: Optional[str], severity: Optional[str], run_id: Optional[str]) -> Dict[str, Any]: + events = self.graph.list_anomaly_events(limit=limit, state=state, severity=severity, run_id=run_id) + return {"success": True, "events": events} + + def get_event(self, event_id: str) -> Dict[str, Any]: + event = self.graph.get_anomaly_event(event_id) + if not event: + return {"success": False, "error": f"Event not found: {event_id}"} + return {"success": True, "event": event} + + def ack_event(self, event_id: str, note: Optional[str]) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + SET e.state = 'acknowledged', + e.acknowledged_at = datetime(), + e.ack_note = $note, + e.updated_at = datetime() + RETURN count(e) AS cnt + """, + event_id=event_id, + note=note or "", + ) + record = result.single() + if not record or record["cnt"] == 0: + return {"success": False, "error": f"Event not found: {event_id}"} + return {"success": True, "eventId": event_id} + + def get_status(self, run_id: str) -> Dict[str, Any]: + with self.graph.session() as session: + result = session.run( + """ + MATCH (r:AgentRun {run_id: $run_id}) + RETURN r + LIMIT 1 + """, + run_id=run_id, + ) + row = result.single() + if not row: + return {"success": False, "error": f"Run not found: {run_id}"} + props = dict(row["r"]) + return { + "success": True, + "status": props.get("status"), + "metrics": { + "cycleCount": props.get("cycle_count", 0), + "lastCycleMs": props.get("last_cycle_ms", 0), + "lastCandidates": props.get("last_candidates", 0), + "lastTriaged": props.get("last_triaged", 0), + "lastEmitted": props.get("last_emitted", 0), + }, + "lastHeartbeatAt": props.get("last_heartbeat_at"), + "run": props, + } + + +def _load_fixture_cases(path: Path) -> List[Dict[str, Any]]: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data.get("cases", []) + if isinstance(data, list): + return data + return [] + + +def replay_fixtures(config_json: Optional[str], fixture_path: str) -> Dict[str, Any]: + config = merge_defaults(json.loads(config_json) if config_json else {}) + path = Path(fixture_path) + cases = _load_fixture_cases(path) + thresholds = config.get("thresholds", {}) + passed = 0 + failures: List[Dict[str, Any]] = [] + + for case in cases: + result = compute_deviation_scores( + current_value=case.get("current_value"), + history_values=case.get("history_values", []), + prev_value=case.get("prev_value"), + thresholds=thresholds, + ) + expected = bool(case.get("expected_candidate", False)) + if result.get("candidate") == expected: + passed += 1 + else: + failures.append( + { + "id": case.get("id"), + "expected_candidate": expected, + "actual_candidate": result.get("candidate"), + "category": result.get("category"), + "reasons": result.get("reasons", []), + } + ) + + return { + "success": len(failures) == 0, + "total": len(cases), + "passed": passed, + "failed": len(failures), + "failures": failures, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Anomaly monitor worker") + sub = parser.add_subparsers(dest="command", required=True) + + p_run = sub.add_parser("run", help="Run continuous anomaly monitoring") + p_run.add_argument("--run-id", help="Optional run id") + p_run.add_argument("--config-json", default="{}", help="JSON config string") + + p_status = sub.add_parser("status", help="Get status for one run") + p_status.add_argument("--run-id", required=True) + + p_list = sub.add_parser("list-events", help="List anomaly events") + p_list.add_argument("--limit", type=int, default=100) + p_list.add_argument("--state") + p_list.add_argument("--severity") + p_list.add_argument("--run-id") + + p_get = sub.add_parser("get-event", help="Get one anomaly event") + p_get.add_argument("--event-id", required=True) + + p_ack = sub.add_parser("ack-event", help="Acknowledge one anomaly event") + p_ack.add_argument("--event-id", required=True) + p_ack.add_argument("--note") + + p_cleanup = sub.add_parser("cleanup", help="Delete old anomaly events") + p_cleanup.add_argument("--retention-days", type=int, default=14) + + p_replay = sub.add_parser("replay-fixtures", help="Validate deterministic scoring against fixtures") + p_replay.add_argument("--fixture-file", required=True) + p_replay.add_argument("--config-json", default="{}") + + args = parser.parse_args() + + if args.command == "replay-fixtures": + result = replay_fixtures(args.config_json, args.fixture_file) + print(json.dumps(result)) + return 0 if result["success"] else 1 + + try: + monitor = AnomalyMonitor( + config=json.loads(getattr(args, "config_json", "{}") or "{}"), + run_id=getattr(args, "run_id", None), + ) + except Exception as exc: + print(json.dumps({"success": False, "error": str(exc)})) + return 1 + + if args.command == "run": + def _signal_handler(_signum, _frame): + monitor._running = False + + signal.signal(signal.SIGTERM, _signal_handler) + if hasattr(signal, "SIGINT"): + signal.signal(signal.SIGINT, _signal_handler) + return monitor.run_forever() + + if args.command == "status": + print(json.dumps(monitor.get_status(args.run_id), default=str)) + return 0 + + if args.command == "list-events": + print(json.dumps(monitor.list_events(args.limit, args.state, args.severity, args.run_id), default=str)) + return 0 + + if args.command == "get-event": + print(json.dumps(monitor.get_event(args.event_id), default=str)) + return 0 + + if args.command == "ack-event": + print(json.dumps(monitor.ack_event(args.event_id, args.note), default=str)) + return 0 + + if args.command == "cleanup": + deleted = monitor.graph.cleanup_anomaly_events(args.retention_days) + print(json.dumps({"success": True, "deleted": deleted})) + return 0 + + return 1 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/scripts/anomaly_rules.py b/scripts/anomaly_rules.py new file mode 100644 index 0000000..2aa274d --- /dev/null +++ b/scripts/anomaly_rules.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Deterministic anomaly scoring primitives for monitoring agents. + +This module intentionally avoids external dependencies so it can run in +packaged/offline environments. +""" + +from __future__ import annotations + +import hashlib +import math +from datetime import datetime, timezone +from statistics import mean, median, pstdev +from typing import Any, Dict, List, Optional + + +def safe_float(value: Any) -> Optional[float]: + """Best-effort conversion to float.""" + if value is None: + return None + if isinstance(value, bool): + return float(value) + if isinstance(value, (int, float)): + if math.isnan(value) or math.isinf(value): + return None + return float(value) + text = str(value).strip() + if not text: + return None + try: + result = float(text) + except ValueError: + return None + if math.isnan(result) or math.isinf(result): + return None + return result + + +def parse_timestamp(ts: Optional[str]) -> Optional[datetime]: + """Parse an ISO-like timestamp to UTC-aware datetime.""" + if not ts: + return None + text = str(ts).strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(text) + except ValueError: + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + + +def is_quality_good(quality: Optional[str]) -> bool: + """Conservative quality gate.""" + if quality is None: + return False + q = str(quality).strip().lower() + if not q: + return False + if "good" in q or "ok" in q or q in {"192"}: + return True + return False + + +def is_stale(timestamp: Optional[str], staleness_sec: int, now: Optional[datetime] = None) -> bool: + """Return True if sample timestamp is stale or invalid.""" + if staleness_sec <= 0: + return False + parsed = parse_timestamp(timestamp) + if parsed is None: + return True + baseline = now or datetime.now(timezone.utc) + age = (baseline - parsed).total_seconds() + return age > staleness_sec + + +def _mad(values: List[float]) -> float: + """Median absolute deviation.""" + if not values: + return 0.0 + med = median(values) + abs_dev = [abs(v - med) for v in values] + return median(abs_dev) if abs_dev else 0.0 + + +def _percentile_rank(values: List[float], current: float) -> float: + """Approximate percentile rank of current within values.""" + if not values: + return 0.0 + less_or_equal = sum(1 for v in values if v <= current) + return less_or_equal / len(values) + + +def compute_deviation_scores( + current_value: Any, + history_values: List[Any], + prev_value: Any = None, + thresholds: Optional[Dict[str, float]] = None, +) -> Dict[str, Any]: + """ + Compute deterministic anomaly scores and candidate flags. + + Threshold defaults are intentionally conservative and should be configured + per process during rollout. + """ + cfg = { + "z": 3.0, + "mad": 3.5, + "rate": 0.0, + "flatline_std_epsilon": 1e-6, + "stuck_window_size": 20, + } + if thresholds: + cfg.update({k: v for k, v in thresholds.items() if v is not None}) + + current = safe_float(current_value) + hist = [v for v in (safe_float(x) for x in history_values) if v is not None] + previous = safe_float(prev_value) + + result: Dict[str, Any] = { + "candidate": False, + "reasons": [], + "category": "normal", + "z_score": 0.0, + "mad_score": 0.0, + "delta_rate": 0.0, + "window_volatility": 0.0, + "percentile_rank": 0.0, + "drift_score": 0.0, + "history_points": len(hist), + } + + if current is None: + result["category"] = "invalid_value" + result["reasons"].append("current_value_not_numeric") + return result + if not hist: + result["category"] = "insufficient_history" + result["reasons"].append("history_empty") + return result + + mu = mean(hist) + sigma = pstdev(hist) if len(hist) > 1 else 0.0 + sigma = max(sigma, 1e-9) + z_score = (current - mu) / sigma + result["z_score"] = z_score + result["window_volatility"] = sigma + result["percentile_rank"] = _percentile_rank(hist, current) + + mad = _mad(hist) + mad_denom = max(mad * 1.4826, 1e-9) + mad_score = abs(current - median(hist)) / mad_denom + result["mad_score"] = mad_score + + if previous is not None: + result["delta_rate"] = abs(current - previous) + + if abs(z_score) >= float(cfg["z"]): + result["candidate"] = True + result["reasons"].append("z_score_threshold") + if mad_score >= float(cfg["mad"]): + result["candidate"] = True + result["reasons"].append("mad_score_threshold") + if float(cfg["rate"]) > 0 and result["delta_rate"] >= float(cfg["rate"]): + result["candidate"] = True + result["reasons"].append("delta_rate_threshold") + + if len(hist) >= 20: + midpoint = len(hist) // 2 + first_half = hist[:midpoint] + second_half = hist[midpoint:] + trend_delta = abs(mean(second_half) - mean(first_half)) + trend_score = trend_delta / sigma + result["drift_score"] = trend_score + if trend_score >= 1.25 and (result["percentile_rank"] >= 0.85 or result["percentile_rank"] <= 0.15): + result["candidate"] = True + result["reasons"].append("drift_trend") + + recent = hist[-int(max(3, cfg["stuck_window_size"])) :] + recent_std = pstdev(recent) if len(recent) > 1 else 0.0 + if recent_std <= float(cfg["flatline_std_epsilon"]): + if previous is not None and abs(current - previous) <= float(cfg["flatline_std_epsilon"]): + result["candidate"] = True + result["reasons"].append("flatline_detected") + result["category"] = "stuck" + + if result["category"] == "normal" and result["candidate"]: + if "flatline_detected" in result["reasons"]: + result["category"] = "stuck" + elif result["delta_rate"] > 0 and "delta_rate_threshold" in result["reasons"]: + result["category"] = "spike" + elif "drift_trend" in result["reasons"]: + result["category"] = "drift" + elif abs(z_score) > 0 and len(hist) >= 20: + # Drift-like heuristic for sustained tail position with moderate rate + if result["percentile_rank"] >= 0.95 or result["percentile_rank"] <= 0.05: + result["category"] = "drift" + else: + result["category"] = "spike" + else: + result["category"] = "deviation" + + return result + + +def dedup_key(tag_path: str, category: str, bucket_minutes: int = 10) -> str: + """Create a deterministic dedup signature for event cooldown windows.""" + now = datetime.now(timezone.utc) + bucket = int(now.timestamp() // max(1, bucket_minutes * 60)) + raw = f"{tag_path}|{category}|{bucket}" + return hashlib.sha1(raw.encode("utf-8")).hexdigest() + diff --git a/scripts/fixtures/anomaly_replay_cases.json b/scripts/fixtures/anomaly_replay_cases.json new file mode 100644 index 0000000..544cd3f --- /dev/null +++ b/scripts/fixtures/anomaly_replay_cases.json @@ -0,0 +1,32 @@ +{ + "cases": [ + { + "id": "normal-baseline", + "current_value": 50.3, + "prev_value": 50.1, + "history_values": [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2, 50.1, 50.0, 49.9, 50.2, 50.1, 50.0, 50.2, 49.8, 50.0, 50.1, 50.0, 49.9, 50.1, 50.2, 50.0, 50.1, 49.9, 50.0, 50.1, 50.0], + "expected_candidate": false + }, + { + "id": "sudden-spike", + "current_value": 91.0, + "prev_value": 49.8, + "history_values": [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2, 50.1, 50.0, 49.9, 50.2, 50.1, 50.0, 50.2, 49.8, 50.0, 50.1, 50.0, 49.9, 50.1, 50.2, 50.0, 50.1, 49.9, 50.0, 50.1, 50.0], + "expected_candidate": true + }, + { + "id": "slow-drift-tail", + "current_value": 61.5, + "prev_value": 61.0, + "history_values": [50.0, 50.2, 50.3, 50.5, 50.7, 50.9, 51.1, 51.4, 51.8, 52.1, 52.6, 53.0, 53.5, 54.0, 54.5, 55.1, 55.6, 56.0, 56.6, 57.0, 57.5, 58.0, 58.4, 58.9, 59.4, 59.9, 60.2, 60.6, 60.9, 61.2], + "expected_candidate": true + }, + { + "id": "flatline-stuck", + "current_value": 72.0, + "prev_value": 72.0, + "history_values": [72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0, 72.0], + "expected_candidate": true + } + ] +} diff --git a/scripts/graph_api.py b/scripts/graph_api.py index 8e36e7c..e3bff45 100644 --- a/scripts/graph_api.py +++ b/scripts/graph_api.py @@ -77,6 +77,8 @@ class GraphAPI: "processdeviation": "mes", "functionallocation": "mes", "vendor": "mes", + "agentrun": "anomaly", + "anomalyevent": "anomaly", } # Color palette for node types @@ -91,6 +93,7 @@ class GraphAPI: "flows": "#E91E63", "overview": "#607D8B", "mes": "#00897B", + "anomaly": "#F44336", "other": "#9E9E9E", } @@ -252,9 +255,11 @@ def get_neighbors( WHERE center.name = $node_id OR center.name ENDS WITH $node_id OR center.name CONTAINS $node_id + OR center.event_id = $node_id + OR center.run_id = $node_id RETURN elementId(center) as id, labels(center)[0] as type, - center.name as label, + coalesce(center.name, center.event_id, center.run_id, center.symptom, center.phrase, 'unknown') as label, properties(center) as props LIMIT 1 """ @@ -264,9 +269,11 @@ def get_neighbors( WHERE center.name = $node_id OR center.name ENDS WITH $node_id OR center.name CONTAINS $node_id + OR center.event_id = $node_id + OR center.run_id = $node_id RETURN elementId(center) as id, labels(center)[0] as type, - center.name as label, + coalesce(center.name, center.event_id, center.run_id, center.symptom, center.phrase, 'unknown') as label, properties(center) as props LIMIT 1 """ diff --git a/scripts/ignition_api_client.py b/scripts/ignition_api_client.py index d0d7e41..e8fbccf 100644 --- a/scripts/ignition_api_client.py +++ b/scripts/ignition_api_client.py @@ -22,7 +22,11 @@ from urllib.parse import urljoin, quote import requests -from dotenv import load_dotenv +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal envs + def load_dotenv(*_args, **_kwargs): + return False load_dotenv() @@ -243,60 +247,55 @@ def read_tags(self, paths: List[str]) -> List[TagValue]: @staticmethod def _local_iso_to_utc(dt_str: str) -> str: - """Convert a bare ISO datetime string (assumed local) to UTC. + """ + Convert a bare ISO datetime string (assumed local time) to UTC. - If the string already has a timezone indicator (Z, +, -) - or looks like epoch milliseconds, it is returned unchanged. + If the input already contains timezone info or appears to be epoch + milliseconds, it is returned unchanged. """ from datetime import datetime, timezone - s = str(dt_str).strip() + text = str(dt_str).strip() + if not text: + return text - # Epoch ms – pass through - if s.isdigit(): - return s + # Epoch millis (or seconds) should pass through unchanged. + if text.isdigit(): + return text - # Already has TZ info – pass through - if s.endswith("Z") or "+" in s[10:] or s[10:].count("-") > 0: - return s + # Already timezone-aware. + if text.endswith("Z") or "+" in text[10:] or text[10:].count("-") > 0: + return text try: - naive = datetime.fromisoformat(s) - local_dt = naive.astimezone() # attach local TZ + naive = datetime.fromisoformat(text) + local_dt = naive.astimezone() utc_dt = local_dt.astimezone(timezone.utc) return utc_dt.strftime("%Y-%m-%dT%H:%M:%S") except (ValueError, TypeError): - return s + return text def query_tag_history( self, tag_paths: List[str], start_date: str, end_date: str, - return_size: int = 100, + return_size: int = 200, aggregation_mode: str = "Average", return_format: str = "Wide", interval_minutes: Optional[int] = None, include_bounding_values: bool = False, ) -> Optional[Any]: - """Query historical tag values via the WebDev queryTagHistory endpoint. - - Bare ISO datetime strings (no timezone suffix) are assumed to be in - the server's local timezone and are converted to UTC before sending - to the gateway (which interprets all times as UTC). - - Args: - tag_paths: Tag paths with provider prefix, e.g. ['[default]Folder/Tag']. - start_date: ISO datetime string (local) or epoch ms. - end_date: ISO datetime string (local) or epoch ms. - return_size: Max rows to return (default 100). - aggregation_mode: Average, MinMax, LastValue, Sum, Minimum, Maximum. - return_format: Wide or Tall. - interval_minutes: Aggregation interval in minutes. - include_bounding_values: Include values at boundaries. """ - normalised = [self._ensure_provider_prefix(p) for p in tag_paths] + Query historical tag values from the WebDev queryTagHistory endpoint. + + Dates may be passed as local ISO strings; they are converted to UTC + to match Ignition endpoint expectations. + """ + if not tag_paths: + return {"error": "No tag paths provided", "tagPaths": []} + normalised = [self._ensure_provider_prefix(p) for p in tag_paths] utc_start = self._local_iso_to_utc(start_date) utc_end = self._local_iso_to_utc(end_date) @@ -304,19 +303,17 @@ def query_tag_history( "tagPaths": ",".join(normalised), "startDate": utc_start, "endDate": utc_end, - "returnSize": return_size, + "returnSize": int(return_size), "aggregationMode": aggregation_mode, "returnFormat": return_format, - "includeBoundingValues": str(include_bounding_values).lower(), + "includeBoundingValues": str(bool(include_bounding_values)).lower(), } if interval_minutes is not None: - params["intervalMinutes"] = interval_minutes + params["intervalMinutes"] = int(interval_minutes) data = self._get("system/webdev/Axilon/queryTagHistory", params=params) - if data is None: return {"error": "API request failed or not configured", "tagPaths": normalised} - return data # --------------------------------------------------------------------- # diff --git a/scripts/neo4j_ontology.py b/scripts/neo4j_ontology.py index 110719f..380e3cb 100644 --- a/scripts/neo4j_ontology.py +++ b/scripts/neo4j_ontology.py @@ -9,7 +9,11 @@ from typing import Dict, List, Optional, Any, Union from dataclasses import dataclass, field from contextlib import contextmanager -from dotenv import load_dotenv +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback for minimal envs + def load_dotenv(*_args, **_kwargs): + return False from neo4j import GraphDatabase, Driver, Session @@ -147,6 +151,8 @@ def create_indexes(self) -> None: "CREATE CONSTRAINT project_name IF NOT EXISTS FOR (p:Project) REQUIRE p.name IS UNIQUE", "CREATE CONSTRAINT script_name IF NOT EXISTS FOR (s:Script) REQUIRE s.name IS UNIQUE", "CREATE CONSTRAINT namedquery_name IF NOT EXISTS FOR (q:NamedQuery) REQUIRE q.name IS UNIQUE", + "CREATE CONSTRAINT agentrun_id IF NOT EXISTS FOR (r:AgentRun) REQUIRE r.run_id IS UNIQUE", + "CREATE CONSTRAINT anomalyevent_id IF NOT EXISTS FOR (e:AnomalyEvent) REQUIRE e.event_id IS UNIQUE", ] # Regular indexes @@ -186,6 +192,11 @@ def create_indexes(self) -> None: "CREATE INDEX hmitextlist_name IF NOT EXISTS FOR (htl:HMITextList) ON (htl.name)", "CREATE INDEX plctagtable_name IF NOT EXISTS FOR (pt:PLCTagTable) ON (pt.name)", "CREATE INDEX plctag_name IF NOT EXISTS FOR (ptg:PLCTag) ON (ptg.name)", + # Agent monitoring indexes + "CREATE INDEX anomalyevent_created IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.created_at)", + "CREATE INDEX anomalyevent_state IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.state)", + "CREATE INDEX anomalyevent_severity IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.severity)", + "CREATE INDEX anomalyevent_dedup_key IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.dedup_key)", ] for constraint in constraints: @@ -202,6 +213,95 @@ def create_indexes(self) -> None: if "already exists" not in str(e).lower(): print(f"[WARNING] Index error: {e}") + def init_agent_monitoring_schema(self) -> None: + """Ensure agent monitoring labels and indexes exist.""" + self.create_indexes() + + def list_anomaly_events( + self, + limit: int = 100, + state: Optional[str] = None, + severity: Optional[str] = None, + run_id: Optional[str] = None, + ) -> List[Dict[str, Any]]: + """List persisted anomaly events for UI feeds.""" + with self.session() as session: + clauses = [] + params: Dict[str, Any] = {"limit": max(1, min(limit, 500))} + if state: + clauses.append("e.state = $state") + params["state"] = state + if severity: + clauses.append("e.severity = $severity") + params["severity"] = severity + if run_id: + clauses.append("e.run_id = $run_id") + params["run_id"] = run_id + where = f"WHERE {' AND '.join(clauses)}" if clauses else "" + query = f""" + MATCH (e:AnomalyEvent) + {where} + OPTIONAL MATCH (e)-[:OBSERVED_ON]->(t:ScadaTag) + OPTIONAL MATCH (e)-[:AFFECTS]->(eq:Equipment) + RETURN e, collect(DISTINCT t.name) AS tags, collect(DISTINCT eq.name) AS equipment + ORDER BY e.created_at DESC + LIMIT $limit + """ + result = session.run(query, **params) + events: List[Dict[str, Any]] = [] + for record in result: + node = record["e"] + props = dict(node) + props["tags"] = [x for x in record["tags"] if x] + props["equipment"] = [x for x in record["equipment"] if x] + events.append(props) + return events + + def get_anomaly_event(self, event_id: str) -> Optional[Dict[str, Any]]: + """Get one anomaly event with linked context labels.""" + with self.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent {event_id: $event_id}) + OPTIONAL MATCH (e)-[:OBSERVED_ON]->(t:ScadaTag) + OPTIONAL MATCH (e)-[:AFFECTS]->(eq:Equipment) + OPTIONAL MATCH (e)-[r:RELATED_TO]->(n) + RETURN e, + collect(DISTINCT t.name) AS tags, + collect(DISTINCT eq.name) AS equipment, + collect(DISTINCT {type: type(r), label: labels(n)[0], name: coalesce(n.name, n.symptom, n.phrase)}) AS related + LIMIT 1 + """, + event_id=event_id, + ) + record = result.single() + if not record: + return None + data = dict(record["e"]) + data["tags"] = [x for x in record["tags"] if x] + data["equipment"] = [x for x in record["equipment"] if x] + data["related"] = [ + x for x in record["related"] if x and x.get("name") + ] + return data + + def cleanup_anomaly_events(self, retention_days: int = 14) -> int: + """Delete old anomaly events outside retention window.""" + with self.session() as session: + result = session.run( + """ + MATCH (e:AnomalyEvent) + WHERE e.created_at IS NOT NULL + AND datetime(e.created_at) < datetime() - duration({days: $days}) + WITH collect(e) AS old_events + FOREACH (n IN old_events | DETACH DELETE n) + RETURN size(old_events) AS deleted + """, + days=max(1, retention_days), + ) + record = result.single() + return int(record["deleted"]) if record else 0 + def clear_all(self) -> None: """Clear all nodes and relationships. USE WITH CAUTION.""" with self.session() as session: @@ -4192,12 +4292,22 @@ def main(): "tia-projects", "tia-project-resources", "db-connections", + "init-agent-schema", + "list-anomaly-events", + "get-anomaly-event", + "cleanup-anomaly-events", ], help="Command to execute", ) parser.add_argument("--file", "-f", help="JSON file for import/export") parser.add_argument("--query", "-q", help="Query string for search") parser.add_argument("--project", "-p", help="Project name for project-resources") + parser.add_argument("--event-id", help="Event ID for get-anomaly-event") + parser.add_argument("--state", help="Filter anomaly events by state") + parser.add_argument("--severity", help="Filter anomaly events by severity") + parser.add_argument("--run-id", help="Filter anomaly events by run_id") + parser.add_argument("--limit", type=int, default=100, help="Limit results for list commands") + parser.add_argument("--retention-days", type=int, default=14, help="Retention window in days") parser.add_argument("--json", action="store_true", help="Output in JSON format") parser.add_argument( "--enrichment-status", @@ -4437,7 +4547,43 @@ def main(): f" {c['name']} ({c['database_type']}) " f"- {c['url']} [{enabled}]" ) + elif args.command == "init-agent-schema": + graph.init_agent_monitoring_schema() + print("[OK] Initialized agent monitoring schema") + + elif args.command == "list-anomaly-events": + events = graph.list_anomaly_events( + limit=args.limit, + state=args.state, + severity=args.severity, + run_id=args.run_id, + ) + if args.json: + print(json_module.dumps(events)) + else: + print(f"Anomaly events: {len(events)}") + for event in events: + print( + f"- {event.get('event_id')} {event.get('severity')} " + f"{event.get('summary', '')[:80]}" + ) + + elif args.command == "get-anomaly-event": + if not args.event_id: + print("[ERROR] --event-id required for get-anomaly-event") + return + event = graph.get_anomaly_event(args.event_id) + if args.json: + print(json_module.dumps(event or {})) + else: + if not event: + print(f"[ERROR] Event not found: {args.event_id}") + return + print(json_module.dumps(event, indent=2)) + elif args.command == "cleanup-anomaly-events": + deleted = graph.cleanup_anomaly_events(args.retention_days) + print(f"[OK] Deleted {deleted} anomaly events older than {args.retention_days} days") if __name__ == "__main__": main() From 1f1f6b4dc1f7005d8d144d21a10ff2dd77cce070 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 2 Mar 2026 18:06:40 +0000 Subject: [PATCH 02/18] Emit provider failures as anomaly feed events Co-authored-by: leor --- scripts/anomaly_monitor.py | 183 ++++++++++++++++++++++++++++++++----- 1 file changed, 161 insertions(+), 22 deletions(-) diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py index 70a0f4b..9049108 100644 --- a/scripts/anomaly_monitor.py +++ b/scripts/anomaly_monitor.py @@ -263,7 +263,7 @@ def _extract_history_values(self, history_data: Any, tag_path: str) -> List[floa values.append(val) return values - def fetch_history_values(self, tag_path: str) -> List[float]: + def fetch_history_values(self, tag_path: str) -> tuple[List[float], Optional[str]]: minutes = int(self.config.get("historyWindowMinutes", 360)) end_dt = datetime.now(timezone.utc) start_dt = end_dt - timedelta(minutes=minutes) @@ -275,7 +275,9 @@ def fetch_history_values(self, tag_path: str) -> List[float]: aggregation_mode="Average", return_format="Wide", ) - return self._extract_history_values(data, tag_path) + if isinstance(data, dict) and data.get("error"): + return [], str(data.get("error")) + return self._extract_history_values(data, tag_path), None def get_context(self, tag_path: str) -> Dict[str, Any]: with self.graph.session() as session: @@ -517,6 +519,97 @@ def persist_event( return event_data + def _emit_persisted_event(self, persisted: Dict[str, Any]) -> None: + """Emit normalized AGENT_EVENT payload for UI stream.""" + emit("AGENT_EVENT", { + "runId": self.run_id, + "eventId": persisted["event_id"], + "severity": persisted["severity"], + "summary": persisted["summary"], + "category": persisted.get("category"), + "entityRefs": { + "tag": persisted.get("tag_name") or persisted.get("source_tag"), + "sourceTag": persisted.get("source_tag"), + }, + "createdAt": persisted.get("created_at"), + }) + + def emit_provider_failure_event( + self, + code: str, + message: str, + *, + severity: str = "high", + category: str = "quality-issue", + source_tag: Optional[str] = None, + details: Optional[Dict[str, Any]] = None, + ) -> bool: + """ + Persist and stream provider-health anomalies so failures appear in feed. + + Returns: + True if a new event was persisted (false if deduped). + """ + emit("AGENT_ERROR", { + "runId": self.run_id, + "code": code, + "message": message, + "recoverable": True, + "timestamp": utc_now_iso(), + }) + + tag = source_tag or f"provider://{code}" + detail_blob = json.dumps(details or {}, default=str) + context = { + "tag_path": tag, + "tag_name": source_tag or "ProviderHealth", + "equipment": [], + "symptoms": [], + "causes": [], + "patterns": [], + "safety": [], + } + deterministic = { + "candidate": True, + "reasons": [code], + "category": category, + "z_score": 0.0, + "mad_score": 0.0, + "delta_rate": 0.0, + "window_volatility": 0.0, + "history_points": 0, + } + triage = { + "summary": message, + "category": category, + "severity": severity, + "confidence": 0.9, + "probable_causes": [message], + "verification_checks": [ + "Check Ignition gateway connectivity and credentials.", + "Validate tag provider availability and endpoint health.", + ], + "safety_notes": [], + "rationale": f"Provider health event ({code}). Details: {detail_blob}", + "related_entities": [], + } + persisted = self.persist_event( + context=context, + deterministic=deterministic, + live_sample={ + "path": tag, + "value": "", + "quality": "Bad", + "timestamp": utc_now_iso(), + "data_type": "provider_health", + }, + triage=triage, + ) + if persisted: + self._emit_persisted_event(persisted) + return True + return False + # ----------------------------- # Monitoring loop # ----------------------------- @@ -527,13 +620,14 @@ def run_cycle(self) -> Dict[str, Any]: min_history = int(self.config.get("minHistoryPoints", 30)) if not self.api.is_configured: - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": "ignition_not_configured", - "message": "Ignition API URL/token not configured.", - "recoverable": True, - "timestamp": utc_now_iso(), - }) + emitted = self.emit_provider_failure_event( + "ignition_not_configured", + "Ignition API URL/token not configured.", + severity="critical", + category="state-conflict", + ) + if emitted: + metrics["emitted"] += 1 metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) return metrics @@ -553,17 +647,31 @@ def run_cycle(self) -> Dict[str, Any]: live_values = self.api.read_tags(tag_paths) candidates: List[Dict[str, Any]] = [] now = datetime.now(timezone.utc) + live_error_count = 0 + live_error_samples: List[str] = [] + history_error_count = 0 + history_error_samples: List[str] = [] + valid_live_count = 0 for tv in live_values: if tv.error: + live_error_count += 1 + if len(live_error_samples) < 5: + live_error_samples.append(f"{tv.path}: {tv.error}") continue + valid_live_count += 1 if not is_quality_good(tv.quality): # quality gate: only emit quality anomalies if this persists via triage. continue if is_stale(tv.timestamp, int(thresholds.get("stalenessSec", 120)), now=now): continue - history = self.fetch_history_values(tv.path) + history, history_error = self.fetch_history_values(tv.path) + if history_error: + history_error_count += 1 + if len(history_error_samples) < 5: + history_error_samples.append(f"{tv.path}: {history_error}") + continue if len(history) < min_history: continue @@ -594,6 +702,48 @@ def run_cycle(self) -> Dict[str, Any]: } ) + if live_values and live_error_count == len(live_values): + emitted = self.emit_provider_failure_event( + "live_tag_provider_failed", + f"Live tag provider failed for all reads ({live_error_count}/{len(live_values)}).", + severity="high", + category="quality-issue", + details={"samples": live_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + elif live_error_count > 0: + emitted = self.emit_provider_failure_event( + "live_tag_provider_partial_failure", + f"Live tag provider partially failed ({live_error_count}/{len(live_values)} reads).", + severity="medium", + category="quality-issue", + details={"samples": live_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + + if valid_live_count > 0 and history_error_count >= max(1, int(valid_live_count * 0.8)): + emitted = self.emit_provider_failure_event( + "history_provider_failed", + f"History provider failed for most queries ({history_error_count}/{valid_live_count}).", + severity="high", + category="quality-issue", + details={"samples": history_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + elif history_error_count > 0: + emitted = self.emit_provider_failure_event( + "history_provider_partial_failure", + f"History provider partially failed ({history_error_count}/{valid_live_count}).", + severity="medium", + category="quality-issue", + details={"samples": history_error_samples}, + ) + if emitted: + metrics["emitted"] += 1 + metrics["candidates"] = len(candidates) max_candidates = int(self.config.get("maxCandidatesPerCycle", 25)) max_triage = int(self.config.get("maxLlmTriagesPerCycle", 5)) @@ -629,18 +779,7 @@ def run_cycle(self) -> Dict[str, Any]: ) if persisted: metrics["emitted"] += 1 - emit("AGENT_EVENT", { - "runId": self.run_id, - "eventId": persisted["event_id"], - "severity": persisted["severity"], - "summary": persisted["summary"], - "category": persisted.get("category"), - "entityRefs": { - "tag": persisted.get("tag_name") or persisted.get("source_tag"), - "sourceTag": persisted.get("source_tag"), - }, - "createdAt": persisted.get("created_at"), - }) + self._emit_persisted_event(persisted) metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) return metrics From 1a17e651432ab28c7109ef46bc5fc05987e4be53 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 2 Mar 2026 18:19:17 +0000 Subject: [PATCH 03/18] Migrate tests to pytest with ingest coverage Co-authored-by: leor --- .gitignore | 3 +- pytest.ini | 4 + requirements-dev.txt | 1 + tests/README.md | 46 +++++ tests/conftest.py | 35 ++++ .../integration/simulated_ignition_server.py | 170 ++++++++++++++++++ .../integration/test_live_value_sim_server.py | 75 ++++++++ tests/unit/test_anomaly_rules.py | 64 +++++++ tests/unit/test_ingest_siemens_parser.py | 72 ++++++++ tests/unit/test_ingest_workbench_parser.py | 119 ++++++++++++ 10 files changed, 587 insertions(+), 2 deletions(-) create mode 100644 pytest.ini create mode 100644 requirements-dev.txt create mode 100644 tests/README.md create mode 100644 tests/conftest.py create mode 100644 tests/integration/simulated_ignition_server.py create mode 100644 tests/integration/test_live_value_sim_server.py create mode 100644 tests/unit/test_anomaly_rules.py create mode 100644 tests/unit/test_ingest_siemens_parser.py create mode 100644 tests/unit/test_ingest_workbench_parser.py diff --git a/.gitignore b/.gitignore index 085a6d7..28f5878 100644 --- a/.gitignore +++ b/.gitignore @@ -9,8 +9,7 @@ venv/ ENV/ .venv -# Test files and outputs -tests/ +# Test outputs *_updated*.xml *_applied*.xml *_diffs/ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..3b2c446 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +python_files = test_*.py +addopts = -q diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..350a8d4 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,46 @@ +# Test Flow: Agents Monitoring + Ingest + +This repository now includes a lightweight test scaffold using `pytest`. + +## Layout + +- `tests/unit/` + - `test_anomaly_rules.py` + Unit tests for deterministic anomaly scoring and quality/staleness gates. + - `test_ingest_workbench_parser.py` + Unit tests for workbench ingest parsing. + - `test_ingest_siemens_parser.py` + Unit tests for Siemens `.st` ingest parsing. + +- `tests/integration/` + - `simulated_ignition_server.py` + Local simulated live/history webserver implementing: + - `/system/webdev/Axilon/getTags` + - `/system/webdev/Axilon/queryTagHistory` + - `test_live_value_sim_server.py` + Integration tests for `IgnitionApiClient` + anomaly scoring with simulated live values. + +## Run all tests + +```bash +python3 -m pytest +``` + +## Run only unit tests + +```bash +python3 -m pytest tests/unit +``` + +## Run only integration tests + +```bash +python3 -m pytest tests/integration +``` + +## Notes + +- Integration tests are fully local and do **not** require a real Ignition gateway. +- LLM services are not required for these tests. +- Neo4j is not required for this test suite. + diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5b51088 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPTS_DIR = REPO_ROOT / "scripts" +INTEGRATION_DIR = REPO_ROOT / "tests" / "integration" + +for path in (SCRIPTS_DIR, INTEGRATION_DIR): + path_str = str(path) + if path_str not in sys.path: + sys.path.insert(0, path_str) + + +@pytest.fixture +def sim_ignition(): + from simulated_ignition_server import ( + start_simulated_ignition_server, + stop_simulated_ignition_server, + ) + + server, thread, state, base_url = start_simulated_ignition_server() + try: + yield { + "server": server, + "thread": thread, + "state": state, + "base_url": base_url, + } + finally: + stop_simulated_ignition_server(server, thread) diff --git a/tests/integration/simulated_ignition_server.py b/tests/integration/simulated_ignition_server.py new file mode 100644 index 0000000..607f316 --- /dev/null +++ b/tests/integration/simulated_ignition_server.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Simulated Ignition WebDev endpoints for local integration tests. +""" + +from __future__ import annotations + +import json +import threading +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Dict, List, Tuple +from urllib.parse import parse_qs, urlparse + + +def _utc_iso(offset_minutes: int = 0) -> str: + return (datetime.now(timezone.utc) + timedelta(minutes=offset_minutes)).isoformat() + + +@dataclass +class SimulatedIgnitionState: + fail_live_reads: bool = False + fail_history_reads: bool = False + live_tags: Dict[str, Dict] = field(default_factory=dict) + tag_history: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.live_tags: + self.live_tags = { + "[default]Line/Throughput": { + "value": 95.0, + "quality": "Good", + "timestamp": _utc_iso(), + "dataType": "Float8", + }, + "[default]Line/Temperature": { + "value": 42.0, + "quality": "Good", + "timestamp": _utc_iso(), + "dataType": "Float8", + }, + } + if not self.tag_history: + base = [49.9, 50.1, 50.0, 50.2, 50.1, 49.8, 50.3, 50.0, 49.9, 50.2] + self.tag_history = { + "[default]Line/Throughput": [ + (_utc_iso(offset_minutes=-(len(base) - i)), value) + for i, value in enumerate(base) + ], + "[default]Line/Temperature": [ + (_utc_iso(offset_minutes=-(len(base) - i)), 41.5 + (i * 0.1)) + for i in range(len(base)) + ], + } + + +class _IgnitionHandler(BaseHTTPRequestHandler): + state: SimulatedIgnitionState + + def _send_json(self, payload, status: int = 200) -> None: + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 - BaseHTTPRequestHandler naming + parsed = urlparse(self.path) + path = parsed.path + query = parse_qs(parsed.query) + + if path == "/system/webdev/Axilon/getTags": + if self.state.fail_live_reads: + self._send_json({"error": "simulated live provider failure"}, status=503) + return + + raw = query.get("tagPaths", [""])[0] + tag_paths = [p.strip() for p in raw.split(",") if p.strip()] + tags = [] + for tag_path in tag_paths: + data = self.state.live_tags.get(tag_path) + if not data: + tags.append( + { + "tagPath": tag_path, + "value": None, + "quality": "Bad", + "isGood": False, + "timestamp": _utc_iso(), + "dataType": "Unknown", + } + ) + continue + tags.append( + { + "tagPath": tag_path, + "value": data.get("value"), + "quality": data.get("quality", "Good"), + "isGood": str(data.get("quality", "Good")).lower() == "good", + "timestamp": data.get("timestamp", _utc_iso()), + "dataType": data.get("dataType", "Unknown"), + } + ) + self._send_json({"success": True, "count": len(tags), "tags": tags}) + return + + if path == "/system/webdev/Axilon/queryTagHistory": + if self.state.fail_history_reads: + self._send_json({"error": "simulated history provider failure"}, status=503) + return + + raw = query.get("tagPaths", [""])[0] + tag_paths = [p.strip() for p in raw.split(",") if p.strip()] + rows = [] + + primary_path = tag_paths[0] if tag_paths else "[default]Line/Throughput" + primary_hist = self.state.tag_history.get(primary_path, []) + for ts, _ in primary_hist: + row = {"timestamp": ts} + for tag_path in tag_paths: + values = self.state.tag_history.get(tag_path, []) + match_val = None + for hist_ts, hist_val in values: + if hist_ts == ts: + match_val = hist_val + break + if match_val is None and values: + match_val = values[-1][1] + row[tag_path] = match_val + rows.append(row) + + self._send_json( + { + "success": True, + "rows": rows, + "tagPaths": tag_paths, + "returnFormat": "Wide", + } + ) + return + + self._send_json({"error": f"unsupported endpoint: {path}"}, status=404) + + def log_message(self, format, *args): # noqa: A003 - stdlib signature + # Silence default HTTP request logs during tests. + return + + +def start_simulated_ignition_server() -> tuple[HTTPServer, threading.Thread, SimulatedIgnitionState, str]: + state = SimulatedIgnitionState() + handler_cls = type( + "IgnitionHandlerWithState", + (_IgnitionHandler,), + {"state": state}, + ) + server = HTTPServer(("127.0.0.1", 0), handler_cls) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + host, port = server.server_address + base_url = f"http://{host}:{port}" + return server, thread, state, base_url + + +def stop_simulated_ignition_server(server: HTTPServer, thread: threading.Thread) -> None: + server.shutdown() + server.server_close() + thread.join(timeout=3) + diff --git a/tests/integration/test_live_value_sim_server.py b/tests/integration/test_live_value_sim_server.py new file mode 100644 index 0000000..d6feeea --- /dev/null +++ b/tests/integration/test_live_value_sim_server.py @@ -0,0 +1,75 @@ +from datetime import datetime, timedelta, timezone + +from anomaly_rules import compute_deviation_scores +from ignition_api_client import IgnitionApiClient + +def test_read_tags_history_and_detect_spike(sim_ignition): + state = sim_ignition["state"] + state.fail_live_reads = False + state.fail_history_reads = False + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + tag_path = "[default]Line/Throughput" + tv = client.read_tag(tag_path) + assert tv.error is None + assert tv.quality == "Good" + assert float(tv.value) == 95.0 + + start = (datetime.now(timezone.utc) - timedelta(hours=1)).replace(microsecond=0).isoformat() + end = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + history = client.query_tag_history([tag_path], start, end, return_size=100) + assert isinstance(history, dict) + assert "rows" in history + + history_values = [ + row[tag_path] + for row in history["rows"] + if isinstance(row, dict) and tag_path in row and row[tag_path] is not None + ] + assert len(history_values) > 5 + + score = compute_deviation_scores( + current_value=tv.value, + history_values=history_values, + prev_value=55.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + assert score["candidate"] + assert score["category"] in {"spike", "deviation", "drift"} + finally: + client.close() + + +def test_live_provider_failure_surfaces_as_read_error(sim_ignition): + state = sim_ignition["state"] + state.fail_live_reads = True + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + tv = client.read_tag("[default]Line/Throughput") + assert tv.error is not None + assert "failed" in tv.error.lower() + finally: + client.close() + + +def test_history_provider_failure_surfaces_error_payload(sim_ignition): + state = sim_ignition["state"] + state.fail_history_reads = True + + client = IgnitionApiClient(base_url=sim_ignition["base_url"], api_token="token") + try: + start = (datetime.now(timezone.utc) - timedelta(hours=1)).replace(microsecond=0).isoformat() + end = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + history = client.query_tag_history( + ["[default]Line/Throughput"], + start, + end, + return_size=100, + ) + assert isinstance(history, dict) + assert "error" in history + finally: + client.close() + diff --git a/tests/unit/test_anomaly_rules.py b/tests/unit/test_anomaly_rules.py new file mode 100644 index 0000000..e5f2af1 --- /dev/null +++ b/tests/unit/test_anomaly_rules.py @@ -0,0 +1,64 @@ +from datetime import datetime, timedelta, timezone + +import pytest + +from anomaly_rules import compute_deviation_scores, is_quality_good, is_stale + + +def test_detects_sharp_rise_and_sharp_drop(): + baseline = [50.0, 49.9, 50.1, 50.2, 49.8, 50.0, 50.1, 49.9, 50.0, 50.2] * 3 + + rise = compute_deviation_scores( + current_value=95.0, + history_values=baseline, + prev_value=52.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + drop = compute_deviation_scores( + current_value=12.0, + history_values=baseline, + prev_value=49.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 10.0}, + ) + + assert rise["candidate"] + assert drop["candidate"] + + +def test_detects_flatline_stuck_pattern(): + flat = [72.0] * 30 + result = compute_deviation_scores( + current_value=72.0, + history_values=flat, + prev_value=72.0, + thresholds={"z": 3.0, "mad": 3.5, "rate": 1.0, "stuck_window_size": 20}, + ) + assert result["candidate"] + assert "flatline_detected" in result["reasons"] + assert result["category"] == "stuck" + + +@pytest.mark.parametrize( + "quality,expected", + [("Good", True), ("OK", True), ("Bad", False), (None, False)], +) +def test_quality_helper(quality, expected): + assert is_quality_good(quality) is expected + + +def test_staleness_helper(): + recent_ts = datetime.now(timezone.utc).isoformat() + old_ts = (datetime.now(timezone.utc) - timedelta(minutes=15)).isoformat() + assert not is_stale(recent_ts, staleness_sec=300) + assert is_stale(old_ts, staleness_sec=300) + + +def test_non_numeric_current_value_is_rejected(): + result = compute_deviation_scores( + current_value="not-a-number", + history_values=[1, 2, 3, 4, 5], + prev_value=3, + ) + assert not result["candidate"] + assert result["category"] == "invalid_value" + diff --git a/tests/unit/test_ingest_siemens_parser.py b/tests/unit/test_ingest_siemens_parser.py new file mode 100644 index 0000000..935bf71 --- /dev/null +++ b/tests/unit/test_ingest_siemens_parser.py @@ -0,0 +1,72 @@ +from pathlib import Path + +from siemens_parser import SiemensSTParser + + +SAMPLE_ST = """ +NAMESPACE Plant.Process + +TYPE MotorData : STRUCT + Speed : REAL; +END_STRUCT +END_TYPE + +CLASS MotorFB +VAR_INPUT + StartCmd : BOOL; // start command +END_VAR +VAR_OUTPUT + Running : BOOL; +END_VAR +METHOD PUBLIC Execute : BOOL +VAR + tempVar : INT := 1; +END_VAR +Running := StartCmd; +END_METHOD +END_CLASS + +PROGRAM MainProgram +VAR + Counter : INT := 0; +END_VAR +Counter := Counter + 1; +END_PROGRAM + +CONFIGURATION Config1 +TASK MainTask(INTERVAL := T#100MS, PRIORITY := 1); +PROGRAM PLC_PRG WITH MainTask: MainProgram; +END_CONFIGURATION + +END_NAMESPACE +""" + + +def test_parse_structured_text_blocks(tmp_path): + st_path = Path(tmp_path) / "sample.st" + st_path.write_text(SAMPLE_ST, encoding="utf-8") + + parser = SiemensSTParser() + blocks = parser.parse_file(str(st_path)) + assert len(blocks) >= 4 + + by_name = {b.name: b for b in blocks} + assert "MotorData" in by_name + assert by_name["MotorData"].type == "UDT" + assert by_name["MotorData"].local_tags[0].name == "Speed" + + assert "MotorFB" in by_name + fb = by_name["MotorFB"] + assert fb.type == "FB" + assert any(t.name == "StartCmd" and t.direction == "INPUT" for t in fb.input_tags) + assert any(t.name == "Running" and t.direction == "OUTPUT" for t in fb.output_tags) + assert any(r["name"] == "Execute" for r in fb.routines) + + assert "MainProgram" in by_name + assert by_name["MainProgram"].type == "PROGRAM" + assert "Counter := Counter + 1" in by_name["MainProgram"].raw_implementation + + assert "Config1" in by_name + assert by_name["Config1"].type == "CONFIGURATION" + assert "MainTask" in by_name["Config1"].description + diff --git a/tests/unit/test_ingest_workbench_parser.py b/tests/unit/test_ingest_workbench_parser.py new file mode 100644 index 0000000..7609490 --- /dev/null +++ b/tests/unit/test_ingest_workbench_parser.py @@ -0,0 +1,119 @@ +import json +from pathlib import Path + +from workbench_parser import WorkbenchParser + + +def test_parse_workbench_project_json_with_inline_resources(tmp_path): + root = Path(tmp_path) + + # Script file expected by WorkbenchParser._read_script_file + script_file = root / "scripts" / "PlantA" / "utility" / "tags" / "code.py" + script_file.parent.mkdir(parents=True, exist_ok=True) + script_file.write_text("def read_tag():\n return 42\n", encoding="utf-8") + + data = { + "__typeName": "WorkbenchState", + "version": "1.2.3", + "root": { + "windows": [ + { + "projectName": "PlantA", + "title": "MainView", + "path": "main/view", + "windowType": "perspective", + "rootContainer": { + "meta": {"name": "Root"}, + "type": "ia.container", + "propConfig": { + "props.value": { + "binding": { + "type": "tag", + "config": { + "tagPath": "[default]Line/Speed", + "bidirectional": True, + }, + } + } + }, + "children": [], + }, + } + ], + "namedQueries": [ + { + "projectName": "PlantA", + "queryName": "GetBatches", + "folderPath": "Prod\\Ops", + "query": "SELECT * FROM batches", + } + ], + "scripts": [ + { + "projectName": "PlantA", + "path": ["utility", "tags"], + "scope": "A", + } + ], + "tags": [ + { + "name": "LineSpeed", + "type": "Opc", + "dataType": "Float8", + "opcItemPath": "[default]Line/Speed", + }, + { + "name": "BatchCount", + "type": "Memory", + "dataType": "Int4", + "value": 7, + }, + ], + "udtDefinitions": [ + { + "name": "MotorUDT", + "id": "MotorUDT", + "parameters": { + "area": {"dataType": "String", "value": "A1"} + }, + "members": [ + { + "name": "Run", + "type": "opc", + "dataType": "Boolean", + "opcItemPath": "[default]Motor/Run", + "serverName": {"binding": "default"}, + } + ], + } + ], + }, + } + + project_json = root / "project.json" + project_json.write_text(json.dumps(data), encoding="utf-8") + + parser = WorkbenchParser() + backup = parser.parse_file(str(project_json)) + + assert "PlantA" in backup.projects + assert len(backup.windows) == 1 + assert backup.windows[0].name == "MainView" + assert backup.windows[0].components[0].bindings[0].target == "[default]Line/Speed" + + assert len(backup.named_queries) == 1 + assert backup.named_queries[0].id == "Prod/Ops/GetBatches" + assert "SELECT" in backup.named_queries[0].query_text + + assert len(backup.scripts) == 1 + assert "return 42" in backup.scripts[0].script_text + + tag_types = {t.name: t.tag_type for t in backup.tags} + assert tag_types["LineSpeed"] == "opc" + assert tag_types["BatchCount"] == "memory" + + assert len(backup.udt_definitions) == 1 + udt = backup.udt_definitions[0] + assert "area" in udt.parameters + assert udt.members[0].server_name == "default" + From e9ca37d4bbd907fd258af0ebca7e3122dcaa1d1c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 2 Mar 2026 22:16:19 +0000 Subject: [PATCH 04/18] Improve anomaly visibility and clear acknowledged events Co-authored-by: leor --- electron-ui/index.html | 1 + electron-ui/main.js | 11 ++++ electron-ui/preload.js | 1 + electron-ui/renderer.js | 27 ++++++++-- scripts/anomaly_monitor.py | 101 ++++++++++++++++++++++++++++++++++++- 5 files changed, 136 insertions(+), 5 deletions(-) diff --git a/electron-ui/index.html b/electron-ui/index.html index 7e5e8a7..99ba9a1 100644 --- a/electron-ui/index.html +++ b/electron-ui/index.html @@ -593,6 +593,7 @@

Anomaly Feed

+ + diff --git a/electron-ui/main.js b/electron-ui/main.js index 43eb3fb..5b6a081 100644 --- a/electron-ui/main.js +++ b/electron-ui/main.js @@ -209,7 +209,7 @@ function normalizeAgentConfig(config = {}) { const thresholds = (config && typeof config.thresholds === 'object' && config.thresholds) || {}; const scope = (config && typeof config.scope === 'object' && config.scope) || {}; return { - pollIntervalMs: Math.max(5000, Number(config.pollIntervalMs || 15000)), + pollIntervalMs: Math.max(1000, Number(config.pollIntervalMs || 1000)), historyWindowMinutes: Math.max(10, Number(config.historyWindowMinutes || 360)), minHistoryPoints: Math.max(10, Number(config.minHistoryPoints || 30)), maxMonitoredTags: Math.max(10, Number(config.maxMonitoredTags || 200)), diff --git a/electron-ui/renderer.js b/electron-ui/renderer.js index 9b5da03..8479580 100644 --- a/electron-ui/renderer.js +++ b/electron-ui/renderer.js @@ -3580,7 +3580,7 @@ function getAgentsElements() { function getAgentsConfigFromUI() { const el = getAgentsElements(); return { - pollIntervalMs: Number(el.cfgPoll?.value || 15000), + pollIntervalMs: Number(el.cfgPoll?.value || 1000), historyWindowMinutes: Number(el.cfgHist?.value || 360), minHistoryPoints: Number(el.cfgPoints?.value || 30), maxCandidatesPerSubsystem: 8, diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py index 2fa7f73..ff033dc 100644 --- a/scripts/anomaly_monitor.py +++ b/scripts/anomaly_monitor.py @@ -238,7 +238,7 @@ def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: raw = dict(config or {}) thresholds = raw.get("thresholds", {}) if isinstance(raw.get("thresholds"), dict) else {} defaults = { - "pollIntervalMs": 15000, + "pollIntervalMs": 1000, "historyWindowMinutes": 360, "minHistoryPoints": 30, "maxMonitoredTags": 200, @@ -1607,7 +1607,7 @@ def run_forever(self) -> int: "timestamp": utc_now_iso(), }) - poll_ms = int(self.config.get("pollIntervalMs", 15000)) + poll_ms = int(self.config.get("pollIntervalMs", 1000)) cleanup_every = max(1, int(self.config.get("cleanupEveryCycles", 40))) exit_code = 0 reason = "stopped" From 6dcfd15f81912b3ce743e6e5eab1507c2f6f093e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 2 Mar 2026 23:18:37 +0000 Subject: [PATCH 15/18] Emit in-cycle agent status progress updates Co-authored-by: leor --- scripts/anomaly_monitor.py | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py index ff033dc..54065c8 100644 --- a/scripts/anomaly_monitor.py +++ b/scripts/anomaly_monitor.py @@ -1024,6 +1024,8 @@ def run_cycle(self) -> Dict[str, Any]: cycle_start = time.time() thresholds = self.config.get("thresholds", {}) stale_threshold_sec = int(thresholds.get("stalenessSec", 120)) + progress_emit_interval_tags = max(5, int(self.config.get("progressEveryTags", 10))) + progress_emit_interval_sec = max(1, int(self.config.get("progressEverySec", 2))) metrics = { "candidates": 0, "triaged": 0, @@ -1148,6 +1150,42 @@ def run_cycle(self) -> Dict[str, Any]: near_shift_unlinked = 0 stale_samples: List[Dict[str, Any]] = [] subsystem_shift_signals: Dict[str, Dict[str, Any]] = {} + processed_live_count = 0 + total_live_count = len(live_values) + last_progress_emit = 0.0 + + def emit_cycle_progress(reason: str, current_tag: str = "") -> None: + nonlocal last_progress_emit + diag = make_default_diagnostics( + staleness_threshold_sec=stale_threshold_sec, + phase="cycle_in_progress", + reason=reason, + ) + diag.update({ + "processedLiveCount": processed_live_count, + "totalLiveCount": total_live_count, + "currentTag": current_tag, + "candidatesSoFar": len(candidates), + "liveErrorCount": live_error_count, + "qualityFilteredCount": quality_filtered_count, + "staleFilteredCount": stale_filtered_count, + "historyErrorCount": history_error_count, + "linkedTags": linked_tag_count, + "unlinkedTags": unlinked_tag_count, + }) + emit("AGENT_STATUS", { + "runId": self.run_id, + "state": "running", + "cycleMs": int((time.time() - cycle_start) * 1000), + "candidates": len(candidates), + "triaged": 0, + "emitted": metrics.get("emitted", 0), + "diagnostics": diag, + "timestamp": utc_now_iso(), + }) + last_progress_emit = time.time() + + emit_cycle_progress("cycle_started") def _update_subsystem_signal( subsystem_ref: Dict[str, str], deterministic: Dict[str, Any], tag_path: str @@ -1180,10 +1218,19 @@ def _update_subsystem_signal( bucket["sampleTag"] = tag_path for tv in live_values: + processed_live_count += 1 tag_meta = tag_lookup.get(tv.path, {"path": tv.path, "name": tv.path}) subsystem = tag_meta.get("primary_subsystem") or _subsystem_ref("global", "all") is_linked = bool(tag_meta.get("views") or tag_meta.get("equipment")) + now_progress = time.time() + if ( + processed_live_count == 1 + or processed_live_count % progress_emit_interval_tags == 0 + or (now_progress - last_progress_emit) >= progress_emit_interval_sec + ): + emit_cycle_progress("processing_live_tags", current_tag=tv.path) + if tv.error: live_error_count += 1 if is_linked: From 5157857bb4ff28e2d4b42f16027b5c7de52f3115 Mon Sep 17 00:00:00 2001 From: Leor Barak Fishman Date: Mon, 2 Mar 2026 17:27:50 -0800 Subject: [PATCH 16/18] more agentics fixing --- electron-ui/index.html | 20 +- electron-ui/main.js | 13 ++ electron-ui/preload.js | 1 + electron-ui/renderer.js | 353 +++++++++++++++++++++++++++++++--- electron-ui/styles.css | 385 ++++++++++++++++++++++++++++++++++++- scripts/anomaly_monitor.py | 316 ++++++++++++++++++++++++++---- tests/quick_import_test.py | 76 ++++++++ 7 files changed, 1100 insertions(+), 64 deletions(-) create mode 100644 tests/quick_import_test.py diff --git a/electron-ui/index.html b/electron-ui/index.html index a4a7a9b..08adc5d 100644 --- a/electron-ui/index.html +++ b/electron-ui/index.html @@ -566,8 +566,11 @@

Long-Running Agents

- - + + + @@ -584,6 +587,18 @@

Long-Running Agents

Last heartbeatn/a
+
+
+

Subsystem Health

+
+ +
+
+
+
Start monitoring to see subsystem health.
+
+
+
@@ -1537,8 +1504,6 @@

Graph: Node

- - diff --git a/electron-ui/main.js b/electron-ui/main.js index f3034f1..ae5c557 100644 --- a/electron-ui/main.js +++ b/electron-ui/main.js @@ -215,8 +215,8 @@ function normalizeAgentConfig(config = {}) { maxMonitoredTags: Math.max(10, Number(config.maxMonitoredTags || 200)), maxCandidatesPerCycle: Math.max(1, Number(config.maxCandidatesPerCycle || 25)), maxCandidatesPerSubsystem: Math.max(1, Number(config.maxCandidatesPerSubsystem || 8)), - maxLlmTriagesPerCycle: Math.max(0, Number(config.maxLlmTriagesPerCycle || 5)), - maxLlmTriagesPerSubsystem: Math.max(0, Number(config.maxLlmTriagesPerSubsystem || 2)), + maxLlmTriagesPerCycle: Math.max(0, Number(config.maxLlmTriagesPerCycle ?? 5)), + maxLlmTriagesPerSubsystem: Math.max(0, Number(config.maxLlmTriagesPerSubsystem ?? 2)), dedupCooldownMinutes: Math.max(1, Number(config.dedupCooldownMinutes || 10)), retentionDays: Math.max(1, Number(config.retentionDays || 14)), cleanupEveryCycles: Math.max(1, Number(config.cleanupEveryCycles || 40)), @@ -1749,17 +1749,18 @@ ipcMain.handle('agents:clear-event', async (event, eventId, note = '') => { } }); -ipcMain.handle('agents:deep-analyze', async (event, eventId) => { - try { - const output = await runPythonScript('anomaly_monitor.py', [ - 'deep-analyze', - '--event-id', - String(eventId), - ]); - return JSON.parse(output || '{}'); - } catch (error) { - return { success: false, error: error.message }; +ipcMain.handle('agents:deep-analyze', async (event, eventId, eventData) => { + if (!activeAgentRun || !activeAgentRun.process || activeAgentRun.process.killed) { + return { success: false, error: 'No active agent run — deep analyze requires a running agent' }; + } + if (!eventData || !eventData.event_id) { + return { success: false, error: 'Missing event data' }; + } + const sent = sendAgentCommand({ cmd: 'deep-analyze', event: eventData }); + if (!sent) { + return { success: false, error: 'Failed to send command to agent process' }; } + return { success: true, pending: true, eventId: eventData.event_id }; }); ipcMain.handle('agents:cleanup', async (event, retentionDays = 14) => { @@ -1773,4 +1774,24 @@ ipcMain.handle('agents:cleanup', async (event, retentionDays = 14) => { } catch (error) { return { success: false, error: error.message }; } +}); + +function sendAgentCommand(cmd) { + if (activeAgentRun && activeAgentRun.process && activeAgentRun.process.stdin && activeAgentRun.process.stdin.writable) { + activeAgentRun.process.stdin.write(JSON.stringify(cmd) + '\n'); + return true; + } + return false; +} + +ipcMain.handle('agents:start-subsystem', async (event, subsystemId) => { + if (!activeAgentRun) return { success: false, error: 'No active agent run' }; + const sent = sendAgentCommand({ cmd: 'start-agent', subsystemId }); + return { success: sent, subsystemId }; +}); + +ipcMain.handle('agents:stop-subsystem', async (event, subsystemId) => { + if (!activeAgentRun) return { success: false, error: 'No active agent run' }; + const sent = sendAgentCommand({ cmd: 'stop-agent', subsystemId }); + return { success: sent, subsystemId }; }); \ No newline at end of file diff --git a/electron-ui/preload.js b/electron-ui/preload.js index cf1d75c..7615063 100644 --- a/electron-ui/preload.js +++ b/electron-ui/preload.js @@ -79,8 +79,10 @@ contextBridge.exposeInMainWorld('api', { agentsGetEvent: (eventId) => ipcRenderer.invoke('agents:get-event', eventId), agentsAckEvent: (eventId, note) => ipcRenderer.invoke('agents:ack-event', eventId, note), agentsClearEvent: (eventId, note) => ipcRenderer.invoke('agents:clear-event', eventId, note), - agentsDeepAnalyze: (eventId) => ipcRenderer.invoke('agents:deep-analyze', eventId), + agentsDeepAnalyze: (eventId, eventData) => ipcRenderer.invoke('agents:deep-analyze', eventId, eventData), agentsCleanup: (retentionDays) => ipcRenderer.invoke('agents:cleanup', retentionDays), + agentsStartSubsystem: (subId) => ipcRenderer.invoke('agents:start-subsystem', subId), + agentsStopSubsystem: (subId) => ipcRenderer.invoke('agents:stop-subsystem', subId), // Database connections getDbConnections: () => ipcRenderer.invoke('get-db-connections'), diff --git a/electron-ui/renderer.js b/electron-ui/renderer.js index bba8767..93bf9b4 100644 --- a/electron-ui/renderer.js +++ b/electron-ui/renderer.js @@ -3536,7 +3536,7 @@ btnSaveDbCreds?.addEventListener('click', async () => { btnSaveDbCreds.disabled = false; } }); -// Agents Tab - Long-running monitoring +// Agents Tab — Per-subsystem agent monitoring // ============================================ const HEALTH_TREND_MAX_CYCLES = 20; @@ -3550,6 +3550,8 @@ const agentsState = { listenersReady: false, subsystemHealth: {}, subsystemHistory: {}, + agentStates: {}, + pendingDeepAnalyze: new Set(), }; function getAgentsElements() { @@ -3558,21 +3560,11 @@ function getAgentsElements() { btnStop: document.getElementById('btn-agents-stop'), btnRefresh: document.getElementById('btn-agents-refresh'), btnCleanup: document.getElementById('btn-agents-cleanup'), - btnDeepAnalyze: document.getElementById('btn-agents-deep-analyze'), - btnOpenGraph: document.getElementById('btn-agents-open-graph'), - btnAck: document.getElementById('btn-agents-ack'), statusChip: document.getElementById('agents-status-chip'), statusText: document.getElementById('agents-status-text'), - list: document.getElementById('agents-event-list'), - detail: document.getElementById('agents-event-detail'), filterState: document.getElementById('agents-filter-state'), filterSeverity: document.getElementById('agents-filter-severity'), filterSearch: document.getElementById('agents-filter-search'), - metricCycle: document.getElementById('agents-metric-cycle'), - metricCandidates: document.getElementById('agents-metric-candidates'), - metricTriaged: document.getElementById('agents-metric-triaged'), - metricEmitted: document.getElementById('agents-metric-emitted'), - metricHeartbeat: document.getElementById('agents-metric-heartbeat'), cfgPoll: document.getElementById('agents-config-poll-ms'), cfgHist: document.getElementById('agents-config-history-min'), cfgPoints: document.getElementById('agents-config-min-points'), @@ -3610,16 +3602,14 @@ function formatAgentTime(ts) { if (!ts) return 'n/a'; const d = new Date(ts); if (Number.isNaN(d.getTime())) return String(ts); - return d.toLocaleString(); + return d.toLocaleTimeString(); } function computeHealthLevel(signal) { - const avgAbsZ = parseFloat(signal.avgAbsZ || 0); - const candidateRatio = parseFloat(signal.candidateRatio || 0); const maxAbsZ = parseFloat(signal.maxAbsZ || 0); - if (candidateRatio >= 0.25 || maxAbsZ >= 5) return 'critical'; - if (candidateRatio >= 0.10 || avgAbsZ >= 2.5) return 'warning'; - if (signal.shiftRatio > 0.1 || avgAbsZ >= 1.5) return 'elevated'; + if (maxAbsZ >= 5) return 'critical'; + if (maxAbsZ >= 3) return 'warning'; + if (maxAbsZ >= 1.5) return 'elevated'; return 'healthy'; } @@ -3627,68 +3617,153 @@ function healthLevelToScore(level) { return { healthy: 0.1, elevated: 0.4, warning: 0.7, critical: 1.0 }[level] || 0.1; } -function updateSubsystemHealthFromDiagnostics(diagnostics) { - const tagMap = diagnostics?.subsystemTagMap; - if (tagMap && typeof tagMap === 'object') { - for (const [subId, info] of Object.entries(tagMap)) { - if (!agentsState.subsystemHealth[subId]) { - agentsState.subsystemHealth[subId] = { - subsystemId: subId, +function getSubsystemIdForEvent(event) { + return event.subsystem_id + || `${(event.subsystem_type || 'global')}:${(event.subsystem_name || 'all').toLowerCase()}`; +} + +function getFilteredEventsForSubsystem(subId) { + const el = getAgentsElements(); + const stateFilter = (el.filterState?.value || '').toLowerCase(); + const sevFilter = (el.filterSeverity?.value || '').toLowerCase(); + const search = (el.filterSearch?.value || '').trim().toLowerCase(); + return agentsState.events.filter((event) => { + if (getSubsystemIdForEvent(event) !== subId) return false; + if (stateFilter && String(event.state || '').toLowerCase() !== stateFilter) return false; + if (sevFilter && String(event.severity || '').toLowerCase() !== sevFilter) return false; + if (search) { + const haystack = [event.summary, event.source_tag, event.tag_name] + .filter(Boolean).join(' ').toLowerCase(); + if (!haystack.includes(search)) return false; + } + return true; + }); +} + +function updateSubsystemHealthFromStatus(payload) { + const diagnostics = payload.diagnostics || {}; + const phase = diagnostics.phase || ''; + const subId = payload.subsystemId || diagnostics.subsystemId; + + if (diagnostics.subsystemTagMap && typeof diagnostics.subsystemTagMap === 'object') { + for (const [sid, info] of Object.entries(diagnostics.subsystemTagMap)) { + if (!agentsState.subsystemHealth[sid]) { + agentsState.subsystemHealth[sid] = { + subsystemId: sid, subsystemType: info.type || 'global', - subsystemName: info.name || subId, + subsystemName: info.name || sid, evaluated: (info.tags || []).length, - candidate: 0, - nearShift: 0, - maxAbsZ: 0, - avgAbsZ: 0, + candidate: 0, nearShift: 0, maxAbsZ: 0, avgAbsZ: 0, healthLevel: 'healthy', tagSignals: (info.tags || []).map((t) => ({ - path: t.path, - name: t.name || t.path, - z: 0, - mad: 0, - value: null, + path: t.path, name: t.name || t.path, z: 0, mad: 0, value: null, })), }; + agentsState.agentStates[sid] = { + state: 'running', cycleCount: 0, avgCycleMs: 0, totalCandidates: 0, totalTriaged: 0, + }; } } } - const signals = diagnostics?.subsystemShiftSignals; - if (Array.isArray(signals) && signals.length) { - for (const sig of signals) { - const subId = sig.subsystemId || sig.subsystemName || 'global:all'; - const healthLevel = computeHealthLevel(sig); - agentsState.subsystemHealth[subId] = { ...sig, healthLevel }; - - if (!agentsState.subsystemHistory[subId]) { - agentsState.subsystemHistory[subId] = []; - } - const history = agentsState.subsystemHistory[subId]; - history.push({ - healthLevel, - avgAbsZ: parseFloat(sig.avgAbsZ || 0), - candidateRatio: parseFloat(sig.candidateRatio || 0), - candidates: parseInt(sig.candidate || 0, 10), - evaluated: parseInt(sig.evaluated || 0, 10), - ts: Date.now(), - }); - if (history.length > HEALTH_TREND_MAX_CYCLES) { - history.splice(0, history.length - HEALTH_TREND_MAX_CYCLES); + if (subId && phase === 'cycle_complete') { + const signals = diagnostics.subsystemShiftSignals; + if (Array.isArray(signals) && signals.length) { + for (const sig of signals) { + const sid = sig.subsystemId || subId; + const healthLevel = computeHealthLevel(sig); + agentsState.subsystemHealth[sid] = { ...sig, healthLevel }; + if (!agentsState.subsystemHistory[sid]) agentsState.subsystemHistory[sid] = []; + const history = agentsState.subsystemHistory[sid]; + history.push({ + healthLevel, + avgAbsZ: parseFloat(sig.avgAbsZ || 0), + candidateRatio: parseFloat(sig.candidateRatio || 0), + candidates: parseInt(sig.candidate || 0, 10), + evaluated: parseInt(sig.evaluated || 0, 10), + ts: Date.now(), + }); + if (history.length > HEALTH_TREND_MAX_CYCLES) history.splice(0, history.length - HEALTH_TREND_MAX_CYCLES); } } + agentsState.agentStates[subId] = { + state: payload.state || 'running', + cycleCount: diagnostics.cycleCount || 0, + avgCycleMs: diagnostics.avgCycleMs || 0, + totalCandidates: diagnostics.totalCandidates || 0, + totalTriaged: diagnostics.totalTriaged || 0, + }; + + // Replace events for this subsystem with current live events + const liveEvents = payload.liveEvents || []; + agentsState.events = agentsState.events.filter((e) => e.subsystem_id !== subId); + for (const evt of liveEvents) { + agentsState.events.unshift(evt); + } + } + + if (subId && phase === 'cycle_progress') { + if (!agentsState.agentStates[subId]) { + agentsState.agentStates[subId] = { state: 'running', cycleCount: 0, avgCycleMs: 0, totalCandidates: 0, totalTriaged: 0 }; + } + const step = diagnostics.step || ''; + const stepLabels = { + reading_tags: 'Reading tags', + fetching_history: 'Fetching history', + scoring: 'Scoring', + triaging: 'Triaging', + waiting: 'Idle', + }; + agentsState.agentStates[subId].currentStep = step; + agentsState.agentStates[subId].stepLabel = stepLabels[step] || step; + agentsState.agentStates[subId].stepDetail = diagnostics.detail || ''; + agentsState.agentStates[subId].lastStepAt = Date.now(); + updateAgentCardPhase(subId); + return; + } + + if (subId && (phase === 'agent_paused' || phase === 'agent_stopped')) { + if (agentsState.agentStates[subId]) { + agentsState.agentStates[subId].state = 'paused'; + agentsState.agentStates[subId].currentStep = 'paused'; + agentsState.agentStates[subId].stepLabel = 'Paused'; + } + } + if (subId && (phase === 'agent_resumed' || phase === 'agent_started')) { + if (agentsState.agentStates[subId]) { + agentsState.agentStates[subId].state = 'running'; + agentsState.agentStates[subId].currentStep = ''; + agentsState.agentStates[subId].stepLabel = ''; + } } renderSubsystemHealthGrid(); } +function updateAgentCardPhase(subId) { + const card = document.querySelector(`.agents-health-card[data-subsystem-id="${CSS.escape(subId)}"]`); + if (!card) return; + const phaseEl = card.querySelector('.health-agent-phase'); + if (!phaseEl) return; + const agState = agentsState.agentStates[subId] || {}; + const step = agState.currentStep || ''; + const isActive = step && step !== 'waiting' && step !== 'paused'; + phaseEl.textContent = agState.stepLabel || ''; + phaseEl.className = 'health-agent-phase' + (isActive ? ' phase-active' : ''); + if (isActive) { + card.classList.add('agent-cycling'); + } else { + card.classList.remove('agent-cycling'); + } +} + function renderSubsystemHealthGrid() { const container = document.getElementById('agents-health-grid'); if (!container) return; const entries = Object.entries(agentsState.subsystemHealth); if (!entries.length) { - container.innerHTML = '
Start monitoring to see subsystem health.
'; + container.innerHTML = '
Start monitoring to see subsystem agents.
'; return; } @@ -3713,11 +3788,34 @@ function renderSubsystemHealthGrid() { const anomalyClass = candidates > 0 ? (level === 'critical' ? ' has-critical' : ' has-anomalies') : ''; const history = agentsState.subsystemHistory[subId] || []; + const agState = agentsState.agentStates[subId] || {}; + const isPaused = agState.state === 'paused'; + const agentIcon = isPaused ? '▶' : '▮▮'; + const agentTitle = isPaused ? 'Resume agent' : 'Pause agent'; + + const currentStep = agState.currentStep || ''; + const isActive = currentStep && currentStep !== 'waiting' && currentStep !== 'paused'; + const phaseLabel = agState.stepLabel || ''; + const cyclingClass = isActive ? ' agent-cycling' : ''; + + const metricsRow = ` +
+ ${escapeHtml(phaseLabel)} + #${agState.cycleCount || 0} + ${agState.avgCycleMs || 0}ms + cand: ${agState.totalCandidates || 0} + tri: ${agState.totalTriaged || 0} +
+ `; + let expandedBody = ''; if (isExpanded) { const bigTrend = renderTrendBars(history, 48); const tagRows = renderTagSignalRows(sig.tagSignals || []); const tagCount = (sig.tagSignals || []).length; + const subEvents = getFilteredEventsForSubsystem(subId); + const eventRows = renderSubsystemEventRows(subEvents); + const eventCount = subEvents.length; expandedBody = `
${bigTrend}
@@ -3729,6 +3827,11 @@ function renderSubsystemHealthGrid() { NameTrendz-scoreAvgCurrent
${tagRows}
+
+

Events

+ ${eventCount} events +
+
${eventRows}
`; } else { @@ -3736,14 +3839,18 @@ function renderSubsystemHealthGrid() { } return ` -
+
${escapeHtml(name)}
- ${escapeHtml(type)} +
+ + ${escapeHtml(type)} +
+ ${metricsRow}
Tags @@ -3765,13 +3872,124 @@ function renderSubsystemHealthGrid() { }) .join(''); + container.querySelectorAll('.btn-agent-toggle').forEach((btn) => { + btn.addEventListener('click', (e) => { + e.stopPropagation(); + const subId = btn.getAttribute('data-subsystem-id'); + if (!subId) return; + const agState = agentsState.agentStates[subId] || {}; + if (agState.state === 'paused') window.api.agentsStartSubsystem(subId); + else window.api.agentsStopSubsystem(subId); + }); + }); + container.querySelectorAll('.agents-health-card').forEach((card) => { card.addEventListener('click', (e) => { - if (e.target.closest('.health-tag-list')) return; + if (e.target.closest('.health-tag-list') || e.target.closest('.health-event-list') || e.target.closest('.btn-agent-toggle')) return; const subId = card.getAttribute('data-subsystem-id'); selectSubsystem(subId); }); }); + + container.querySelectorAll('.health-event-row').forEach((row) => { + row.addEventListener('click', (e) => { + e.stopPropagation(); + const eventId = row.getAttribute('data-event-id'); + if (eventId) selectAgentEvent(eventId); + }); + }); + + container.querySelectorAll('.health-event-detail-actions .btn-deep-analyze').forEach((btn) => { + btn.addEventListener('click', (e) => { + e.stopPropagation(); + const eventId = btn.getAttribute('data-event-id'); + if (eventId) deepAnalyzeEvent(eventId, btn); + }); + }); + + container.querySelectorAll('.health-event-detail-actions .btn-ack-event').forEach((btn) => { + btn.addEventListener('click', (e) => { + e.stopPropagation(); + const eventId = btn.getAttribute('data-event-id'); + if (eventId) acknowledgeEvent(eventId); + }); + }); + + container.querySelectorAll('.health-event-detail-actions .btn-open-graph').forEach((btn) => { + btn.addEventListener('click', (e) => { + e.stopPropagation(); + const eventId = btn.getAttribute('data-event-id'); + if (!eventId) return; + const event = agentsState.events.find((ev) => ev.event_id === eventId); + if (!event) return; + const target = resolveAgentGraphTarget(event); + if (target) openGraphModal(target.name, target.type, event.summary || target.name); + }); + }); +} + +function renderSubsystemEventRows(events) { + if (!events.length) return '
No events for this subsystem.
'; + return events.slice(0, 50).map((event) => { + const sev = String(event.severity || 'low').toLowerCase(); + const isSelected = event.event_id === agentsState.selectedEventId; + const selectedClass = isSelected ? ' selected' : ''; + const tagLabel = event.tag_name || event.source_tag || ''; + const timeLabel = formatAgentTime(event.created_at); + + let detailHtml = ''; + if (isSelected) { + detailHtml = renderInlineEventDetail(event); + } + + return ` +
+
+ ${escapeHtml(sev)} + ${escapeHtml(event.summary || 'Anomaly')} + ${escapeHtml(tagLabel)} + ${escapeHtml(timeLabel)} +
+ ${detailHtml} +
+ `; + }).join(''); +} + +function renderInlineEventDetail(event) { + let checks = [], causes = [], safety = []; + try { checks = JSON.parse(event.recommended_checks_json || '[]'); } catch (e) {} + try { causes = JSON.parse(event.probable_causes_json || '[]'); } catch (e) {} + try { safety = JSON.parse(event.safety_notes_json || '[]'); } catch (e) {} + + const st = String(event.state || '').toLowerCase(); + const ackLabel = st === 'acknowledged' ? 'Clear' : (st === 'cleared' ? 'Cleared' : 'Acknowledge'); + const ackDisabled = st === 'cleared' ? ' disabled' : ''; + const isPending = agentsState.pendingDeepAnalyze.has(event.event_id); + const analyzeLabel = isPending ? 'Analyzing…' : (event.deep_analyzed ? 'Re-Analyze' : 'Deep Analyze'); + const analyzeDisabled = isPending ? ' disabled' : ''; + + return ` +
+
+ State${escapeHtml(event.state || 'open')} + z-score${escapeHtml(String(event.z_score ?? '0'))} + MAD${escapeHtml(String(event.mad_score ?? '0'))} + Confidence${escapeHtml(String(event.confidence ?? ''))} + Category${escapeHtml(event.category || '')} + Source Tag${escapeHtml(event.source_tag || '')} +
+ ${event.explanation ? `
Explanation
${escapeHtml(event.explanation)}
` : ''} + ${causes.length ? `
Probable Causes
    ${causes.map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('')}
` : ''} + ${checks.length ? `
Checks
    ${checks.map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('')}
` : ''} + ${safety.length ? `
Safety
    ${safety.map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('')}
` : ''} +
+ + + +
+
+ `; } function renderTrendBars(history, maxHeight) { @@ -3820,7 +4038,6 @@ function renderTagSignalRows(tagSignals) { if (!tagSignals || !tagSignals.length) { return '
No tag data available yet.
'; } - return tagSignals .map((tag) => { const absZ = Math.abs(tag.z || 0); @@ -3848,13 +4065,23 @@ function selectSubsystem(subId) { const clearBtn = document.getElementById('btn-agents-clear-subsystem'); if (agentsState.selectedSubsystemId === subId) { agentsState.selectedSubsystemId = null; + agentsState.selectedEventId = null; if (clearBtn) clearBtn.style.display = 'none'; } else { agentsState.selectedSubsystemId = subId; + agentsState.selectedEventId = null; if (clearBtn) clearBtn.style.display = ''; } renderSubsystemHealthGrid(); - renderAgentEventList(); +} + +function selectAgentEvent(eventId) { + if (agentsState.selectedEventId === eventId) { + agentsState.selectedEventId = null; + } else { + agentsState.selectedEventId = eventId; + } + renderSubsystemHealthGrid(); } function updateAgentStatusUi(status, text) { @@ -3870,95 +4097,6 @@ function updateAgentStatusUi(status, text) { if (el.btnStop) el.btnStop.disabled = !(normalized === 'running' || normalized === 'starting' || normalized === 'stopping'); } -function updateAgentMetrics(metrics = {}, heartbeatTs = null) { - const el = getAgentsElements(); - if (el.metricCycle) el.metricCycle.textContent = String(metrics.cycleMs ?? metrics.lastCycleMs ?? 0); - if (el.metricCandidates) el.metricCandidates.textContent = String(metrics.candidates ?? metrics.lastCandidates ?? 0); - if (el.metricTriaged) el.metricTriaged.textContent = String(metrics.triaged ?? metrics.lastTriaged ?? 0); - if (el.metricEmitted) el.metricEmitted.textContent = String(metrics.emitted ?? metrics.lastEmitted ?? 0); - if (el.metricHeartbeat) el.metricHeartbeat.textContent = formatAgentTime(heartbeatTs || metrics.timestamp); -} - -function getFilteredAgentEvents() { - const el = getAgentsElements(); - const state = (el.filterState?.value || '').toLowerCase(); - const severity = (el.filterSeverity?.value || '').toLowerCase(); - const search = (el.filterSearch?.value || '').trim().toLowerCase(); - const subFilter = agentsState.selectedSubsystemId || ''; - return agentsState.events.filter((event) => { - if (state && String(event.state || '').toLowerCase() !== state) return false; - if (severity && String(event.severity || '').toLowerCase() !== severity) return false; - if (subFilter) { - const eventSubId = event.subsystem_id - || `${(event.subsystem_type || 'global')}:${(event.subsystem_name || 'all').toLowerCase()}`; - if (eventSubId !== subFilter) return false; - } - if (search) { - const haystack = [ - event.summary, - event.source_tag, - event.tag_name, - event.subsystem_name, - event.subsystem_type, - ...(event.equipment || []), - ...(event.tags || []), - ] - .filter(Boolean) - .join(' ') - .toLowerCase(); - if (!haystack.includes(search)) return false; - } - return true; - }); -} - -function renderAgentEventList() { - const el = getAgentsElements(); - if (!el.list) return; - const events = getFilteredAgentEvents(); - if (!events.length) { - const subName = agentsState.selectedSubsystemId - ? (agentsState.subsystemHealth[agentsState.selectedSubsystemId]?.subsystemName || agentsState.selectedSubsystemId) - : ''; - const msg = subName - ? `No anomaly events for "${subName}".` - : 'No anomaly events match the current filters.'; - el.list.innerHTML = `
${escapeHtml(msg)}
`; - return; - } - el.list.innerHTML = events - .map((event) => { - const active = event.event_id === agentsState.selectedEventId ? ' active' : ''; - const sev = String(event.severity || 'low').toLowerCase(); - const equipment = (event.equipment || []).slice(0, 2).join(', '); - const subsystemLabel = event.subsystem_name - ? `${event.subsystem_type || 'subsystem'}: ${event.subsystem_name}` - : ''; - const baseMeta = [event.tag_name || event.source_tag || '', equipment, subsystemLabel] - .filter(Boolean) - .join(' • '); - return ` -
-
- ${escapeHtml(sev)} - ${escapeHtml(formatAgentTime(event.created_at))} -
-
${escapeHtml(event.summary || 'Untitled anomaly')}
-
${escapeHtml(baseMeta)}
-
- `; - }) - .join(''); - - el.list.querySelectorAll('.agents-event-card').forEach((card) => { - card.addEventListener('click', () => { - const eventId = card.getAttribute('data-event-id'); - if (!eventId) return; - selectAgentEvent(eventId); - }); - }); -} - function resolveAgentGraphTarget(event) { if (String(event.subsystem_type || '').toLowerCase() === 'view' && event.subsystem_name) { return { name: event.subsystem_name, type: 'View' }; @@ -3973,102 +4111,6 @@ function resolveAgentGraphTarget(event) { return null; } -function renderAgentEventDetails(event) { - const el = getAgentsElements(); - if (!el.detail) return; - if (!event) { - el.detail.innerHTML = '

Select an anomaly event from the feed.

'; - if (el.btnDeepAnalyze) el.btnDeepAnalyze.disabled = true; - if (el.btnOpenGraph) el.btnOpenGraph.disabled = true; - if (el.btnAck) el.btnAck.disabled = true; - return; - } - - let checks = []; - let causes = []; - let safety = []; - try { checks = JSON.parse(event.recommended_checks_json || '[]'); } catch (e) {} - try { causes = JSON.parse(event.probable_causes_json || '[]'); } catch (e) {} - try { safety = JSON.parse(event.safety_notes_json || '[]'); } catch (e) {} - - el.detail.innerHTML = ` -
-
Event ID${escapeHtml(event.event_id || '')}
-
State${escapeHtml(event.state || '')}
-
Severity${escapeHtml(event.severity || '')}
-
Confidence${escapeHtml(String(event.confidence ?? ''))}
-
Category${escapeHtml(event.category || '')}
-
Timestamp${escapeHtml(formatAgentTime(event.created_at))}
-
Subsystem Type${escapeHtml(event.subsystem_type || 'global')}
-
Subsystem${escapeHtml(event.subsystem_name || 'all')}
-
Source Tag${escapeHtml(event.source_tag || '')}
-
Tag Name${escapeHtml(event.tag_name || '')}
-
z-score${escapeHtml(String(event.z_score ?? '0'))}
-
MAD score${escapeHtml(String(event.mad_score ?? '0'))}
-
-
-
Summary
-
${escapeHtml(event.summary || '')}
-
-
-
Explanation
-
${escapeHtml(event.explanation || '')}
-
-
-
Probable Causes
-
    ${(causes || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
-
-
-
Verification Checks
-
    ${(checks || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
-
-
-
Safety Notes
-
    ${(safety || []).map((x) => `
  • ${escapeHtml(String(x))}
  • `).join('') || '
  • n/a
  • '}
-
- `; - - if (el.btnDeepAnalyze) { - el.btnDeepAnalyze.disabled = false; - el.btnDeepAnalyze.textContent = event.llm_triaged ? 'Re-Analyze' : 'Deep Analyze'; - } - if (el.btnOpenGraph) el.btnOpenGraph.disabled = !resolveAgentGraphTarget(event); - if (el.btnAck) { - const state = String(event.state || '').toLowerCase(); - if (state === 'acknowledged') { - el.btnAck.textContent = 'Clear'; - el.btnAck.disabled = false; - } else if (state === 'cleared') { - el.btnAck.textContent = 'Cleared'; - el.btnAck.disabled = true; - } else { - el.btnAck.textContent = 'Acknowledge'; - el.btnAck.disabled = false; - } - } -} - -async function selectAgentEvent(eventId) { - agentsState.selectedEventId = eventId; - const existing = agentsState.events.find((e) => e.event_id === eventId); - if (existing && existing.explanation && existing.recommended_checks_json) { - renderAgentEventList(); - renderAgentEventDetails(existing); - return; - } - const detailResult = await window.api.agentsGetEvent(eventId); - if (detailResult.success && detailResult.event) { - const idx = agentsState.events.findIndex((e) => e.event_id === eventId); - if (idx >= 0) { - agentsState.events[idx] = { ...agentsState.events[idx], ...detailResult.event }; - } else { - agentsState.events.unshift(detailResult.event); - } - renderAgentEventList(); - renderAgentEventDetails(detailResult.event); - } -} - async function loadAgentEvents() { const el = getAgentsElements(); const result = await window.api.agentsListEvents({ @@ -4079,12 +4121,7 @@ async function loadAgentEvents() { }); if (!result.success) return; agentsState.events = Array.isArray(result.events) ? result.events : []; - renderAgentEventList(); - - if (agentsState.selectedEventId) { - const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); - renderAgentEventDetails(selected || null); - } + renderSubsystemHealthGrid(); } async function refreshAgentStatus() { @@ -4097,7 +4134,6 @@ async function refreshAgentStatus() { agentsState.runId = status.runId || agentsState.runId; agentsState.status = status.status || 'running'; updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId}`); - updateAgentMetrics(status.metrics || {}, status.lastHeartbeatAt); } else { agentsState.status = 'idle'; updateAgentStatusUi('idle', 'No active run'); @@ -4108,7 +4144,10 @@ async function startAgentsMonitoring() { const config = getAgentsConfigFromUI(); agentsState.subsystemHealth = {}; agentsState.subsystemHistory = {}; + agentsState.agentStates = {}; agentsState.selectedSubsystemId = null; + agentsState.selectedEventId = null; + agentsState.events = []; renderSubsystemHealthGrid(); const clearSubBtn = document.getElementById('btn-agents-clear-subsystem'); if (clearSubBtn) clearSubBtn.style.display = 'none'; @@ -4135,70 +4174,59 @@ async function stopAgentsMonitoring() { updateAgentStatusUi('stopped', 'Monitoring stopped'); } -async function deepAnalyzeSelectedEvent() { - if (!agentsState.selectedEventId) return; - const el = getAgentsElements(); - if (el.btnDeepAnalyze) { - el.btnDeepAnalyze.disabled = true; - el.btnDeepAnalyze.textContent = 'Analyzing…'; +async function deepAnalyzeEvent(eventId, btnEl) { + const event = agentsState.events.find((e) => e.event_id === eventId); + if (!event) { + console.error('[Agents] deep-analyze: event not found in local state', eventId); + if (btnEl) { btnEl.textContent = 'Not Found'; btnEl.disabled = false; } + return; } + agentsState.pendingDeepAnalyze.add(eventId); + if (btnEl) { btnEl.disabled = true; btnEl.textContent = 'Analyzing…'; } try { - const result = await window.api.agentsDeepAnalyze(agentsState.selectedEventId); - if (result.success && result.event) { - const idx = agentsState.events.findIndex((e) => e.event_id === agentsState.selectedEventId); - if (idx >= 0) agentsState.events[idx] = { ...agentsState.events[idx], ...result.event }; - renderAgentEventList(); - renderAgentEventDetails(result.event); - } else { + const result = await window.api.agentsDeepAnalyze(eventId, event); + if (!result.success) { console.error('[Agents] deep-analyze failed:', result.error); - if (el.btnDeepAnalyze) { - el.btnDeepAnalyze.textContent = 'Failed — Retry'; - el.btnDeepAnalyze.disabled = false; - } + agentsState.pendingDeepAnalyze.delete(eventId); + if (btnEl) { btnEl.textContent = 'Failed — Retry'; btnEl.disabled = false; } } + // Button stays disabled — result arrives async via AGENT_EVENT with deepAnalyze=true } catch (err) { console.error('[Agents] deep-analyze error:', err); - if (el.btnDeepAnalyze) { - el.btnDeepAnalyze.textContent = 'Failed — Retry'; - el.btnDeepAnalyze.disabled = false; - } + agentsState.pendingDeepAnalyze.delete(eventId); + if (btnEl) { btnEl.textContent = 'Failed — Retry'; btnEl.disabled = false; } } } -async function acknowledgeSelectedAgentEvent() { - if (!agentsState.selectedEventId) return; - const selected = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); - const state = String(selected?.state || '').toLowerCase(); - const result = state === 'acknowledged' - ? await window.api.agentsClearEvent(agentsState.selectedEventId, '') - : await window.api.agentsAckEvent(agentsState.selectedEventId, ''); +async function acknowledgeEvent(eventId) { + const event = agentsState.events.find((e) => e.event_id === eventId); + const st = String(event?.state || '').toLowerCase(); + const result = st === 'acknowledged' + ? await window.api.agentsClearEvent(eventId, '') + : await window.api.agentsAckEvent(eventId, ''); if (!result.success) return; - await loadAgentEvents(); - const refreshed = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); - renderAgentEventDetails(refreshed || null); + const idx = agentsState.events.findIndex((e) => e.event_id === eventId); + if (idx >= 0) { + agentsState.events[idx].state = st === 'acknowledged' ? 'cleared' : 'acknowledged'; + } + renderSubsystemHealthGrid(); } function upsertRealtimeAgentEvent(payload) { - if (!payload || !payload.eventId) return; - const idx = agentsState.events.findIndex((e) => e.event_id === payload.eventId); - const next = { - event_id: payload.eventId, - severity: payload.severity || 'medium', - summary: payload.summary || 'Anomaly detected', - category: payload.category || 'deviation', - created_at: payload.createdAt || new Date().toISOString(), - source_tag: payload.entityRefs?.sourceTag || payload.entityRefs?.tag || '', - tag_name: payload.entityRefs?.tag || '', - subsystem_type: payload.entityRefs?.subsystemType || '', - subsystem_name: payload.entityRefs?.subsystemName || '', - state: 'open', - }; - if (idx >= 0) { - agentsState.events[idx] = { ...agentsState.events[idx], ...next }; - } else { - agentsState.events.unshift(next); + const evt = payload?.event; + if (!evt || !evt.event_id) return; + if (payload.deepAnalyze) { + agentsState.pendingDeepAnalyze.delete(evt.event_id); + if (evt.deep_analyze_error) { + console.error('[Agents] Deep analyze failed:', evt.deep_analyze_error); + } else { + console.log('[Agents] Deep analyze complete for', evt.event_id); + } } - renderAgentEventList(); + const idx = agentsState.events.findIndex((e) => e.event_id === evt.event_id); + if (idx >= 0) agentsState.events[idx] = { ...agentsState.events[idx], ...evt }; + else agentsState.events.unshift(evt); + renderSubsystemHealthGrid(); } function ensureAgentListeners() { @@ -4210,29 +4238,28 @@ function ensureAgentListeners() { if (payload.runId) agentsState.runId = payload.runId; agentsState.status = payload.state || agentsState.status; updateAgentStatusUi(agentsState.status, `Run ${agentsState.runId || 'n/a'}`); - updateAgentMetrics(payload, payload.timestamp); + const diagnostics = payload.diagnostics || {}; const phase = diagnostics.phase || '?'; - console.log(`[Agents] phase=${phase} tags=${diagnostics.monitoredTags ?? '?'}`); - - if (phase === 'cycle_complete') { - const signals = diagnostics.subsystemShiftSignals; - const subCount = Array.isArray(signals) ? signals.length : 0; - const evaluated = (diagnostics.evaluatedLinked || 0) + (diagnostics.evaluatedUnlinked || 0); - console.log(`[Agents] cycle_complete: ${subCount} subsystems, ${evaluated} evaluated, ${diagnostics.candidateLinked || 0} candidates`); + const subId = payload.subsystemId || diagnostics.subsystemId || ''; + if (phase === 'triage_slow_candidate') { + console.warn(`[Agent ${subId}] SLOW TRIAGE: ${diagnostics.tag} use_llm=${diagnostics.use_llm} llm=${diagnostics.llm_ms}ms persist=${diagnostics.persist_ms}ms total=${diagnostics.total_ms}ms`); + } else if (phase === 'cycle_complete' && subId) { + const t = diagnostics.timingMs || {}; + console.log(`[Agent ${subId}] cycle #${diagnostics.cycleCount || '?'} ${payload.cycleMs || diagnostics.avgCycleMs || 0}ms (read=${t.read || '?'}ms hist=${t.history || '?'}ms score=${t.score || '?'}ms triage=${t.triage || '?'}ms) ${payload.candidates || 0} cand`); + } else if (phase === 'agents_started' || phase === 'rediscovery_complete') { + console.log(`[Agents] ${phase}: ${diagnostics.agentCount || 0} agents`); } - updateSubsystemHealthFromDiagnostics(diagnostics); + updateSubsystemHealthFromStatus(payload); }); - window.api.onAgentEvent((payload) => { - upsertRealtimeAgentEvent(payload); - }); + window.api.onAgentEvent((payload) => upsertRealtimeAgentEvent(payload)); window.api.onAgentError((payload) => { if (!payload) return; - console.error('[Agents error]', payload); - updateAgentStatusUi('error', payload.message || 'Agent runtime error'); + console.error('[Agents error]', payload.code, payload.message); + if (!payload.recoverable) updateAgentStatusUi('error', payload.message || 'Agent runtime error'); }); window.api.onAgentComplete((payload) => { @@ -4240,7 +4267,6 @@ function ensureAgentListeners() { console.log('[Agents] run complete, success=' + payload.success); agentsState.status = payload.success ? 'stopped' : 'failed'; updateAgentStatusUi(agentsState.status, payload.reason || 'Run complete'); - refreshAgentStatus(); }); } @@ -4257,25 +4283,15 @@ function initAgentsTab() { await window.api.agentsCleanup(14); await loadAgentEvents(); }); - el.btnDeepAnalyze?.addEventListener('click', deepAnalyzeSelectedEvent); - el.btnAck?.addEventListener('click', acknowledgeSelectedAgentEvent); - el.btnOpenGraph?.addEventListener('click', () => { - const event = agentsState.events.find((e) => e.event_id === agentsState.selectedEventId); - if (!event) return; - const target = resolveAgentGraphTarget(event); - if (!target) return; - openGraphModal(target.name, target.type, event.summary || target.name); - }); - el.filterState?.addEventListener('change', loadAgentEvents); - el.filterSeverity?.addEventListener('change', loadAgentEvents); - el.filterSearch?.addEventListener('input', renderAgentEventList); - + el.filterState?.addEventListener('change', () => renderSubsystemHealthGrid()); + el.filterSeverity?.addEventListener('change', () => renderSubsystemHealthGrid()); + el.filterSearch?.addEventListener('input', () => renderSubsystemHealthGrid()); const clearSubBtn = document.getElementById('btn-agents-clear-subsystem'); clearSubBtn?.addEventListener('click', () => { agentsState.selectedSubsystemId = null; + agentsState.selectedEventId = null; clearSubBtn.style.display = 'none'; renderSubsystemHealthGrid(); - renderAgentEventList(); }); } refreshAgentStatus(); diff --git a/electron-ui/styles.css b/electron-ui/styles.css index c967b08..35e7ffc 100644 --- a/electron-ui/styles.css +++ b/electron-ui/styles.css @@ -3227,6 +3227,39 @@ select.input, text-overflow: ellipsis; } +.health-card-controls { + display: flex; + align-items: center; + gap: var(--space-1); + flex-shrink: 0; +} + +.btn-agent-toggle { + background: transparent; + border: 1px solid var(--color-border); + border-radius: var(--radius-sm); + color: var(--color-text-muted); + cursor: pointer; + font-size: 10px; + padding: 2px 6px; + line-height: 1; + transition: color var(--transition-fast), border-color var(--transition-fast); +} + +.btn-agent-toggle:hover { + color: var(--color-text); + border-color: var(--color-border-active); +} + +.agents-health-card.agent-paused { + opacity: 0.6; +} + +.agents-health-card.agent-paused .health-indicator { + opacity: 0.4; + animation: none; +} + .health-card-type { font-size: 10px; text-transform: uppercase; @@ -3240,6 +3273,49 @@ select.input, white-space: nowrap; } +.health-agent-metrics { + display: flex; + gap: var(--space-2); + margin-bottom: var(--space-2); + font-size: 10px; + font-family: var(--font-mono); + color: var(--color-text-muted); + padding: 2px var(--space-1); + border-bottom: 1px solid var(--color-border-subtle); + align-items: center; +} + +.health-agent-metrics span { + white-space: nowrap; +} + +.health-agent-phase { + font-family: var(--font-sans); + font-weight: 500; + color: var(--color-text-muted); + min-width: 0; + overflow: hidden; + text-overflow: ellipsis; +} + +.health-agent-phase.phase-active { + color: var(--color-accent); +} + +.agents-health-card.agent-cycling { + border-color: rgba(34, 211, 238, 0.3); + box-shadow: 0 0 0 1px rgba(34, 211, 238, 0.08) inset; +} + +@keyframes agent-cycle-pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.5; } +} + +.agents-health-card.agent-cycling .health-indicator { + animation: agent-cycle-pulse 1s ease-in-out infinite; +} + .health-card-stats { display: grid; grid-template-columns: 1fr 1fr 1fr; @@ -3464,38 +3540,7 @@ select.input, text-align: center; } -.agents-main { - display: grid; - grid-template-columns: minmax(300px, 38%) minmax(0, 1fr); - gap: var(--space-3); - min-height: 480px; -} - -.agents-feed-panel, -.agents-detail-panel { - border: 1px solid var(--color-border); - background: var(--color-bg-panel); - border-radius: var(--radius-lg); - overflow: hidden; - display: flex; - flex-direction: column; -} - -.agents-feed-header, -.agents-detail-header { - padding: var(--space-3); - border-bottom: 1px solid var(--color-border-subtle); - display: flex; - justify-content: space-between; - align-items: center; - gap: var(--space-2); -} - -.agents-feed-header h3, -.agents-detail-header h3 { - font-size: var(--text-md); - font-weight: 600; -} +/* ---- Inline event list/detail within subsystem cards ---- */ .agents-feed-filters { display: flex; @@ -3504,50 +3549,7 @@ select.input, } .agents-feed-filters .input { - min-width: 120px; -} - -.agents-event-list { - overflow-y: auto; - padding: var(--space-2); - display: flex; - flex-direction: column; - gap: var(--space-2); - flex: 1; -} - -.agents-empty { - color: var(--color-text-muted); - font-size: var(--text-sm); - padding: var(--space-4); - text-align: center; -} - -.agents-event-card { - border: 1px solid var(--color-border); - background: var(--color-bg-panel-2); - border-radius: var(--radius-md); - padding: var(--space-2) var(--space-3); - cursor: pointer; - transition: border-color var(--transition-fast), transform var(--transition-fast); -} - -.agents-event-card:hover { - border-color: var(--color-border-active); - transform: translateY(-1px); -} - -.agents-event-card.active { - border-color: var(--color-accent); - box-shadow: 0 0 0 1px rgba(34, 211, 238, 0.35) inset; -} - -.agents-event-line-top { - display: flex; - justify-content: space-between; - align-items: center; - margin-bottom: 4px; - gap: var(--space-2); + min-width: 100px; } .agents-severity { @@ -3557,6 +3559,7 @@ select.input, padding: 2px 6px; border-radius: 999px; border: 1px solid transparent; + flex-shrink: 0; } .agents-severity.sev-critical { @@ -3583,57 +3586,100 @@ select.input, border-color: rgba(59, 130, 246, 0.35); } -.agents-event-time { +.health-event-list { + display: flex; + flex-direction: column; + gap: var(--space-1); + max-height: 400px; + overflow-y: auto; +} + +.health-event-row { + border: 1px solid var(--color-border-subtle); + border-radius: var(--radius-sm); + background: var(--color-bg-panel); + cursor: pointer; + transition: border-color var(--transition-fast); +} + +.health-event-row:hover { + border-color: var(--color-border-active); +} + +.health-event-row.selected { + border-color: var(--color-accent); + background: var(--color-bg-elevated); +} + +.health-event-row-header { + display: grid; + grid-template-columns: auto 1fr auto auto; + gap: var(--space-2); + align-items: center; + padding: 5px var(--space-2); font-size: var(--text-xs); - color: var(--color-text-muted); - font-family: var(--font-mono); } -.agents-event-summary { - font-size: var(--text-sm); +.health-event-summary { color: var(--color-text); - margin-bottom: 4px; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; } -.agents-event-meta { - font-size: var(--text-xs); +.health-event-tag { + font-family: var(--font-mono); color: var(--color-text-muted); + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + max-width: 140px; } -.agents-detail-content { - padding: var(--space-3); - overflow-y: auto; - font-size: var(--text-sm); +.health-event-time { + font-family: var(--font-mono); + color: var(--color-text-muted); + white-space: nowrap; +} + +.health-event-detail { + padding: var(--space-2) var(--space-3); + border-top: 1px solid var(--color-border-subtle); + font-size: var(--text-xs); display: flex; flex-direction: column; - gap: var(--space-3); + gap: var(--space-2); } -.agents-detail-grid { +.health-event-detail-grid { display: grid; - grid-template-columns: 1fr 1fr; - gap: var(--space-2) var(--space-3); -} - -.agents-detail-item { - display: flex; - flex-direction: column; - gap: 2px; + grid-template-columns: auto 1fr auto 1fr auto 1fr; + gap: 3px var(--space-2); + align-items: baseline; } -.agents-detail-label { - font-size: var(--text-xs); +.health-event-detail .detail-label { + font-size: 10px; color: var(--color-text-muted); text-transform: uppercase; letter-spacing: 0.3px; } -.agents-detail-value { - font-family: var(--font-mono); - color: var(--color-text); +.health-event-detail .detail-section { + display: flex; + flex-direction: column; + gap: 2px; +} + +.health-event-detail-actions { + display: flex; + gap: var(--space-2); + padding-top: var(--space-1); } .agents-list { - margin-left: var(--space-4); + margin: 0; + padding-left: var(--space-4); color: var(--color-text-secondary); + font-size: var(--text-xs); } diff --git a/scripts/anomaly_monitor.py b/scripts/anomaly_monitor.py index 0d0d91a..db9612c 100644 --- a/scripts/anomaly_monitor.py +++ b/scripts/anomaly_monitor.py @@ -1,15 +1,29 @@ #!/usr/bin/env python3 """ -Long-running anomaly monitor worker. - -Modes: - - run: start continuous monitoring loop - - status: get run status - - list-events: list persisted anomaly events - - get-event: fetch one anomaly event - - ack-event: mark event as acknowledged - - cleanup: delete old events by retention policy - - replay-fixtures: run deterministic fixture validation +Per-subsystem anomaly monitoring with coordinator + worker threads. + +Architecture: + AgentCoordinator (main thread) + - Discovers subsystems from Neo4j ontology + - Spawns/manages SubsystemAgent threads + - Reads stdin for commands (start/stop individual agents) + - Shared: Neo4j driver, IgnitionApiClient, thread-safe emit() + + SubsystemAgent (one thread per subsystem) + - Own cycle loop, history cache, prev_values, ClaudeClient + - Monitors only its assigned tags + - Emits per-subsystem status/events via thread-safe emit() + +CLI modes: + run Start coordinator with per-subsystem agents + list-events List persisted anomaly events + get-event Fetch one anomaly event + ack-event Acknowledge an event + clear-event Clear an acknowledged event + deep-analyze Run LLM triage on an existing event + cleanup Delete old events + status Get run status + replay-fixtures Validate scoring against fixtures """ from __future__ import annotations @@ -17,17 +31,19 @@ import argparse import json import os +import queue import signal import sys +import threading import time import uuid from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any, Callable, Dict, List, Optional, Set, Tuple try: from dotenv import load_dotenv -except ImportError: # pragma: no cover - optional fallback for minimal environments +except ImportError: def load_dotenv(*_args, **_kwargs): return False @@ -40,82 +56,44 @@ def load_dotenv(*_args, **_kwargs): safe_float, ) - load_dotenv() +_api_semaphore = threading.Semaphore(2) # max 2 concurrent Ignition API calls + +_emit_queue: queue.Queue = queue.Queue() + + +def _emit_writer() -> None: + """Dedicated thread that drains the emit queue to stdout.""" + while True: + line = _emit_queue.get() + if line is None: + break + try: + sys.stdout.write(line) + sys.stdout.flush() + except Exception: + pass + + +_emit_thread = threading.Thread(target=_emit_writer, daemon=True, name="emit-writer") +_emit_thread.start() + def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() def emit(prefix: str, payload: Dict[str, Any]) -> None: - """Emit machine-parseable messages for Electron main process.""" - print(f"[{prefix}] {json.dumps(payload, default=str)}", flush=True) + _emit_queue.put(f"[{prefix}] {json.dumps(payload, default=str)}\n") DEFAULT_SUBSYSTEM_PRIORITY = ["view", "equipment", "group", "global"] -def _preview_value(value: Any, max_len: int = 120) -> Any: - if value is None or isinstance(value, (bool, int, float)): - return value - text = str(value) - if len(text) <= max_len: - return text - return text[: max_len - 3] + "..." - - -def make_default_diagnostics( - *, - staleness_threshold_sec: int = 120, - phase: str = "initializing", - reason: str = "", -) -> Dict[str, Any]: - return { - "phase": phase, - "reason": reason, - "monitoredTags": 0, - "linkedTags": 0, - "unlinkedTags": 0, - "validLiveCount": 0, - "missingTimestampCount": 0, - "inferredTimestampCount": 0, - "liveErrorCount": 0, - "liveErrorLinked": 0, - "liveErrorUnlinked": 0, - "qualityFilteredCount": 0, - "qualityFilteredLinked": 0, - "qualityFilteredUnlinked": 0, - "staleFilteredCount": 0, - "staleFilteredLinked": 0, - "staleFilteredUnlinked": 0, - "historyErrorCount": 0, - "historyErrorLinked": 0, - "historyErrorUnlinked": 0, - "insufficientHistoryCount": 0, - "lowHistoryCandidateCount": 0, - "evaluatedLinked": 0, - "evaluatedUnlinked": 0, - "candidateLinked": 0, - "candidateUnlinked": 0, - "nearShiftCount": 0, - "nearShiftLinked": 0, - "nearShiftUnlinked": 0, - "stalenessThresholdSec": staleness_threshold_sec, - "staleSamples": [], - "timestampParseNote": "Naive timestamps are treated as local time by parse_timestamp().", - "detectedSubsystemCount": 0, - "detectedSubsystems": [], - "candidateSubsystemCount": 0, - "candidateBySubsystem": {}, - "subsystemShiftSignals": [], - "maxCandidatesPerSubsystem": 0, - "maxLlmTriagesPerSubsystem": 0, - "llmTriagedCount": 0, - "dedupSuppressedCount": 0, - "toolCalls": [], - } - +# --------------------------------------------------------------------------- +# Subsystem helpers +# --------------------------------------------------------------------------- def _canonical_subsystem_type(kind: Any) -> str: value = str(kind or "").strip().lower() @@ -149,7 +127,6 @@ def infer_tag_group(tag_path: Optional[str], folder_name: Optional[str] = None) head = folder.split("/", 1)[0].strip() if head: return head - raw = str(tag_path or "").strip() if not raw: return None @@ -159,30 +136,26 @@ def infer_tag_group(tag_path: Optional[str], folder_name: Optional[str] = None) if not raw: return None parts = [p.strip() for p in raw.split("/") if p.strip()] - # Ignore flat tags and only infer a group when there is at least one folder segment. if len(parts) < 2: return None return parts[0] -def _last_segment_from_tag_path(tag_path: Optional[str]) -> str: +def _last_segment(tag_path: Optional[str]) -> str: raw = str(tag_path or "").strip() if not raw: return "" if raw.startswith("[") and "]" in raw: raw = raw.split("]", 1)[1] raw = raw.strip("/") - if not raw: - return "" parts = [p.strip() for p in raw.split("/") if p.strip()] return parts[-1] if parts else raw -def _looks_like_live_tag_path(value: Optional[str]) -> bool: +def _looks_like_tag_path(value: Optional[str]) -> bool: path = str(value or "").strip() if not path: return False - # Typical Ignition path shape: [provider]Folder/Tag or Folder/Tag if path.startswith("[") and "]" in path: return True if "/" in path and not any(ch in path for ch in "{}()"): @@ -197,35 +170,32 @@ def derive_subsystems_for_tag( ) -> Tuple[List[Dict[str, str]], Dict[str, str]]: mode = str(subsystem_mode or "auto").strip().lower() if mode in {"global", "off", "disabled"}: - global_ref = _subsystem_ref("global", "all") - return [global_ref], global_ref + ref = _subsystem_ref("global", "all") + return [ref], ref refs: List[Dict[str, str]] = [] seen: Set[str] = set() - def add_ref(kind: str, name: Optional[str]) -> None: + def add(kind: str, name: Optional[str]) -> None: if not name: return ref = _subsystem_ref(kind, name) - if ref["id"] in seen: - return - seen.add(ref["id"]) - refs.append(ref) + if ref["id"] not in seen: + seen.add(ref["id"]) + refs.append(ref) - for view_name in tag_meta.get("views") or []: - add_ref("view", str(view_name)) - for equipment_name in tag_meta.get("equipment") or []: - add_ref("equipment", str(equipment_name)) - add_ref("group", infer_tag_group(tag_meta.get("path"), tag_meta.get("folder_name"))) + for v in tag_meta.get("views") or []: + add("view", str(v)) + for e in tag_meta.get("equipment") or []: + add("equipment", str(e)) + add("group", infer_tag_group(tag_meta.get("path"), tag_meta.get("folder_name"))) if not refs: refs = [_subsystem_ref("global", "all")] - ordered_priority = [ - _canonical_subsystem_type(x) for x in (priority or DEFAULT_SUBSYSTEM_PRIORITY) - ] + ordered = [_canonical_subsystem_type(x) for x in (priority or DEFAULT_SUBSYSTEM_PRIORITY)] primary = refs[0] - for kind in ordered_priority: + for kind in ordered: found = next((s for s in refs if s.get("type") == kind), None) if found: primary = found @@ -234,11 +204,22 @@ def add_ref(kind: str, name: Optional[str]) -> None: return refs, primary +def _preview_value(value: Any, max_len: int = 120) -> Any: + if value is None or isinstance(value, (bool, int, float)): + return value + text = str(value) + return text if len(text) <= max_len else text[:max_len - 3] + "..." + + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: raw = dict(config or {}) thresholds = raw.get("thresholds", {}) if isinstance(raw.get("thresholds"), dict) else {} defaults = { - "pollIntervalMs": 1000, + "pollIntervalMs": 5000, "historyWindowMinutes": 360, "minHistoryPoints": 30, "maxMonitoredTags": 200, @@ -249,17 +230,16 @@ def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: "dedupCooldownMinutes": 10, "retentionDays": 14, "cleanupEveryCycles": 40, - "historyCacheTtlSec": 30, + "historyCacheTtlSec": 60, "tagCacheTtlSec": 60, - "runMode": "live", + "rediscoveryIntervalSec": 60, "scope": { - "project": None, - "equipmentTags": [], - "tagRegex": None, "subsystemMode": "auto", "subsystemPriority": list(DEFAULT_SUBSYSTEM_PRIORITY), "subsystemInclude": [], "includeUnlinkedTags": False, + "tagRegex": None, + "equipmentTags": [], }, "thresholds": { "z": 3.0, @@ -277,320 +257,205 @@ def merge_defaults(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: cfg["thresholds"].update({k: v for k, v in thresholds.items() if v is not None}) if isinstance(raw.get("scope"), dict): cfg["scope"].update(raw["scope"]) - scope_cfg = cfg["scope"] - mode = str(scope_cfg.get("subsystemMode") or "auto").strip().lower() - if mode not in {"auto", "global", "off", "disabled"}: - mode = "auto" - scope_cfg["subsystemMode"] = mode - if not isinstance(scope_cfg.get("subsystemPriority"), list) or not scope_cfg.get("subsystemPriority"): - scope_cfg["subsystemPriority"] = list(DEFAULT_SUBSYSTEM_PRIORITY) - scope_cfg["subsystemPriority"] = [ - str(x).strip() - for x in scope_cfg.get("subsystemPriority", []) - if str(x).strip() + scope = cfg["scope"] + mode = str(scope.get("subsystemMode") or "auto").strip().lower() + scope["subsystemMode"] = mode if mode in {"auto", "global", "off", "disabled"} else "auto" + if not isinstance(scope.get("subsystemPriority"), list) or not scope["subsystemPriority"]: + scope["subsystemPriority"] = list(DEFAULT_SUBSYSTEM_PRIORITY) + scope["subsystemPriority"] = [ + str(x).strip() for x in scope["subsystemPriority"] if str(x).strip() ] or list(DEFAULT_SUBSYSTEM_PRIORITY) - if not isinstance(scope_cfg.get("subsystemInclude"), list): - scope_cfg["subsystemInclude"] = [] - scope_cfg["subsystemInclude"] = [ - str(x).strip().lower() - for x in scope_cfg.get("subsystemInclude", []) - if str(x).strip() - ] - scope_cfg["includeUnlinkedTags"] = bool(scope_cfg.get("includeUnlinkedTags", False)) + if not isinstance(scope.get("subsystemInclude"), list): + scope["subsystemInclude"] = [] + scope["subsystemInclude"] = [str(x).strip().lower() for x in scope["subsystemInclude"] if str(x).strip()] + scope["includeUnlinkedTags"] = bool(scope.get("includeUnlinkedTags", False)) return cfg -class AnomalyMonitor: - def __init__(self, config: Dict[str, Any], run_id: Optional[str] = None): - self.config = merge_defaults(config) - self.run_id = run_id or f"agent-run-{uuid.uuid4()}" - from ignition_api_client import IgnitionApiClient - from neo4j_ontology import get_ontology_graph - - self.graph = get_ontology_graph() - - self.api = IgnitionApiClient( - base_url=self.config.get("ignitionApiUrl") or os.getenv("IGNITION_API_URL"), - api_token=self.config.get("ignitionApiToken") or os.getenv("IGNITION_API_TOKEN"), - timeout=15.0, - ) +# ═══════════════════════════════════════════════════════════════════════════ +# SubsystemAgent — one per subsystem, runs in its own thread +# ═══════════════════════════════════════════════════════════════════════════ - self.llm = None - self._llm_enabled = bool(os.getenv("ANTHROPIC_API_KEY")) - if self._llm_enabled: - try: - from claude_client import ClaudeClient +class SubsystemAgent(threading.Thread): + """Monitors a single subsystem's tags in its own thread.""" - self.llm = ClaudeClient( - enable_tools=False, - ignition_api_url=self.config.get("ignitionApiUrl"), - ignition_api_token=self.config.get("ignitionApiToken"), - ) - except Exception as exc: - self._llm_enabled = False - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": "llm_init_failed", - "message": str(exc), - "recoverable": True, - "timestamp": utc_now_iso(), - }) + def __init__( + self, + *, + subsystem_id: str, + subsystem_type: str, + subsystem_name: str, + tag_metas: List[Dict[str, Any]], + graph: Any, + api: Any, + config: Dict[str, Any], + run_id: str, + stagger_delay: float = 0.0, + ): + super().__init__(daemon=True, name=f"agent-{subsystem_id}") + self.subsystem_id = subsystem_id + self.subsystem_type = subsystem_type + self.subsystem_name = subsystem_name + self.tag_metas = list(tag_metas) + self.graph = graph + self.api = api + self.config = config + self.run_id = run_id + self._stagger_delay = stagger_delay self._running = True + self._paused = False self._cycle_count = 0 + self._total_candidates = 0 + self._total_triaged = 0 + self._total_emitted = 0 + self._cycle_times: List[int] = [] self._prev_values: Dict[str, float] = {} self._history_cache: Dict[str, Dict[str, Any]] = {} - self._tag_cache: Optional[Dict[str, Any]] = None - self._tag_cache_at: float = 0.0 + self._context_cache: Dict[str, Dict[str, Any]] = {} + self._context_cache_ts: Dict[str, float] = {} - # ----------------------------- - # Schema / run lifecycle - # ----------------------------- - def init_schema(self) -> None: - self.graph.init_agent_monitoring_schema() - - def upsert_run(self, status: str, reason: Optional[str] = None) -> None: - with self.graph.session() as session: - session.run( - """ - MERGE (r:AgentRun {run_id: $run_id}) - SET r.status = $status, - r.updated_at = datetime(), - r.last_heartbeat_at = datetime(), - r.config_json = $config_json, - r.cycle_count = $cycle_count, - r.started_at = coalesce(r.started_at, datetime()), - r.stopped_at = CASE WHEN $status IN ['stopped', 'failed'] THEN datetime() ELSE r.stopped_at END, - r.stop_reason = CASE WHEN $reason IS NULL THEN r.stop_reason ELSE $reason END - """, - run_id=self.run_id, - status=status, - config_json=json.dumps(self.config, default=str), - cycle_count=self._cycle_count, - reason=reason, - ) - - def heartbeat(self, metrics: Dict[str, Any]) -> None: - with self.graph.session() as session: - session.run( - """ - MATCH (r:AgentRun {run_id: $run_id}) - SET r.last_heartbeat_at = datetime(), - r.cycle_count = $cycle_count, - r.last_cycle_ms = $cycle_ms, - r.last_candidates = $candidates, - r.last_triaged = $triaged, - r.last_emitted = $emitted - """, - run_id=self.run_id, - cycle_count=self._cycle_count, - cycle_ms=metrics.get("cycleMs", 0), - candidates=metrics.get("candidates", 0), - triaged=metrics.get("triaged", 0), - emitted=metrics.get("emitted", 0), - ) - - # ----------------------------- - # Tag and context collection - # ----------------------------- - def get_monitored_tags(self) -> List[Dict[str, Any]]: - ttl = float(self.config.get("tagCacheTtlSec", 60)) - now = time.time() - if self._tag_cache is not None and ttl > 0 and (now - self._tag_cache_at) < ttl: - return self._tag_cache - - result = self._fetch_monitored_tags() - self._tag_cache = result - self._tag_cache_at = time.time() - return result - - def _fetch_monitored_tags(self) -> List[Dict[str, Any]]: - max_tags = int(self.config.get("maxMonitoredTags", 200)) - scope = self.config.get("scope", {}) - tag_regex = scope.get("tagRegex") - equipment_tags = { - str(x).strip().lower() - for x in (scope.get("equipmentTags") or []) - if str(x).strip() - } - subsystem_mode = str(scope.get("subsystemMode") or "auto").strip().lower() - subsystem_priority = scope.get("subsystemPriority") or list(DEFAULT_SUBSYSTEM_PRIORITY) - subsystem_include = { - str(x).strip().lower() - for x in (scope.get("subsystemInclude") or []) - if str(x).strip() - } - include_unlinked = bool(scope.get("includeUnlinkedTags", False)) - tag_map: Dict[str, Dict[str, Any]] = {} - - def upsert_tag( - *, - tag_path: str, - tag_name: str, - folder_name: str = "", - views: Optional[List[str]] = None, - equipment: Optional[List[str]] = None, - source: str = "unknown", - ) -> None: - path = str(tag_path or "").strip() - if not path: - return - entry = tag_map.setdefault( - path, - { - "path": path, - "name": str(tag_name or _last_segment_from_tag_path(path) or path), - "folder_name": str(folder_name or ""), - "views": [], - "equipment": [], - "source": source, - "bound_to_view": False, - }, - ) - if source == "view_binding": - entry["bound_to_view"] = True - entry["source"] = source - if folder_name and not entry.get("folder_name"): - entry["folder_name"] = str(folder_name) - if tag_name and ( - not entry.get("name") - or entry.get("name") == entry.get("path") - or entry.get("name") == _last_segment_from_tag_path(entry.get("path")) - ): - entry["name"] = str(tag_name) - for view_name in views or []: - v = str(view_name or "").strip() - if v and v not in entry["views"]: - entry["views"].append(v) - for eq_name in equipment or []: - eq = str(eq_name or "").strip() - if eq and eq not in entry["equipment"]: - entry["equipment"].append(eq) - - with self.graph.session() as session: - bound_result = session.run( - """ - MATCH (v:View)-[:HAS_COMPONENT]->(c:ViewComponent)-[r:BINDS_TO]->(n) - WHERE r.tag_path IS NOT NULL - AND trim(r.tag_path) <> '' - AND toLower(coalesce(r.binding_type, 'tag')) = 'tag' - OPTIONAL MATCH (eq:Equipment)-[*1..2]-(n) - RETURN DISTINCT trim(r.tag_path) AS tag_path, - coalesce(n.name, '') AS tag_name, - collect(DISTINCT v.name) AS views, - collect(DISTINCT eq.name) AS equipment - LIMIT $limit - """, - limit=max_tags * 4, - ) - for r in bound_result: - path = str(r["tag_path"] or "").strip() - if not _looks_like_live_tag_path(path): - continue - upsert_tag( - tag_path=path, - tag_name=str(r["tag_name"] or _last_segment_from_tag_path(path)), - folder_name=infer_tag_group(path) or "", - views=[x for x in (r["views"] or []) if x], - equipment=[x for x in (r["equipment"] or []) if x], - source="view_binding", - ) - - scada_result = session.run( - """ - MATCH (t:ScadaTag) - WHERE t.opc_item_path IS NOT NULL - AND trim(t.opc_item_path) <> '' - OPTIONAL MATCH (c:ViewComponent)-[:BINDS_TO]->(t) - OPTIONAL MATCH (v:View)-[:HAS_COMPONENT]->(c) - OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) - RETURN DISTINCT trim(t.opc_item_path) AS tag_path, - coalesce(t.name, t.opc_item_path) AS tag_name, - coalesce(t.folder_name, '') AS folder_name, - collect(DISTINCT v.name) AS views, - collect(DISTINCT eq.name) AS equipment - LIMIT $limit - """, - limit=max_tags * 6, - ) - for r in scada_result: - path = str(r["tag_path"] or "").strip() - if not _looks_like_live_tag_path(path): - continue - upsert_tag( - tag_path=path, - tag_name=str(r["tag_name"] or _last_segment_from_tag_path(path)), - folder_name=str(r["folder_name"] or ""), - views=[x for x in (r["views"] or []) if x], - equipment=[x for x in (r["equipment"] or []) if x], - source="scada_tag", + self.llm = None + if bool(os.getenv("ANTHROPIC_API_KEY")): + try: + from claude_client import ClaudeClient + self.llm = ClaudeClient( + enable_tools=False, + ignition_api_url=config.get("ignitionApiUrl"), + ignition_api_token=config.get("ignitionApiToken"), ) + except Exception: + pass + + @property + def agent_state(self) -> str: + if not self._running: + return "stopped" + if self._paused: + return "paused" + return "running" + + @property + def avg_cycle_ms(self) -> int: + if not self._cycle_times: + return 0 + return int(sum(self._cycle_times) / len(self._cycle_times)) + + def update_tags(self, tag_metas: List[Dict[str, Any]]) -> None: + self.tag_metas = list(tag_metas) + + def pause(self) -> None: + self._paused = True + + def resume(self) -> None: + self._paused = False + + def stop(self) -> None: + self._running = False + + # ------------------------------------------------------------------- + # Thread entry point + # ------------------------------------------------------------------- + def run(self) -> None: + poll_ms = int(self.config.get("pollIntervalMs", 1000)) + self._emit_status("agent_started", "cycle_start") - tags = list(tag_map.values()) + if self._stagger_delay > 0: + self._emit_progress("staggering", f"{int(self._stagger_delay * 1000)}ms") + time.sleep(self._stagger_delay) - if not include_unlinked: - linked = [t for t in tags if (t.get("views") or t.get("equipment") or t.get("bound_to_view"))] - if linked: - tags = linked + while self._running: + if self._paused: + time.sleep(0.5) + continue - if tag_regex: - import re + self._cycle_count += 1 + t0 = time.time() try: - pattern = re.compile(tag_regex, re.IGNORECASE) - tags = [t for t in tags if pattern.search(t["path"]) or pattern.search(t["name"])] - except re.error: + metrics = self._run_cycle() + cycle_ms = int((time.time() - t0) * 1000) + self._cycle_times.append(cycle_ms) + if len(self._cycle_times) > 20: + self._cycle_times = self._cycle_times[-20:] + + self._emit_status( + "cycle_complete", + "ok", + cycle_ms=cycle_ms, + diagnostics=metrics.get("diagnostics", {}), + candidates=metrics.get("candidates", 0), + triaged=metrics.get("triaged", 0), + emitted=metrics.get("emitted", 0), + live_events=metrics.get("liveEvents", []), + ) + except Exception as exc: + cycle_ms = int((time.time() - t0) * 1000) emit("AGENT_ERROR", { "runId": self.run_id, - "code": "invalid_tag_regex", - "message": f"Invalid regex: {tag_regex}", + "subsystemId": self.subsystem_id, + "code": "cycle_error", + "message": str(exc), "recoverable": True, "timestamp": utc_now_iso(), }) + self._emit_status("cycle_error", str(exc), cycle_ms=cycle_ms) - if equipment_tags: - tags = [ - t for t in tags - if t["name"].lower() in equipment_tags - or t["path"].lower() in equipment_tags - or any(str(eq).strip().lower() in equipment_tags for eq in t.get("equipment", [])) - ] - - tags.sort( - key=lambda t: ( - 0 if t.get("bound_to_view") else 1, - 0 if (t.get("views") or t.get("equipment")) else 1, - str(t.get("path", "")), - ) - ) - - for tag in tags: - subsystems, primary = derive_subsystems_for_tag( - tag_meta=tag, - subsystem_mode=subsystem_mode, - priority=subsystem_priority, - ) - tag["subsystems"] = subsystems - tag["primary_subsystem"] = primary + elapsed = time.time() - t0 + remaining = max(0, poll_ms / 1000.0 - elapsed) + if remaining > 0 and self._running: + self._emit_progress("waiting", f"{int(remaining * 1000)}ms") + time.sleep(remaining) - if subsystem_include: - tags = [ - t - for t in tags - if any( - s.get("id", "").lower() in subsystem_include - or s.get("name", "").lower() in subsystem_include - for s in (t.get("subsystems") or []) - ) - ] + self._emit_status("agent_stopped", "stopped") - return tags[:max_tags] + # ------------------------------------------------------------------- + # Status emission + # ------------------------------------------------------------------- + def _emit_status( + self, + phase: str, + reason: str, + cycle_ms: int = 0, + diagnostics: Optional[Dict[str, Any]] = None, + candidates: int = 0, + triaged: int = 0, + emitted: int = 0, + live_events: Optional[List[Dict[str, Any]]] = None, + ) -> None: + payload: Dict[str, Any] = { + "runId": self.run_id, + "subsystemId": self.subsystem_id, + "state": self.agent_state, + "cycleMs": cycle_ms, + "candidates": candidates, + "triaged": triaged, + "emitted": emitted, + "diagnostics": { + "phase": phase, + "reason": reason, + "subsystemId": self.subsystem_id, + "subsystemType": self.subsystem_type, + "subsystemName": self.subsystem_name, + "cycleCount": self._cycle_count, + "avgCycleMs": self.avg_cycle_ms, + "totalCandidates": self._total_candidates, + "totalTriaged": self._total_triaged, + "totalEmitted": self._total_emitted, + "tagCount": len(self.tag_metas), + **(diagnostics or {}), + }, + "timestamp": utc_now_iso(), + } + if live_events is not None: + payload["liveEvents"] = live_events + emit("AGENT_STATUS", payload) + # ------------------------------------------------------------------- + # History fetching (per-agent cache) + # ------------------------------------------------------------------- def _extract_history_values(self, history_data: Any, tag_path: str) -> List[float]: - """Normalize multiple gateway response shapes to numeric values list.""" values: List[float] = [] - if history_data is None: - return values - if isinstance(history_data, dict) and history_data.get("error"): + if history_data is None or (isinstance(history_data, dict) and history_data.get("error")): return values rows: List[Any] = [] @@ -602,10 +467,10 @@ def _extract_history_values(self, history_data: Any, tag_path: str) -> List[floa if isinstance(chunk, list): rows = chunk break - if not rows and "tagHistory" in history_data and isinstance(history_data["tagHistory"], list): + if not rows and isinstance(history_data.get("tagHistory"), list): rows = history_data["tagHistory"] - prefixed = self.api._ensure_provider_prefix(tag_path) if hasattr(self, "api") else tag_path + prefixed = self.api._ensure_provider_prefix(tag_path) if hasattr(self.api, "_ensure_provider_prefix") else tag_path stripped = tag_path if stripped.startswith("[") and "]" in stripped: stripped = stripped[stripped.index("]") + 1:] @@ -621,11 +486,11 @@ def _extract_history_values(self, history_data: Any, tag_path: str) -> List[floa continue candidate = None if "value" in row: - candidate = row.get("value") + candidate = row["value"] else: - matched_key = next((k for k in path_variants if k in row), None) - if matched_key: - candidate = row.get(matched_key) + matched = next((k for k in path_variants if k in row), None) + if matched: + candidate = row[matched] elif len(row) <= 2: for k, v in row.items(): if k.lower() in {"timestamp", "ts", "t", "time"}: @@ -637,34 +502,7 @@ def _extract_history_values(self, history_data: Any, tag_path: str) -> List[floa values.append(val) return values - def fetch_history_values(self, tag_path: str) -> tuple[List[float], Optional[str]]: - ttl = float(self.config.get("historyCacheTtlSec", 30)) - now = time.time() - cached = self._history_cache.get(tag_path) - if cached and ttl > 0 and (now - cached["fetched_at"]) < ttl: - return list(cached["values"]), cached.get("error") - - minutes = int(self.config.get("historyWindowMinutes", 360)) - end_dt = datetime.now(timezone.utc) - start_dt = end_dt - timedelta(minutes=minutes) - data = self.api.query_tag_history( - [tag_path], - start_dt.isoformat(), - end_dt.isoformat(), - return_size=max(100, int(self.config.get("minHistoryPoints", 30)) * 4), - aggregation_mode="Average", - return_format="Wide", - ) - if isinstance(data, dict) and data.get("error"): - err = str(data.get("error")) - self._history_cache[tag_path] = {"values": [], "error": err, "fetched_at": now} - return [], err - values = self._extract_history_values(data, tag_path) - self._history_cache[tag_path] = {"values": values, "error": None, "fetched_at": now} - return values, None - - def fetch_history_batch(self, tag_paths: List[str]) -> Dict[str, Tuple[List[float], Optional[str]]]: - """Fetch history for many tags, using cache and batched API calls.""" + def _fetch_history_batch(self, tag_paths: List[str]) -> Dict[str, Tuple[List[float], Optional[str]]]: ttl = float(self.config.get("historyCacheTtlSec", 30)) now = time.time() results: Dict[str, Tuple[List[float], Optional[str]]] = {} @@ -684,35 +522,46 @@ def fetch_history_batch(self, tag_paths: List[str]) -> Dict[str, Tuple[List[floa end_dt = datetime.now(timezone.utc) start_dt = end_dt - timedelta(minutes=minutes) return_size = max(100, int(self.config.get("minHistoryPoints", 30)) * 4) - batch_size = 20 - - for i in range(0, len(uncached), batch_size): - batch = uncached[i : i + batch_size] - data = self.api.query_tag_history( - batch, - start_dt.isoformat(), - end_dt.isoformat(), - return_size=return_size, - aggregation_mode="Average", - return_format="Wide", - ) - fetch_ts = time.time() + + for i in range(0, len(uncached), 20): + batch = uncached[i:i + 20] + with _api_semaphore: + data = self.api.query_tag_history( + batch, start_dt.isoformat(), end_dt.isoformat(), + return_size=return_size, aggregation_mode="Average", return_format="Wide", + ) + ts = time.time() if isinstance(data, dict) and data.get("error"): - err = str(data.get("error")) - for path in batch: - results[path] = ([], err) - self._history_cache[path] = {"values": [], "error": err, "fetched_at": fetch_ts} + err = str(data["error"]) + for p in batch: + results[p] = ([], err) + self._history_cache[p] = {"values": [], "error": err, "fetched_at": ts} continue - for path in batch: - values = self._extract_history_values(data, path) - results[path] = (values, None) - self._history_cache[path] = {"values": values, "error": None, "fetched_at": fetch_ts} + for p in batch: + vals = self._extract_history_values(data, p) + results[p] = (vals, None) + self._history_cache[p] = {"values": vals, "error": None, "fetched_at": ts} return results - def get_context(self, tag_path: str) -> Dict[str, Any]: + # ------------------------------------------------------------------- + # Context & triage + # ------------------------------------------------------------------- + def _get_context(self, tag_path: str) -> Dict[str, Any]: + ttl = 120.0 + now = time.time() + cached_ts = self._context_cache_ts.get(tag_path, 0) + if tag_path in self._context_cache and (now - cached_ts) < ttl: + return dict(self._context_cache[tag_path]) + + ctx = self._fetch_context_from_graph(tag_path) + self._context_cache[tag_path] = ctx + self._context_cache_ts[tag_path] = now + return dict(ctx) + + def _fetch_context_from_graph(self, tag_path: str) -> Dict[str, Any]: with self.graph.session() as session: result = session.run( """ @@ -723,1252 +572,746 @@ def get_context(self, tag_path: str) -> Dict[str, Any]: OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) OPTIONAL MATCH (eq)-[:HAS_SYMPTOM]->(s:FaultSymptom) OPTIONAL MATCH (s)-[:CAUSED_BY]->(fc:FaultCause) - OPTIONAL MATCH (eq)-[:HAS_PATTERN]->(p:ControlPattern) - OPTIONAL MATCH (eq)-[:SAFETY_CRITICAL]->(se:SafetyElement) RETURN t, collect(DISTINCT v.name) AS views, collect(DISTINCT eq.name) AS equipment, collect(DISTINCT s.symptom) AS symptoms, - collect(DISTINCT fc.cause) AS causes, - collect(DISTINCT p.name) AS patterns, - collect(DISTINCT se.name) AS safety + collect(DISTINCT fc.cause) AS causes LIMIT 1 """, tag=tag_path, ) record = result.single() - fallback_views: List[str] = [] - fallback_equipment: List[str] = [] - fallback_result = session.run( + fallback = session.run( """ MATCH (v:View)-[:HAS_COMPONENT]->(vc:ViewComponent)-[r:BINDS_TO]->(n) WHERE r.tag_path = $tag OPTIONAL MATCH (eq:Equipment)-[*1..2]-(n) - RETURN collect(DISTINCT v.name) AS views, - collect(DISTINCT eq.name) AS equipment + RETURN collect(DISTINCT v.name) AS views, collect(DISTINCT eq.name) AS equipment LIMIT 1 """, tag=tag_path, ).single() - if fallback_result: - fallback_views = [x for x in (fallback_result["views"] or []) if x] - fallback_equipment = [x for x in (fallback_result["equipment"] or []) if x] + fb_views = [x for x in (fallback["views"] or []) if x] if fallback else [] + fb_equip = [x for x in (fallback["equipment"] or []) if x] if fallback else [] if not record: return { "tag_path": tag_path, - "tag_name": _last_segment_from_tag_path(tag_path) or tag_path, - "views": fallback_views, - "equipment": fallback_equipment, + "tag_name": _last_segment(tag_path) or tag_path, + "views": fb_views, "equipment": fb_equip, "group": infer_tag_group(tag_path), - "symptoms": [], - "causes": [], - "patterns": [], - "safety": [], + "symptoms": [], "causes": [], } node = record["t"] return { "tag_path": tag_path, - "tag_name": node.get("name") if node else (_last_segment_from_tag_path(tag_path) or tag_path), - "views": sorted(set([x for x in record["views"] if x] + fallback_views)), - "equipment": sorted(set([x for x in record["equipment"] if x] + fallback_equipment)), + "tag_name": node.get("name") if node else (_last_segment(tag_path) or tag_path), + "views": sorted(set([x for x in record["views"] if x] + fb_views)), + "equipment": sorted(set([x for x in record["equipment"] if x] + fb_equip)), "group": infer_tag_group(tag_path, node.get("folder_name") if node else None), "symptoms": [x for x in record["symptoms"] if x], "causes": [x for x in record["causes"] if x], - "patterns": [x for x in record["patterns"] if x], - "safety": [x for x in record["safety"] if x], } - # ----------------------------- - # Triage and persistence - # ----------------------------- - def run_llm_triage( - self, - context: Dict[str, Any], - deterministic: Dict[str, Any], - live_sample: Dict[str, Any], - ) -> Dict[str, Any]: + def _run_llm_triage(self, context: Dict, deterministic: Dict, live_sample: Dict) -> Dict[str, Any]: fallback = { - "summary": f"Deterministic anomaly on {context.get('tag_name', context['tag_path'])}", + "summary": f"Deviation on {context.get('tag_name', context['tag_path'])} in {self.subsystem_name}", "category": deterministic.get("category", "deviation"), "severity": "medium", - "confidence": 0.55, + "confidence": 0.5, "probable_causes": ["Signal deviates from historical baseline."], - "verification_checks": [ - f"Check live quality/timestamp for {context.get('tag_path')}", - "Inspect upstream interlocks and communication health.", - ], - "safety_notes": context.get("safety", []), - "rationale": "LLM triage unavailable; using deterministic fallback.", + "verification_checks": [f"Check {context.get('tag_path')}"], + "safety_notes": [], + "rationale": "Deterministic-only triage.", "related_entities": [ {"label": "Equipment", "name": e} for e in context.get("equipment", [])[:3] - ] + [{"label": "View", "name": v} for v in context.get("views", [])[:2]], + ], } if not self.llm: return fallback - - system_prompt = ( - "You are an industrial anomaly triage assistant. " - "Return ONLY valid JSON with keys: summary, category, severity, confidence, " - "probable_causes, verification_checks, safety_notes, rationale, related_entities. " - "Severity must be one of critical/high/medium/low. " - "Category must be one of spike/drift/stuck/state-conflict/quality-issue/deviation. " - "related_entities is a list of objects: {label,name}." - ) - user_prompt = json.dumps( - { - "context": context, - "deterministic": deterministic, - "live_sample": live_sample, - }, - default=str, - ) try: result = self.llm.query_json( - system_prompt=system_prompt, - user_prompt=user_prompt, + system_prompt=( + "You are an industrial anomaly triage assistant. " + "Return ONLY valid JSON with keys: summary, category, severity, confidence, " + "probable_causes, verification_checks, safety_notes, rationale, related_entities." + ), + user_prompt=json.dumps({"context": context, "deterministic": deterministic, "live_sample": live_sample}, default=str), max_tokens=900, use_tools=False, ) data = result.get("data") - if not isinstance(data, dict): - return fallback - merged = dict(fallback) - merged.update({k: v for k, v in data.items() if v is not None}) - return merged - except Exception as exc: - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": "llm_triage_failed", - "message": str(exc), - "recoverable": True, - "timestamp": utc_now_iso(), - }) - return fallback + if isinstance(data, dict): + merged = dict(fallback) + merged.update({k: v for k, v in data.items() if v is not None}) + return merged + except Exception: + pass + return fallback + + # ------------------------------------------------------------------- + # Main cycle + # ------------------------------------------------------------------- + def _emit_progress(self, step: str, detail: str = "") -> None: + emit("AGENT_STATUS", { + "runId": self.run_id, + "subsystemId": self.subsystem_id, + "state": self.agent_state, + "diagnostics": { + "phase": "cycle_progress", + "step": step, + "detail": detail, + "subsystemId": self.subsystem_id, + "cycleCount": self._cycle_count, + }, + "timestamp": utc_now_iso(), + }) - def _severity_from_scores(self, deterministic: Dict[str, Any], llm_out: Dict[str, Any]) -> str: - sev = str(llm_out.get("severity", "")).lower() - if sev in {"critical", "high", "medium", "low"}: - return sev - z = abs(float(deterministic.get("z_score", 0.0))) - if z >= 8: - return "critical" - if z >= 5: - return "high" - if z >= 3: - return "medium" - return "low" - - def is_duplicate_recent(self, dedup_sig: str) -> bool: - cooldown = max(1, int(self.config.get("dedupCooldownMinutes", 10))) - with self.graph.session() as session: - result = session.run( - """ - MATCH (e:AnomalyEvent {dedup_key: $dedup_key}) - WHERE e.created_at IS NOT NULL - AND datetime(e.created_at) > datetime() - duration({minutes: $minutes}) - RETURN count(e) AS cnt - """, - dedup_key=dedup_sig, - minutes=cooldown, - ) - row = result.single() - return bool(row and row["cnt"] > 0) + def _run_cycle(self) -> Dict[str, Any]: + thresholds = self.config.get("thresholds", {}) + stale_sec = int(thresholds.get("stalenessSec", 120)) + min_history = int(self.config.get("minHistoryPoints", 30)) + max_candidates = int(self.config.get("maxCandidatesPerSubsystem", 8)) + max_llm = int(self.config.get("maxLlmTriagesPerSubsystem", 0)) + + tag_paths = [t["path"] for t in self.tag_metas] + if not tag_paths: + return {"candidates": 0, "triaged": 0, "emitted": 0, "diagnostics": {"phase": "cycle_complete", "reason": "no_tags"}} + + self._emit_progress("reading_tags", f"{len(tag_paths)} tags") + t_read = time.time() + with _api_semaphore: + live_values = self.api.read_tags(tag_paths) + read_ms = int((time.time() - t_read) * 1000) + now = datetime.now(timezone.utc) - def persist_event( - self, - context: Dict[str, Any], - deterministic: Dict[str, Any], - live_sample: Dict[str, Any], - triage: Dict[str, Any], - subsystem: Optional[Dict[str, str]] = None, - ) -> Optional[Dict[str, Any]]: - category = triage.get("category") or deterministic.get("category", "deviation") - subsystem_ref = subsystem or _subsystem_ref("global", "all") - dedup_source = f"{context['tag_path']}::{subsystem_ref.get('id', 'global:all')}" - dedup_sig = dedup_key(dedup_source, category, int(self.config.get("dedupCooldownMinutes", 10))) - if self.is_duplicate_recent(dedup_sig): - return None - - event_id = f"ae-{uuid.uuid4()}" - severity = self._severity_from_scores(deterministic, triage) - confidence = float(max(0.0, min(1.0, triage.get("confidence", 0.5)))) - event_data = { - "event_id": event_id, - "run_id": self.run_id, - "event_schema_version": 1, - "state": "open", - "severity": severity, - "confidence": confidence, - "category": category, - "summary": triage.get("summary", f"Anomaly on {context['tag_path']}"), - "explanation": triage.get("rationale", ""), - "recommended_checks_json": json.dumps(triage.get("verification_checks", []), default=str), - "probable_causes_json": json.dumps(triage.get("probable_causes", []), default=str), - "safety_notes_json": json.dumps(triage.get("safety_notes", []), default=str), - "deterministic_reasons_json": json.dumps(deterministic.get("reasons", []), default=str), - "z_score": float(deterministic.get("z_score", 0.0)), - "mad_score": float(deterministic.get("mad_score", 0.0)), - "delta_rate": float(deterministic.get("delta_rate", 0.0)), - "window_volatility": float(deterministic.get("window_volatility", 0.0)), - "source_tag": context["tag_path"], - "tag_name": context.get("tag_name") or context["tag_path"], - "subsystem_type": subsystem_ref.get("type"), - "subsystem_name": subsystem_ref.get("name"), - "subsystem_id": subsystem_ref.get("id"), - "live_quality": live_sample.get("quality"), - "live_timestamp": live_sample.get("timestamp"), - "live_value": str(live_sample.get("value")), - "dedup_key": dedup_sig, - "created_at": utc_now_iso(), - "updated_at": utc_now_iso(), + tags_for_history: List[Tuple[Any, Dict[str, Any]]] = [] + live_error_count = 0 + quality_filtered = 0 + stale_filtered = 0 + + for idx, tv in enumerate(live_values): + tag_meta = self.tag_metas[idx] if idx < len(self.tag_metas) else {"path": tv.path, "name": tv.path} + if tv.error: + live_error_count += 1 + continue + if not is_quality_good(tv.quality): + quality_filtered += 1 + continue + if is_stale(tv.timestamp, stale_sec, now=now): + stale_filtered += 1 + continue + tags_for_history.append((tv, tag_meta)) + + self._emit_progress("fetching_history", f"{len(tags_for_history)} tags") + history_paths = [tv.path for tv, _ in tags_for_history] + t_hist = time.time() + history_results = self._fetch_history_batch(history_paths) if history_paths else {} + hist_ms = int((time.time() - t_hist) * 1000) + + self._emit_progress("scoring", f"{len(tags_for_history)} tags (read={read_ms}ms hist={hist_ms}ms)") + t_score = time.time() + shift_signal = { + "subsystemId": self.subsystem_id, + "subsystemType": self.subsystem_type, + "subsystemName": self.subsystem_name, + "evaluated": 0, "candidate": 0, "nearShift": 0, + "sumAbsZ": 0.0, "maxAbsZ": 0.0, + "_tagEntries": [], } + candidates: List[Dict] = [] + history_errors = 0 + insufficient_history = 0 + + for tv, tag_meta in tags_for_history: + history, hist_err = history_results.get(tv.path, ([], "No history")) + if hist_err: + history_errors += 1 + continue + if len(history) < min_history and len(history) < 5: + insufficient_history += 1 + continue - with self.graph.session() as session: - session.run( - """ - MATCH (r:AgentRun {run_id: $run_id}) - CREATE (e:AnomalyEvent $props) - MERGE (r)-[:EMITTED]->(e) - """, - run_id=self.run_id, - props=event_data, - ) + prev_val = self._prev_values.get(tv.path) + det = compute_deviation_scores(tv.value, history, prev_value=prev_val, thresholds=thresholds) + curr = safe_float(tv.value) + if curr is not None: + self._prev_values[tv.path] = curr + + abs_z = abs(float(det.get("z_score", 0.0))) + z = float(det.get("z_score", 0.0)) + shift_signal["evaluated"] += 1 + shift_signal["sumAbsZ"] += abs_z + if abs_z > shift_signal["maxAbsZ"]: + shift_signal["maxAbsZ"] = abs_z + if abs_z >= 1.5: + shift_signal["nearShift"] += 1 + + tag_name = tv.path.rsplit("/", 1)[-1] if "/" in str(tv.path) else str(tv.path) + cached_hist = self._history_cache.get(tv.path) + sparkline = None + avg_val = None + if cached_hist and cached_hist.get("values"): + vals = cached_hist["values"] + avg_val = round(sum(vals) / len(vals), 2) + if len(vals) <= 20: + sparkline = [round(v, 2) for v in vals] + else: + step = len(vals) / 20 + sparkline = [round(vals[int(i * step)], 2) for i in range(20)] - session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - MATCH (t:ScadaTag) - WHERE t.name = $tag OR t.opc_item_path = $tag - MERGE (e)-[:OBSERVED_ON]->(t) - """, - event_id=event_id, - tag=context["tag_path"], - ) + shift_signal["_tagEntries"].append({ + "path": str(tv.path), "name": tag_name, + "z": round(z, 3), "mad": round(float(det.get("mad_score", 0)), 3), + "value": tv.value, "avg": avg_val, "sparkline": sparkline, + }) - for equipment_name in context.get("equipment", [])[:5]: - session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - MATCH (eq:Equipment {name: $name}) - MERGE (e)-[:AFFECTS]->(eq) - """, - event_id=event_id, - name=equipment_name, - ) + cat = det.get("category", "normal") + if det.get("candidate") and cat != "stuck" and len(candidates) < max_candidates: + shift_signal["candidate"] += 1 + context = self._get_context(tv.path) + context["subsystem"] = _subsystem_ref(self.subsystem_type, self.subsystem_name) + candidates.append({ + "context": context, "deterministic": det, + "live_sample": {"path": tv.path, "value": tv.value, "quality": tv.quality, "timestamp": tv.timestamp}, + }) - if subsystem_ref.get("type") == "view": - session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - MATCH (v:View {name: $name}) - MERGE (e)-[:SCOPED_TO]->(v) - """, - event_id=event_id, - name=subsystem_ref.get("name"), - ) - elif subsystem_ref.get("type") == "equipment": - session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - MATCH (eq:Equipment {name: $name}) - MERGE (e)-[:SCOPED_TO]->(eq) - """, - event_id=event_id, - name=subsystem_ref.get("name"), - ) + score_ms = int((time.time() - t_score) * 1000) + + t_triage = time.time() + live_events: List[Dict[str, Any]] = [] + now_iso = utc_now_iso() + for cand in candidates: + det = cand["deterministic"] + ctx = cand["context"] + ls = cand["live_sample"] + severity = "low" + abs_z = abs(float(det.get("z_score", 0))) + if abs_z >= 8: + severity = "critical" + elif abs_z >= 5: + severity = "high" + elif abs_z >= 3: + severity = "medium" + live_events.append({ + "event_id": f"live-{self.subsystem_id}-{ls.get('path', '')}", + "source_tag": ls.get("path", ""), + "tag_name": ctx.get("tag_name") or ls.get("path", ""), + "subsystem_id": self.subsystem_id, + "subsystem_type": self.subsystem_type, + "subsystem_name": self.subsystem_name, + "state": "open", + "severity": severity, + "category": det.get("category", "deviation"), + "summary": f"{det.get('category', 'Deviation')} on {ctx.get('tag_name', '?')} (z={det.get('z_score', 0):.1f})", + "z_score": float(det.get("z_score", 0)), + "mad_score": float(det.get("mad_score", 0)), + "delta_rate": float(det.get("delta_rate", 0)), + "confidence": 0.5, + "deterministic_reasons_json": json.dumps(det.get("reasons", []), default=str), + "live_value": str(ls.get("value")), + "live_quality": ls.get("quality"), + "live_timestamp": ls.get("timestamp"), + "created_at": now_iso, + }) + triage_ms = int((time.time() - t_triage) * 1000) + + self._total_candidates += len(candidates) + self._total_emitted += len(live_events) + + evaluated = max(1, shift_signal["evaluated"]) + tag_entries = shift_signal.pop("_tagEntries", []) + shift_signal["avgAbsZ"] = round(shift_signal["sumAbsZ"] / evaluated, 3) + shift_signal["shiftRatio"] = round(shift_signal["nearShift"] / evaluated, 3) + shift_signal["candidateRatio"] = round(shift_signal["candidate"] / evaluated, 3) + shift_signal.pop("sumAbsZ", None) + sorted_tags = sorted(tag_entries, key=lambda t: abs(t.get("z", 0)), reverse=True) + shift_signal["tagSignals"] = sorted_tags + + return { + "candidates": len(candidates), + "triaged": len(live_events), + "emitted": len(live_events), + "liveEvents": live_events, + "diagnostics": { + "phase": "cycle_complete", + "reason": "ok", + "monitoredTags": len(tag_paths), + "liveErrorCount": live_error_count, + "qualityFilteredCount": quality_filtered, + "staleFilteredCount": stale_filtered, + "historyErrorCount": history_errors, + "insufficientHistoryCount": insufficient_history, + "evaluatedCount": shift_signal["evaluated"], + "candidateCount": len(candidates), + "subsystemShiftSignals": [shift_signal], + "timingMs": { + "read": read_ms, + "history": hist_ms, + "score": score_ms, + "triage": triage_ms, + }, + }, + } - related_inputs: List[Dict[str, str]] = [] - for item in triage.get("related_entities", []) or []: - if isinstance(item, dict) and item.get("label") and item.get("name"): - related_inputs.append({"label": str(item["label"]), "name": str(item["name"])}) - for name in context.get("symptoms", [])[:3]: - related_inputs.append({"label": "FaultSymptom", "name": name}) - for name in context.get("causes", [])[:3]: - related_inputs.append({"label": "FaultCause", "name": name}) - - for rel in related_inputs[:8]: - label = rel["label"] - if label not in {"FaultSymptom", "FaultCause", "ControlPattern", "SafetyElement", "Equipment", "ScadaTag", "View"}: - continue - session.run( - f""" - MATCH (e:AnomalyEvent {{event_id: $event_id}}) - MATCH (n:{label}) - WHERE n.name = $name OR n.symptom = $name OR n.cause = $name - MERGE (e)-[:RELATED_TO]->(n) - """, - event_id=event_id, - name=rel["name"], - ) - return event_data +# ═══════════════════════════════════════════════════════════════════════════ +# AgentCoordinator — manages subsystem agents +# ═══════════════════════════════════════════════════════════════════════════ - def _emit_persisted_event(self, persisted: Dict[str, Any]) -> None: - """Emit normalized AGENT_EVENT payload for UI stream.""" - emit("AGENT_EVENT", { - "runId": self.run_id, - "eventId": persisted["event_id"], - "severity": persisted["severity"], - "summary": persisted["summary"], - "category": persisted.get("category"), - "entityRefs": { - "tag": persisted.get("tag_name") or persisted.get("source_tag"), - "sourceTag": persisted.get("source_tag"), - "subsystemType": persisted.get("subsystem_type"), - "subsystemName": persisted.get("subsystem_name"), - }, - "createdAt": persisted.get("created_at"), - }) +class AgentCoordinator: + """Discovers subsystems, spawns/manages SubsystemAgent threads.""" - def emit_provider_failure_event( - self, - code: str, - message: str, - *, - severity: str = "high", - category: str = "quality-issue", - source_tag: Optional[str] = None, - details: Optional[Dict[str, Any]] = None, - subsystem: Optional[Dict[str, str]] = None, - ) -> bool: - """ - Persist and stream provider-health anomalies so failures appear in feed. - - Returns: - True if a new event was persisted (false if deduped). - """ - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": code, - "message": message, - "recoverable": True, - "timestamp": utc_now_iso(), - }) + def __init__(self, config: Dict[str, Any], run_id: Optional[str] = None): + self.config = merge_defaults(config) + self.run_id = run_id or f"agent-{int(time.time() * 1000)}" + from ignition_api_client import IgnitionApiClient + from neo4j_ontology import get_ontology_graph - tag = source_tag or f"provider://{code}" - detail_blob = json.dumps(details or {}, default=str) - context = { - "tag_path": tag, - "tag_name": source_tag or "ProviderHealth", - "equipment": [], - "symptoms": [], - "causes": [], - "patterns": [], - "safety": [], - } - deterministic = { - "candidate": True, - "reasons": [code], - "category": category, - "z_score": 0.0, - "mad_score": 0.0, - "delta_rate": 0.0, - "window_volatility": 0.0, - "history_points": 0, - } - triage = { - "summary": message, - "category": category, - "severity": severity, - "confidence": 0.9, - "probable_causes": [message], - "verification_checks": [ - "Check Ignition gateway connectivity and credentials.", - "Validate tag provider availability and endpoint health.", - ], - "safety_notes": [], - "rationale": f"Provider health event ({code}). Details: {detail_blob}", - "related_entities": [], - } - persisted = self.persist_event( - context=context, - deterministic=deterministic, - live_sample={ - "path": tag, - "value": "", - "quality": "Bad", - "timestamp": utc_now_iso(), - "data_type": "provider_health", - }, - triage=triage, - subsystem=subsystem, + self.graph = get_ontology_graph() + self.api = IgnitionApiClient( + base_url=self.config.get("ignitionApiUrl") or os.getenv("IGNITION_API_URL"), + api_token=self.config.get("ignitionApiToken") or os.getenv("IGNITION_API_TOKEN"), + timeout=15.0, ) - if persisted: - self._emit_persisted_event(persisted) - return True - return False + self._running = True + self.agents: Dict[str, SubsystemAgent] = {} - # ----------------------------- - # Monitoring loop - # ----------------------------- - def run_cycle(self) -> Dict[str, Any]: - cycle_start = time.time() - thresholds = self.config.get("thresholds", {}) - stale_threshold_sec = int(thresholds.get("stalenessSec", 120)) - metrics = { - "candidates": 0, - "triaged": 0, - "emitted": 0, - "cycleMs": 0, - "diagnostics": make_default_diagnostics( - staleness_threshold_sec=stale_threshold_sec, - phase="cycle_start", - reason="cycle_initialized", - ), - } - min_history = int(self.config.get("minHistoryPoints", 30)) - max_candidates_total = max(1, int(self.config.get("maxCandidatesPerCycle", 25))) - max_candidates_per_subsystem = max(1, int(self.config.get("maxCandidatesPerSubsystem", 8))) - max_triage_total = max(0, int(self.config.get("maxLlmTriagesPerCycle", 5))) - max_triage_per_subsystem = max(0, int(self.config.get("maxLlmTriagesPerSubsystem", 2))) - - if not self.api.is_configured: - emitted = self.emit_provider_failure_event( - "ignition_not_configured", - "Ignition API URL/token not configured.", - severity="critical", - category="state-conflict", + # ------------------------------------------------------------------- + # Schema / lifecycle + # ------------------------------------------------------------------- + def _init_schema(self) -> None: + self.graph.init_agent_monitoring_schema() + + def _upsert_run(self, status: str, reason: Optional[str] = None) -> None: + with self.graph.session() as session: + session.run( + """ + MERGE (r:AgentRun {run_id: $run_id}) + SET r.status = $status, r.updated_at = datetime(), + r.last_heartbeat_at = datetime(), + r.config_json = $cfg, + r.started_at = coalesce(r.started_at, datetime()), + r.stopped_at = CASE WHEN $status IN ['stopped','failed'] THEN datetime() ELSE r.stopped_at END, + r.stop_reason = CASE WHEN $reason IS NULL THEN r.stop_reason ELSE $reason END + """, + run_id=self.run_id, status=status, + cfg=json.dumps(self.config, default=str), reason=reason, ) - if emitted: - metrics["emitted"] += 1 - metrics["diagnostics"]["phase"] = "cycle_early_exit" - metrics["diagnostics"]["reason"] = "ignition_not_configured" - metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) - return metrics - - tags = self.get_monitored_tags() - if not tags: - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": "no_tags_found", - "message": "No ScadaTag nodes with readable tag paths found.", - "recoverable": True, - "timestamp": utc_now_iso(), + + # ------------------------------------------------------------------- + # Tag discovery + # ------------------------------------------------------------------- + def _fetch_tags(self) -> List[Dict[str, Any]]: + max_tags = int(self.config.get("maxMonitoredTags", 200)) + scope = self.config.get("scope", {}) + subsystem_mode = str(scope.get("subsystemMode") or "auto") + subsystem_priority = scope.get("subsystemPriority") or list(DEFAULT_SUBSYSTEM_PRIORITY) + include_unlinked = bool(scope.get("includeUnlinkedTags", False)) + tag_map: Dict[str, Dict[str, Any]] = {} + + def upsert(*, path: str, name: str, folder: str = "", views: List[str] = None, equipment: List[str] = None, source: str = "unknown"): + path = path.strip() + if not path: + return + entry = tag_map.setdefault(path, { + "path": path, "name": name or _last_segment(path) or path, + "folder_name": folder, "views": [], "equipment": [], + "source": source, "bound_to_view": False, }) - metrics["diagnostics"]["phase"] = "cycle_early_exit" - metrics["diagnostics"]["reason"] = "no_tags_found" - metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) - return metrics - - tag_paths = [t["path"] for t in tags] - tag_lookup = {t["path"]: t for t in tags} - linked_tag_count = sum( - 1 for t in tags if (t.get("views") or t.get("equipment")) - ) - unlinked_tag_count = max(0, len(tags) - linked_tag_count) - detected_subsystems = sorted( - { - (t.get("primary_subsystem") or _subsystem_ref("global", "all")).get("id", "global:all") - for t in tags - } - ) + if source == "view_binding": + entry["bound_to_view"] = True + entry["source"] = source + if folder and not entry.get("folder_name"): + entry["folder_name"] = folder + if name and (not entry["name"] or entry["name"] == entry["path"]): + entry["name"] = name + for v in (views or []): + if v and v not in entry["views"]: + entry["views"].append(v) + for e in (equipment or []): + if e and e not in entry["equipment"]: + entry["equipment"].append(e) + + with self.graph.session() as session: + for r in session.run( + """ + MATCH (v:View)-[:HAS_COMPONENT]->(c:ViewComponent)-[r:BINDS_TO]->(n) + WHERE r.tag_path IS NOT NULL AND trim(r.tag_path) <> '' + AND toLower(coalesce(r.binding_type, 'tag')) = 'tag' + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(n) + RETURN DISTINCT trim(r.tag_path) AS tag_path, coalesce(n.name,'') AS tag_name, + collect(DISTINCT v.name) AS views, collect(DISTINCT eq.name) AS equipment + LIMIT $lim + """, lim=max_tags * 4, + ): + p = str(r["tag_path"] or "").strip() + if _looks_like_tag_path(p): + upsert(path=p, name=str(r["tag_name"] or _last_segment(p)), + folder=infer_tag_group(p) or "", + views=[x for x in (r["views"] or []) if x], + equipment=[x for x in (r["equipment"] or []) if x], + source="view_binding") + + for r in session.run( + """ + MATCH (t:ScadaTag) WHERE t.opc_item_path IS NOT NULL AND trim(t.opc_item_path) <> '' + OPTIONAL MATCH (c:ViewComponent)-[:BINDS_TO]->(t) + OPTIONAL MATCH (v:View)-[:HAS_COMPONENT]->(c) + OPTIONAL MATCH (eq:Equipment)-[*1..2]-(t) + RETURN DISTINCT trim(t.opc_item_path) AS tag_path, coalesce(t.name,t.opc_item_path) AS tag_name, + coalesce(t.folder_name,'') AS folder_name, + collect(DISTINCT v.name) AS views, collect(DISTINCT eq.name) AS equipment + LIMIT $lim + """, lim=max_tags * 6, + ): + p = str(r["tag_path"] or "").strip() + if _looks_like_tag_path(p): + upsert(path=p, name=str(r["tag_name"] or _last_segment(p)), + folder=str(r["folder_name"] or ""), + views=[x for x in (r["views"] or []) if x], + equipment=[x for x in (r["equipment"] or []) if x], + source="scada_tag") + + tags = list(tag_map.values()) + if not include_unlinked: + linked = [t for t in tags if t.get("views") or t.get("equipment") or t.get("bound_to_view")] + if linked: + tags = linked + + for tag in tags: + subs, primary = derive_subsystems_for_tag(tag, subsystem_mode=subsystem_mode, priority=subsystem_priority) + tag["subsystems"] = subs + tag["primary_subsystem"] = primary + + return tags[:max_tags] - subsystem_tag_map: Dict[str, Dict[str, Any]] = {} + def _discover_subsystems(self) -> Dict[str, Dict[str, Any]]: + tags = self._fetch_tags() + subsystems: Dict[str, Dict[str, Any]] = {} for t in tags: sub = t.get("primary_subsystem") or _subsystem_ref("global", "all") sub_id = sub.get("id", "global:all") - bucket = subsystem_tag_map.setdefault(sub_id, { + bucket = subsystems.setdefault(sub_id, { "type": sub.get("type", "global"), "name": sub.get("name", "all"), "tags": [], }) - bucket["tags"].append({ - "path": t["path"], - "name": t.get("name", t["path"]), - "views": t.get("views", []), - "equipment": t.get("equipment", []), - "allSubsystems": [s.get("id") for s in (t.get("subsystems") or [])], - }) - - live_values = self.api.read_tags(tag_paths) - tool_calls: List[Dict[str, Any]] = [] - tool_calls.append({ - "tool": "read_tags", - "request": { - "count": len(tag_paths), - "samplePaths": tag_paths[:8], - }, - "result": { - "count": len(live_values), - "errorCount": sum(1 for tv in live_values if tv.error), - "qualityGoodCount": sum(1 for tv in live_values if is_quality_good(tv.quality)), - "timestampMissingCount": sum(1 for tv in live_values if not tv.timestamp), - "timestampInferredCount": sum( - 1 - for tv in live_values - if isinstance(tv.config, dict) and bool(tv.config.get("timestamp_inferred")) - ), - "sample": [ - { - "path": tv.path, - "value": _preview_value(tv.value), - "quality": tv.quality, - "timestamp": tv.timestamp, - "timestampInferred": bool(tv.config.get("timestamp_inferred")) - if isinstance(tv.config, dict) - else False, - "configKeys": sorted(list(tv.config.keys()))[:8] - if isinstance(tv.config, dict) - else [], - "error": tv.error, - } - for tv in live_values[:5] - ], - }, - }) - candidates: List[Dict[str, Any]] = [] - now = datetime.now(timezone.utc) - live_error_count = 0 - live_error_samples: List[str] = [] - history_error_count = 0 - history_error_samples: List[str] = [] - valid_live_count = 0 - missing_timestamp_count = 0 - inferred_timestamp_count = 0 - quality_filtered_count = 0 - stale_filtered_count = 0 - insufficient_history_count = 0 - low_history_candidate_count = 0 - candidate_subsystem_counts: Dict[str, int] = {} - live_error_linked = 0 - live_error_unlinked = 0 - history_error_linked = 0 - history_error_unlinked = 0 - quality_filtered_linked = 0 - quality_filtered_unlinked = 0 - stale_filtered_linked = 0 - stale_filtered_unlinked = 0 - evaluated_linked = 0 - evaluated_unlinked = 0 - candidate_linked = 0 - candidate_unlinked = 0 - near_shift_count = 0 - near_shift_linked = 0 - near_shift_unlinked = 0 - stale_samples: List[Dict[str, Any]] = [] - subsystem_shift_signals: Dict[str, Dict[str, Any]] = {} - processed_live_count = 0 - total_live_count = len(live_values) - last_progress_emit = 0.0 - - def emit_cycle_progress(reason: str, current_tag: str = "", include_tag_map: bool = False) -> None: - nonlocal last_progress_emit - diag = make_default_diagnostics( - staleness_threshold_sec=stale_threshold_sec, - phase="cycle_in_progress", - reason=reason, - ) - diag.update({ - "processedLiveCount": processed_live_count, - "totalLiveCount": total_live_count, - "currentTag": current_tag, - "candidatesSoFar": len(candidates), - "liveErrorCount": live_error_count, - "qualityFilteredCount": quality_filtered_count, - "staleFilteredCount": stale_filtered_count, - "historyErrorCount": history_error_count, - "monitoredTags": len(tags), - "linkedTags": linked_tag_count, - "unlinkedTags": unlinked_tag_count, - "detectedSubsystemCount": len(detected_subsystems), - "detectedSubsystems": detected_subsystems[:10], - }) - if include_tag_map: - diag["subsystemTagMap"] = subsystem_tag_map - emit("AGENT_STATUS", { - "runId": self.run_id, - "state": "running", - "cycleMs": int((time.time() - cycle_start) * 1000), - "candidates": len(candidates), - "triaged": 0, - "emitted": metrics.get("emitted", 0), - "diagnostics": diag, - "timestamp": utc_now_iso(), - }) - last_progress_emit = time.time() - - emit_cycle_progress("cycle_started", include_tag_map=True) - - def _update_subsystem_signal( - subsystem_ref: Dict[str, str], deterministic: Dict[str, Any], - tag_path: str, live_value: Any = None, - ) -> None: - sub_id = subsystem_ref.get("id", "global:all") - abs_z = abs(float(deterministic.get("z_score", 0.0))) - z = float(deterministic.get("z_score", 0.0)) - mad = float(deterministic.get("mad_score", 0.0)) - bucket = subsystem_shift_signals.setdefault( - sub_id, - { - "subsystemId": sub_id, - "subsystemType": subsystem_ref.get("type", "global"), - "subsystemName": subsystem_ref.get("name", "all"), - "evaluated": 0, - "candidate": 0, - "nearShift": 0, - "sumAbsZ": 0.0, - "sumZ": 0.0, - "maxAbsZ": 0.0, - "sampleTag": tag_path, - "_tagEntries": [], - }, - ) - bucket["evaluated"] += 1 - bucket["sumAbsZ"] += abs_z - bucket["sumZ"] += z - if abs_z >= 1.5: - bucket["nearShift"] += 1 - if abs_z > bucket["maxAbsZ"]: - bucket["maxAbsZ"] = abs_z - bucket["sampleTag"] = tag_path - tag_name = tag_path.rsplit("/", 1)[-1] if "/" in str(tag_path) else str(tag_path) - bucket["_tagEntries"].append({ - "path": str(tag_path), - "name": tag_name, - "z": round(z, 3), - "absZ": round(abs_z, 3), - "mad": round(mad, 3), - "value": live_value, - }) - - # ---- Phase 1: Filter live values (no I/O) ---- - TagEntry = Tuple[Any, Dict[str, Any], Dict[str, str], bool] # (tv, tag_meta, subsystem, is_linked) - tags_for_history: List[TagEntry] = [] - - for idx, tv in enumerate(live_values): - processed_live_count += 1 - tag_meta = ( - tags[idx] if idx < len(tags) - else tag_lookup.get(tv.path, {"path": tv.path, "name": tv.path}) - ) - subsystem = tag_meta.get("primary_subsystem") or _subsystem_ref("global", "all") - is_linked = bool(tag_meta.get("views") or tag_meta.get("equipment")) - - if tv.error: - live_error_count += 1 - if is_linked: - live_error_linked += 1 - else: - live_error_unlinked += 1 - if len(live_error_samples) < 5: - live_error_samples.append(f"{tv.path}: {tv.error}") - continue - valid_live_count += 1 - if not tv.timestamp: - missing_timestamp_count += 1 - if isinstance(tv.config, dict) and bool(tv.config.get("timestamp_inferred")): - inferred_timestamp_count += 1 - if not is_quality_good(tv.quality): - quality_filtered_count += 1 - if is_linked: - quality_filtered_linked += 1 - else: - quality_filtered_unlinked += 1 - continue - parsed_ts = parse_timestamp(tv.timestamp) - age_sec = (now - parsed_ts).total_seconds() if parsed_ts is not None else None - if is_stale(tv.timestamp, stale_threshold_sec, now=now): - stale_filtered_count += 1 - if is_linked: - stale_filtered_linked += 1 - else: - stale_filtered_unlinked += 1 - if len(stale_samples) < 8: - stale_samples.append({ - "path": tv.path, - "timestampRaw": tv.timestamp, - "timestampParsedUtc": parsed_ts.isoformat() if parsed_ts else None, - "ageSec": round(age_sec, 3) if age_sec is not None else None, - "thresholdSec": stale_threshold_sec, - "reason": "timestamp_parse_failed" if parsed_ts is None else "age_exceeds_threshold", - }) - continue - - tags_for_history.append((tv, tag_meta, subsystem, is_linked)) - - emit_cycle_progress( - "filtering_complete", - current_tag=f"{len(tags_for_history)} tags passed filters", + bucket["tags"].append(t) + return subsystems + + # ------------------------------------------------------------------- + # Agent management + # ------------------------------------------------------------------- + def _spawn_agent(self, sub_id: str, info: Dict[str, Any], stagger_delay: float = 0.0) -> SubsystemAgent: + agent = SubsystemAgent( + subsystem_id=sub_id, + subsystem_type=info["type"], + subsystem_name=info["name"], + tag_metas=info["tags"], + graph=self.graph, + api=self.api, + config=self.config, + run_id=self.run_id, + stagger_delay=stagger_delay, ) + agent.start() + self.agents[sub_id] = agent + return agent + + def _stop_agent(self, sub_id: str) -> None: + agent = self.agents.pop(sub_id, None) + if agent: + agent.stop() + + def _stop_all(self) -> None: + for agent in self.agents.values(): + agent.stop() + for agent in list(self.agents.values()): + agent.join(timeout=5) + self.agents.clear() + + # ------------------------------------------------------------------- + # Stdin command reader + # ------------------------------------------------------------------- + def _stdin_reader(self) -> None: + while self._running: + try: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + if not line: + continue + cmd = json.loads(line) + self._handle_command(cmd) + except (json.JSONDecodeError, Exception): + continue - # ---- Phase 2: Batched history fetch ---- - history_fetch_start = time.time() - history_paths = [tv.path for tv, _, _, _ in tags_for_history] - history_results = self.fetch_history_batch(history_paths) if history_paths else {} - history_fetch_elapsed = time.time() - history_fetch_start - emit_cycle_progress( - "history_complete", - current_tag=f"{len(history_results)} in {round(history_fetch_elapsed, 1)}s", - ) + def _handle_command(self, cmd: Dict[str, Any]) -> None: + action = cmd.get("cmd", "") + sub_id = cmd.get("subsystemId", "") - # ---- Phase 3: Score and build candidates using pre-fetched history ---- - for tv, tag_meta, subsystem, is_linked in tags_for_history: - history, history_error = history_results.get(tv.path, ([], "No history result")) - - if len(tool_calls) < 18: - tool_calls.append({ - "tool": "query_tag_history", - "request": { - "tagPath": tv.path, - "historyWindowMinutes": int(self.config.get("historyWindowMinutes", 360)), - }, - "result": { - "historyPoints": len(history), - "error": history_error, - }, + if action == "stop-all": + self._running = False + elif action == "stop-agent" and sub_id: + agent = self.agents.get(sub_id) + if agent: + agent.pause() + emit("AGENT_STATUS", { + "runId": self.run_id, "subsystemId": sub_id, + "state": "paused", "diagnostics": {"phase": "agent_paused", "reason": "user_request"}, + "timestamp": utc_now_iso(), }) - if history_error: - history_error_count += 1 - if is_linked: - history_error_linked += 1 - else: - history_error_unlinked += 1 - if len(history_error_samples) < 5: - history_error_samples.append(f"{tv.path}: {history_error}") - continue - if len(history) < min_history: - insufficient_history_count += 1 - if len(history) >= 5: - prev_val = self._prev_values.get(tv.path) - deterministic = compute_deviation_scores( - current_value=tv.value, - history_values=history, - prev_value=prev_val, - thresholds=thresholds, + elif action == "start-agent" and sub_id: + agent = self.agents.get(sub_id) + if agent: + agent.resume() + emit("AGENT_STATUS", { + "runId": self.run_id, "subsystemId": sub_id, + "state": "running", "diagnostics": {"phase": "agent_resumed", "reason": "user_request"}, + "timestamp": utc_now_iso(), + }) + elif action == "deep-analyze": + event_data = cmd.get("event", {}) + threading.Thread( + target=self._deep_analyze_inline, + args=(event_data,), + daemon=True, + name="deep-analyze", + ).start() + + # ------------------------------------------------------------------- + # Deep analyze (inline, runs in background thread) + # ------------------------------------------------------------------- + def _deep_analyze_inline(self, event_data: Dict[str, Any]) -> None: + event_id = event_data.get("event_id", "?") + tag_path = event_data.get("source_tag") or event_data.get("tag_name", "") + sub_id = event_data.get("subsystem_id", "") + if not tag_path: + emit("AGENT_EVENT", {"runId": self.run_id, "deepAnalyze": True, + "event": {**event_data, "deep_analyze_error": "No source_tag"}}) + return + agent = self.agents.get(sub_id) if sub_id else None + llm = None + if agent and agent.llm: + llm = agent.llm + else: + if bool(os.getenv("ANTHROPIC_API_KEY")): + try: + from claude_client import ClaudeClient + llm = ClaudeClient( + enable_tools=False, + ignition_api_url=self.config.get("ignitionApiUrl"), + ignition_api_token=self.config.get("ignitionApiToken"), ) - curr_num = safe_float(tv.value) - if curr_num is not None: - self._prev_values[tv.path] = curr_num - - _update_subsystem_signal(subsystem, deterministic, tv.path, live_value=tv.value) - if is_linked: - evaluated_linked += 1 - else: - evaluated_unlinked += 1 - if abs(float(deterministic.get("z_score", 0.0))) >= 1.5: - near_shift_count += 1 - if is_linked: - near_shift_linked += 1 - else: - near_shift_unlinked += 1 - - if deterministic.get("candidate"): - sub_bucket = subsystem_shift_signals.setdefault( - subsystem.get("id", "global:all"), - { - "subsystemId": subsystem.get("id", "global:all"), - "subsystemType": subsystem.get("type", "global"), - "subsystemName": subsystem.get("name", "all"), - "evaluated": 0, - "candidate": 0, - "nearShift": 0, - "sumAbsZ": 0.0, - "sumZ": 0.0, - "maxAbsZ": 0.0, - "sampleTag": tv.path, - "_tagEntries": [], - }, - ) - sub_bucket["candidate"] += 1 - if is_linked: - candidate_linked += 1 - else: - candidate_unlinked += 1 - deterministic["reasons"] = list(deterministic.get("reasons", [])) + ["low_history_override"] - deterministic["history_quality"] = "low" - context = self.get_context(tv.path) - context["subsystem"] = subsystem - context["subsystems"] = tag_meta.get("subsystems") or [subsystem] - candidates.append( - { - "context": context, - "deterministic": deterministic, - "live_sample": { - "path": tv.path, - "value": tv.value, - "quality": tv.quality, - "timestamp": tv.timestamp, - "data_type": tv.data_type, - }, - "subsystem": subsystem, - } - ) - sub_id = subsystem.get("id", "global:all") - candidate_subsystem_counts[sub_id] = candidate_subsystem_counts.get(sub_id, 0) + 1 - low_history_candidate_count += 1 - continue + except Exception: + pass + if not llm: + emit("AGENT_EVENT", {"runId": self.run_id, "deepAnalyze": True, + "event": {**event_data, "deep_analyze_error": "No LLM available (check ANTHROPIC_API_KEY)"}}) + return - prev_val = self._prev_values.get(tv.path) - deterministic = compute_deviation_scores( - current_value=tv.value, - history_values=history, - prev_value=prev_val, - thresholds=thresholds, - ) - curr_num = safe_float(tv.value) - if curr_num is not None: - self._prev_values[tv.path] = curr_num + det = { + "z_score": event_data.get("z_score", 0), + "mad_score": event_data.get("mad_score", 0), + "delta_rate": event_data.get("delta_rate", 0), + "category": event_data.get("category", "deviation"), + "reasons": json.loads(event_data.get("deterministic_reasons_json", "[]")), + } + context = {"tag_path": tag_path, "tag_name": event_data.get("tag_name", tag_path), + "equipment": [], "views": [], "group": "", "symptoms": [], "causes": []} + live_sample = {"path": tag_path, "value": event_data.get("live_value"), + "quality": event_data.get("live_quality"), "timestamp": event_data.get("live_timestamp")} - _update_subsystem_signal(subsystem, deterministic, tv.path, live_value=tv.value) - if is_linked: - evaluated_linked += 1 - else: - evaluated_unlinked += 1 - if abs(float(deterministic.get("z_score", 0.0))) >= 1.5: - near_shift_count += 1 - if is_linked: - near_shift_linked += 1 - else: - near_shift_unlinked += 1 - - if deterministic.get("candidate"): - sub_bucket = subsystem_shift_signals.setdefault( - subsystem.get("id", "global:all"), - { - "subsystemId": subsystem.get("id", "global:all"), - "subsystemType": subsystem.get("type", "global"), - "subsystemName": subsystem.get("name", "all"), - "evaluated": 0, - "candidate": 0, - "nearShift": 0, - "sumAbsZ": 0.0, - "sumZ": 0.0, - "maxAbsZ": 0.0, - "sampleTag": tv.path, - "_tagEntries": [], - }, - ) - sub_bucket["candidate"] += 1 - if is_linked: - candidate_linked += 1 - else: - candidate_unlinked += 1 - context = self.get_context(tv.path) - context["subsystem"] = subsystem - context["subsystems"] = tag_meta.get("subsystems") or [subsystem] - candidates.append( - { - "context": context, - "deterministic": deterministic, - "live_sample": { - "path": tv.path, - "value": tv.value, - "quality": tv.quality, - "timestamp": tv.timestamp, - "data_type": tv.data_type, - }, - "subsystem": subsystem, - } - ) - sub_id = subsystem.get("id", "global:all") - candidate_subsystem_counts[sub_id] = candidate_subsystem_counts.get(sub_id, 0) + 1 - - emit_cycle_progress("scoring_complete") - - if live_values and live_error_count == len(live_values): - emitted = self.emit_provider_failure_event( - "live_tag_provider_failed", - f"Live tag provider failed for all reads ({live_error_count}/{len(live_values)}).", - severity="high", - category="quality-issue", - details={"samples": live_error_samples}, - ) - if emitted: - metrics["emitted"] += 1 - elif live_error_count > 0: - emitted = self.emit_provider_failure_event( - "live_tag_provider_partial_failure", - f"Live tag provider partially failed ({live_error_count}/{len(live_values)} reads).", - severity="medium", - category="quality-issue", - details={"samples": live_error_samples}, - ) - if emitted: - metrics["emitted"] += 1 - - if valid_live_count > 0 and history_error_count >= max(1, int(valid_live_count * 0.8)): - emitted = self.emit_provider_failure_event( - "history_provider_failed", - f"History provider failed for most queries ({history_error_count}/{valid_live_count}).", - severity="high", - category="quality-issue", - details={"samples": history_error_samples}, - ) - if emitted: - metrics["emitted"] += 1 - elif history_error_count > 0: - emitted = self.emit_provider_failure_event( - "history_provider_partial_failure", - f"History provider partially failed ({history_error_count}/{valid_live_count}).", - severity="medium", - category="quality-issue", - details={"samples": history_error_samples}, - ) - if emitted: - metrics["emitted"] += 1 - - if valid_live_count > 0 and stale_filtered_count >= max(1, int(valid_live_count * 0.8)): - emitted = self.emit_provider_failure_event( - "live_timestamp_stale", - f"Most live samples were stale ({stale_filtered_count}/{valid_live_count}).", - severity="medium", - category="quality-issue", - details={"staleCount": stale_filtered_count, "validLiveCount": valid_live_count}, - ) - if emitted: - metrics["emitted"] += 1 - - if valid_live_count > 0 and quality_filtered_count >= max(1, int(valid_live_count * 0.8)): - emitted = self.emit_provider_failure_event( - "live_quality_bad", - f"Most live samples had non-good quality ({quality_filtered_count}/{valid_live_count}).", - severity="medium", - category="quality-issue", - details={"qualityFilteredCount": quality_filtered_count, "validLiveCount": valid_live_count}, - ) - if emitted: - metrics["emitted"] += 1 - - metrics["candidates"] = len(candidates) - shortlisted: List[Dict[str, Any]] = [] - selected_per_subsystem: Dict[str, int] = {} - for candidate in candidates: - subsystem = candidate.get("subsystem") or _subsystem_ref("global", "all") - sub_id = subsystem.get("id", "global:all") - if selected_per_subsystem.get(sub_id, 0) >= max_candidates_per_subsystem: - continue - shortlisted.append(candidate) - selected_per_subsystem[sub_id] = selected_per_subsystem.get(sub_id, 0) + 1 - if len(shortlisted) >= max_candidates_total: - break - - llm_total = 0 - llm_per_subsystem: Dict[str, int] = {} - dedup_suppressed_count = 0 - - if shortlisted: - emit_cycle_progress( - "triage_started", - current_tag=f"{len(shortlisted)} candidates to process", + try: + result = llm.query_json( + system_prompt=( + "You are an industrial anomaly triage assistant. " + "Return ONLY valid JSON with keys: summary, category, severity, confidence, " + "probable_causes, verification_checks, safety_notes, rationale, related_entities." + ), + user_prompt=json.dumps({"context": context, "deterministic": det, "live_sample": live_sample}, default=str), + max_tokens=900, + use_tools=False, ) + data = result.get("data", {}) if isinstance(result, dict) else {} + updated = dict(event_data) + if isinstance(data, dict): + updated["summary"] = data.get("summary", updated.get("summary", "")) + updated["explanation"] = data.get("rationale", updated.get("explanation", "")) + updated["probable_causes_json"] = json.dumps(data.get("probable_causes", [])) + updated["recommended_checks_json"] = json.dumps(data.get("verification_checks", [])) + updated["safety_notes_json"] = json.dumps(data.get("safety_notes", [])) + updated["severity"] = data.get("severity", updated.get("severity", "medium")) + updated["confidence"] = data.get("confidence", updated.get("confidence", 0.5)) + updated["deep_analyzed"] = True + emit("AGENT_EVENT", {"runId": self.run_id, "deepAnalyze": True, "event": updated}) + except Exception as exc: + emit("AGENT_EVENT", {"runId": self.run_id, "deepAnalyze": True, + "event": {**event_data, "deep_analyze_error": str(exc)}}) - for ci, candidate in enumerate(shortlisted): - subsystem = candidate.get("subsystem") or _subsystem_ref("global", "all") - sub_id = subsystem.get("id", "global:all") - tag_name = candidate["context"].get("tag_name", candidate["context"].get("tag_path", "?")) - use_llm = ( - llm_total < max_triage_total - and llm_per_subsystem.get(sub_id, 0) < max_triage_per_subsystem - ) - triage = ( - self.run_llm_triage( - candidate["context"], - candidate["deterministic"], - candidate["live_sample"], - ) - if use_llm - else { - "summary": ( - f"Deviation on {candidate['context'].get('tag_name', candidate['context']['tag_path'])} " - f"in subsystem {subsystem.get('name', 'all')}" - ), - "category": candidate["deterministic"].get("category", "deviation"), - "severity": "medium", - "confidence": 0.5, - "verification_checks": [], - "probable_causes": [], - "safety_notes": [], - "rationale": "Deterministic-only triage (LLM triage disabled or cap reached).", - "related_entities": [], - } - ) - if use_llm: - llm_total += 1 - llm_per_subsystem[sub_id] = llm_per_subsystem.get(sub_id, 0) + 1 - metrics["triaged"] += 1 - persisted = self.persist_event( - candidate["context"], - candidate["deterministic"], - candidate["live_sample"], - triage, - subsystem=subsystem, - ) - if persisted: - metrics["emitted"] += 1 - self._emit_persisted_event(persisted) - else: - dedup_suppressed_count += 1 + # ------------------------------------------------------------------- + # Main loop + # ------------------------------------------------------------------- + def run(self) -> int: + self._init_schema() + self._upsert_run("running") - if (ci + 1) % 5 == 0 or ci == len(shortlisted) - 1: - emit_cycle_progress( - "triaging", - current_tag=f"{ci + 1}/{len(shortlisted)} ({tag_name})", - ) + emit("AGENT_STATUS", { + "runId": self.run_id, "state": "running", + "diagnostics": {"phase": "startup", "reason": "coordinator_started"}, + "timestamp": utc_now_iso(), + }) + + subsystems = self._discover_subsystems() + tag_map: Dict[str, Any] = {} + stagger_sec = 1.5 # seconds between each agent's first cycle + for idx, (sub_id, info) in enumerate(subsystems.items()): + tag_map[sub_id] = { + "type": info["type"], "name": info["name"], + "tags": [{"path": t["path"], "name": t.get("name", t["path"])} for t in info["tags"]], + } + self._spawn_agent(sub_id, info, stagger_delay=idx * stagger_sec) - top_candidates_by_subsystem = dict( - sorted(candidate_subsystem_counts.items(), key=lambda item: item[1], reverse=True)[:10] - ) - top_shift_signals = sorted( - subsystem_shift_signals.values(), - key=lambda item: ( - int(item.get("candidate", 0)), - float(item.get("maxAbsZ", 0.0)), - int(item.get("nearShift", 0)), - int(item.get("evaluated", 0)), - ), - reverse=True, - ) - sparkline_size = 20 - for item in top_shift_signals: - evaluated = max(1, int(item.get("evaluated", 0))) - item["avgAbsZ"] = round(float(item.get("sumAbsZ", 0.0)) / evaluated, 3) - item["avgZ"] = round(float(item.get("sumZ", 0.0)) / evaluated, 3) - item["shiftRatio"] = round(float(item.get("nearShift", 0)) / evaluated, 3) - item["candidateRatio"] = round(float(item.get("candidate", 0)) / evaluated, 3) - item.pop("sumAbsZ", None) - item.pop("sumZ", None) - raw_tags = item.pop("_tagEntries", []) - sorted_tags = sorted(raw_tags, key=lambda t: t.get("absZ", 0.0), reverse=True) - tag_signals = [] - for t in sorted_tags: - entry = {k: v for k, v in t.items() if k != "absZ"} - cached_hist = self._history_cache.get(t.get("path", "")) - if cached_hist and cached_hist.get("values"): - vals = cached_hist["values"] - entry["avg"] = round(sum(vals) / len(vals), 2) - if len(vals) <= sparkline_size: - entry["sparkline"] = [round(v, 2) for v in vals] - else: - step = len(vals) / sparkline_size - entry["sparkline"] = [round(vals[int(i * step)], 2) for i in range(sparkline_size)] - tag_signals.append(entry) - item["tagSignals"] = tag_signals - - metrics["diagnostics"] = { - **make_default_diagnostics( - staleness_threshold_sec=int(thresholds.get("stalenessSec", 120)), - phase="cycle_complete", - reason="ok", - ), - "monitoredTags": len(tag_paths), - "linkedTags": linked_tag_count, - "unlinkedTags": unlinked_tag_count, - "validLiveCount": valid_live_count, - "missingTimestampCount": missing_timestamp_count, - "inferredTimestampCount": inferred_timestamp_count, - "liveErrorCount": live_error_count, - "liveErrorLinked": live_error_linked, - "liveErrorUnlinked": live_error_unlinked, - "qualityFilteredCount": quality_filtered_count, - "qualityFilteredLinked": quality_filtered_linked, - "qualityFilteredUnlinked": quality_filtered_unlinked, - "staleFilteredCount": stale_filtered_count, - "staleFilteredLinked": stale_filtered_linked, - "staleFilteredUnlinked": stale_filtered_unlinked, - "historyErrorCount": history_error_count, - "historyErrorLinked": history_error_linked, - "historyErrorUnlinked": history_error_unlinked, - "insufficientHistoryCount": insufficient_history_count, - "lowHistoryCandidateCount": low_history_candidate_count, - "evaluatedLinked": evaluated_linked, - "evaluatedUnlinked": evaluated_unlinked, - "candidateLinked": candidate_linked, - "candidateUnlinked": candidate_unlinked, - "nearShiftCount": near_shift_count, - "nearShiftLinked": near_shift_linked, - "nearShiftUnlinked": near_shift_unlinked, - "stalenessThresholdSec": int(thresholds.get("stalenessSec", 120)), - "staleSamples": stale_samples, - "timestampParseNote": "Naive timestamps are treated as local time by parse_timestamp().", - "detectedSubsystemCount": len(detected_subsystems), - "detectedSubsystems": detected_subsystems[:10], - "subsystemTagMap": subsystem_tag_map, - "candidateSubsystemCount": len(candidate_subsystem_counts), - "candidateBySubsystem": top_candidates_by_subsystem, - "subsystemShiftSignals": top_shift_signals, - "maxCandidatesPerSubsystem": max_candidates_per_subsystem, - "maxLlmTriagesPerSubsystem": max_triage_per_subsystem, - "llmTriagedCount": llm_total, - "dedupSuppressedCount": dedup_suppressed_count, - "toolCalls": tool_calls, - } - metrics["cycleMs"] = int((time.time() - cycle_start) * 1000) - return metrics - - def cleanup_retention(self) -> int: - retention_days = int(self.config.get("retentionDays", 14)) - return self.graph.cleanup_anomaly_events(retention_days=retention_days) - - def run_forever(self) -> int: - self.init_schema() - self.upsert_run("running") - startup_diag = make_default_diagnostics( - staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), - phase="startup", - reason="worker_started", - ) emit("AGENT_STATUS", { - "runId": self.run_id, - "state": "running", - "cycleMs": 0, - "candidates": 0, - "triaged": 0, - "emitted": 0, - "diagnostics": startup_diag, + "runId": self.run_id, "state": "running", + "diagnostics": { + "phase": "agents_started", + "reason": f"{len(self.agents)} subsystem agents spawned", + "subsystemTagMap": tag_map, + "agentCount": len(self.agents), + "agentIds": list(self.agents.keys()), + }, "timestamp": utc_now_iso(), }) - poll_ms = int(self.config.get("pollIntervalMs", 1000)) + stdin_thread = threading.Thread(target=self._stdin_reader, daemon=True, name="stdin-reader") + stdin_thread.start() + + rediscovery_interval = float(self.config.get("rediscoveryIntervalSec", 60)) cleanup_every = max(1, int(self.config.get("cleanupEveryCycles", 40))) - exit_code = 0 - reason = "stopped" + last_rediscovery = time.time() + watchdog_count = 0 while self._running: - self._cycle_count += 1 - cycle_started = time.time() - try: - metrics = self.run_cycle() - self.heartbeat(metrics) - emit("AGENT_STATUS", { - "runId": self.run_id, - "state": "running", - "cycleMs": metrics["cycleMs"], - "candidates": metrics["candidates"], - "triaged": metrics["triaged"], - "emitted": metrics["emitted"], - "diagnostics": metrics.get("diagnostics", {}), - "timestamp": utc_now_iso(), - }) - if self._cycle_count % cleanup_every == 0: - deleted = self.cleanup_retention() - if deleted > 0: - cleanup_diag = make_default_diagnostics( - staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), - phase="retention_cleanup", - reason="cleanup_complete", - ) - cleanup_diag["emittedCleanupCount"] = deleted + time.sleep(2) + watchdog_count += 1 + + if time.time() - last_rediscovery >= rediscovery_interval: + try: + new_subs = self._discover_subsystems() + new_ids = set(new_subs.keys()) + old_ids = set(self.agents.keys()) + + for sub_id in new_ids - old_ids: + info = new_subs[sub_id] + self._spawn_agent(sub_id, info) emit("AGENT_STATUS", { - "runId": self.run_id, - "state": "retention_cleanup", - "cycleMs": 0, - "candidates": 0, - "triaged": 0, - "emitted": deleted, - "diagnostics": cleanup_diag, + "runId": self.run_id, "subsystemId": sub_id, "state": "running", + "diagnostics": {"phase": "agent_discovered", "reason": "new_subsystem"}, "timestamp": utc_now_iso(), }) - except Exception as exc: - reason = "failed" - exit_code = 1 - emit("AGENT_ERROR", { - "runId": self.run_id, - "code": "cycle_error", - "message": str(exc), - "recoverable": True, - "timestamp": utc_now_iso(), - }) - error_diag = make_default_diagnostics( - staleness_threshold_sec=int(self.config.get("thresholds", {}).get("stalenessSec", 120)), - phase="cycle_error", - reason="exception", - ) - error_diag["errorMessage"] = str(exc) - emit("AGENT_STATUS", { - "runId": self.run_id, - "state": "running", - "cycleMs": int((time.time() - cycle_started) * 1000), - "candidates": 0, - "triaged": 0, - "emitted": 0, - "diagnostics": error_diag, - "timestamp": utc_now_iso(), - }) - elapsed_ms = int((time.time() - cycle_started) * 1000) - remaining = max(0, poll_ms - elapsed_ms) / 1000.0 - if remaining > 0: - time.sleep(remaining) + for sub_id in old_ids & new_ids: + agent = self.agents.get(sub_id) + if agent: + agent.update_tags(new_subs[sub_id]["tags"]) + + tag_map = {} + for sub_id, info in new_subs.items(): + tag_map[sub_id] = { + "type": info["type"], "name": info["name"], + "tags": [{"path": t["path"], "name": t.get("name", t["path"])} for t in info["tags"]], + } + emit("AGENT_STATUS", { + "runId": self.run_id, "state": "running", + "diagnostics": { + "phase": "rediscovery_complete", + "reason": f"{len(new_subs)} subsystems", + "subsystemTagMap": tag_map, + "agentCount": len(self.agents), + }, + "timestamp": utc_now_iso(), + }) + except Exception as exc: + emit("AGENT_ERROR", { + "runId": self.run_id, "code": "rediscovery_error", + "message": str(exc), "recoverable": True, "timestamp": utc_now_iso(), + }) + last_rediscovery = time.time() - self.upsert_run("stopped" if reason != "failed" else "failed", reason=reason) + if watchdog_count % cleanup_every == 0: + try: + deleted = self.graph.cleanup_anomaly_events(int(self.config.get("retentionDays", 14))) + if deleted > 0: + emit("AGENT_STATUS", { + "runId": self.run_id, "state": "running", + "diagnostics": {"phase": "retention_cleanup", "reason": f"deleted {deleted} old events"}, + "timestamp": utc_now_iso(), + }) + except Exception: + pass + + self._stop_all() + self._upsert_run("stopped", reason="stopped") emit("AGENT_COMPLETE", { - "runId": self.run_id, - "success": exit_code == 0, - "reason": reason, - "stoppedAt": utc_now_iso(), + "runId": self.run_id, "success": True, "reason": "stopped", "stoppedAt": utc_now_iso(), }) - return exit_code + return 0 - # ----------------------------- - # Single-operation helpers - # ----------------------------- - def list_events(self, limit: int, state: Optional[str], severity: Optional[str], run_id: Optional[str]) -> Dict[str, Any]: - events = self.graph.list_anomaly_events(limit=limit, state=state, severity=severity, run_id=run_id) - return {"success": True, "events": events} + # ------------------------------------------------------------------- + # Single-operation helpers (for CLI) + # ------------------------------------------------------------------- + def list_events(self, limit: int, state: Optional[str] = None, severity: Optional[str] = None, run_id: Optional[str] = None) -> Dict: + return {"success": True, "events": self.graph.list_anomaly_events(limit=limit, state=state, severity=severity, run_id=run_id)} - def get_event(self, event_id: str) -> Dict[str, Any]: + def get_event(self, event_id: str) -> Dict: event = self.graph.get_anomaly_event(event_id) - if not event: - return {"success": False, "error": f"Event not found: {event_id}"} - return {"success": True, "event": event} + return {"success": True, "event": event} if event else {"success": False, "error": f"Not found: {event_id}"} - def ack_event(self, event_id: str, note: Optional[str]) -> Dict[str, Any]: + def ack_event(self, event_id: str, note: Optional[str] = None) -> Dict: with self.graph.session() as session: - result = session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - SET e.state = 'acknowledged', - e.acknowledged_at = datetime(), - e.ack_note = $note, - e.updated_at = datetime() - RETURN count(e) AS cnt - """, - event_id=event_id, - note=note or "", - ) - record = result.single() - if not record or record["cnt"] == 0: - return {"success": False, "error": f"Event not found: {event_id}"} + row = session.run( + "MATCH (e:AnomalyEvent {event_id: $eid}) SET e.state='acknowledged', e.acknowledged_at=datetime(), e.ack_note=$note, e.updated_at=datetime() RETURN count(e) AS cnt", + eid=event_id, note=note or "", + ).single() + if not row or row["cnt"] == 0: + return {"success": False, "error": f"Not found: {event_id}"} return {"success": True, "eventId": event_id} - def clear_event(self, event_id: str, note: Optional[str]) -> Dict[str, Any]: + def clear_event(self, event_id: str, note: Optional[str] = None) -> Dict: with self.graph.session() as session: - result = session.run( - """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - SET e.state = 'cleared', - e.cleared_at = datetime(), - e.clear_note = $note, - e.updated_at = datetime() - RETURN count(e) AS cnt - """, - event_id=event_id, - note=note or "", - ) - record = result.single() - if not record or record["cnt"] == 0: - return {"success": False, "error": f"Event not found: {event_id}"} + row = session.run( + "MATCH (e:AnomalyEvent {event_id: $eid}) SET e.state='cleared', e.cleared_at=datetime(), e.clear_note=$note, e.updated_at=datetime() RETURN count(e) AS cnt", + eid=event_id, note=note or "", + ).single() + if not row or row["cnt"] == 0: + return {"success": False, "error": f"Not found: {event_id}"} return {"success": True, "eventId": event_id} - def deep_analyze(self, event_id: str) -> Dict[str, Any]: - """Run LLM triage on an existing event and update it in-place.""" + def deep_analyze(self, event_id: str) -> Dict: event = self.graph.get_anomaly_event(event_id) if not event: - return {"success": False, "error": f"Event not found: {event_id}"} - + return {"success": False, "error": f"Not found: {event_id}"} tag_path = event.get("source_tag") or event.get("tag_name", "") if not tag_path: return {"success": False, "error": "Event has no source_tag"} - context = self.get_context(tag_path) - context["subsystem"] = { - "id": event.get("subsystem_id", "global:all"), - "type": event.get("subsystem_type", "global"), - "name": event.get("subsystem_name", "all"), - } + temp_agent = SubsystemAgent( + subsystem_id=event.get("subsystem_id", "global:all"), + subsystem_type=event.get("subsystem_type", "global"), + subsystem_name=event.get("subsystem_name", "all"), + tag_metas=[], graph=self.graph, api=self.api, + config=self.config, run_id=self.run_id, + ) + if not temp_agent.llm: + return {"success": False, "error": "LLM client not configured"} - deterministic = { + context = temp_agent._get_context(tag_path) + context["subsystem"] = _subsystem_ref(event.get("subsystem_type", "global"), event.get("subsystem_name", "all")) + det = { "candidate": True, "z_score": float(event.get("z_score", 0)), "mad_score": float(event.get("mad_score", 0)), @@ -1977,134 +1320,81 @@ def deep_analyze(self, event_id: str) -> Dict[str, Any]: "reasons": json.loads(event.get("deterministic_reasons_json", "[]")), "category": event.get("category", "deviation"), } + live = {"value": event.get("live_value"), "quality": event.get("live_quality"), "timestamp": event.get("live_timestamp")} + triage = temp_agent._run_llm_triage(context, det, live) + severity = SubsystemAgent._severity_from_scores(det, triage) - live_sample = { - "value": event.get("live_value"), - "quality": event.get("live_quality"), - "timestamp": event.get("live_timestamp"), - } - - if not self.llm: - return {"success": False, "error": "LLM client not configured"} - - triage = self.run_llm_triage(context, deterministic, live_sample) - - severity = self._severity_from_scores(deterministic, triage) with self.graph.session() as session: session.run( """ - MATCH (e:AnomalyEvent {event_id: $event_id}) - SET e.summary = $summary, - e.explanation = $explanation, - e.severity = $severity, - e.confidence = $confidence, - e.recommended_checks_json = $checks, - e.probable_causes_json = $causes, - e.safety_notes_json = $safety, - e.updated_at = $updated_at, - e.llm_triaged = true - RETURN e + MATCH (e:AnomalyEvent {event_id: $eid}) + SET e.summary=$summary, e.explanation=$expl, e.severity=$sev, + e.confidence=$conf, e.recommended_checks_json=$checks, + e.probable_causes_json=$causes, e.safety_notes_json=$safety, + e.updated_at=$ts, e.llm_triaged=true """, - event_id=event_id, - summary=triage.get("summary", ""), - explanation=triage.get("rationale", ""), - severity=severity, - confidence=float(max(0.0, min(1.0, triage.get("confidence", 0.5)))), + eid=event_id, summary=triage.get("summary", ""), + expl=triage.get("rationale", ""), sev=severity, + conf=float(max(0.0, min(1.0, triage.get("confidence", 0.5)))), checks=json.dumps(triage.get("verification_checks", []), default=str), causes=json.dumps(triage.get("probable_causes", []), default=str), safety=json.dumps(triage.get("safety_notes", []), default=str), - updated_at=utc_now_iso(), + ts=utc_now_iso(), ) + return {"success": True, "event": self.graph.get_anomaly_event(event_id)} - updated_event = self.graph.get_anomaly_event(event_id) - return {"success": True, "event": updated_event} - - def get_status(self, run_id: str) -> Dict[str, Any]: + def get_status(self, run_id: str) -> Dict: with self.graph.session() as session: - result = session.run( - """ - MATCH (r:AgentRun {run_id: $run_id}) - RETURN r - LIMIT 1 - """, - run_id=run_id, - ) - row = result.single() + row = session.run("MATCH (r:AgentRun {run_id: $rid}) RETURN r LIMIT 1", rid=run_id).single() if not row: return {"success": False, "error": f"Run not found: {run_id}"} props = dict(row["r"]) return { - "success": True, - "status": props.get("status"), + "success": True, "status": props.get("status"), "metrics": { "cycleCount": props.get("cycle_count", 0), "lastCycleMs": props.get("last_cycle_ms", 0), - "lastCandidates": props.get("last_candidates", 0), - "lastTriaged": props.get("last_triaged", 0), - "lastEmitted": props.get("last_emitted", 0), }, - "lastHeartbeatAt": props.get("last_heartbeat_at"), - "run": props, + "lastHeartbeatAt": props.get("last_heartbeat_at"), "run": props, } -def _load_fixture_cases(path: Path) -> List[Dict[str, Any]]: - data = json.loads(path.read_text(encoding="utf-8")) - if isinstance(data, dict): - return data.get("cases", []) - if isinstance(data, list): - return data - return [] - +# ═══════════════════════════════════════════════════════════════════════════ +# Fixture replay (standalone, no agent needed) +# ═══════════════════════════════════════════════════════════════════════════ def replay_fixtures(config_json: Optional[str], fixture_path: str) -> Dict[str, Any]: config = merge_defaults(json.loads(config_json) if config_json else {}) - path = Path(fixture_path) - cases = _load_fixture_cases(path) + cases = json.loads(Path(fixture_path).read_text(encoding="utf-8")) + if isinstance(cases, dict): + cases = cases.get("cases", []) thresholds = config.get("thresholds", {}) passed = 0 - failures: List[Dict[str, Any]] = [] - + failures: List[Dict] = [] for case in cases: - result = compute_deviation_scores( - current_value=case.get("current_value"), - history_values=case.get("history_values", []), - prev_value=case.get("prev_value"), - thresholds=thresholds, - ) + result = compute_deviation_scores(case.get("current_value"), case.get("history_values", []), + prev_value=case.get("prev_value"), thresholds=thresholds) expected = bool(case.get("expected_candidate", False)) if result.get("candidate") == expected: passed += 1 else: - failures.append( - { - "id": case.get("id"), - "expected_candidate": expected, - "actual_candidate": result.get("candidate"), - "category": result.get("category"), - "reasons": result.get("reasons", []), - } - ) + failures.append({"id": case.get("id"), "expected": expected, "actual": result.get("candidate"), "reasons": result.get("reasons", [])}) + return {"success": len(failures) == 0, "total": len(cases), "passed": passed, "failed": len(failures), "failures": failures} - return { - "success": len(failures) == 0, - "total": len(cases), - "passed": passed, - "failed": len(failures), - "failures": failures, - } +# ═══════════════════════════════════════════════════════════════════════════ +# CLI entry point +# ═══════════════════════════════════════════════════════════════════════════ def main() -> int: - parser = argparse.ArgumentParser(description="Anomaly monitor worker") + parser = argparse.ArgumentParser(description="Per-subsystem anomaly monitor") sub = parser.add_subparsers(dest="command", required=True) - p_run = sub.add_parser("run", help="Run continuous anomaly monitoring") - p_run.add_argument("--run-id", help="Optional run id") - p_run.add_argument("--config-json", default="{}", help="JSON config string") + p_run = sub.add_parser("run", help="Run coordinator with per-subsystem agents") + p_run.add_argument("--run-id") + p_run.add_argument("--config-json", default="{}") - p_status = sub.add_parser("status", help="Get status for one run") - p_status.add_argument("--run-id", required=True) + sub.add_parser("status", help="Get run status").add_argument("--run-id", required=True) p_list = sub.add_parser("list-events", help="List anomaly events") p_list.add_argument("--limit", type=int, default=100) @@ -2112,36 +1402,33 @@ def main() -> int: p_list.add_argument("--severity") p_list.add_argument("--run-id") - p_get = sub.add_parser("get-event", help="Get one anomaly event") - p_get.add_argument("--event-id", required=True) + sub.add_parser("get-event", help="Get one event").add_argument("--event-id", required=True) - p_ack = sub.add_parser("ack-event", help="Acknowledge one anomaly event") + p_ack = sub.add_parser("ack-event", help="Acknowledge event") p_ack.add_argument("--event-id", required=True) p_ack.add_argument("--note") - p_clear = sub.add_parser("clear-event", help="Clear one acknowledged anomaly event") + p_clear = sub.add_parser("clear-event", help="Clear event") p_clear.add_argument("--event-id", required=True) p_clear.add_argument("--note") - p_deep = sub.add_parser("deep-analyze", help="Run LLM triage on an existing event") + p_deep = sub.add_parser("deep-analyze", help="LLM triage on existing event") p_deep.add_argument("--event-id", required=True) - p_cleanup = sub.add_parser("cleanup", help="Delete old anomaly events") - p_cleanup.add_argument("--retention-days", type=int, default=14) + sub.add_parser("cleanup", help="Delete old events").add_argument("--retention-days", type=int, default=14) - p_replay = sub.add_parser("replay-fixtures", help="Validate deterministic scoring against fixtures") + p_replay = sub.add_parser("replay-fixtures", help="Validate scoring") p_replay.add_argument("--fixture-file", required=True) p_replay.add_argument("--config-json", default="{}") args = parser.parse_args() if args.command == "replay-fixtures": - result = replay_fixtures(args.config_json, args.fixture_file) - print(json.dumps(result)) - return 0 if result["success"] else 1 + print(json.dumps(replay_fixtures(args.config_json, args.fixture_file))) + return 0 try: - monitor = AnomalyMonitor( + coordinator = AgentCoordinator( config=json.loads(getattr(args, "config_json", "{}") or "{}"), run_id=getattr(args, "run_id", None), ) @@ -2150,46 +1437,28 @@ def main() -> int: return 1 if args.command == "run": - def _signal_handler(_signum, _frame): - monitor._running = False - - signal.signal(signal.SIGTERM, _signal_handler) + signal.signal(signal.SIGTERM, lambda *_: setattr(coordinator, '_running', False)) if hasattr(signal, "SIGINT"): - signal.signal(signal.SIGINT, _signal_handler) - return monitor.run_forever() + signal.signal(signal.SIGINT, lambda *_: setattr(coordinator, '_running', False)) + return coordinator.run() if args.command == "status": - print(json.dumps(monitor.get_status(args.run_id), default=str)) - return 0 - - if args.command == "list-events": - print(json.dumps(monitor.list_events(args.limit, args.state, args.severity, args.run_id), default=str)) - return 0 - - if args.command == "get-event": - print(json.dumps(monitor.get_event(args.event_id), default=str)) - return 0 - - if args.command == "ack-event": - print(json.dumps(monitor.ack_event(args.event_id, args.note), default=str)) - return 0 - - if args.command == "clear-event": - print(json.dumps(monitor.clear_event(args.event_id, args.note), default=str)) - return 0 - - if args.command == "deep-analyze": - print(json.dumps(monitor.deep_analyze(args.event_id), default=str)) - return 0 - - if args.command == "cleanup": - deleted = monitor.graph.cleanup_anomaly_events(args.retention_days) + print(json.dumps(coordinator.get_status(args.run_id), default=str)) + elif args.command == "list-events": + print(json.dumps(coordinator.list_events(args.limit, args.state, args.severity, getattr(args, "run_id", None)), default=str)) + elif args.command == "get-event": + print(json.dumps(coordinator.get_event(args.event_id), default=str)) + elif args.command == "ack-event": + print(json.dumps(coordinator.ack_event(args.event_id, args.note), default=str)) + elif args.command == "clear-event": + print(json.dumps(coordinator.clear_event(args.event_id, args.note), default=str)) + elif args.command == "deep-analyze": + print(json.dumps(coordinator.deep_analyze(args.event_id), default=str)) + elif args.command == "cleanup": + deleted = coordinator.graph.cleanup_anomaly_events(args.retention_days) print(json.dumps({"success": True, "deleted": deleted})) - return 0 - - return 1 + return 0 if __name__ == "__main__": sys.exit(main()) - diff --git a/scripts/neo4j_ontology.py b/scripts/neo4j_ontology.py index 380e3cb..92f6258 100644 --- a/scripts/neo4j_ontology.py +++ b/scripts/neo4j_ontology.py @@ -192,6 +192,9 @@ def create_indexes(self) -> None: "CREATE INDEX hmitextlist_name IF NOT EXISTS FOR (htl:HMITextList) ON (htl.name)", "CREATE INDEX plctagtable_name IF NOT EXISTS FOR (pt:PLCTagTable) ON (pt.name)", "CREATE INDEX plctag_name IF NOT EXISTS FOR (ptg:PLCTag) ON (ptg.name)", + # ScadaTag lookup indexes (used by agent persist queries) + "CREATE INDEX scadatag_name IF NOT EXISTS FOR (t:ScadaTag) ON (t.name)", + "CREATE INDEX scadatag_opc_item_path IF NOT EXISTS FOR (t:ScadaTag) ON (t.opc_item_path)", # Agent monitoring indexes "CREATE INDEX anomalyevent_created IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.created_at)", "CREATE INDEX anomalyevent_state IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.state)", From 9db49f2d583671cb16ca5b58d63b51d5543bcbd0 Mon Sep 17 00:00:00 2001 From: Leor Barak Fishman Date: Tue, 10 Mar 2026 13:27:39 -0700 Subject: [PATCH 18/18] p and id intake --- electron-ui/graph-renderer.js | 16 + electron-ui/index.html | 62 ++++ electron-ui/main.js | 48 ++- electron-ui/preload.js | 4 + electron-ui/renderer.js | 133 ++++++++- requirements.txt | 8 +- scripts/artifact_ingest.py | 478 ++++++++++++++++++++++++++++++ scripts/artifact_linker.py | 508 ++++++++++++++++++++++++++++++++ scripts/artifact_models.py | 171 +++++++++++ scripts/gpt54_client.py | 378 ++++++++++++++++++++++++ scripts/graph_api.py | 57 +++- scripts/incremental_analyzer.py | 51 ++++ scripts/neo4j_ontology.py | 304 +++++++++++++++++++ scripts/process_semantics.py | 170 +++++++++++ 14 files changed, 2379 insertions(+), 9 deletions(-) create mode 100644 scripts/artifact_ingest.py create mode 100644 scripts/artifact_linker.py create mode 100644 scripts/artifact_models.py create mode 100644 scripts/gpt54_client.py create mode 100644 scripts/process_semantics.py diff --git a/electron-ui/graph-renderer.js b/electron-ui/graph-renderer.js index ee5ec51..6441750 100644 --- a/electron-ui/graph-renderer.js +++ b/electron-ui/graph-renderer.js @@ -838,6 +838,17 @@ class GraphRenderer { 'HMIScreen': 'siemens-hmi', 'hmiscreen': 'siemens-hmi', 'PLCTagTable': 'plc', 'plctagtable': 'plc', 'PLCTag': 'plc', 'plctag': 'plc', + // Process-semantic layer + 'ProcessMedium': 'process', 'processmedium': 'process', + 'UnitOperation': 'process', 'unitoperation': 'process', + 'OperatingEnvelope': 'process', 'operatingenvelope': 'process', + 'PhysicalPrinciple': 'process', 'physicalprinciple': 'process', + 'ChemicalSpecies': 'process', 'chemicalspecies': 'process', + 'Reaction': 'process', 'reaction': 'process', + // Anomaly + safety + 'AgentRun': 'anomaly', 'agentrun': 'anomaly', + 'AnomalyEvent': 'anomaly', 'anomalyevent': 'anomaly', + 'SafetyElement': 'safety', 'safetyelement': 'safety', }; return typeMap[type] || 'other'; } @@ -855,6 +866,11 @@ class GraphRenderer { 'mes': '#00897B', 'siemens': '#0288D1', 'siemens-hmi': '#0097A7', + 'process': '#00ACC1', + 'anomaly': '#F44336', + 'safety': '#D32F2F', + 'patterns': '#795548', + 'flows': '#E91E63', 'other': '#9E9E9E' }; const group = this._getGroupForType(type); diff --git a/electron-ui/index.html b/electron-ui/index.html index 0f6e719..d09dac3 100644 --- a/electron-ui/index.html +++ b/electron-ui/index.html @@ -219,6 +219,33 @@

Workbench Backup

+
+
+ + + + + + + +
+

P&IDs / SOPs / Diagrams

+

Import process documents — GPT-5.4 extracts equipment, media, operations, and links them to the ontology

+
+ +
+
+ + +
+
+ +
+
@@ -708,6 +735,7 @@

Ontology Graph

+
@@ -803,6 +831,24 @@

Ontology Graph

HMI (Alarm / Screen / Script)
+
+
Process Layer
+
+ + Medium / Operation / Envelope +
+
+ + Principle / Species / Reaction +
+
+
+
Anomalies
+
+ + Agent Run / Event +
+
Other
@@ -964,6 +1010,14 @@

Add New Node

+ + + + + + + +
@@ -1478,6 +1532,14 @@

Graph: Node

Siemens HMI
+
+ + Process (Medium/Op/Principle) +
+
+ + Anomalies +
diff --git a/electron-ui/main.js b/electron-ui/main.js index ae5c557..e9aa32d 100644 --- a/electron-ui/main.js +++ b/electron-ui/main.js @@ -376,9 +376,11 @@ async function stopActiveAgent(reason = 'stopped_by_user') { // Select file dialog ipcMain.handle('select-file', async (event, options) => { + const properties = ['openFile']; + if (options && options.multiple) properties.push('multiSelections'); const result = await dialog.showOpenDialog(mainWindow, { - properties: ['openFile'], - filters: options.filters || [ + properties, + filters: (options && options.filters) || [ { name: 'All Supported', extensions: ['json', 'sc', 'L5X', 'st', 'xml'] }, { name: 'Ignition Backup', extensions: ['json'] }, { name: 'Rockwell PLC', extensions: ['sc', 'L5X'] }, @@ -386,6 +388,9 @@ ipcMain.handle('select-file', async (event, options) => { { name: 'TIA Portal XML', extensions: ['xml'] } ] }); + if (options && options.multiple) { + return { filePaths: result.filePaths || [] }; + } return result.filePaths[0] || null; }); @@ -1794,4 +1799,43 @@ ipcMain.handle('agents:stop-subsystem', async (event, subsystemId) => { if (!activeAgentRun) return { success: false, error: 'No active agent run' }; const sent = sendAgentCommand({ cmd: 'stop-agent', subsystemId }); return { success: sent, subsystemId }; +}); + +// ============================================ +// Artifact Ingestion IPC (P&IDs / SOPs / Diagrams via GPT-5.4) +// ============================================ + +ipcMain.handle('ingest-artifact', async (event, filePath, sourceKind = 'pid') => { + try { + sendToRenderer('stream-output', { text: `Ingesting ${path.basename(filePath)} as ${sourceKind}...\n` }); + const output = await runPythonScript('artifact_ingest.py', [ + filePath, + '--source-kind', sourceKind, + '--verbose', + '--json', + ], { streaming: true, streamId: 'artifact-ingest' }); + const result = JSON.parse(output || '{}'); + return { success: true, ...result }; + } catch (error) { + sendToRenderer('stream-output', { text: `Ingestion error: ${error.message}\n` }); + return { success: false, error: error.message }; + } +}); + +ipcMain.handle('ingest-artifact-batch', async (event, files) => { + try { + const filePaths = files.map(f => f.path); + const sourceKind = files[0]?.sourceKind || 'pid'; + sendToRenderer('stream-output', { text: `Ingesting ${files.length} artifact(s)...\n` }); + const output = await runPythonScript('artifact_ingest.py', [ + ...filePaths, + '--source-kind', sourceKind, + '--verbose', + '--json', + ], { streaming: true, streamId: 'artifact-ingest' }); + const result = JSON.parse(output || '{}'); + return { success: true, ...result }; + } catch (error) { + return { success: false, error: error.message }; + } }); \ No newline at end of file diff --git a/electron-ui/preload.js b/electron-ui/preload.js index 7615063..3be7255 100644 --- a/electron-ui/preload.js +++ b/electron-ui/preload.js @@ -60,6 +60,10 @@ contextBridge.exposeInMainWorld('api', { graphAiPropose: (description) => ipcRenderer.invoke('graph:ai-propose', description), graphAiExplain: (nodeNames) => ipcRenderer.invoke('graph:ai-explain', nodeNames), + // Artifact Ingestion (P&IDs, SOPs, Engineering Diagrams via GPT-5.4) + ingestArtifact: (filePath, sourceKind) => ipcRenderer.invoke('ingest-artifact', filePath, sourceKind), + ingestArtifactBatch: (files) => ipcRenderer.invoke('ingest-artifact-batch', files), + // DEXPI P&ID Conversion API dexpiConvert: (options) => ipcRenderer.invoke('dexpi:convert', options), dexpiExport: () => ipcRenderer.invoke('dexpi:export'), diff --git a/electron-ui/renderer.js b/electron-ui/renderer.js index 93bf9b4..0523a9e 100644 --- a/electron-ui/renderer.js +++ b/electron-ui/renderer.js @@ -1834,7 +1834,7 @@ async function loadGraphData() { if (loading) loading.classList.add('active'); try { - const result = await window.api.graphLoad({ limit: 500 }); + const result = await window.api.graphLoad({}); if (result.success && graphRenderer) { graphRenderer.loadData(result); @@ -2113,12 +2113,48 @@ document.getElementById('graph-search')?.addEventListener('input', (e) => { } }); -document.getElementById('graph-filter')?.addEventListener('change', (e) => { - if (graphRenderer) { - graphRenderer.filterByType(e.target.value); +const GRAPH_FILTER_LABELS = { + plc: ['AOI', 'Tag', 'PLCTagTable', 'PLCTag'], + scada: ['UDT', 'Equipment', 'View', 'ViewComponent', 'ScadaTag', 'Script', 'NamedQuery', 'Project', 'GatewayEvent'], + siemens: ['TiaProject', 'PLCDevice', 'HMIDevice', 'HMIConnection'], + 'siemens-hmi': ['HMIAlarm', 'HMIAlarmClass', 'HMIScript', 'HMIScreen', 'HMITagTable', 'HMITextList'], + mes: ['Material', 'Batch', 'ProductionOrder', 'Operation', 'CriticalControlPoint', 'ProcessDeviation'], + troubleshooting: ['FaultSymptom', 'FaultCause', 'OperatorPhrase', 'CommonPhrase', 'Intent'], + anomaly: ['AgentRun', 'AnomalyEvent'], + flows: ['DataFlow', 'EndToEndFlow'], + process: ['ProcessMedium', 'UnitOperation', 'OperatingEnvelope', 'PhysicalPrinciple', 'ChemicalSpecies', 'Reaction'], +}; + +document.getElementById('graph-filter')?.addEventListener('change', async (e) => { + const value = e.target.value; + if (value === 'all') { + await loadGraphData(); + } else if (GRAPH_FILTER_LABELS[value]) { + await loadGraphDataFiltered(GRAPH_FILTER_LABELS[value]); + } else { + if (graphRenderer) graphRenderer.filterByType(value); } }); +async function loadGraphDataFiltered(nodeTypes) { + const loading = document.getElementById('graph-loading'); + if (loading) loading.classList.add('active'); + + try { + const result = await window.api.graphLoad({ types: nodeTypes }); + if (result.success && graphRenderer) { + graphRenderer.loadData(result); + if (loading) loading.classList.remove('active'); + } else { + console.error('Failed to load filtered graph:', result.error); + if (loading) loading.innerHTML = `

Error: ${result.error}

`; + } + } catch (error) { + console.error('Failed to load filtered graph:', error); + if (loading) loading.innerHTML = `

Error: ${error.message}

`; + } +} + document.getElementById('btn-layout-force')?.addEventListener('click', () => { if (graphRenderer) { graphRenderer.switchLayout('force'); @@ -3382,6 +3418,95 @@ btnSaveSettings?.addEventListener('click', async () => { } }); +// ============================================ +// Artifact Ingestion (P&IDs / SOPs / Diagrams) +// ============================================ + +const btnSelectArtifact = document.getElementById('btn-select-artifact'); +const btnIngestArtifact = document.getElementById('btn-ingest-artifact'); +const artifactSourceKind = document.getElementById('artifact-source-kind'); +const artifactFileList = document.getElementById('artifact-file-list'); +const artifactIngestStatus = document.getElementById('artifact-ingest-status'); + +let selectedArtifactFiles = []; + +btnSelectArtifact?.addEventListener('click', async () => { + const extensions = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif', 'webp', 'gif', 'pdf', 'txt', 'md']; + const result = await api.selectFile({ + filters: [{ name: 'Supported Files', extensions }], + multiple: true, + }); + + if (result && result.filePaths && result.filePaths.length > 0) { + selectedArtifactFiles = result.filePaths; + if (artifactFileList) { + artifactFileList.innerHTML = selectedArtifactFiles + .map(f => `
${f.split(/[\\/]/).pop()}
`) + .join(''); + } + if (btnIngestArtifact) btnIngestArtifact.disabled = false; + } +}); + +btnIngestArtifact?.addEventListener('click', async () => { + if (selectedArtifactFiles.length === 0) return; + + const sourceKind = artifactSourceKind ? artifactSourceKind.value : 'pid'; + btnIngestArtifact.disabled = true; + btnIngestArtifact.textContent = 'Ingesting...'; + + appendOutput(`\n[Artifact Ingest] Processing ${selectedArtifactFiles.length} file(s) as ${sourceKind}...\n`); + + try { + const files = selectedArtifactFiles.map(p => ({ path: p, sourceKind })); + const result = await api.ingestArtifactBatch(files); + + if (result.success) { + if (result.node_details && result.node_details.length > 0) { + appendOutput(`[Artifact Ingest] Node updates (${result.node_details.length}):\n`); + for (const d of result.node_details) { + appendOutput(` + ${d}\n`); + } + } + if (result.concept_details && result.concept_details.length > 0) { + appendOutput(`[Artifact Ingest] Process concepts (${result.concept_details.length}):\n`); + for (const d of result.concept_details) { + appendOutput(` + ${d}\n`); + } + } + if (result.relationship_details && result.relationship_details.length > 0) { + appendOutput(`[Artifact Ingest] Relationships (${result.relationship_details.length}):\n`); + for (const d of result.relationship_details) { + appendOutput(` ~ ${d}\n`); + } + } + appendOutput( + `[Artifact Ingest] Summary: ${result.nodes_updated || 0} node updates, ` + + `${result.concepts_created || 0} process concepts, ` + + `${result.relationships_created || 0} relationships\n` + ); + if (result.errors && result.errors.length > 0) { + appendOutput(`[Artifact Ingest] ${result.errors.length} error(s):\n`); + for (const err of result.errors) { + appendOutput(` - ${typeof err === 'string' ? err : JSON.stringify(err)}\n`); + } + } + if (artifactIngestStatus) { + artifactIngestStatus.style.display = 'block'; + artifactIngestStatus.textContent = + `${result.nodes_updated || 0} updates, ${result.concepts_created || 0} concepts, ${result.relationships_created || 0} rels`; + } + } else { + appendOutput(`[Artifact Ingest] Error: ${result.error || 'Unknown error'}\n`); + } + } catch (err) { + appendOutput(`[Artifact Ingest] Error: ${err.message}\n`); + } finally { + btnIngestArtifact.disabled = false; + btnIngestArtifact.textContent = 'Ingest'; + } +}); + // ============================================ // Database Connections Settings // ============================================ diff --git a/requirements.txt b/requirements.txt index 09f2756..20f186c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,10 @@ mysql-connector-python>=8.0.0 psycopg2-binary>=2.9.0 # HTTP client for Ignition gateway API -requests>=2.28.0 \ No newline at end of file +requests>=2.28.0 + +# OpenAI GPT API for artifact extraction +openai>=1.0.0 + +# PDF text extraction (optional, for SOP ingestion) +PyPDF2>=3.0.0 \ No newline at end of file diff --git a/scripts/artifact_ingest.py b/scripts/artifact_ingest.py new file mode 100644 index 0000000..762ee23 --- /dev/null +++ b/scripts/artifact_ingest.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +""" +Artifact ingestion pipeline for P&IDs, SOPs, and engineering diagrams. + +Orchestrates: +1. Source parsing (image or text) +2. GPT-5.4 structured extraction +3. Entity linking and concept normalization +4. Provenance-aware Neo4j writes + +Usage: + from artifact_ingest import ArtifactIngester + + ingester = ArtifactIngester(graph) + result = ingester.ingest_file("path/to/pid.png", source_kind="pid") +""" + +import os +import sys +import json +import time +from pathlib import Path +from typing import Dict, List, Optional, Any, Callable +from dataclasses import asdict + +from neo4j_ontology import OntologyGraph, get_ontology_graph +from gpt54_client import GPT54Client +from artifact_linker import ArtifactLinker +from artifact_models import ExtractionResult +from process_semantics import ( + EvidenceItem, + evidence_to_json, + merge_evidence, + PROCESS_NODE_SCHEMAS, + PROCESS_RELATIONSHIPS, +) + + +class ArtifactIngester: + """ + End-to-end ingestion pipeline for process engineering artifacts. + + Supports: + - P&IDs (images): .png, .jpg, .jpeg, .bmp, .tiff, .webp + - SOPs (text): .txt, .md, .pdf (text extraction only) + - Engineering diagrams (images): same as P&IDs + """ + + IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".webp", ".gif"} + TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".tsv"} + + def __init__( + self, + graph: Optional[OntologyGraph] = None, + gpt_client: Optional[GPT54Client] = None, + verbose: bool = False, + on_progress: Optional[Callable[[str], None]] = None, + ): + self._graph = graph or get_ontology_graph() + self._gpt = gpt_client + self._linker = None # initialized lazily after GPT client is ready + self._verbose = verbose + self._on_progress = on_progress or (lambda msg: None) + + def _ensure_gpt(self) -> GPT54Client: + if self._gpt is None: + self._gpt = GPT54Client() + if self._linker is None: + self._linker = ArtifactLinker(self._graph, gpt_client=self._gpt) + return self._gpt + + def _log(self, msg: str) -> None: + if self._verbose: + print(f"[ArtifactIngest] {msg}", file=sys.stderr, flush=True) + self._on_progress(msg) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def ingest_file( + self, + file_path: str, + source_kind: str = "pid", + ) -> Dict[str, Any]: + """ + Ingest a single artifact file end-to-end. + + Args: + file_path: Path to the source file. + source_kind: "pid", "sop", or "diagram". + + Returns: + Summary dict with counts and any errors. + """ + path = Path(file_path) + if not path.exists(): + return {"error": f"File not found: {file_path}"} + + self._log(f"Extracting facts from {path.name} ({source_kind})...") + gpt = self._ensure_gpt() + + self._log(f"Loading entity cache for linking...") + entity_hints = self._linker.load_entity_cache() + ext = path.suffix.lower() + + if ext in self.IMAGE_EXTENSIONS: + raw = gpt.extract_from_image( + str(path), + source_kind=source_kind, + existing_entities=entity_hints, + verbose=self._verbose, + ) + elif ext in self.TEXT_EXTENSIONS: + text = path.read_text(encoding="utf-8", errors="replace") + raw = gpt.extract_from_text( + text, + source_file=str(path), + source_kind=source_kind, + existing_entities=entity_hints, + verbose=self._verbose, + ) + elif ext == ".pdf": + text = self._extract_pdf_text(str(path)) + raw = gpt.extract_from_text( + text, + source_file=str(path), + source_kind=source_kind, + existing_entities=entity_hints, + verbose=self._verbose, + ) + else: + return {"error": f"Unsupported file type: {ext}"} + + if "error" in raw: + return {"error": raw["error"], "raw": raw.get("raw", "")} + + eq_count = len(raw.get("equipment_facts", [])) + tag_count = len(raw.get("tag_facts", [])) + media_count = len(raw.get("process_media", [])) + op_count = len(raw.get("unit_operations", [])) + species_count = len(raw.get("chemical_species", [])) + rx_count = len(raw.get("reactions", [])) + rel_count = len(raw.get("relationships", [])) + self._log( + f"GPT extracted: {eq_count} equipment, {tag_count} tags, " + f"{media_count} media, {op_count} operations, " + f"{species_count} species, {rx_count} reactions, {rel_count} relationships" + ) + + self._log("Resolving extracted entity names against existing graph nodes...") + extraction = self._linker.normalize_extraction( + raw, source_file=str(path), source_kind=source_kind, + verbose=self._verbose, + ) + + self._log("Writing facts to Neo4j...") + summary = self._write_extraction(extraction) + + self._log( + f"Done: {summary['nodes_updated']} updates, " + f"{summary['concepts_created']} concepts, " + f"{summary['relationships_created']} relationships" + ) + return summary + + def ingest_batch( + self, + files: List[Dict[str, str]], + ) -> Dict[str, Any]: + """ + Ingest multiple artifacts. + + Args: + files: List of dicts with "path" and "source_kind" keys. + + Returns: + Aggregate summary. + """ + totals = { + "files_processed": 0, + "files_failed": 0, + "nodes_updated": 0, + "concepts_created": 0, + "relationships_created": 0, + "node_details": [], + "concept_details": [], + "relationship_details": [], + "errors": [], + } + + for i, f in enumerate(files, 1): + self._log(f"Processing file {i}/{len(files)}: {f['path']}") + result = self.ingest_file(f["path"], f.get("source_kind", "pid")) + if "error" in result: + totals["files_failed"] += 1 + totals["errors"].append({"file": f["path"], "error": result["error"]}) + else: + totals["files_processed"] += 1 + totals["nodes_updated"] += result.get("nodes_updated", 0) + totals["concepts_created"] += result.get("concepts_created", 0) + totals["relationships_created"] += result.get("relationships_created", 0) + totals["node_details"].extend(result.get("node_details", [])) + totals["concept_details"].extend(result.get("concept_details", [])) + totals["relationship_details"].extend(result.get("relationship_details", [])) + + return totals + + # ------------------------------------------------------------------ + # Graph write helpers + # ------------------------------------------------------------------ + + def _write_extraction(self, extraction: ExtractionResult) -> Dict[str, Any]: + """Write a normalized ExtractionResult to Neo4j with provenance.""" + summary = { + "source_file": extraction.source_file, + "source_kind": extraction.source_kind, + "nodes_updated": 0, + "concepts_created": 0, + "relationships_created": 0, + "node_details": [], + "concept_details": [], + "relationship_details": [], + "errors": extraction.errors[:], + } + + with self._graph.session() as session: + for update in extraction.node_updates: + try: + matched, existed = self._write_node_update(session, update) + props_str = ", ".join(f"{k}={v}" for k, v in update.properties.items()) if update.properties else "" + detail = f"{update.node_label}:{update.node_name}" + if props_str: + detail += f" ({props_str})" + if matched: + summary["nodes_updated"] += 1 + summary["node_details"].append(detail) + action = "Updated" if existed else "Created" + self._log(f" {action} {detail}") + else: + self._log(f" Skipped {detail} (not found in graph)") + except Exception as e: + summary["errors"].append(f"Node update {update.node_name}: {e}") + + seen_concepts = set() + for concept in extraction.process_concepts: + dedup_key = f"{concept.label}:{concept.name}" + if dedup_key in seen_concepts: + continue + seen_concepts.add(dedup_key) + try: + self._write_process_concept(session, concept) + summary["concepts_created"] += 1 + props_str = ", ".join(f"{k}={v}" for k, v in concept.properties.items()) if concept.properties else "" + detail = f"{concept.label}:{concept.name}" + if props_str: + detail += f" ({props_str})" + summary["concept_details"].append(detail) + self._log(f" Created {detail}") + except Exception as e: + summary["errors"].append(f"Concept {concept.name}: {e}") + + seen_rels = set() + for rel in extraction.relationships: + dedup_key = f"{rel.source_label}:{rel.source_name}-{rel.rel_type}->{rel.target_label}:{rel.target_name}" + if dedup_key in seen_rels: + continue + seen_rels.add(dedup_key) + try: + detail = f"{rel.source_label}:{rel.source_name} -[{rel.rel_type}]-> {rel.target_label}:{rel.target_name}" + linked = self._write_relationship(session, rel) + if linked: + summary["relationships_created"] += 1 + summary["relationship_details"].append(detail) + self._log(f" Linked {detail}") + else: + self._log(f" Skipped {detail} (endpoint not found)") + except Exception as e: + summary["errors"].append(f"Rel {rel.rel_type}: {e}") + + return summary + + EXISTING_LABELS = { + "AOI", "Tag", "UDT", "View", "ViewComponent", + "ScadaTag", "Script", "NamedQuery", "Project", + "FaultSymptom", "FaultCause", "OperatorPhrase", + "ControlPattern", "DataFlow", "SafetyElement", + "Material", "Batch", "ProductionOrder", "Operation", + "CriticalControlPoint", "ProcessDeviation", + "TiaProject", "PLCDevice", "HMIDevice", "HMIConnection", + "HMIAlarm", "HMIAlarmClass", "HMIScript", "HMIScreen", + "PLCTagTable", "PLCTag", "HMITagTable", "HMITextList", + } + + def _write_node_update(self, session, update) -> bool: + """Update an existing node with new properties and evidence. + + For backbone labels (Equipment, ScadaTag, AOI, etc.) uses MATCH so + it only updates nodes that already exist -- never creates new ones. + Returns True if a node was actually matched and updated. + """ + ev_json = evidence_to_json(update.evidence) + + set_clauses = [] + params: Dict[str, Any] = {"name": update.node_name, "ev_json": ev_json} + + for k, v in update.properties.items(): + param_key = f"prop_{k}" + set_clauses.append(f"n.{k} = ${param_key}") + params[param_key] = v + + set_clause = ", ".join(set_clauses) if set_clauses else "" + if set_clause: + set_clause = f"SET {set_clause}, " + else: + set_clause = "SET " + + if update.node_label in self.EXISTING_LABELS: + verb = "MATCH" + else: + verb = "MERGE" + + query = f""" + {verb} (n:{update.node_label} {{name: $name}}) + {set_clause} + n.evidence_items = CASE + WHEN n.evidence_items IS NULL THEN $ev_json + ELSE n.evidence_items + $ev_json + END, + n.last_evidence_at = datetime() + RETURN n.name AS matched, n.created_at IS NOT NULL AS existed + """ + result = session.run(query, params) + record = result.single() + if record is None: + return False, False + return True, bool(record.get("existed", True)) + + def _write_process_concept(self, session, concept) -> None: + """Create or merge a process-semantic node with provenance.""" + ev_json = evidence_to_json(concept.evidence) + + set_clauses = [] + params: Dict[str, Any] = {"name": concept.name, "ev_json": ev_json} + + for k, v in concept.properties.items(): + param_key = f"prop_{k}" + set_clauses.append(f"n.{k} = COALESCE(n.{k}, ${param_key})") + params[param_key] = v + + set_clause = ", ".join(set_clauses) if set_clauses else "" + if set_clause: + set_clause = f"SET {set_clause}, " + else: + set_clause = "SET " + + query = f""" + MERGE (n:{concept.label} {{name: $name}}) + {set_clause} + n.evidence_items = CASE + WHEN n.evidence_items IS NULL THEN $ev_json + ELSE n.evidence_items + $ev_json + END, + n.last_evidence_at = datetime() + """ + session.run(query, params) + + _PATH_KEYED_LABELS = {"ViewComponent"} + + def _match_clause(self, alias: str, label: str, param_name: str) -> str: + """Return a MATCH clause using `path` for ViewComponent, `name` otherwise.""" + if label in self._PATH_KEYED_LABELS: + return f"MATCH ({alias}:{label} {{path: ${param_name}}})" + return f"MATCH ({alias}:{label} {{name: ${param_name}}})" + + def _write_relationship(self, session, rel) -> bool: + """Create a relationship with provenance metadata. + + Returns True if both endpoints existed and the relationship was written. + """ + ev_json = evidence_to_json(rel.evidence) + + params: Dict[str, Any] = { + "src_name": rel.source_name, + "tgt_name": rel.target_name, + "ev_json": ev_json, + } + + for k, v in rel.properties.items(): + param_key = f"prop_{k}" + params[param_key] = v + + src_match = self._match_clause("src", rel.source_label, "src_name") + tgt_match = self._match_clause("tgt", rel.target_label, "tgt_name") + + query = f""" + {src_match} + {tgt_match} + MERGE (src)-[r:{rel.rel_type}]->(tgt) + SET r.evidence_items = CASE + WHEN r.evidence_items IS NULL THEN $ev_json + ELSE r.evidence_items + $ev_json + END, + r.last_evidence_at = datetime() + RETURN type(r) AS rel_type + """ + result = session.run(query, params) + return result.single() is not None + + # ------------------------------------------------------------------ + # PDF text extraction + # ------------------------------------------------------------------ + + @staticmethod + def _extract_pdf_text(pdf_path: str) -> str: + """Extract text from a PDF file. Falls back gracefully.""" + try: + import PyPDF2 + text_pages = [] + with open(pdf_path, "rb") as f: + reader = PyPDF2.PdfReader(f) + for page in reader.pages: + text_pages.append(page.extract_text() or "") + return "\n\n".join(text_pages) + except ImportError: + try: + import subprocess + result = subprocess.run( + ["pdftotext", pdf_path, "-"], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0: + return result.stdout + except Exception: + pass + return f"[Could not extract text from {pdf_path}]" + + +# ============================================================================ +# CLI entry point +# ============================================================================ + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Ingest P&IDs/SOPs into ontology") + parser.add_argument("files", nargs="+", help="Files to ingest") + parser.add_argument( + "--source-kind", default="pid", + choices=["pid", "sop", "diagram"], + help="Source type (default: pid)", + ) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--json", action="store_true", help="JSON output") + args = parser.parse_args() + + ingester = ArtifactIngester(verbose=args.verbose) + + files = [{"path": f, "source_kind": args.source_kind} for f in args.files] + result = ingester.ingest_batch(files) + + if args.json: + print(json.dumps(result, indent=2)) + else: + print(f"\nIngestion complete:") + print(f" Files processed: {result['files_processed']}") + print(f" Files failed: {result['files_failed']}") + print(f" Nodes updated: {result['nodes_updated']}") + print(f" Concepts created:{result['concepts_created']}") + print(f" Rels created: {result['relationships_created']}") + if result["errors"]: + print(f"\nErrors:") + for err in result["errors"]: + print(f" - {err}") + + +if __name__ == "__main__": + main() diff --git a/scripts/artifact_linker.py b/scripts/artifact_linker.py new file mode 100644 index 0000000..c0d2a7e --- /dev/null +++ b/scripts/artifact_linker.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +""" +Entity linker for artifact extraction results. + +Resolves extracted mentions from GPT-5.4 output to existing ontology nodes, +deduplicates process-semantic concepts, and produces a clean set of graph +mutations ready for Neo4j writes. +""" + +import json +from typing import Dict, List, Optional, Any, Tuple + +from neo4j_ontology import OntologyGraph +from process_semantics import ( + EvidenceItem, + PROCESS_NODE_SCHEMAS, + PROCESS_RELATIONSHIPS, +) +from artifact_models import ( + ExtractedNodeUpdate, + ExtractedRelationship, + ExtractedProcessConcept, + ExtractionResult, +) + + +class ArtifactLinker: + """ + Resolves extracted mentions to existing graph entities and normalizes + process-semantic concepts before graph writes. + + Uses GPT to match extracted names to existing graph nodes when a GPT + client is provided; falls back to substring matching otherwise. + """ + + def __init__(self, graph: OntologyGraph, gpt_client=None): + self._graph = graph + self._gpt = gpt_client + self._entity_cache: Dict[str, Dict[str, str]] = {} + self._entity_cache_raw: Dict[str, List[str]] = {} + self._gpt_resolved: Dict[str, Dict[str, Optional[str]]] = {} + self._gpt_visualizes: Dict[str, List[str]] = {} + + def load_entity_cache(self) -> Dict[str, List[str]]: + """ + Load known entity names from the graph for linking hints. + Returns dict mapping label -> list of names. + """ + labels_to_query = [ + "Equipment", "AOI", "UDT", "ScadaTag", "View", "ViewComponent", + "ProcessMedium", "UnitOperation", "OperatingEnvelope", + "PhysicalPrinciple", "ChemicalSpecies", "Reaction", + "Process", "Operation", "CriticalControlPoint", + ] + cache: Dict[str, List[str]] = {} + with self._graph.session() as session: + for label in labels_to_query: + try: + if label == "ViewComponent": + result = session.run( + "MATCH (n:ViewComponent) RETURN n.path AS name LIMIT 1000" + ) + else: + result = session.run( + f"MATCH (n:{label}) RETURN n.name AS name LIMIT 500" + ) + names = [r["name"] for r in result if r["name"]] + if names: + cache[label] = names + except Exception: + pass + self._entity_cache = { + label: {n.lower(): n for n in names} + for label, names in cache.items() + } + self._entity_cache_raw = cache + return cache + + def _collect_extracted_mentions(self, raw: Dict[str, Any]) -> Dict[str, List[str]]: + """Gather all entity names from a raw GPT extraction keyed by label.""" + mentions: Dict[str, set] = {} + + for eq in raw.get("equipment_facts", []): + name = eq.get("equipment_name", "") + if name: + mentions.setdefault("Equipment", set()).add(name) + + for tag in raw.get("tag_facts", []): + name = tag.get("tag_name", "") + if name: + mentions.setdefault("ScadaTag", set()).add(name) + + for rel in raw.get("relationships", []): + src_label = rel.get("source_type", "") + src_name = rel.get("source_name", "") + tgt_label = rel.get("target_type", "") + tgt_name = rel.get("target_name", "") + if src_label and src_name: + mentions.setdefault(src_label, set()).add(src_name) + if tgt_label and tgt_name: + mentions.setdefault(tgt_label, set()).add(tgt_name) + + return {label: sorted(names) for label, names in mentions.items()} + + def run_gpt_entity_resolution( + self, raw: Dict[str, Any], verbose: bool = False + ) -> None: + """ + Use GPT to resolve extracted mentions against existing graph entities. + Populates self._gpt_resolved with the mappings. + """ + if not self._gpt: + return + + self._gpt_visualizes = {} + extracted = self._collect_extracted_mentions(raw) + labels_to_resolve = { + label: names for label, names in extracted.items() + if label in self._entity_cache_raw and self._entity_cache_raw[label] + } + + has_vc = bool(self._entity_cache_raw.get("ViewComponent")) + if has_vc and "Equipment" in extracted and "Equipment" not in labels_to_resolve: + labels_to_resolve["Equipment"] = extracted["Equipment"] + + if not labels_to_resolve: + return + + import sys + if verbose: + total = sum(len(v) for v in labels_to_resolve.values()) + print( + f"[ArtifactLinker] Resolving {total} extracted mentions " + f"against {sum(len(v) for v in self._entity_cache_raw.values())} existing entities via GPT...", + file=sys.stderr, flush=True, + ) + + raw_result = self._gpt.resolve_entities( + labels_to_resolve, + self._entity_cache_raw, + verbose=verbose, + ) + + vis = raw_result.pop("visualizes", {}) + if isinstance(vis, dict): + for equip, vcs in vis.items(): + if isinstance(vcs, list): + self._gpt_visualizes[equip] = [v for v in vcs if isinstance(v, str)] + elif isinstance(vcs, str): + self._gpt_visualizes[equip] = [vcs] + + self._gpt_resolved = raw_result + + if verbose: + matched = sum( + 1 for mappings in self._gpt_resolved.values() + for v in mappings.values() if v + ) + vis_count = sum(len(v) for v in self._gpt_visualizes.values()) + print( + f"[ArtifactLinker] GPT matched {matched} mentions to existing entities, " + f"{vis_count} VISUALIZES links proposed", + file=sys.stderr, flush=True, + ) + for equip, vcs in self._gpt_visualizes.items(): + for vc in vcs: + print( + f"[ArtifactLinker] VISUALIZES: {vc} -> Equipment:{equip}", + file=sys.stderr, flush=True, + ) + + def resolve_name(self, label: str, raw_name: str) -> Tuple[str, bool]: + """ + Resolve an extracted name to an existing graph entity. + + Resolution order: + 1. GPT-resolved mapping (if available) + 2. Exact case-insensitive match + 3. Return raw name as-is (not matched) + """ + if not raw_name: + return raw_name, False + + # Check GPT resolution first + gpt_mappings = self._gpt_resolved.get(label, {}) + if raw_name in gpt_mappings: + resolved = gpt_mappings[raw_name] + if resolved: + return resolved, True + else: + return raw_name.strip(), False + + # Exact case-insensitive match + label_cache = self._entity_cache.get(label, {}) + lower = raw_name.lower().strip() + if lower in label_cache: + return label_cache[lower], True + + return raw_name.strip(), False + + def normalize_extraction( + self, + raw: Dict[str, Any], + source_file: str, + source_kind: str, + extraction_model: str = "gpt-5.4", + verbose: bool = False, + ) -> ExtractionResult: + """ + Convert raw GPT-5.4 JSON output into a normalized ExtractionResult. + + Performs: + - entity linking against known graph names + - process concept normalization + - relationship validation against allowed vocabulary + - evidence attachment + """ + result = ExtractionResult( + source_file=source_file, + source_kind=source_kind, + ) + + self.run_gpt_entity_resolution(raw, verbose=verbose) + + base_evidence = EvidenceItem( + source_file=source_file, + source_kind=source_kind, + extraction_model=extraction_model, + extraction_method="vision" if source_kind in ("pid", "diagram") else "text", + ) + + self._process_equipment_facts(raw, result, base_evidence) + self._process_tag_facts(raw, result, base_evidence) + self._process_media(raw, result, base_evidence) + self._process_operations(raw, result, base_evidence) + self._process_species(raw, result, base_evidence) + self._process_reactions(raw, result, base_evidence) + self._process_relationships(raw, result, base_evidence) + self._process_visualizes(result, base_evidence) + + return result + + # ------------------------------------------------------------------ + # Internal normalization helpers + # ------------------------------------------------------------------ + + def _make_evidence(self, base: EvidenceItem, **overrides) -> EvidenceItem: + from dataclasses import asdict + d = asdict(base) + d.update(overrides) + return EvidenceItem(**d) + + def _process_equipment_facts( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for eq in raw.get("equipment_facts", []): + name = eq.get("equipment_name", "") + if not name: + continue + + resolved, _ = self.resolve_name("Equipment", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Equipment: {resolved}") + + props: Dict[str, Any] = {} + if eq.get("service"): + props["service"] = eq["service"] + if eq.get("function"): + props["process_function"] = eq["function"] + + result.node_updates.append(ExtractedNodeUpdate( + node_label="Equipment", + node_name=resolved, + properties=props, + evidence=[ev], + )) + + for medium in eq.get("media_handled", []): + med_resolved, _ = self.resolve_name("ProcessMedium", medium) + result.process_concepts.append(ExtractedProcessConcept( + label="ProcessMedium", + name=med_resolved, + properties={"category": "product"}, + evidence=[ev], + )) + result.relationships.append(ExtractedRelationship( + source_label="Equipment", source_name=resolved, + target_label="ProcessMedium", target_name=med_resolved, + rel_type="HANDLES_MEDIUM", + evidence=[ev], + )) + + for op in eq.get("operations_performed", []): + op_resolved, _ = self.resolve_name("UnitOperation", op) + result.process_concepts.append(ExtractedProcessConcept( + label="UnitOperation", + name=op_resolved, + properties={"category": "transfer"}, + evidence=[ev], + )) + result.relationships.append(ExtractedRelationship( + source_label="Equipment", source_name=resolved, + target_label="UnitOperation", target_name=op_resolved, + rel_type="PERFORMS_OPERATION", + evidence=[ev], + )) + + for param in eq.get("operating_parameters", []): + env_name = f"{resolved}/{param.get('parameter', 'unknown')}" + env_props = { + k: param[k] for k in [ + "parameter", "unit", + "normal_low", "normal_high", + "alarm_low", "alarm_high", + "trip_low", "trip_high", + ] if param.get(k) is not None + } + # Map alarm to warning for schema consistency + if "alarm_low" in env_props: + env_props["low_warning"] = env_props.pop("alarm_low") + if "alarm_high" in env_props: + env_props["high_warning"] = env_props.pop("alarm_high") + if "trip_low" in env_props: + env_props["trip_low"] = env_props["trip_low"] + if "trip_high" in env_props: + env_props["trip_high"] = env_props["trip_high"] + + result.process_concepts.append(ExtractedProcessConcept( + label="OperatingEnvelope", + name=env_name, + properties=env_props, + evidence=[ev], + )) + result.relationships.append(ExtractedRelationship( + source_label="Equipment", source_name=resolved, + target_label="OperatingEnvelope", target_name=env_name, + rel_type="HAS_OPERATING_ENVELOPE", + evidence=[ev], + )) + + def _process_tag_facts( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for tag in raw.get("tag_facts", []): + name = tag.get("tag_name", "") + if not name: + continue + + resolved, _ = self.resolve_name("ScadaTag", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Tag: {resolved}") + + props: Dict[str, Any] = {} + if tag.get("process_context"): + props["process_context"] = tag["process_context"] + + result.node_updates.append(ExtractedNodeUpdate( + node_label="ScadaTag", + node_name=resolved, + properties=props, + evidence=[ev], + )) + + if tag.get("measures"): + principle_name = tag["measures"] + pp_resolved, _ = self.resolve_name("PhysicalPrinciple", principle_name) + result.process_concepts.append(ExtractedProcessConcept( + label="PhysicalPrinciple", + name=pp_resolved, + properties={"category": "analytical"}, + evidence=[ev], + )) + result.relationships.append(ExtractedRelationship( + source_label="ScadaTag", source_name=resolved, + target_label="PhysicalPrinciple", target_name=pp_resolved, + rel_type="MEASURES", + evidence=[ev], + )) + + def _process_media( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for medium in raw.get("process_media", []): + name = medium.get("name", "") + if not name: + continue + resolved, _ = self.resolve_name("ProcessMedium", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Medium: {resolved}") + result.process_concepts.append(ExtractedProcessConcept( + label="ProcessMedium", + name=resolved, + properties={ + k: medium[k] for k in ["category", "phase", "description"] + if medium.get(k) + }, + evidence=[ev], + )) + + def _process_operations( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for op in raw.get("unit_operations", []): + name = op.get("name", "") + if not name: + continue + resolved, _ = self.resolve_name("UnitOperation", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Operation: {resolved}") + result.process_concepts.append(ExtractedProcessConcept( + label="UnitOperation", + name=resolved, + properties={ + k: op[k] for k in ["category", "description"] + if op.get(k) + }, + evidence=[ev], + )) + + def _process_species( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for sp in raw.get("chemical_species", []): + name = sp.get("name", "") + if not name: + continue + resolved, _ = self.resolve_name("ChemicalSpecies", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Species: {resolved}") + result.process_concepts.append(ExtractedProcessConcept( + label="ChemicalSpecies", + name=resolved, + properties={ + k: sp[k] for k in ["category", "cas_number", "description"] + if sp.get(k) + }, + evidence=[ev], + )) + + def _process_reactions( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + for rx in raw.get("reactions", []): + name = rx.get("name", "") + if not name: + continue + resolved, _ = self.resolve_name("Reaction", name) + ev = self._make_evidence(base_ev, source_excerpt=f"Reaction: {resolved}") + result.process_concepts.append(ExtractedProcessConcept( + label="Reaction", + name=resolved, + properties={ + k: rx[k] for k in ["category", "description"] + if rx.get(k) + }, + evidence=[ev], + )) + for species in rx.get("species_involved", []): + sp_resolved, _ = self.resolve_name("ChemicalSpecies", species) + result.relationships.append(ExtractedRelationship( + source_label="Reaction", source_name=resolved, + target_label="ChemicalSpecies", target_name=sp_resolved, + rel_type="INVOLVES_SPECIES", + evidence=[ev], + )) + + def _process_visualizes( + self, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + """Add VISUALIZES relationships proposed by GPT entity resolution.""" + for equip_name, vc_names in self._gpt_visualizes.items(): + for vc_name in vc_names: + ev = self._make_evidence( + base_ev, + source_excerpt=f"ViewComponent:{vc_name} -[VISUALIZES]-> Equipment:{equip_name}", + ) + result.relationships.append(ExtractedRelationship( + source_label="ViewComponent", source_name=vc_name, + target_label="Equipment", target_name=equip_name, + rel_type="VISUALIZES", + evidence=[ev], + )) + + def _process_relationships( + self, raw: Dict, result: ExtractionResult, base_ev: EvidenceItem + ) -> None: + allowed = set(PROCESS_RELATIONSHIPS.keys()) + for rel in raw.get("relationships", []): + rel_type = rel.get("relationship", "") + if rel_type not in allowed: + continue + + src_label = rel.get("source_type", "") + src_name = rel.get("source_name", "") + tgt_label = rel.get("target_type", "") + tgt_name = rel.get("target_name", "") + + if not all([src_label, src_name, tgt_label, tgt_name]): + continue + + src_resolved, _ = self.resolve_name(src_label, src_name) + tgt_resolved, _ = self.resolve_name(tgt_label, tgt_name) + + ev = self._make_evidence( + base_ev, + source_excerpt=f"{src_label}:{src_resolved} -{rel_type}-> {tgt_label}:{tgt_resolved}", + ) + result.relationships.append(ExtractedRelationship( + source_label=src_label, source_name=src_resolved, + target_label=tgt_label, target_name=tgt_resolved, + rel_type=rel_type, + evidence=[ev], + )) diff --git a/scripts/artifact_models.py b/scripts/artifact_models.py new file mode 100644 index 0000000..80e07a5 --- /dev/null +++ b/scripts/artifact_models.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +Normalized extraction models for GPT-5.4 artifact ingestion. + +Defines the intermediate schema between raw GPT output and Neo4j writes. +All extraction results are normalized into these dataclasses before any +graph mutations happen. +""" + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Any + +from process_semantics import EvidenceItem + + +# ============================================================================ +# Extracted fact types +# ============================================================================ + +@dataclass +class ExtractedNodeUpdate: + """An update to an existing ontology node extracted from a source.""" + node_label: str # e.g. "Equipment", "AOI", "ScadaTag" + node_name: str # name used to MERGE/match + properties: Dict[str, Any] = field(default_factory=dict) + evidence: List[EvidenceItem] = field(default_factory=list) + + +@dataclass +class ExtractedRelationship: + """A relationship extracted between two entities.""" + source_label: str + source_name: str + target_label: str + target_name: str + rel_type: str # e.g. "HANDLES_MEDIUM", "PERFORMS_OPERATION" + properties: Dict[str, Any] = field(default_factory=dict) + evidence: List[EvidenceItem] = field(default_factory=list) + + +@dataclass +class ExtractedProcessConcept: + """A new process-semantic concept to be induced.""" + label: str # e.g. "ProcessMedium", "UnitOperation" + name: str + properties: Dict[str, Any] = field(default_factory=dict) + evidence: List[EvidenceItem] = field(default_factory=list) + + +@dataclass +class ExtractionResult: + """Complete normalized result from extracting one source artifact.""" + source_file: str + source_kind: str # "pid", "sop", "diagram" + node_updates: List[ExtractedNodeUpdate] = field(default_factory=list) + relationships: List[ExtractedRelationship] = field(default_factory=list) + process_concepts: List[ExtractedProcessConcept] = field(default_factory=list) + raw_mentions: List[Dict[str, Any]] = field(default_factory=list) + errors: List[str] = field(default_factory=list) + + +# ============================================================================ +# GPT extraction prompt contract +# ============================================================================ + +EXTRACTION_RESPONSE_SCHEMA = { + "type": "object", + "properties": { + "equipment_facts": { + "type": "array", + "items": { + "type": "object", + "properties": { + "equipment_name": {"type": "string"}, + "service": {"type": "string"}, + "function": {"type": "string"}, + "media_handled": {"type": "array", "items": {"type": "string"}}, + "operations_performed": {"type": "array", "items": {"type": "string"}}, + "operating_parameters": { + "type": "array", + "items": { + "type": "object", + "properties": { + "parameter": {"type": "string"}, + "unit": {"type": "string"}, + "normal_low": {"type": "number"}, + "normal_high": {"type": "number"}, + "alarm_low": {"type": "number"}, + "alarm_high": {"type": "number"}, + "trip_low": {"type": "number"}, + "trip_high": {"type": "number"}, + }, + }, + }, + }, + }, + }, + "tag_facts": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tag_name": {"type": "string"}, + "measures": {"type": "string"}, + "process_context": {"type": "string"}, + "unit": {"type": "string"}, + }, + }, + }, + "process_media": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "category": {"type": "string"}, + "phase": {"type": "string"}, + "description": {"type": "string"}, + }, + }, + }, + "unit_operations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "category": {"type": "string"}, + "description": {"type": "string"}, + }, + }, + }, + "chemical_species": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "category": {"type": "string"}, + "cas_number": {"type": "string"}, + "description": {"type": "string"}, + }, + }, + }, + "reactions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "category": {"type": "string"}, + "description": {"type": "string"}, + "species_involved": {"type": "array", "items": {"type": "string"}}, + }, + }, + }, + "relationships": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_type": {"type": "string"}, + "source_name": {"type": "string"}, + "relationship": {"type": "string"}, + "target_type": {"type": "string"}, + "target_name": {"type": "string"}, + }, + }, + }, + }, +} diff --git a/scripts/gpt54_client.py b/scripts/gpt54_client.py new file mode 100644 index 0000000..2a8da18 --- /dev/null +++ b/scripts/gpt54_client.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +GPT-5.4 multimodal client for artifact extraction. + +Handles image (P&ID, engineering diagram) and document (SOP PDF/text) +understanding via the OpenAI API, producing structured JSON that feeds +into the artifact normalization pipeline. +""" + +import os +import sys +import json +import base64 +import time +from pathlib import Path +from typing import Dict, List, Optional, Any + +try: + from dotenv import load_dotenv +except ImportError: + def load_dotenv(*_a, **_kw): + return False + +load_dotenv() + + +class GPT54Client: + """ + OpenAI GPT-5.4 client for multimodal artifact extraction. + + Supports: + - Image inputs (P&IDs, engineering diagrams) + - Text/PDF inputs (SOPs, procedures) + - Structured JSON output via response_format + """ + + DEFAULT_MODEL = "gpt-5.4" + + def __init__( + self, + api_key: Optional[str] = None, + model: Optional[str] = None, + ): + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + if not self.api_key: + raise ValueError( + "OPENAI_API_KEY not found. Set it in .env or pass api_key." + ) + self.model = model or os.getenv("OPENAI_MODEL", self.DEFAULT_MODEL) + + import openai + self.client = openai.OpenAI(api_key=self.api_key, timeout=300.0) + + # ------------------------------------------------------------------ + # Image encoding helpers + # ------------------------------------------------------------------ + + @staticmethod + def _encode_image(image_path: str) -> str: + with open(image_path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + + @staticmethod + def _image_media_type(path: str) -> str: + ext = Path(path).suffix.lower() + return { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".tiff": "image/tiff", + ".tif": "image/tiff", + }.get(ext, "image/png") + + # ------------------------------------------------------------------ + # Core extraction methods + # ------------------------------------------------------------------ + + def extract_from_image( + self, + image_path: str, + source_kind: str = "pid", + existing_entities: Optional[Dict[str, List[str]]] = None, + verbose: bool = False, + ) -> Dict[str, Any]: + """ + Extract structured process facts from an image (P&ID / diagram). + + Args: + image_path: Path to the image file. + source_kind: "pid", "diagram", or "sop". + existing_entities: Dict mapping label -> list of known names + for entity linking hints. + verbose: Print debug output. + + Returns: + Raw parsed JSON dict from GPT-5.4. + """ + b64 = self._encode_image(image_path) + media = self._image_media_type(image_path) + + system_prompt = self._build_system_prompt(source_kind, existing_entities) + user_content = [ + { + "type": "image_url", + "image_url": { + "url": f"data:{media};base64,{b64}", + "detail": "high", + }, + }, + { + "type": "text", + "text": self._build_user_prompt(source_kind, image_path), + }, + ] + + return self._call(system_prompt, user_content, verbose=verbose) + + def extract_from_text( + self, + text: str, + source_file: str = "", + source_kind: str = "sop", + existing_entities: Optional[Dict[str, List[str]]] = None, + verbose: bool = False, + ) -> Dict[str, Any]: + """ + Extract structured process facts from text (SOP / procedure). + + Args: + text: The document text content. + source_file: Original file path for reference. + source_kind: "sop", "procedure", or "manual". + existing_entities: Dict mapping label -> list of known names. + verbose: Print debug output. + + Returns: + Raw parsed JSON dict from GPT-5.4. + """ + system_prompt = self._build_system_prompt(source_kind, existing_entities) + user_content = [ + { + "type": "text", + "text": self._build_user_prompt(source_kind, source_file) + + f"\n\n--- DOCUMENT CONTENT ---\n{text[:60000]}", + }, + ] + + return self._call(system_prompt, user_content, verbose=verbose) + + def resolve_entities( + self, + extracted_mentions: Dict[str, List[str]], + existing_entities: Dict[str, List[str]], + verbose: bool = False, + ) -> Dict[str, Dict[str, str]]: + """ + Use GPT to match extracted entity mentions to existing graph nodes. + + Args: + extracted_mentions: Dict mapping label -> list of names from extraction. + e.g. {"Equipment": ["Brew Kettle BK-001", "HX-200"], "ScadaTag": ["TT501"]} + existing_entities: Dict mapping label -> list of known names in graph. + e.g. {"Equipment": ["BK-001", "HX-200-A"], "ScadaTag": ["Area5/TT501.PV"]} + + Returns: + Dict mapping label -> {extracted_name: resolved_name_or_null}. + resolved_name is the exact existing name if matched, or null if no match. + """ + system_prompt = """You are an expert at matching industrial equipment and tag names across different naming conventions. + +You will receive two lists per entity type: +- "extracted": names found in a P&ID, SOP, or engineering diagram +- "existing": names already in the plant's ontology database + +Your job is to determine which extracted names refer to the same entity as which existing names. + +Industrial naming conventions vary: a P&ID might say "Brew Kettle BK-001" while the SCADA system has "BK-001" or "BK_001_BrewKettle". Tags like "TT-501" might appear in SCADA as "Area5/TT501.PV" or "TT_501_Temperature". + +ViewComponents are SCADA UI elements that visualize equipment. A ViewComponent whose name or path references an equipment ID likely VISUALIZES that equipment. When you see Equipment in the extracted list AND ViewComponent in the existing list, also return a "visualizes" key mapping equipment names to the ViewComponent names that display them. + +Rules: +- Match based on tag numbers, equipment IDs, and functional identity -- not just substring overlap. +- If an extracted name clearly refers to an existing entity, map it to the EXACT existing name. +- If there is no plausible match, map it to null. +- When in doubt, prefer no match over a wrong match. + +Return JSON: +{ + "Equipment": { "extracted_name": "existing_name_or_null", ... }, + "ScadaTag": { "extracted_name": "existing_name_or_null", ... }, + "visualizes": { "equipment_name": ["ViewComponent_name", ...], ... } +}""" + + user_parts = [] + for label in extracted_mentions: + ext_list = extracted_mentions[label] + exist_list = existing_entities.get(label, []) + if not ext_list: + continue + user_parts.append(f"## {label}") + user_parts.append(f"Extracted: {json.dumps(ext_list)}") + user_parts.append(f"Existing: {json.dumps(exist_list[:200])}") + user_parts.append("") + + vc_list = existing_entities.get("ViewComponent", []) + if "Equipment" in extracted_mentions and vc_list: + user_parts.append("## ViewComponent (existing only — for VISUALIZES linking)") + user_parts.append(f"Existing: {json.dumps(vc_list[:300])}") + user_parts.append("") + + if not user_parts: + return {} + + user_content = [{"type": "text", "text": "\n".join(user_parts)}] + result = self._call(system_prompt, user_content, verbose=verbose) + + mappings: Dict[str, Any] = {} + for label, matches in result.items(): + if isinstance(matches, dict): + mappings[label] = { + k: v for k, v in matches.items() + if isinstance(k, str) + } + return mappings + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _call( + self, + system_prompt: str, + user_content: List[Dict], + verbose: bool = False, + ) -> Dict[str, Any]: + """Make the actual API call and parse JSON response.""" + if verbose: + print(f"[GPT54] Calling {self.model}...", file=sys.stderr, flush=True) + + start = time.time() + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content}, + ], + max_completion_tokens=16000, + temperature=0.1, + response_format={"type": "json_object"}, + ) + elapsed = time.time() - start + + text = response.choices[0].message.content or "{}" + if verbose: + tokens = response.usage + print( + f"[GPT54] Done in {elapsed:.1f}s " + f"(in={tokens.prompt_tokens}, out={tokens.completion_tokens})", + file=sys.stderr, flush=True, + ) + + try: + return json.loads(text) + except json.JSONDecodeError: + return {"error": "Failed to parse GPT response as JSON", "raw": text[:2000]} + + def _build_system_prompt( + self, + source_kind: str, + existing_entities: Optional[Dict[str, List[str]]] = None, + ) -> str: + entity_hint = "" + if existing_entities: + parts = [] + for label, names in existing_entities.items(): + sample = names[:30] + parts.append(f" {label}: {json.dumps(sample)}") + entity_hint = ( + "\n\nKnown entities already in the ontology (use these exact names when possible):\n" + + "\n".join(parts) + ) + + return f"""You are an expert process engineer analyzing industrial plant documentation. + +Your job is to extract structured facts from a {source_kind.upper()} source and return them as JSON. + +Extract ALL of the following when present: +1. Equipment facts: names, services, functions, media handled, operations performed, operating parameters with ranges/limits. +2. Tag/instrument facts: tag names, what they measure, units, process context. +3. Process media: fluids, gases, utilities, products flowing through the system. +4. Unit operations: what operations each piece of equipment performs (pumping, heating, mixing, filtration, etc.). +5. Chemical species: chemicals, additives, reactants, products mentioned. +6. Reactions: any chemical or physical transformations described. +7. Relationships: connections between equipment, tags, media, operations, and species. + +Rules: +- Use exact equipment/tag names from the source when possible. +- When a name matches a known entity, use that exact name. +- For operating parameters, extract numeric limits when available. +- Classify media as: utility, product, waste, solvent, or gas. +- Classify operations as: transfer, thermal, mixing, separation, cleaning, or reaction. +- Return valid JSON matching the schema described below. +{entity_hint} + +Response schema: +{{ + "equipment_facts": [{{ + "equipment_name": "string", + "service": "string", + "function": "string", + "media_handled": ["string"], + "operations_performed": ["string"], + "operating_parameters": [{{ + "parameter": "string", + "unit": "string", + "normal_low": number_or_null, + "normal_high": number_or_null, + "alarm_low": number_or_null, + "alarm_high": number_or_null, + "trip_low": number_or_null, + "trip_high": number_or_null + }}] + }}], + "tag_facts": [{{ + "tag_name": "string", + "measures": "string", + "process_context": "string", + "unit": "string" + }}], + "process_media": [{{ + "name": "string", + "category": "string", + "phase": "string", + "description": "string" + }}], + "unit_operations": [{{ + "name": "string", + "category": "string", + "description": "string" + }}], + "chemical_species": [{{ + "name": "string", + "category": "string", + "cas_number": "string", + "description": "string" + }}], + "reactions": [{{ + "name": "string", + "category": "string", + "description": "string", + "species_involved": ["string"] + }}], + "relationships": [{{ + "source_type": "string", + "source_name": "string", + "relationship": "string", + "target_type": "string", + "target_name": "string" + }}] +}}""" + + def _build_user_prompt(self, source_kind: str, source_path: str) -> str: + kind_labels = { + "pid": "P&ID (Piping and Instrumentation Diagram)", + "diagram": "engineering diagram", + "sop": "Standard Operating Procedure", + "procedure": "operating procedure", + "manual": "equipment manual", + } + label = kind_labels.get(source_kind, source_kind) + return ( + f"Analyze this {label} and extract all process facts.\n" + f"Source file: {source_path}\n" + f"Return a single JSON object with all extracted facts." + ) diff --git a/scripts/graph_api.py b/scripts/graph_api.py index e3bff45..4f89175 100644 --- a/scripts/graph_api.py +++ b/scripts/graph_api.py @@ -79,6 +79,13 @@ class GraphAPI: "vendor": "mes", "agentrun": "anomaly", "anomalyevent": "anomaly", + # Process-semantic layer + "processmedium": "process", + "unitoperation": "process", + "operatingenvelope": "process", + "physicalprinciple": "process", + "chemicalspecies": "process", + "reaction": "process", } # Color palette for node types @@ -94,9 +101,40 @@ class GraphAPI: "overview": "#607D8B", "mes": "#00897B", "anomaly": "#F44336", + "process": "#00ACC1", "other": "#9E9E9E", } + # Schema-driven contract for the richer ontology. + # Maps label -> metadata for display, search, edit, and relationship rules. + NODE_LABEL_META = { + "AOI": {"key": "name", "display": "name", "searchable": ["name", "purpose", "description"], "group": "plc"}, + "Tag": {"key": "name", "display": "name", "searchable": ["name", "description"], "group": "plc"}, + "UDT": {"key": "name", "display": "name", "searchable": ["name", "purpose"], "group": "scada"}, + "Equipment": {"key": "name", "display": "name", "searchable": ["name", "purpose", "type"], "group": "scada"}, + "View": {"key": "name", "display": "name", "searchable": ["name", "purpose"], "group": "scada"}, + "ViewComponent": {"key": "path", "display": "name", "searchable": ["path", "name", "purpose"], "group": "scada-component"}, + "ScadaTag": {"key": "name", "display": "name", "searchable": ["name", "purpose"], "group": "scada-tag"}, + "Script": {"key": "name", "display": "name", "searchable": ["name", "purpose"], "group": "scada"}, + "NamedQuery": {"key": "name", "display": "name", "searchable": ["name", "purpose"], "group": "scada"}, + "FaultSymptom": {"key": "symptom", "display": "symptom", "searchable": ["symptom"], "group": "troubleshooting"}, + "FaultCause": {"key": "cause", "display": "cause", "searchable": ["cause"], "group": "troubleshooting"}, + "OperatorPhrase": {"key": "phrase", "display": "phrase", "searchable": ["phrase"], "group": "troubleshooting"}, + "Material": {"key": "name", "display": "name", "searchable": ["name"], "group": "mes"}, + "Batch": {"key": "name", "display": "name", "searchable": ["name"], "group": "mes"}, + "ProductionOrder": {"key": "name", "display": "name", "searchable": ["name"], "group": "mes"}, + "Operation": {"key": "name", "display": "name", "searchable": ["name"], "group": "mes"}, + "CriticalControlPoint": {"key": "name", "display": "name", "searchable": ["name"], "group": "mes"}, + "ProcessMedium": {"key": "name", "display": "name", "searchable": ["name", "description", "category"], "group": "process"}, + "UnitOperation": {"key": "name", "display": "name", "searchable": ["name", "description", "category"], "group": "process"}, + "OperatingEnvelope": {"key": "name", "display": "name", "searchable": ["name", "parameter"], "group": "process"}, + "PhysicalPrinciple": {"key": "name", "display": "name", "searchable": ["name", "category"], "group": "process"}, + "ChemicalSpecies": {"key": "name", "display": "name", "searchable": ["name", "category"], "group": "process"}, + "Reaction": {"key": "name", "display": "name", "searchable": ["name", "category"], "group": "process"}, + "AgentRun": {"key": "run_id", "display": "run_id", "searchable": ["run_id"], "group": "anomaly"}, + "AnomalyEvent": {"key": "event_id", "display": "summary", "searchable": ["summary", "event_id"], "group": "anomaly"}, + } + def __init__(self, graph: Optional[OntologyGraph] = None): self._graph = graph self._owns_graph = False @@ -150,7 +188,7 @@ def _format_edge(self, record: Dict) -> Dict: # ========================================================================= def load_graph( - self, node_types: Optional[List[str]] = None, limit: int = 500 + self, node_types: Optional[List[str]] = None, limit: int = 10000 ) -> Dict: """ Load graph data for visualization. @@ -822,6 +860,8 @@ def get_schema(self) -> Dict: "nodeTypes": labels, "relationshipTypes": sorted(relationships), "groups": list(set(self.NODE_GROUPS.values())), + "labelMeta": self.NODE_LABEL_META, + "groupColors": self.NODE_COLORS, } @@ -833,7 +873,7 @@ def main(): # Load graph load_parser = subparsers.add_parser("load", help="Load graph data") load_parser.add_argument("--types", nargs="*", help="Node types to include") - load_parser.add_argument("--limit", type=int, default=500, help="Max nodes") + load_parser.add_argument("--limit", type=int, default=10000, help="Max nodes") # Get neighbors neighbors_parser = subparsers.add_parser("neighbors", help="Get node neighbors") @@ -910,6 +950,15 @@ def main(): # Schema subparsers.add_parser("schema", help="Get graph schema") + # Ingest artifact + ingest_parser = subparsers.add_parser("ingest-artifact", help="Ingest a P&ID/SOP/diagram") + ingest_parser.add_argument("file_path", help="Path to the artifact file") + ingest_parser.add_argument( + "--source-kind", default="pid", + choices=["pid", "sop", "diagram"], + help="Source type (default: pid)", + ) + args = parser.parse_args() if not args.command: @@ -957,6 +1006,10 @@ def main(): result = api.apply_batch(changes) elif args.command == "schema": result = api.get_schema() + elif args.command == "ingest-artifact": + from artifact_ingest import ArtifactIngester + ingester = ArtifactIngester(graph=api._get_graph(), verbose=True) + result = ingester.ingest_file(args.file_path, args.source_kind) else: output_error(f"Unknown command: {args.command}") return diff --git a/scripts/incremental_analyzer.py b/scripts/incremental_analyzer.py index 85894ea..30bd8ba 100644 --- a/scripts/incremental_analyzer.py +++ b/scripts/incremental_analyzer.py @@ -844,8 +844,38 @@ def _build_batch_context( "hmi": item.get("hmi", ""), } + # Inject process-semantic context when available + if item_type in ("Equipment", "AOI", "ScadaTag"): + context["process_semantics"] = self._get_process_context_for_items( + item_type, items + ) + return context + def _get_process_context_for_items( + self, item_type: str, items: List[Dict] + ) -> Dict[str, Any]: + """Fetch process-semantic context (media, operations, envelopes) for items.""" + ctx: Dict[str, Any] = {} + for item in items: + name = item.get("name", "") + if not name: + continue + try: + if item_type == "Equipment": + pctx = self._graph.get_process_context_for_equipment(name) + elif item_type == "ScadaTag": + pctx = self._graph.get_process_context_for_tag(name) + elif item_type == "AOI": + pctx = self._graph.get_process_context_for_equipment(name) + else: + continue + if pctx and any(pctx.get(k) for k in pctx if k != "name"): + ctx[name] = pctx + except Exception: + pass + return ctx + def _get_system_prompt(self, item_type: str) -> str: """Get the system prompt for analyzing a specific item type.""" base = """You are an expert in industrial automation and SCADA systems, specializing in Ignition by Inductive Automation. @@ -998,6 +1028,27 @@ def _get_user_prompt( if ctx.get("udts"): parts.append(f"Displays UDTs: {ctx['udts']}") + # Add process-semantic context if available + process_ctx = context.get("process_semantics", {}).get(name, {}) + if process_ctx: + if process_ctx.get("media"): + parts.append(f"Handles media: {process_ctx['media']}") + if process_ctx.get("operations"): + parts.append(f"Performs operations: {process_ctx['operations']}") + if process_ctx.get("envelopes"): + envs = process_ctx["envelopes"] + env_strs = [] + for e in envs: + if isinstance(e, dict) and e.get("name"): + rng = f"{e.get('normal_low', '?')}–{e.get('normal_high', '?')} {e.get('unit', '')}" + env_strs.append(f"{e['parameter']}: {rng}") + if env_strs: + parts.append(f"Operating envelopes: {', '.join(env_strs)}") + if process_ctx.get("measures"): + parts.append(f"Measures: {process_ctx['measures']}") + if process_ctx.get("controlled_operations"): + parts.append(f"Controls operations: {process_ctx['controlled_operations']}") + parts.append("") parts.append( diff --git a/scripts/neo4j_ontology.py b/scripts/neo4j_ontology.py index 92f6258..f2a9efc 100644 --- a/scripts/neo4j_ontology.py +++ b/scripts/neo4j_ontology.py @@ -95,6 +95,29 @@ class OntologyGraph: - HAS_SCREEN: HMIDevice -> HMIScreen - HAS_TEXT_LIST: HMIDevice -> HMITextList - MONITORS_TAG: HMIAlarm -> PLCTag + + Process-Semantic Node Types: + - ProcessMedium: A material/utility stream (water, steam, product, etc.) + - UnitOperation: A canonical plant operation (pumping, heating, mixing, etc.) + - OperatingEnvelope: Normal ranges, alarm bands, trip windows for a parameter + - PhysicalPrinciple: A measurable physical quantity (temperature, pressure, flow) + - ChemicalSpecies: A chemical substance involved in plant processes + - Reaction: A chemical or physical transformation step + + Process-Semantic Relationship Types: + - HANDLES_MEDIUM: Equipment -> ProcessMedium + - PERFORMS_OPERATION: Equipment -> UnitOperation + - HAS_OPERATING_ENVELOPE: Equipment -> OperatingEnvelope + - MEASURES: ScadaTag -> PhysicalPrinciple + - MONITORS_ENVELOPE: ScadaTag -> OperatingEnvelope + - IMPLEMENTS_CONTROL_OF: AOI -> UnitOperation + - USES_PRINCIPLE: UnitOperation -> PhysicalPrinciple + - INVOLVES_SPECIES: Reaction -> ChemicalSpecies + - PROCESSES_SPECIES: UnitOperation -> ChemicalSpecies + - HAS_REACTION: UnitOperation -> Reaction + - MEDIUM_CONTAINS: ProcessMedium -> ChemicalSpecies + - ENVELOPE_FOR_PRINCIPLE: OperatingEnvelope -> PhysicalPrinciple + - VISUALIZES: ViewComponent -> Equipment """ def __init__(self, config: Optional[Neo4jConfig] = None): @@ -200,6 +223,13 @@ def create_indexes(self) -> None: "CREATE INDEX anomalyevent_state IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.state)", "CREATE INDEX anomalyevent_severity IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.severity)", "CREATE INDEX anomalyevent_dedup_key IF NOT EXISTS FOR (e:AnomalyEvent) ON (e.dedup_key)", + # Process-semantic layer indexes + "CREATE INDEX processmedium_name IF NOT EXISTS FOR (pm:ProcessMedium) ON (pm.name)", + "CREATE INDEX unitoperation_name IF NOT EXISTS FOR (uo:UnitOperation) ON (uo.name)", + "CREATE INDEX operatingenvelope_name IF NOT EXISTS FOR (oe:OperatingEnvelope) ON (oe.name)", + "CREATE INDEX physicalprinciple_name IF NOT EXISTS FOR (pp:PhysicalPrinciple) ON (pp.name)", + "CREATE INDEX chemicalspecies_name IF NOT EXISTS FOR (cs:ChemicalSpecies) ON (cs.name)", + "CREATE INDEX reaction_name IF NOT EXISTS FOR (rx:Reaction) ON (rx.name)", ] for constraint in constraints: @@ -2990,6 +3020,280 @@ def get_item_with_context( context = {k: v for k, v in dict(record).items() if k != "item"} return {"item": item_data, "context": context} + # ========================================================================= + # Process-Semantic Layer Write Helpers + # ========================================================================= + + def create_process_medium( + self, name: str, category: str = "", phase: str = "", + description: str = "", purpose: str = "", + evidence_json: str = "", + ) -> str: + """Create or merge a ProcessMedium node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:ProcessMedium {name: $name}) + SET n.category = COALESCE(n.category, $category), + n.phase = COALESCE(n.phase, $phase), + n.description = COALESCE(n.description, $description), + n.purpose = COALESCE(n.purpose, $purpose), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + """, + {"name": name, "category": category, "phase": phase, + "description": description, "purpose": purpose, "ev": evidence_json}, + ) + return name + + def create_unit_operation( + self, name: str, category: str = "", + description: str = "", purpose: str = "", + evidence_json: str = "", + ) -> str: + """Create or merge a UnitOperation node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:UnitOperation {name: $name}) + SET n.category = COALESCE(n.category, $category), + n.description = COALESCE(n.description, $description), + n.purpose = COALESCE(n.purpose, $purpose), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + """, + {"name": name, "category": category, + "description": description, "purpose": purpose, "ev": evidence_json}, + ) + return name + + def create_operating_envelope( + self, name: str, parameter: str = "", unit: str = "", + low_limit: float = None, low_warning: float = None, + normal_low: float = None, normal_high: float = None, + high_warning: float = None, high_limit: float = None, + trip_low: float = None, trip_high: float = None, + description: str = "", evidence_json: str = "", + ) -> str: + """Create or merge an OperatingEnvelope node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:OperatingEnvelope {name: $name}) + SET n.parameter = COALESCE(n.parameter, $parameter), + n.unit = COALESCE(n.unit, $unit), + n.description = COALESCE(n.description, $description), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + FOREACH (_ IN CASE WHEN $low_limit IS NOT NULL THEN [1] ELSE [] END | + SET n.low_limit = $low_limit) + FOREACH (_ IN CASE WHEN $low_warning IS NOT NULL THEN [1] ELSE [] END | + SET n.low_warning = $low_warning) + FOREACH (_ IN CASE WHEN $normal_low IS NOT NULL THEN [1] ELSE [] END | + SET n.normal_low = $normal_low) + FOREACH (_ IN CASE WHEN $normal_high IS NOT NULL THEN [1] ELSE [] END | + SET n.normal_high = $normal_high) + FOREACH (_ IN CASE WHEN $high_warning IS NOT NULL THEN [1] ELSE [] END | + SET n.high_warning = $high_warning) + FOREACH (_ IN CASE WHEN $high_limit IS NOT NULL THEN [1] ELSE [] END | + SET n.high_limit = $high_limit) + FOREACH (_ IN CASE WHEN $trip_low IS NOT NULL THEN [1] ELSE [] END | + SET n.trip_low = $trip_low) + FOREACH (_ IN CASE WHEN $trip_high IS NOT NULL THEN [1] ELSE [] END | + SET n.trip_high = $trip_high) + """, + {"name": name, "parameter": parameter, "unit": unit, + "description": description, "ev": evidence_json, + "low_limit": low_limit, "low_warning": low_warning, + "normal_low": normal_low, "normal_high": normal_high, + "high_warning": high_warning, "high_limit": high_limit, + "trip_low": trip_low, "trip_high": trip_high}, + ) + return name + + def create_physical_principle( + self, name: str, category: str = "", unit_family: str = "", + description: str = "", evidence_json: str = "", + ) -> str: + """Create or merge a PhysicalPrinciple node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:PhysicalPrinciple {name: $name}) + SET n.category = COALESCE(n.category, $category), + n.unit_family = COALESCE(n.unit_family, $unit_family), + n.description = COALESCE(n.description, $description), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + """, + {"name": name, "category": category, + "unit_family": unit_family, "description": description, + "ev": evidence_json}, + ) + return name + + def create_chemical_species( + self, name: str, category: str = "", cas_number: str = "", + molecular_formula: str = "", description: str = "", + evidence_json: str = "", + ) -> str: + """Create or merge a ChemicalSpecies node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:ChemicalSpecies {name: $name}) + SET n.category = COALESCE(n.category, $category), + n.cas_number = COALESCE(n.cas_number, $cas_number), + n.molecular_formula = COALESCE(n.molecular_formula, $molecular_formula), + n.description = COALESCE(n.description, $description), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + """, + {"name": name, "category": category, + "cas_number": cas_number, "molecular_formula": molecular_formula, + "description": description, "ev": evidence_json}, + ) + return name + + def create_reaction( + self, name: str, category: str = "", description: str = "", + conditions: str = "", evidence_json: str = "", + ) -> str: + """Create or merge a Reaction node with provenance.""" + with self.session() as session: + session.run( + """ + MERGE (n:Reaction {name: $name}) + SET n.category = COALESCE(n.category, $category), + n.description = COALESCE(n.description, $description), + n.conditions = COALESCE(n.conditions, $conditions), + n.evidence_items = CASE + WHEN $ev = '' THEN n.evidence_items + WHEN n.evidence_items IS NULL THEN $ev + ELSE n.evidence_items + $ev + END, + n.last_evidence_at = datetime() + """, + {"name": name, "category": category, + "description": description, "conditions": conditions, + "ev": evidence_json}, + ) + return name + + def create_process_relationship( + self, source_label: str, source_name: str, + target_label: str, target_name: str, + rel_type: str, evidence_json: str = "", + properties: dict = None, + ) -> bool: + """Create a process-semantic relationship with provenance. + + Only allows relationship types defined in PROCESS_RELATIONSHIPS. + Returns True if the relationship was created/updated. + """ + from process_semantics import PROCESS_RELATIONSHIPS + if rel_type not in PROCESS_RELATIONSHIPS: + return False + + prop_sets = "" + params = { + "src_name": source_name, + "tgt_name": target_name, + "ev": evidence_json, + } + if properties: + for k, v in properties.items(): + param_key = f"prop_{k}" + prop_sets += f", r.{k} = ${param_key}" + params[param_key] = v + + with self.session() as session: + session.run( + f""" + MATCH (src:{source_label} {{name: $src_name}}) + MATCH (tgt:{target_label} {{name: $tgt_name}}) + MERGE (src)-[r:{rel_type}]->(tgt) + SET r.evidence_items = CASE + WHEN $ev = '' THEN r.evidence_items + WHEN r.evidence_items IS NULL THEN $ev + ELSE r.evidence_items + $ev + END, + r.last_evidence_at = datetime(){prop_sets} + """, + params, + ) + return True + + def get_process_context_for_equipment(self, equipment_name: str) -> Dict: + """Get process-semantic context for an equipment node. + + Returns media handled, operations performed, operating envelopes, + and connected tags with their physical principles. + """ + with self.session() as session: + result = session.run( + """ + MATCH (e:Equipment {name: $name}) + OPTIONAL MATCH (e)-[:HANDLES_MEDIUM]->(pm:ProcessMedium) + OPTIONAL MATCH (e)-[:PERFORMS_OPERATION]->(uo:UnitOperation) + OPTIONAL MATCH (e)-[:HAS_OPERATING_ENVELOPE]->(oe:OperatingEnvelope) + OPTIONAL MATCH (e)<-[:MAPS_TO_SCADA]-(a:AOI)-[:IMPLEMENTS_CONTROL_OF]->(uo2:UnitOperation) + RETURN e.name AS name, + collect(DISTINCT pm.name) AS media, + collect(DISTINCT uo.name) AS operations, + collect(DISTINCT {name: oe.name, parameter: oe.parameter, + normal_low: oe.normal_low, normal_high: oe.normal_high, + unit: oe.unit}) AS envelopes, + collect(DISTINCT uo2.name) AS controlled_operations + """, + {"name": equipment_name}, + ) + record = result.single() + if not record: + return {} + return dict(record) + + def get_process_context_for_tag(self, tag_name: str) -> Dict: + """Get process-semantic context for a SCADA tag.""" + with self.session() as session: + result = session.run( + """ + MATCH (t:ScadaTag {name: $name}) + OPTIONAL MATCH (t)-[:MEASURES]->(pp:PhysicalPrinciple) + OPTIONAL MATCH (t)-[:MONITORS_ENVELOPE]->(oe:OperatingEnvelope) + RETURN t.name AS name, + collect(DISTINCT pp.name) AS measures, + collect(DISTINCT {name: oe.name, parameter: oe.parameter, + normal_low: oe.normal_low, normal_high: oe.normal_high, + unit: oe.unit}) AS envelopes + """, + {"name": tag_name}, + ) + record = result.single() + if not record: + return {} + return dict(record) + # ========================================================================= # Query Operations # ========================================================================= diff --git a/scripts/process_semantics.py b/scripts/process_semantics.py new file mode 100644 index 0000000..efc1ca3 --- /dev/null +++ b/scripts/process_semantics.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Process-semantics layer for the PLC/SCADA ontology. + +Defines canonical node types for physics, chemistry, and operating constraints +that can be induced from both PLC/SCADA structure and external documents +(P&IDs, SOPs, engineering diagrams). + +Node types: + ProcessMedium, UnitOperation, OperatingEnvelope, + PhysicalPrinciple, ChemicalSpecies, Reaction + +All write helpers follow the same MERGE-based pattern as the existing ontology +and attach provenance metadata to every asserted fact. +""" + +import json +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, field, asdict +from datetime import datetime + + +# ============================================================================ +# Provenance metadata contract +# ============================================================================ + +@dataclass +class EvidenceItem: + """Single piece of evidence supporting a graph fact.""" + source_file: str = "" + source_kind: str = "" # "pid", "sop", "diagram", "plc", "scada" + source_page: Optional[int] = None + source_region: str = "" # bounding-box or section id + source_excerpt: str = "" # verbatim snippet + extraction_model: str = "" # "gpt-5.4", "claude-sonnet", "deterministic" + extraction_method: str = "" # "vision", "text", "structured_parse" + confidence: float = 1.0 + extracted_at: str = "" + + def __post_init__(self): + if not self.extracted_at: + self.extracted_at = datetime.utcnow().isoformat() + + +def evidence_to_json(items: List[EvidenceItem]) -> str: + return json.dumps([asdict(e) for e in items]) + + +def merge_evidence(existing_json: Optional[str], new_items: List[EvidenceItem]) -> str: + """Append new evidence items to an existing JSON array (append-only).""" + existing: list = [] + if existing_json: + try: + existing = json.loads(existing_json) + except (json.JSONDecodeError, TypeError): + existing = [] + existing.extend([asdict(e) for e in new_items]) + return json.dumps(existing) + + +# ============================================================================ +# Canonical process-semantic schemas +# ============================================================================ + +PROCESS_NODE_SCHEMAS: Dict[str, Dict[str, Any]] = { + "ProcessMedium": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "Steam", "CIP-Caustic", "Product-A" + "category": "str", # "utility", "product", "waste", "solvent", "gas" + "phase": "str", # "liquid", "gas", "solid", "mixed" + "description": "str", + "purpose": "str", + }, + "description": "A material or utility stream handled by plant equipment.", + }, + "UnitOperation": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "Pumping", "CIP", "Heating" + "category": "str", # "transfer", "thermal", "mixing", "separation", "cleaning", "reaction" + "description": "str", + "purpose": "str", + }, + "description": "A canonical plant operation such as pumping, mixing, or filtration.", + }, + "OperatingEnvelope": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "BR-500-001/Temperature" + "parameter": "str", # "temperature", "pressure", "flow", "level", "pH" + "unit": "str", # "degC", "bar", "L/min" + "low_limit": "float", + "low_warning": "float", + "normal_low": "float", + "normal_high": "float", + "high_warning": "float", + "high_limit": "float", + "trip_low": "float", + "trip_high": "float", + "description": "str", + }, + "description": "Normal ranges, alarm bands, and trip windows for a measured parameter.", + }, + "PhysicalPrinciple": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "Temperature", "Pressure", "Flow" + "category": "str", # "thermal", "fluid", "electrical", "mechanical", "analytical" + "unit_family": "str", # "temperature", "pressure", "volumetric_flow", etc. + "description": "str", + }, + "description": "A measurable physical quantity relevant to process control.", + }, + "ChemicalSpecies": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "NaOH", "Ethanol", "Product-X" + "cas_number": "str", + "category": "str", # "reactant", "product", "byproduct", "additive", "cleaning_agent" + "molecular_formula": "str", + "description": "str", + }, + "description": "A specific chemical substance involved in plant processes.", + }, + "Reaction": { + "key_property": "name", + "properties": { + "name": "str", # e.g. "Neutralization-CIP", "Fermentation-Stage1" + "category": "str", # "neutralization", "fermentation", "oxidation", "blending", etc. + "description": "str", + "conditions": "str", # brief summary of required conditions + }, + "description": "A chemical or physical transformation step in the process.", + }, +} + +# Allowed relationship types for the process layer +PROCESS_RELATIONSHIPS: Dict[str, Dict[str, str]] = { + "HANDLES_MEDIUM": {"from": "Equipment", "to": "ProcessMedium"}, + "PERFORMS_OPERATION": {"from": "Equipment", "to": "UnitOperation"}, + "HAS_OPERATING_ENVELOPE":{"from": "Equipment", "to": "OperatingEnvelope"}, + "MEASURES": {"from": "ScadaTag", "to": "PhysicalPrinciple"}, + "MONITORS_ENVELOPE": {"from": "ScadaTag", "to": "OperatingEnvelope"}, + "IMPLEMENTS_CONTROL_OF": {"from": "AOI", "to": "UnitOperation"}, + "USES_PRINCIPLE": {"from": "UnitOperation", "to": "PhysicalPrinciple"}, + "INVOLVES_SPECIES": {"from": "Reaction", "to": "ChemicalSpecies"}, + "PROCESSES_SPECIES": {"from": "UnitOperation", "to": "ChemicalSpecies"}, + "HAS_REACTION": {"from": "UnitOperation", "to": "Reaction"}, + "MEDIUM_CONTAINS": {"from": "ProcessMedium", "to": "ChemicalSpecies"}, + "ENVELOPE_FOR_PRINCIPLE":{"from": "OperatingEnvelope", "to": "PhysicalPrinciple"}, + "VISUALIZES": {"from": "ViewComponent", "to": "Equipment"}, +} + + +# ============================================================================ +# Schema metadata for graph API / UI contract +# ============================================================================ + +PROCESS_LABEL_META: Dict[str, Dict[str, Any]] = { + label: { + "key_property": schema["key_property"], + "display_property": "name", + "searchable_properties": ["name", "description", "purpose"] if "purpose" in schema["properties"] else ["name", "description"], + "editable_properties": list(schema["properties"].keys()), + "group": "process", + "description": schema["description"], + } + for label, schema in PROCESS_NODE_SCHEMAS.items() +}