ligbox-ops-platform/projects/ops-desk/api/app/agents/checks.py
Ligbox Spec Hub e0959e6fd7 Add Agentic Ops Spec 029: wire API, worker tick, T0/T1, staging stack.
Mounts agents router and schema init, adds VM123 checks, chat copilot,
Desk UI module, isolated docker-compose staging on ports 8180/8192,
and full spec documentation without touching production ports.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-19 23:22:33 +00:00

266 lines
8.7 KiB
Python

"""T0/T1 checks — Spec 029."""
from __future__ import annotations
import os
import sqlite3
import time
import httpx
DESK = os.getenv("DESK_PUBLIC_URL", "https://desk.ligbox.com.br")
VM112 = os.getenv("VM112_API_URL", "http://10.10.10.112:8090")
WIZARD = os.getenv("WIZARD_ONBOARD_URL", "https://onboard.ligbox.com.br/onboard")
PFS_URL = os.getenv("PFSENSE_API_URL", "https://firewall.itecnologys.com/api/v2/status/system")
PFS_USER = os.getenv("PFSENSE_API_USER", "api_cursor")
PFS_PASS = os.getenv("PFSENSE_API_PASSWORD", "805353")
PVE = os.getenv("PVE_API_URL", "https://10.10.10.2:8006/api2/json")
PVE_USER = os.getenv("PVE_USER", "root@pam")
PVE_PASS = os.getenv("PVE_PASSWORD", "")
PVE_NODE = os.getenv("PVE_NODE", "big1")
VMIDS = [int(x) for x in os.getenv("AGENTIC_CRITICAL_VMIDS", "112,122,123,104").split(",") if x.strip()]
OLLAMA = os.getenv("OLLAMA_BASE_URL", "http://10.10.10.123:11434").rstrip("/")
VM123_IP = os.getenv("VM123_IP", "10.10.10.123")
OPENPANEL_BRIDGE = os.getenv("OPENPANEL_BRIDGE_URL", f"http://{VM123_IP}:18087").rstrip("/")
def _http(url, *, auth=None, max_ms=2500):
t0 = time.perf_counter()
try:
with httpx.Client(timeout=15, verify=False, follow_redirects=True) as c:
r = c.get(url, auth=auth)
ms = int((time.perf_counter() - t0) * 1000)
return {"ok": r.status_code == 200 and ms <= max_ms, "status_code": r.status_code, "latency_ms": ms, "url": url}
except Exception as e:
return {"ok": False, "error": str(e), "url": url}
def check_desk_api_health():
r = _http(f"{DESK}/api/health", max_ms=4000)
return [] if r["ok"] else [
{
"severity": "high",
"category": "api",
"title": "Desk API health falhou",
"detail_md": str(r),
"evidence": r,
"human_action": "Verificar docker-compose api VM122",
}
]
def check_vm112_health():
out = []
r1 = _http(f"{VM112}/api/onboarding/health")
if not r1["ok"]:
out.append(
{
"severity": "high",
"category": "api",
"title": "VM112 API down",
"detail_md": str(r1),
"evidence": r1,
"human_action": "systemctl ligbox-wizard VM112",
}
)
r2 = _http(WIZARD, max_ms=4000)
if not r2["ok"]:
out.append(
{
"severity": "warn",
"category": "api",
"title": "Portal /onboard falhou",
"detail_md": str(r2),
"evidence": r2,
"human_action": "Traefik CT114 + VM112",
}
)
return out
def check_pfsense_api():
r = _http(PFS_URL, auth=(PFS_USER, PFS_PASS), max_ms=4000)
return [] if r["ok"] else [
{
"severity": "warn",
"category": "infra",
"title": "pfSense API falhou",
"detail_md": str(r),
"evidence": r,
"human_action": "Validar firewall.itecnologys.com via Traefik",
}
]
def check_funnel_stuck(conn, max_stuck=5):
try:
c = conn.execute(
"SELECT COUNT(*) n FROM tickets WHERE status IN ('open','assisting','escalated') "
"AND (subject LIKE '%onboarding%' OR payload LIKE '%onboarding%') "
"AND datetime(created_at)<datetime('now','-24 hours')"
).fetchone()["n"]
if c <= max_stuck:
return []
return [
{
"severity": "warn",
"category": "code",
"title": f"Funil travado {c} tickets",
"detail_md": str(c),
"evidence": {"count": c},
"human_action": "Rever tickets onboarding — Spec 010 Assist",
}
]
except sqlite3.OperationalError:
return []
def check_integration_gap(ops_api_url, token):
if not token:
return []
try:
with httpx.Client(timeout=15) as c:
r = c.get(f"{ops_api_url}/api/v1/integrations/health", headers={"X-Ops-Internal-Token": token})
if r.status_code != 200:
return []
gap = (r.json().get("vm112_onboard") or {}).get("gap_minutes")
if gap is None or int(gap) <= 15:
return []
return [
{
"severity": "high",
"category": "infra",
"title": f"Gap webhook {int(gap)}min",
"detail_md": "VM112 sem eventos recentes",
"evidence": {"gap": gap},
"human_action": "Webhooks VM112→122",
}
]
except Exception:
return []
def check_proxmox_cluster():
if not PVE_PASS:
return []
try:
with httpx.Client(timeout=15, verify=False) as c:
t = c.post(f"{PVE}/access/ticket", data={"username": PVE_USER, "password": PVE_PASS})
if t.status_code != 200:
return [
{
"severity": "warn",
"category": "infra",
"title": "Proxmox auth falhou",
"detail_md": str(t.status_code),
"evidence": {},
"human_action": "PVE 10.10.10.2:8006",
}
]
tok = t.json()["data"]["ticket"]
bad = []
with httpx.Client(timeout=15, verify=False) as c:
for vmid in VMIDS:
r = c.get(
f"{PVE}/nodes/{PVE_NODE}/qemu/{vmid}/status/current",
headers={"Cookie": f"PVEAuthCookie={tok}"},
)
st = r.json().get("data", {}).get("status") if r.status_code == 200 else "error"
if st != "running":
bad.append({"vmid": vmid, "status": st})
if not bad:
return []
return [
{
"severity": "critical",
"category": "infra",
"title": f"VMs paradas {bad}",
"detail_md": str(bad),
"evidence": {"bad": bad},
"human_action": "qm start no big1",
}
]
except Exception as e:
return [
{
"severity": "info",
"category": "infra",
"title": "Proxmox check erro",
"detail_md": str(e),
"evidence": {},
"human_action": "",
}
]
def check_ollama_vm123():
r = _http(f"{OLLAMA}/api/tags", max_ms=5000)
return [] if r["ok"] else [
{
"severity": "high",
"category": "infra",
"title": "Ollama VM123 offline",
"detail_md": str(r),
"evidence": r,
"human_action": "systemctl start ollama VM123",
}
]
def check_vm123_finance_stack():
out = []
foss = _http(f"http://{VM123_IP}:8092/", max_ms=5000)
if not foss["ok"]:
out.append(
{
"severity": "high",
"category": "api",
"title": "FOSSBilling VM123 down",
"detail_md": str(foss),
"evidence": foss,
"human_action": "docker compose VM123 finance stack",
}
)
odoo = _http(f"http://{VM123_IP}:8069/web/login", max_ms=5000)
if not odoo["ok"]:
out.append(
{
"severity": "warn",
"category": "api",
"title": "Odoo VM123 inacessível",
"detail_md": str(odoo),
"evidence": odoo,
"human_action": "Verificar container Odoo VM123",
}
)
return out
def check_vm123_openpanel_bridge():
r = _http(f"{OPENPANEL_BRIDGE}/health", max_ms=4000)
if r.get("status_code") == 404:
r = _http(OPENPANEL_BRIDGE, max_ms=4000)
return [] if r["ok"] else [
{
"severity": "warn",
"category": "api",
"title": "OpenPanel bridge VM123 falhou",
"detail_md": str(r),
"evidence": r,
"human_action": f"Bridge {OPENPANEL_BRIDGE}",
}
]
SCENARIO_RUNNERS = {
"desk.api.health": lambda conn, **kw: check_desk_api_health(),
"wizard.vm112.bundle": lambda conn, **kw: check_vm112_health(),
"pfsense.api.system": lambda conn, **kw: check_pfsense_api(),
"funnel.stuck.onboarding": lambda conn, **kw: check_funnel_stuck(conn),
"integration.webhook.gap": lambda conn, **kw: check_integration_gap(
kw.get("ops_api_url", ""), kw.get("internal_token", "")
),
"proxmox.cluster": lambda conn, **kw: check_proxmox_cluster(),
"ollama.vm123.health": lambda conn, **kw: check_ollama_vm123(),
"vm123.finance.stack": lambda conn, **kw: check_vm123_finance_stack(),
"vm123.openpanel.bridge": lambda conn, **kw: check_vm123_openpanel_bridge(),
}