95 lines
5.4 KiB
Python
95 lines
5.4 KiB
Python
"""T0/T1 checks — Spec 029."""
|
|
from __future__ import annotations
|
|
import os, sqlite3, time
|
|
import httpx
|
|
|
|
DESK = os.getenv("DESK_PUBLIC_URL", "https://desk.ligbox.com.br")
|
|
VM112 = os.getenv("VM112_API_URL", "http://10.10.10.112:8090")
|
|
WIZARD = os.getenv("WIZARD_ONBOARD_URL", "https://onboard.ligbox.com.br/onboard")
|
|
PFS_URL = os.getenv("PFSENSE_API_URL", "https://firewall.itecnologys.com/api/v2/status/system")
|
|
PFS_USER = os.getenv("PFSENSE_API_USER", "api_cursor")
|
|
PFS_PASS = os.getenv("PFSENSE_API_PASSWORD", "805353")
|
|
PVE = os.getenv("PVE_API_URL", "https://10.10.10.2:8006/api2/json")
|
|
PVE_USER = os.getenv("PVE_USER", "root@pam")
|
|
PVE_PASS = os.getenv("PVE_PASSWORD", "@betinplace")
|
|
PVE_NODE = os.getenv("PVE_NODE", "big1")
|
|
VMIDS = [int(x) for x in os.getenv("AGENTIC_CRITICAL_VMIDS", "112,122,123,104").split(",") if x.strip()]
|
|
OLLAMA = os.getenv("OLLAMA_BASE_URL", "http://10.10.10.123:11434").rstrip("/")
|
|
|
|
def _http(url, *, auth=None, max_ms=2500):
|
|
t0 = time.perf_counter()
|
|
try:
|
|
with httpx.Client(timeout=15, verify=False, follow_redirects=True) as c:
|
|
r = c.get(url, auth=auth)
|
|
ms = int((time.perf_counter()-t0)*1000)
|
|
return {"ok": r.status_code==200 and ms<=max_ms, "status_code": r.status_code, "latency_ms": ms, "url": url}
|
|
except Exception as e:
|
|
return {"ok": False, "error": str(e), "url": url}
|
|
|
|
def check_desk_api_health():
|
|
r = _http(f"{DESK}/api/health")
|
|
return [] if r["ok"] else [{"severity":"high","category":"api","title":"Desk API health falhou","detail_md":str(r),"evidence":r,"human_action":"docker-compose logs api VM122"}]
|
|
|
|
def check_vm112_health():
|
|
out = []
|
|
r1 = _http(f"{VM112}/api/onboarding/health")
|
|
if not r1["ok"]: out.append({"severity":"high","category":"api","title":"VM112 API down","detail_md":str(r1),"evidence":r1,"human_action":"systemctl ligbox-wizard VM112"})
|
|
r2 = _http(WIZARD, max_ms=4000)
|
|
if not r2["ok"]: out.append({"severity":"warn","category":"api","title":"Portal /onboard falhou","detail_md":str(r2),"evidence":r2,"human_action":"Traefik + VM112"})
|
|
return out
|
|
|
|
def check_pfsense_api():
|
|
r = _http(PFS_URL, auth=(PFS_USER, PFS_PASS), max_ms=4000)
|
|
return [] if r["ok"] else [{"severity":"warn","category":"infra","title":"pfSense API falhou","detail_md":str(r),"evidence":r,"human_action":"firewall.itecnologys.com"}]
|
|
|
|
def check_funnel_stuck(conn, max_stuck=5):
|
|
try:
|
|
c = conn.execute("SELECT COUNT(*) n FROM tickets WHERE status IN ('open','assisting','escalated') AND (subject LIKE '%onboarding%' OR payload LIKE '%onboarding%') AND datetime(created_at)<datetime('now','-24 hours')").fetchone()["n"]
|
|
if c <= max_stuck: return []
|
|
return [{"severity":"warn","category":"code","title":f"Funil travado {c} tickets","detail_md":str(c),"evidence":{"count":c},"human_action":"ASM Spec 010"}]
|
|
except sqlite3.OperationalError:
|
|
return []
|
|
|
|
def check_integration_gap(ops_api_url, token):
|
|
if not token: return []
|
|
try:
|
|
with httpx.Client(timeout=15) as c:
|
|
r = c.get(f"{ops_api_url}/api/v1/integrations/health", headers={"X-Ops-Internal-Token": token})
|
|
if r.status_code != 200: return []
|
|
gap = (r.json().get("vm112_onboard") or {}).get("gap_minutes")
|
|
if gap is None or int(gap) <= 15: return []
|
|
return [{"severity":"high","category":"infra","title":f"Gap webhook {int(gap)}min","detail_md":"VM112 sem eventos","evidence":{"gap":gap},"human_action":"Webhooks VM112→122"}]
|
|
except Exception:
|
|
return []
|
|
|
|
def check_proxmox_cluster():
|
|
try:
|
|
with httpx.Client(timeout=15, verify=False) as c:
|
|
t = c.post(f"{PVE}/access/ticket", data={"username": PVE_USER, "password": PVE_PASS})
|
|
if t.status_code != 200:
|
|
return [{"severity":"warn","category":"infra","title":"Proxmox auth falhou","detail_md":str(t.status_code),"evidence":{},"human_action":"PVE 10.10.10.2:8006"}]
|
|
tok = t.json()["data"]["ticket"]
|
|
bad = []
|
|
with httpx.Client(timeout=15, verify=False) as c:
|
|
for vmid in VMIDS:
|
|
r = c.get(f"{PVE}/nodes/{PVE_NODE}/qemu/{vmid}/status/current", headers={"Cookie": f"PVEAuthCookie={tok}"})
|
|
st = r.json().get("data", {}).get("status") if r.status_code == 200 else "error"
|
|
if st != "running": bad.append({"vmid": vmid, "status": st})
|
|
if not bad: return []
|
|
return [{"severity":"critical","category":"infra","title":f"VMs paradas {bad}","detail_md":str(bad),"evidence":{"bad":bad},"human_action":"qm start no big1"}]
|
|
except Exception as e:
|
|
return [{"severity":"info","category":"infra","title":"Proxmox check erro","detail_md":str(e),"evidence":{},"human_action":""}]
|
|
|
|
def check_ollama_vm123():
|
|
r = _http(f"{OLLAMA}/api/tags", max_ms=5000)
|
|
return [] if r["ok"] else [{"severity":"high","category":"infra","title":"Ollama VM123 offline","detail_md":str(r),"evidence":r,"human_action":"systemctl start ollama VM123"}]
|
|
|
|
SCENARIO_RUNNERS = {
|
|
"desk.api.health": lambda conn, **kw: check_desk_api_health(),
|
|
"wizard.vm112.bundle": lambda conn, **kw: check_vm112_health(),
|
|
"pfsense.api.system": lambda conn, **kw: check_pfsense_api(),
|
|
"funnel.stuck.onboarding": lambda conn, **kw: check_funnel_stuck(conn),
|
|
"integration.webhook.gap": lambda conn, **kw: check_integration_gap(kw.get("ops_api_url",""), kw.get("internal_token","")),
|
|
"proxmox.cluster": lambda conn, **kw: check_proxmox_cluster(),
|
|
"ollama.vm123.health": lambda conn, **kw: check_ollama_vm123(),
|
|
}
|