Specs stay at repo root (cross-VM). Move deploy and code into logical projects with README per domain, updated manifest.yaml, and symlinks at legacy paths for VM122 backward compatibility.
95 lines
5.4 KiB
Python
95 lines
5.4 KiB
Python
"""T0/T1 checks — Spec 029."""
|
|
from __future__ import annotations
|
|
import os, sqlite3, time
|
|
import httpx
|
|
|
|
DESK = os.getenv("DESK_PUBLIC_URL", "https://desk.ligbox.com.br")
|
|
VM112 = os.getenv("VM112_API_URL", "http://10.10.10.112:8090")
|
|
WIZARD = os.getenv("WIZARD_ONBOARD_URL", "https://onboard.ligbox.com.br/onboard")
|
|
PFS_URL = os.getenv("PFSENSE_API_URL", "https://firewall.itecnologys.com/api/v2/status/system")
|
|
PFS_USER = os.getenv("PFSENSE_API_USER", "api_cursor")
|
|
PFS_PASS = os.getenv("PFSENSE_API_PASSWORD", "805353")
|
|
PVE = os.getenv("PVE_API_URL", "https://10.10.10.2:8006/api2/json")
|
|
PVE_USER = os.getenv("PVE_USER", "root@pam")
|
|
PVE_PASS = os.getenv("PVE_PASSWORD", "@betinplace")
|
|
PVE_NODE = os.getenv("PVE_NODE", "big1")
|
|
VMIDS = [int(x) for x in os.getenv("AGENTIC_CRITICAL_VMIDS", "112,122,123,104").split(",") if x.strip()]
|
|
OLLAMA = os.getenv("OLLAMA_BASE_URL", "http://10.10.10.123:11434").rstrip("/")
|
|
|
|
def _http(url, *, auth=None, max_ms=2500):
|
|
t0 = time.perf_counter()
|
|
try:
|
|
with httpx.Client(timeout=15, verify=False, follow_redirects=True) as c:
|
|
r = c.get(url, auth=auth)
|
|
ms = int((time.perf_counter()-t0)*1000)
|
|
return {"ok": r.status_code==200 and ms<=max_ms, "status_code": r.status_code, "latency_ms": ms, "url": url}
|
|
except Exception as e:
|
|
return {"ok": False, "error": str(e), "url": url}
|
|
|
|
def check_desk_api_health():
|
|
r = _http(f"{DESK}/api/health")
|
|
return [] if r["ok"] else [{"severity":"high","category":"api","title":"Desk API health falhou","detail_md":str(r),"evidence":r,"human_action":"docker-compose logs api VM122"}]
|
|
|
|
def check_vm112_health():
|
|
out = []
|
|
r1 = _http(f"{VM112}/api/onboarding/health")
|
|
if not r1["ok"]: out.append({"severity":"high","category":"api","title":"VM112 API down","detail_md":str(r1),"evidence":r1,"human_action":"systemctl ligbox-wizard VM112"})
|
|
r2 = _http(WIZARD, max_ms=4000)
|
|
if not r2["ok"]: out.append({"severity":"warn","category":"api","title":"Portal /onboard falhou","detail_md":str(r2),"evidence":r2,"human_action":"Traefik + VM112"})
|
|
return out
|
|
|
|
def check_pfsense_api():
|
|
r = _http(PFS_URL, auth=(PFS_USER, PFS_PASS), max_ms=4000)
|
|
return [] if r["ok"] else [{"severity":"warn","category":"infra","title":"pfSense API falhou","detail_md":str(r),"evidence":r,"human_action":"firewall.itecnologys.com"}]
|
|
|
|
def check_funnel_stuck(conn, max_stuck=5):
|
|
try:
|
|
c = conn.execute("SELECT COUNT(*) n FROM tickets WHERE status IN ('open','assisting','escalated') AND (subject LIKE '%onboarding%' OR payload LIKE '%onboarding%') AND datetime(created_at)<datetime('now','-24 hours')").fetchone()["n"]
|
|
if c <= max_stuck: return []
|
|
return [{"severity":"warn","category":"code","title":f"Funil travado {c} tickets","detail_md":str(c),"evidence":{"count":c},"human_action":"ASM Spec 010"}]
|
|
except sqlite3.OperationalError:
|
|
return []
|
|
|
|
def check_integration_gap(ops_api_url, token):
|
|
if not token: return []
|
|
try:
|
|
with httpx.Client(timeout=15) as c:
|
|
r = c.get(f"{ops_api_url}/api/v1/integrations/health", headers={"X-Ops-Internal-Token": token})
|
|
if r.status_code != 200: return []
|
|
gap = (r.json().get("vm112_onboard") or {}).get("gap_minutes")
|
|
if gap is None or int(gap) <= 15: return []
|
|
return [{"severity":"high","category":"infra","title":f"Gap webhook {int(gap)}min","detail_md":"VM112 sem eventos","evidence":{"gap":gap},"human_action":"Webhooks VM112→122"}]
|
|
except Exception:
|
|
return []
|
|
|
|
def check_proxmox_cluster():
|
|
try:
|
|
with httpx.Client(timeout=15, verify=False) as c:
|
|
t = c.post(f"{PVE}/access/ticket", data={"username": PVE_USER, "password": PVE_PASS})
|
|
if t.status_code != 200:
|
|
return [{"severity":"warn","category":"infra","title":"Proxmox auth falhou","detail_md":str(t.status_code),"evidence":{},"human_action":"PVE 10.10.10.2:8006"}]
|
|
tok = t.json()["data"]["ticket"]
|
|
bad = []
|
|
with httpx.Client(timeout=15, verify=False) as c:
|
|
for vmid in VMIDS:
|
|
r = c.get(f"{PVE}/nodes/{PVE_NODE}/qemu/{vmid}/status/current", headers={"Cookie": f"PVEAuthCookie={tok}"})
|
|
st = r.json().get("data", {}).get("status") if r.status_code == 200 else "error"
|
|
if st != "running": bad.append({"vmid": vmid, "status": st})
|
|
if not bad: return []
|
|
return [{"severity":"critical","category":"infra","title":f"VMs paradas {bad}","detail_md":str(bad),"evidence":{"bad":bad},"human_action":"qm start no big1"}]
|
|
except Exception as e:
|
|
return [{"severity":"info","category":"infra","title":"Proxmox check erro","detail_md":str(e),"evidence":{},"human_action":""}]
|
|
|
|
def check_ollama_vm123():
|
|
r = _http(f"{OLLAMA}/api/tags", max_ms=5000)
|
|
return [] if r["ok"] else [{"severity":"high","category":"infra","title":"Ollama VM123 offline","detail_md":str(r),"evidence":r,"human_action":"systemctl start ollama VM123"}]
|
|
|
|
SCENARIO_RUNNERS = {
|
|
"desk.api.health": lambda conn, **kw: check_desk_api_health(),
|
|
"wizard.vm112.bundle": lambda conn, **kw: check_vm112_health(),
|
|
"pfsense.api.system": lambda conn, **kw: check_pfsense_api(),
|
|
"funnel.stuck.onboarding": lambda conn, **kw: check_funnel_stuck(conn),
|
|
"integration.webhook.gap": lambda conn, **kw: check_integration_gap(kw.get("ops_api_url",""), kw.get("internal_token","")),
|
|
"proxmox.cluster": lambda conn, **kw: check_proxmox_cluster(),
|
|
"ollama.vm123.health": lambda conn, **kw: check_ollama_vm123(),
|
|
}
|