Fix purge 500 when SQLite database is locked (Spec 017).

Enable WAL/busy_timeout, retry writes, reject duplicate running jobs with HTTP 409,
use bcrypt directly instead of broken passlib 1.7.4 + bcrypt 4.x, and improve UI errors.
This commit is contained in:
Ligbox Spec Hub 2026-06-19 22:05:05 +00:00
parent 06d0dcc977
commit 0ee4845818
5 changed files with 229 additions and 61 deletions

View file

@ -13,7 +13,7 @@ from typing import Any
from fastapi import Depends, Header, HTTPException, Request from fastapi import Depends, Header, HTTPException, Request
from jose import JWTError, jwt from jose import JWTError, jwt
from passlib.context import CryptContext import bcrypt
from app.totp_util import verify_code as verify_totp_code from app.totp_util import verify_code as verify_totp_code
@ -26,8 +26,6 @@ DESK_BOOTSTRAP_PASSWORD = os.getenv("DESK_BOOTSTRAP_PASSWORD", "805353")
AUTH_LOGIN_RATE_LIMIT = int(os.getenv("AUTH_LOGIN_RATE_LIMIT", "5")) AUTH_LOGIN_RATE_LIMIT = int(os.getenv("AUTH_LOGIN_RATE_LIMIT", "5"))
OPS_INTERNAL_TOKEN = os.getenv("OPS_INTERNAL_TOKEN", "") OPS_INTERNAL_TOKEN = os.getenv("OPS_INTERNAL_TOKEN", "")
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
_login_attempts: dict[str, list[float]] = {} _login_attempts: dict[str, list[float]] = {}
_mfa_pending: dict[str, tuple[str, float]] = {} _mfa_pending: dict[str, tuple[str, float]] = {}
MFA_TOKEN_TTL_SEC = 300 MFA_TOKEN_TTL_SEC = 300
@ -54,17 +52,24 @@ class DeskUser:
def db() -> sqlite3.Connection: def db() -> sqlite3.Connection:
DB_PATH.parent.mkdir(parents=True, exist_ok=True) DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH, timeout=30.0)
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
return conn return conn
def hash_password(password: str) -> str: def hash_password(password: str) -> str:
return pwd_context.hash(password) return bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode("utf-8")
def verify_password(plain: str, hashed: str) -> bool: def verify_password(plain: str, hashed: str) -> bool:
return pwd_context.verify(plain, hashed) if not plain or not hashed:
return False
try:
return bcrypt.checkpw(plain.encode("utf-8"), hashed.encode("utf-8"))
except (ValueError, TypeError):
return False
def init_auth_schema(conn: sqlite3.Connection) -> None: def init_auth_schema(conn: sqlite3.Connection) -> None:

View file

@ -133,8 +133,10 @@ TICKET_COLUMNS = "id,tenant_id,subject,status,payload,created_at,assigned_to,ass
def db(): def db():
DB_PATH.parent.mkdir(parents=True, exist_ok=True) DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH, timeout=30.0)
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
return conn return conn
@ -172,6 +174,11 @@ def init_db():
from app import billing_store from app import billing_store
migration_store.init_schema(conn) migration_store.init_schema(conn)
billing_store.init_schema(conn) billing_store.init_schema(conn)
from app.vm112_purge_jobs import init_purge_jobs_schema
init_purge_jobs_schema(conn)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
conn.commit() conn.commit()

View file

@ -2,6 +2,8 @@
from __future__ import annotations from __future__ import annotations
import sqlite3
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -9,7 +11,13 @@ from pydantic import BaseModel, Field
from app import auth, vm112_domains from app import auth, vm112_domains
from app.permissions import can_manage_vm112_domains from app.permissions import can_manage_vm112_domains
from app.vm112_purge_stream import purge_sse_generator from app.vm112_purge_stream import purge_sse_generator
from app.vm112_purge_jobs import get_job_public, list_jobs, recover_job, start_job from app.vm112_purge_jobs import (
PurgeJobConflictError,
get_job_public,
list_jobs,
recover_job,
start_job,
)
router = APIRouter(prefix="/api/v1/vm112", tags=["vm112-domains"]) router = APIRouter(prefix="/api/v1/vm112", tags=["vm112-domains"])
@ -116,7 +124,17 @@ def start_purge_job(
): ):
"""Inicia purge em background; consultar GET /purge/jobs/{id} (recomendado via Traefik).""" """Inicia purge em background; consultar GET /purge/jobs/{id} (recomendado via Traefik)."""
domain = _validate_purge_request(domain, body) domain = _validate_purge_request(domain, body)
try:
job_id = start_job(domain, body.root_password, user.username) job_id = start_job(domain, body.root_password, user.username)
except PurgeJobConflictError as exc:
raise HTTPException(409, str(exc)) from exc
except sqlite3.OperationalError as exc:
if "locked" in str(exc).lower():
raise HTTPException(
503,
"Base de dados ocupada — aguarde o purge em curso ou tente novamente em alguns segundos",
) from exc
raise HTTPException(500, f"Erro SQLite ao iniciar purge: {exc}") from exc
return {"ok": True, "job_id": job_id, "domain": domain, "status": "running"} return {"ok": True, "job_id": job_id, "domain": domain, "status": "running"}

View file

@ -3,7 +3,9 @@
from __future__ import annotations from __future__ import annotations
import json import json
import sqlite3
import threading import threading
import time
import traceback import traceback
import uuid import uuid
from datetime import datetime, timezone from datetime import datetime, timezone
@ -12,6 +14,11 @@ from typing import Any, Callable
from app import auth, vm112_domains from app import auth, vm112_domains
_lock = threading.Lock() _lock = threading.Lock()
_schema_ready = False
class PurgeJobConflictError(Exception):
"""Já existe purge queued/running para o domínio."""
def _now() -> str: def _now() -> str:
@ -36,17 +43,42 @@ def init_purge_jobs_schema(conn) -> None:
) )
""" """
) )
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_vm112_purge_jobs_domain_status
ON vm112_purge_jobs(domain, status)
"""
)
conn.commit() conn.commit()
def _ensure_schema() -> None: def _ensure_schema() -> None:
global _schema_ready
if _schema_ready:
return
conn = auth.db() conn = auth.db()
try: try:
init_purge_jobs_schema(conn) init_purge_jobs_schema(conn)
_schema_ready = True
finally: finally:
conn.close() conn.close()
def _sqlite_retry(fn: Callable[[], Any], *, attempts: int = 8) -> Any:
last_err: Exception | None = None
for attempt in range(attempts):
try:
return fn()
except sqlite3.OperationalError as exc:
last_err = exc
if "locked" not in str(exc).lower() or attempt >= attempts - 1:
raise
time.sleep(min(0.05 * (2**attempt), 1.0))
if last_err:
raise last_err
return None
def _row_to_job(row) -> dict[str, Any]: def _row_to_job(row) -> dict[str, Any]:
return { return {
"id": row["id"], "id": row["id"],
@ -66,6 +98,8 @@ def _row_to_job(row) -> dict[str, Any]:
def _persist_job(job: dict[str, Any]) -> None: def _persist_job(job: dict[str, Any]) -> None:
_ensure_schema() _ensure_schema()
def _write() -> None:
conn = auth.db() conn = auth.db()
try: try:
job["updated_at"] = _now() job["updated_at"] = _now()
@ -103,9 +137,13 @@ def _persist_job(job: dict[str, Any]) -> None:
finally: finally:
conn.close() conn.close()
_sqlite_retry(_write)
def _load_job(job_id: str) -> dict[str, Any] | None: def _load_job(job_id: str) -> dict[str, Any] | None:
_ensure_schema() _ensure_schema()
def _read() -> dict[str, Any] | None:
conn = auth.db() conn = auth.db()
try: try:
row = conn.execute( row = conn.execute(
@ -115,15 +153,69 @@ def _load_job(job_id: str) -> dict[str, Any] | None:
finally: finally:
conn.close() conn.close()
return _sqlite_retry(_read)
def _find_active_job_locked(conn: sqlite3.Connection, domain: str) -> sqlite3.Row | None:
return conn.execute(
"""
SELECT id, domain, status, created_at FROM vm112_purge_jobs
WHERE domain = ? AND status IN ('queued', 'running')
ORDER BY created_at DESC LIMIT 1
""",
(domain.lower().strip(),),
).fetchone()
def _mutate_job(job_id: str, fn: Callable[[dict[str, Any]], None]) -> dict[str, Any] | None: def _mutate_job(job_id: str, fn: Callable[[dict[str, Any]], None]) -> dict[str, Any] | None:
with _lock: with _lock:
job = _load_job(job_id) def _mutate() -> dict[str, Any] | None:
if not job: conn = auth.db()
try:
row = conn.execute(
"SELECT * FROM vm112_purge_jobs WHERE id = ?", (job_id,)
).fetchone()
if not row:
return None return None
job = _row_to_job(row)
fn(job) fn(job)
_persist_job(job) job["updated_at"] = _now()
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
status = excluded.status,
timeline_json = excluded.timeline_json,
elapsed_vm112 = excluded.elapsed_vm112,
desk_json = excluded.desk_json,
vm112_json = excluded.vm112_json,
error = excluded.error,
by_user = excluded.by_user,
updated_at = excluded.updated_at
""",
(
job["id"],
job["domain"],
job["status"],
json.dumps(job.get("timeline") or [], ensure_ascii=False),
int(job.get("elapsed_vm112") or 0),
json.dumps(job.get("desk") or {}, ensure_ascii=False),
json.dumps(job.get("vm112") or {}, ensure_ascii=False),
job.get("error"),
job.get("by"),
job.get("created_at") or _now(),
job["updated_at"],
),
)
conn.commit()
return dict(job) return dict(job)
finally:
conn.close()
return _sqlite_retry(_mutate)
def _upsert_step(job_id: str, step: dict[str, str]) -> None: def _upsert_step(job_id: str, step: dict[str, str]) -> None:
@ -143,12 +235,13 @@ def _set_job(job_id: str, **fields: Any) -> None:
def create_job(domain: str, username: str) -> str: def create_job(domain: str, username: str) -> str:
domain = domain.lower().strip()
job_id = uuid.uuid4().hex[:16] job_id = uuid.uuid4().hex[:16]
now = _now() now = _now()
job = { job = {
"id": job_id, "id": job_id,
"job_id": job_id, "job_id": job_id,
"domain": domain.lower().strip(), "domain": domain,
"status": "queued", "status": "queued",
"timeline": [], "timeline": [],
"elapsed_vm112": 0, "elapsed_vm112": 0,
@ -159,8 +252,43 @@ def create_job(domain: str, username: str) -> str:
"created_at": now, "created_at": now,
"updated_at": now, "updated_at": now,
} }
def _create() -> None:
conn = auth.db()
try:
active = _find_active_job_locked(conn, domain)
if active:
raise PurgeJobConflictError(
f"Purge já em curso para {domain} (job {active['id']}, status {active['status']})"
)
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
job["id"],
job["domain"],
job["status"],
"[]",
0,
"{}",
"{}",
None,
job["by"],
job["created_at"],
job["updated_at"],
),
)
conn.commit()
finally:
conn.close()
with _lock: with _lock:
_persist_job(job) _ensure_schema()
_sqlite_retry(_create)
return job_id return job_id

View file

@ -140,6 +140,16 @@ const DeskServices = (() => {
let errText = typeof detail === 'string' ? detail : JSON.stringify(detail || `${res.status}`); let errText = typeof detail === 'string' ? detail : JSON.stringify(detail || `${res.status}`);
if (res.status === 504) { if (res.status === 504) {
errText = '504 Gateway Timeout — o purge pode demorar vários minutos. Verifique na VM112 se concluiu antes de repetir.'; errText = '504 Gateway Timeout — o purge pode demorar vários minutos. Verifique na VM112 se concluiu antes de repetir.';
} else if (res.status === 409) {
errText = typeof detail === 'string'
? detail
: 'Purge já em curso para este domínio — aguarde a conclusão no drawer lateral.';
} else if (res.status === 503) {
errText = typeof detail === 'string'
? detail
: 'Servidor ocupado — aguarde alguns segundos e tente novamente.';
} else if (res.status === 500 && errText === '500') {
errText = 'Erro interno ao iniciar purge — verifique logs da API Desk (VM122).';
} }
throw new Error(errText); throw new Error(errText);
} }