Fix purge 500 when SQLite database is locked (Spec 017).

Enable WAL/busy_timeout, retry writes, reject duplicate running jobs with HTTP 409,
use bcrypt directly instead of broken passlib 1.7.4 + bcrypt 4.x, and improve UI errors.
This commit is contained in:
Ligbox Spec Hub 2026-06-19 22:05:05 +00:00
parent 06d0dcc977
commit 0ee4845818
5 changed files with 229 additions and 61 deletions

View file

@ -13,7 +13,7 @@ from typing import Any
from fastapi import Depends, Header, HTTPException, Request
from jose import JWTError, jwt
from passlib.context import CryptContext
import bcrypt
from app.totp_util import verify_code as verify_totp_code
@ -26,8 +26,6 @@ DESK_BOOTSTRAP_PASSWORD = os.getenv("DESK_BOOTSTRAP_PASSWORD", "805353")
AUTH_LOGIN_RATE_LIMIT = int(os.getenv("AUTH_LOGIN_RATE_LIMIT", "5"))
OPS_INTERNAL_TOKEN = os.getenv("OPS_INTERNAL_TOKEN", "")
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
_login_attempts: dict[str, list[float]] = {}
_mfa_pending: dict[str, tuple[str, float]] = {}
MFA_TOKEN_TTL_SEC = 300
@ -54,17 +52,24 @@ class DeskUser:
def db() -> sqlite3.Connection:
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
conn = sqlite3.connect(DB_PATH, timeout=30.0)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
return conn
def hash_password(password: str) -> str:
return pwd_context.hash(password)
return bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode("utf-8")
def verify_password(plain: str, hashed: str) -> bool:
return pwd_context.verify(plain, hashed)
if not plain or not hashed:
return False
try:
return bcrypt.checkpw(plain.encode("utf-8"), hashed.encode("utf-8"))
except (ValueError, TypeError):
return False
def init_auth_schema(conn: sqlite3.Connection) -> None:

View file

@ -133,8 +133,10 @@ TICKET_COLUMNS = "id,tenant_id,subject,status,payload,created_at,assigned_to,ass
def db():
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
conn = sqlite3.connect(DB_PATH, timeout=30.0)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
return conn
@ -172,6 +174,11 @@ def init_db():
from app import billing_store
migration_store.init_schema(conn)
billing_store.init_schema(conn)
from app.vm112_purge_jobs import init_purge_jobs_schema
init_purge_jobs_schema(conn)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
conn.commit()

View file

@ -2,6 +2,8 @@
from __future__ import annotations
import sqlite3
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
@ -9,7 +11,13 @@ from pydantic import BaseModel, Field
from app import auth, vm112_domains
from app.permissions import can_manage_vm112_domains
from app.vm112_purge_stream import purge_sse_generator
from app.vm112_purge_jobs import get_job_public, list_jobs, recover_job, start_job
from app.vm112_purge_jobs import (
PurgeJobConflictError,
get_job_public,
list_jobs,
recover_job,
start_job,
)
router = APIRouter(prefix="/api/v1/vm112", tags=["vm112-domains"])
@ -116,7 +124,17 @@ def start_purge_job(
):
"""Inicia purge em background; consultar GET /purge/jobs/{id} (recomendado via Traefik)."""
domain = _validate_purge_request(domain, body)
job_id = start_job(domain, body.root_password, user.username)
try:
job_id = start_job(domain, body.root_password, user.username)
except PurgeJobConflictError as exc:
raise HTTPException(409, str(exc)) from exc
except sqlite3.OperationalError as exc:
if "locked" in str(exc).lower():
raise HTTPException(
503,
"Base de dados ocupada — aguarde o purge em curso ou tente novamente em alguns segundos",
) from exc
raise HTTPException(500, f"Erro SQLite ao iniciar purge: {exc}") from exc
return {"ok": True, "job_id": job_id, "domain": domain, "status": "running"}

View file

@ -3,7 +3,9 @@
from __future__ import annotations
import json
import sqlite3
import threading
import time
import traceback
import uuid
from datetime import datetime, timezone
@ -12,6 +14,11 @@ from typing import Any, Callable
from app import auth, vm112_domains
_lock = threading.Lock()
_schema_ready = False
class PurgeJobConflictError(Exception):
"""Já existe purge queued/running para o domínio."""
def _now() -> str:
@ -36,17 +43,42 @@ def init_purge_jobs_schema(conn) -> None:
)
"""
)
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_vm112_purge_jobs_domain_status
ON vm112_purge_jobs(domain, status)
"""
)
conn.commit()
def _ensure_schema() -> None:
global _schema_ready
if _schema_ready:
return
conn = auth.db()
try:
init_purge_jobs_schema(conn)
_schema_ready = True
finally:
conn.close()
def _sqlite_retry(fn: Callable[[], Any], *, attempts: int = 8) -> Any:
last_err: Exception | None = None
for attempt in range(attempts):
try:
return fn()
except sqlite3.OperationalError as exc:
last_err = exc
if "locked" not in str(exc).lower() or attempt >= attempts - 1:
raise
time.sleep(min(0.05 * (2**attempt), 1.0))
if last_err:
raise last_err
return None
def _row_to_job(row) -> dict[str, Any]:
return {
"id": row["id"],
@ -66,64 +98,124 @@ def _row_to_job(row) -> dict[str, Any]:
def _persist_job(job: dict[str, Any]) -> None:
_ensure_schema()
conn = auth.db()
try:
job["updated_at"] = _now()
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
status = excluded.status,
timeline_json = excluded.timeline_json,
elapsed_vm112 = excluded.elapsed_vm112,
desk_json = excluded.desk_json,
vm112_json = excluded.vm112_json,
error = excluded.error,
by_user = excluded.by_user,
updated_at = excluded.updated_at
""",
(
job["id"],
job["domain"],
job["status"],
json.dumps(job.get("timeline") or [], ensure_ascii=False),
int(job.get("elapsed_vm112") or 0),
json.dumps(job.get("desk") or {}, ensure_ascii=False),
json.dumps(job.get("vm112") or {}, ensure_ascii=False),
job.get("error"),
job.get("by"),
job.get("created_at") or _now(),
job["updated_at"],
),
)
conn.commit()
finally:
conn.close()
def _write() -> None:
conn = auth.db()
try:
job["updated_at"] = _now()
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
status = excluded.status,
timeline_json = excluded.timeline_json,
elapsed_vm112 = excluded.elapsed_vm112,
desk_json = excluded.desk_json,
vm112_json = excluded.vm112_json,
error = excluded.error,
by_user = excluded.by_user,
updated_at = excluded.updated_at
""",
(
job["id"],
job["domain"],
job["status"],
json.dumps(job.get("timeline") or [], ensure_ascii=False),
int(job.get("elapsed_vm112") or 0),
json.dumps(job.get("desk") or {}, ensure_ascii=False),
json.dumps(job.get("vm112") or {}, ensure_ascii=False),
job.get("error"),
job.get("by"),
job.get("created_at") or _now(),
job["updated_at"],
),
)
conn.commit()
finally:
conn.close()
_sqlite_retry(_write)
def _load_job(job_id: str) -> dict[str, Any] | None:
_ensure_schema()
conn = auth.db()
try:
row = conn.execute(
"SELECT * FROM vm112_purge_jobs WHERE id = ?", (job_id,)
).fetchone()
return _row_to_job(row) if row else None
finally:
conn.close()
def _read() -> dict[str, Any] | None:
conn = auth.db()
try:
row = conn.execute(
"SELECT * FROM vm112_purge_jobs WHERE id = ?", (job_id,)
).fetchone()
return _row_to_job(row) if row else None
finally:
conn.close()
return _sqlite_retry(_read)
def _find_active_job_locked(conn: sqlite3.Connection, domain: str) -> sqlite3.Row | None:
return conn.execute(
"""
SELECT id, domain, status, created_at FROM vm112_purge_jobs
WHERE domain = ? AND status IN ('queued', 'running')
ORDER BY created_at DESC LIMIT 1
""",
(domain.lower().strip(),),
).fetchone()
def _mutate_job(job_id: str, fn: Callable[[dict[str, Any]], None]) -> dict[str, Any] | None:
with _lock:
job = _load_job(job_id)
if not job:
return None
fn(job)
_persist_job(job)
return dict(job)
def _mutate() -> dict[str, Any] | None:
conn = auth.db()
try:
row = conn.execute(
"SELECT * FROM vm112_purge_jobs WHERE id = ?", (job_id,)
).fetchone()
if not row:
return None
job = _row_to_job(row)
fn(job)
job["updated_at"] = _now()
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
status = excluded.status,
timeline_json = excluded.timeline_json,
elapsed_vm112 = excluded.elapsed_vm112,
desk_json = excluded.desk_json,
vm112_json = excluded.vm112_json,
error = excluded.error,
by_user = excluded.by_user,
updated_at = excluded.updated_at
""",
(
job["id"],
job["domain"],
job["status"],
json.dumps(job.get("timeline") or [], ensure_ascii=False),
int(job.get("elapsed_vm112") or 0),
json.dumps(job.get("desk") or {}, ensure_ascii=False),
json.dumps(job.get("vm112") or {}, ensure_ascii=False),
job.get("error"),
job.get("by"),
job.get("created_at") or _now(),
job["updated_at"],
),
)
conn.commit()
return dict(job)
finally:
conn.close()
return _sqlite_retry(_mutate)
def _upsert_step(job_id: str, step: dict[str, str]) -> None:
@ -143,12 +235,13 @@ def _set_job(job_id: str, **fields: Any) -> None:
def create_job(domain: str, username: str) -> str:
domain = domain.lower().strip()
job_id = uuid.uuid4().hex[:16]
now = _now()
job = {
"id": job_id,
"job_id": job_id,
"domain": domain.lower().strip(),
"domain": domain,
"status": "queued",
"timeline": [],
"elapsed_vm112": 0,
@ -159,8 +252,43 @@ def create_job(domain: str, username: str) -> str:
"created_at": now,
"updated_at": now,
}
def _create() -> None:
conn = auth.db()
try:
active = _find_active_job_locked(conn, domain)
if active:
raise PurgeJobConflictError(
f"Purge já em curso para {domain} (job {active['id']}, status {active['status']})"
)
conn.execute(
"""
INSERT INTO vm112_purge_jobs (
id, domain, status, timeline_json, elapsed_vm112,
desk_json, vm112_json, error, by_user, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
job["id"],
job["domain"],
job["status"],
"[]",
0,
"{}",
"{}",
None,
job["by"],
job["created_at"],
job["updated_at"],
),
)
conn.commit()
finally:
conn.close()
with _lock:
_persist_job(job)
_ensure_schema()
_sqlite_retry(_create)
return job_id

View file

@ -140,6 +140,16 @@ const DeskServices = (() => {
let errText = typeof detail === 'string' ? detail : JSON.stringify(detail || `${res.status}`);
if (res.status === 504) {
errText = '504 Gateway Timeout — o purge pode demorar vários minutos. Verifique na VM112 se concluiu antes de repetir.';
} else if (res.status === 409) {
errText = typeof detail === 'string'
? detail
: 'Purge já em curso para este domínio — aguarde a conclusão no drawer lateral.';
} else if (res.status === 503) {
errText = typeof detail === 'string'
? detail
: 'Servidor ocupado — aguarde alguns segundos e tente novamente.';
} else if (res.status === 500 && errText === '500') {
errText = 'Erro interno ao iniciar purge — verifique logs da API Desk (VM122).';
}
throw new Error(errText);
}