Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions alertmanager/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# NightMend-flavored Alertmanager:官方镜像 + envsubst + 启动脚本
# 只改 entrypoint,Alertmanager 本身版本由 FROM 锁定。
FROM prom/alertmanager:v0.27.0

USER root
# busybox 自带 sh,但无 envsubst;装 gettext 提供 envsubst
RUN apk add --no-cache gettext

COPY alertmanager.yml.template /etc/alertmanager/alertmanager.yml.template
COPY entrypoint.sh /usr/local/bin/nightmend-entrypoint.sh
RUN chmod +x /usr/local/bin/nightmend-entrypoint.sh

USER nobody
EXPOSE 9093
ENTRYPOINT ["/usr/local/bin/nightmend-entrypoint.sh"]
40 changes: 40 additions & 0 deletions alertmanager/alertmanager.yml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Alertmanager 配置模板 (template) — NightMend 集成
#
# 顶层设计:
# Prom sidecar 的 rules(M2 同步)触发 → Alertmanager(分组/去重/抑制)→
# webhook 投递到 NightMend backend → AI 诊断 + runbook。
#
# 本文件是 template,容器启动时 entrypoint 脚本用 envsubst 把 $ALERTMANAGER_WEBHOOK_TOKEN
# 等占位符替换成 .env 的实际值,生成 /etc/alertmanager/alertmanager.yml 再启动。

global:
resolve_timeout: 5m

route:
receiver: nightmend-webhook
group_by: [alertname, instance, nightmend_rule_id]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- matchers:
- severity = "critical"
group_wait: 10s
group_interval: 1m
repeat_interval: 30m
receiver: nightmend-webhook

inhibit_rules:
- source_matchers: [severity = "critical"]
target_matchers: [severity = "warning"]
equal: [alertname, instance]

receivers:
- name: nightmend-webhook
webhook_configs:
- url: ${NIGHTMEND_WEBHOOK_URL}
send_resolved: true
http_config:
authorization:
type: Bearer
credentials: ${ALERTMANAGER_WEBHOOK_TOKEN}
25 changes: 25 additions & 0 deletions alertmanager/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/sh
# Alertmanager entrypoint:用 envsubst 渲染 template → 真实配置,再 exec 官方 binary。
#
# 环境变量:
# NIGHTMEND_WEBHOOK_URL 例如 http://backend:8000/api/v1/webhooks/alertmanager
# ALERTMANAGER_WEBHOOK_TOKEN 与 NightMend settings.alertmanager_webhook_token 一致
set -eu

TEMPLATE_PATH="${ALERTMANAGER_TEMPLATE:-/etc/alertmanager/alertmanager.yml.template}"
OUTPUT_PATH="${ALERTMANAGER_CONFIG:-/etc/alertmanager/alertmanager.yml}"

: "${NIGHTMEND_WEBHOOK_URL:?NIGHTMEND_WEBHOOK_URL is required}"
: "${ALERTMANAGER_WEBHOOK_TOKEN:?ALERTMANAGER_WEBHOOK_TOKEN is required (must match NightMend settings)}"

# 只渲染已知变量,防止意外展开 $foo
export NIGHTMEND_WEBHOOK_URL ALERTMANAGER_WEBHOOK_TOKEN
envsubst '${NIGHTMEND_WEBHOOK_URL} ${ALERTMANAGER_WEBHOOK_TOKEN}' \
< "$TEMPLATE_PATH" > "$OUTPUT_PATH"

# 启动 Alertmanager,其他参数透传
exec /bin/alertmanager \
--config.file="$OUTPUT_PATH" \
--storage.path=/alertmanager \
--web.listen-address=:9093 \
"$@"
2 changes: 2 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ class Settings(BaseSettings):
prometheus_remote_timeout_seconds: float = 15.0
# Remote Write 接收端点 Bearer token;空 = 拒绝所有 Bearer 调用(仅允许 User JWT)
prom_remote_write_token: str = ""
# Alertmanager sidecar 反向路由地址;空 = 反向 silence API 禁用
alertmanager_url: str = "http://alertmanager:9093"

# AlertManager Bridge 配置 (AlertManager Bridge Configuration)
alertmanager_webhook_token: str = "" # Bearer token for webhook auth, generate with: python -c "import secrets; print(secrets.token_urlsafe(32))"
Expand Down
2 changes: 2 additions & 0 deletions backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
from app.routers import promql
from app.routers import webhooks
from app.routers import prom_remote_write # noqa: E402 (main.py 存量 E402 debt 规避)
from app.routers import alertmanager_silences # noqa: E402
from app.routers import alert_stream
from app.api.v1 import data_retention
from app.api.v1 import alert_deduplication
Expand Down Expand Up @@ -316,6 +317,7 @@ async def _monitor_tasks():
app.include_router(promql.router) # PromQL 查询 (PromQL Query Engine)
app.include_router(webhooks.router) # 外部告警源 Webhook (External Alert Source Webhooks)
app.include_router(prom_remote_write.router) # Prometheus Remote Write 接收器 (Prometheus Remote Write Receiver)
app.include_router(alertmanager_silences.router) # Alertmanager Silence 反向路由 (Alertmanager Silence Reverse Routing)
app.include_router(alert_stream.router) # 告警诊断 SSE 流 (Alert Diagnosis SSE Stream)


Expand Down
102 changes: 102 additions & 0 deletions backend/app/routers/alertmanager_silences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Alertmanager Silence 管理端点。

反向路由(M7):NightMend → Alertmanager /api/v2/silences
让用户/Runbook 直接在 NightMend UI 上管理 Alertmanager silences,
不需要切到 AM Web UI 也不会丢失审计链路。
"""
from __future__ import annotations

import json
import logging
from datetime import timedelta
from typing import Any

from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession

from app.core.database import get_db
from app.core.deps import get_current_user, get_operator_user
from app.models.user import User
from app.services import alertmanager_client as am
from app.services.audit import log_audit

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/alertmanager", tags=["alertmanager"])


class Matcher(BaseModel):
name: str
value: str
isRegex: bool = False
isEqual: bool = True


class CreateSilenceRequest(BaseModel):
matchers: list[Matcher] = Field(..., min_length=1, max_length=20)
duration_seconds: int = Field(..., ge=60, le=7 * 24 * 3600, description="60s ~ 7d")
comment: str = Field(..., min_length=1, max_length=500)


@router.post("/silences")
async def create_silence(
payload: CreateSilenceRequest,
request: Request,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_operator_user),
):
try:
silence_id = await am.create_silence(
matchers=[m.model_dump() for m in payload.matchers],
duration=timedelta(seconds=payload.duration_seconds),
created_by=f"nightmend-user:{user.id}",
comment=payload.comment,
)
except am.AlertmanagerUnavailable as exc:
raise HTTPException(status_code=502, detail=str(exc))

await log_audit(
db, user.id, "create_silence", "alertmanager", 0,
json.dumps({"silence_id": silence_id, **payload.model_dump()}),
request.client.host if request.client else None,
)
await db.commit()
return {"silence_id": silence_id}


@router.delete("/silences/{silence_id}", status_code=204)
async def delete_silence(
silence_id: str,
request: Request,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_operator_user),
):
try:
await am.delete_silence(silence_id)
except am.AlertmanagerUnavailable as exc:
raise HTTPException(status_code=502, detail=str(exc))

await log_audit(
db, user.id, "delete_silence", "alertmanager", 0,
json.dumps({"silence_id": silence_id}),
request.client.host if request.client else None,
)
await db.commit()


@router.get("/silences")
async def list_silences(
active_only: bool = True,
_user: User = Depends(get_current_user),
) -> list[dict[str, Any]]:
try:
return await am.list_silences(active_only=active_only)
except am.AlertmanagerUnavailable as exc:
raise HTTPException(status_code=502, detail=str(exc))


@router.get("/health")
async def alertmanager_health(_user: User = Depends(get_current_user)):
"""UI 展示 Alertmanager 可达性。"""
return {"healthy": await am.is_healthy()}
134 changes: 134 additions & 0 deletions backend/app/services/alertmanager_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Alertmanager 反向路由客户端 (Alertmanager Reverse Route Client)

正向:Prom → AM → NightMend webhook(由 alertmanager/alertmanager.yml 配置完成)
反向:NightMend → AM silence API

反向用途:
- Runbook 执行期间,主动把目标告警设成 silence,防止 flap / 重复告警噪音
- 用户在 UI 点"静默 1h"按钮直接投递到 AM(而不是只在 NightMend 自己记忆)
- 维护窗口:批量 silence 某一批主机
"""
from __future__ import annotations

import logging
from datetime import datetime, timedelta, timezone

import httpx

from app.core.config import settings

logger = logging.getLogger(__name__)


class AlertmanagerUnavailable(RuntimeError):
pass


def _iso(dt: datetime) -> str:
"""Alertmanager 期望 RFC3339 (UTC, 带 Z 或 +00:00)。"""
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")


async def create_silence(
matchers: list[dict],
*,
duration: timedelta,
created_by: str,
comment: str,
starts_at: datetime | None = None,
) -> str:
"""
创建一条 silence,返回 silence ID。

matchers 示例:
[{"name": "alertname", "value": "HighCPU", "isRegex": False, "isEqual": True},
{"name": "instance", "value": "web-0[12]", "isRegex": True, "isEqual": True}]

生产调用示例(runbook 执行前):
await create_silence(
[{"name": "alertname", "value": rule.name, "isRegex": False, "isEqual": True}],
duration=timedelta(minutes=10),
created_by="nightmend-runbook",
comment=f"auto-silence during remediation run {log_id}",
)
"""
if not settings.alertmanager_url:
raise AlertmanagerUnavailable("alertmanager_url not configured")

start = starts_at or datetime.now(timezone.utc)
end = start + duration
payload = {
"matchers": matchers,
"startsAt": _iso(start),
"endsAt": _iso(end),
"createdBy": created_by,
"comment": comment,
}

url = f"{settings.alertmanager_url.rstrip('/')}/api/v2/silences"
try:
async with httpx.AsyncClient(timeout=settings.prometheus_remote_timeout_seconds) as client:
response = await client.post(url, json=payload)
except httpx.RequestError as exc:
logger.warning("Alertmanager silence POST failed: %s", exc)
raise AlertmanagerUnavailable(f"Alertmanager unreachable: {exc}") from exc

if response.status_code >= 400:
raise AlertmanagerUnavailable(
f"Alertmanager rejected silence: status={response.status_code} body={response.text[:200]}"
)

data = response.json()
silence_id = data.get("silenceID") or data.get("silenceId") or data.get("id")
if not silence_id:
raise AlertmanagerUnavailable(f"Alertmanager response missing silenceID: {data}")
logger.info("silence created: id=%s by=%s duration=%s", silence_id, created_by, duration)
return silence_id


async def delete_silence(silence_id: str) -> None:
"""提前解除 silence。"""
if not settings.alertmanager_url:
raise AlertmanagerUnavailable("alertmanager_url not configured")
url = f"{settings.alertmanager_url.rstrip('/')}/api/v2/silence/{silence_id}"
try:
async with httpx.AsyncClient(timeout=settings.prometheus_remote_timeout_seconds) as client:
response = await client.delete(url)
except httpx.RequestError as exc:
raise AlertmanagerUnavailable(f"Alertmanager unreachable: {exc}") from exc
if response.status_code >= 400 and response.status_code != 404:
raise AlertmanagerUnavailable(
f"delete silence failed: status={response.status_code} body={response.text[:200]}"
)


async def list_silences(active_only: bool = True) -> list[dict]:
"""列出当前 silences,用于 UI 展示。"""
if not settings.alertmanager_url:
raise AlertmanagerUnavailable("alertmanager_url not configured")
url = f"{settings.alertmanager_url.rstrip('/')}/api/v2/silences"
try:
async with httpx.AsyncClient(timeout=settings.prometheus_remote_timeout_seconds) as client:
response = await client.get(url)
except httpx.RequestError as exc:
raise AlertmanagerUnavailable(f"Alertmanager unreachable: {exc}") from exc
if response.status_code >= 400:
raise AlertmanagerUnavailable(f"list silences failed: {response.status_code}")
silences = response.json() or []
if active_only:
silences = [s for s in silences if (s.get("status") or {}).get("state") == "active"]
return silences


async def is_healthy() -> bool:
if not settings.alertmanager_url:
return False
url = f"{settings.alertmanager_url.rstrip('/')}/-/healthy"
try:
async with httpx.AsyncClient(timeout=3.0) as client:
response = await client.get(url)
return response.status_code == 200
except httpx.RequestError:
return False
Loading