modules/monitoring: add alert rules to loki

This commit is contained in:
xinyangli 2024-12-03 16:36:34 +08:00
parent 5b6f6ce735
commit 83f7700949
Signed by: xin
SSH key fingerprint: SHA256:UU5pRTl7NiLFJbWJZa+snLylZSXIz5rgHmwjzv8v4oE
3 changed files with 130 additions and 40 deletions

View file

@ -71,7 +71,7 @@ in
services.restic.server.prometheus = true; services.restic.server.prometheus = true;
# miniflux # miniflux
sops.templates."miniflux_metrics_env" = { sops.templates."miniflux_metrics_env" = {
content = '' content = ''
METRICS_COLLECTOR=1 METRICS_COLLECTOR=1

View file

@ -1,68 +1,158 @@
{ {
pkgs,
config, config,
lib, lib,
my-lib,
... ...
}: }:
let let
inherit (lib) inherit (lib)
mkOption
mkEnableOption mkEnableOption
mkIf mkIf
mkMerge mkMerge
types
literalExpression
;
inherit (my-lib.settings)
alertmanagerPort
; ;
cfg = config.custom.monitoring; cfg = config.custom.monitoring;
port-loki = 3100; lokiPort = 3100;
in in
{ {
options = { options = {
custom.monitoring = { custom.monitoring = {
loki.enable = mkEnableOption "loki"; loki = {
enable = mkEnableOption "loki";
rules = mkOption {
type = types.attrsOf (
types.submodule {
options = {
condition = mkOption {
type = types.str;
description = ''
Loki alert expression.
'';
example = ''count_over_time({job=~"secure"} |="sshd[" |~": Failed|: Invalid|: Connection closed by authenticating user" | __error__="" [15m]) > 15'';
default = null;
};
description = mkOption {
type = types.str;
description = ''
Loki alert message.
'';
example = "Prometheus encountered value {{ $value }} with {{ $labels }}";
default = null;
};
labels = mkOption {
type = types.nullOr (types.attrsOf types.str);
description = ''
Additional alert labels.
'';
example = literalExpression ''
{ severity = "page" };
'';
default = { };
};
time = mkOption {
type = types.str;
description = ''
Time until the alert is fired.
'';
example = "5m";
default = "2m";
};
};
}
);
description = ''
Defines the loki rules.
'';
default = { };
};
};
promtail.enable = mkEnableOption "promtail"; promtail.enable = mkEnableOption "promtail";
}; };
}; };
config = mkMerge [ config = mkMerge [
(mkIf cfg.loki.enable { (
services.loki = { let
enable = true; rulerConfig = {
configuration = { groups = [
auth_enabled = false;
server.http_listen_address = "${config.networking.hostName}.coho-tet.ts.net";
server.http_listen_port = port-loki;
common = {
ring = {
instance_addr = "${config.networking.hostName}.coho-tet.ts.net";
kvstore.store = "inmemory";
};
replication_factor = 1;
path_prefix = "/var/lib/loki";
};
schema_config.configs = [
{ {
from = "2024-12-01"; name = "alerting-rules";
store = "boltdb-shipper"; rules = lib.mapAttrsToList (name: opts: {
object_store = "filesystem"; alert = name;
schema = "v13"; inherit (opts) condition labels;
index = { for = opts.time;
prefix = "index_"; annotations.description = opts.description;
period = "24h"; }) cfg.loki.rules;
};
} }
]; ];
};
rulerFile = pkgs.writeText "ruler.yml" (builtins.toJSON rulerConfig);
in
mkIf cfg.loki.enable {
services.loki = {
enable = true;
configuration = {
auth_enabled = false;
server.http_listen_address = "${config.networking.hostName}.coho-tet.ts.net";
server.http_listen_port = lokiPort;
storage_config = { common = {
filesystem.directory = "/var/lib/loki/chunks"; ring = {
}; instance_addr = "${config.networking.hostName}.coho-tet.ts.net";
kvstore.store = "inmemory";
};
replication_factor = 1;
path_prefix = "/var/lib/loki";
};
limits_config = { schema_config.configs = [
reject_old_samples = true; {
reject_old_samples_max_age = "168h"; from = "2024-12-01";
allow_structured_metadata = false; store = "boltdb-shipper";
object_store = "filesystem";
schema = "v13";
index = {
prefix = "index_";
period = "24h";
};
}
];
storage_config = {
filesystem.directory = "/var/lib/loki/chunks";
};
limits_config = {
reject_old_samples = true;
reject_old_samples_max_age = "168h";
allow_structured_metadata = false;
};
ruler = {
storage = {
type = "local";
local.directory = "${config.services.loki.dataDir}/ruler";
};
rule_path = "${config.services.loki.dataDir}/rules";
alertmanager_url = "http://127.0.0.1:${toString alertmanagerPort}";
};
}; };
}; };
}; systemd.tmpfiles.rules = [
}) "d /var/lib/loki 0700 loki loki - -"
"d /var/lib/loki/ruler 0700 loki loki - -"
"d /var/lib/loki/rules 0700 loki loki - -"
"L /var/lib/loki/ruler/ruler.yml - - - - ${rulerFile}"
];
systemd.services.loki.reloadTriggers = [ rulerFile ];
}
)
(mkIf cfg.promtail.enable { (mkIf cfg.promtail.enable {
services.promtail = { services.promtail = {
enable = true; enable = true;
@ -78,7 +168,7 @@ in
clients = [ clients = [
{ {
url = "http://thorite.coho-tet.ts.net:${toString port-loki}/loki/api/v1/push"; url = "http://thorite.coho-tet.ts.net:${toString lokiPort}/loki/api/v1/push";
} }
]; ];

View file

@ -39,7 +39,7 @@ let
echo "Creating snapshot for ${rootDir}" echo "Creating snapshot for ${rootDir}"
subvolumes=$(${pkgs.btrfs-progs}/bin/btrfs subvolume list -o "${rootDir}" | ${awk} '{print $NF}') subvolumes=$(${pkgs.btrfs-progs}/bin/btrfs subvolume list -o "${rootDir}" | ${awk} '{print $NF}')
mkdir -p "${backupDir}" mkdir -p "${backupDir}"
${pkgs.btrfs-progs}/bin/btrfs subvolume snapshot -r "${rootDir}" "${backupDir}/rootfs" ${pkgs.btrfs-progs}/bin/btrfs subvolume snapshot -r "${rootDir}" "${backupDir}/rootDirectory"
for subvol in $subvolumes; do for subvol in $subvolumes; do
${continueIfInExclude} ${continueIfInExclude}
[[ /"$subvol" == "${backupDir}"* ]] && continue [[ /"$subvol" == "${backupDir}"* ]] && continue