282 lines
8.6 KiB
Nix
282 lines
8.6 KiB
Nix
{
|
|
pkgs,
|
|
config,
|
|
lib,
|
|
...
|
|
}:
|
|
let
|
|
inherit (lib)
|
|
mkOption
|
|
mkEnableOption
|
|
mkIf
|
|
mkMerge
|
|
types
|
|
literalExpression
|
|
;
|
|
inherit (config.my-lib.settings)
|
|
alertmanagerPort
|
|
;
|
|
cfg = config.custom.monitoring;
|
|
lokiPort = 3100;
|
|
in
|
|
{
|
|
options = {
|
|
custom.monitoring = {
|
|
loki = {
|
|
enable = mkEnableOption "loki";
|
|
rules = mkOption {
|
|
type = types.attrsOf (
|
|
types.submodule {
|
|
options = {
|
|
expr = mkOption {
|
|
type = types.str;
|
|
description = ''
|
|
Loki alert expression.
|
|
'';
|
|
example = ''count_over_time({job=~"secure"} |="sshd[" |~": Failed|: Invalid|: Connection closed by authenticating user" | __error__="" [15m]) > 15'';
|
|
default = null;
|
|
};
|
|
description = mkOption {
|
|
type = types.str;
|
|
description = ''
|
|
Loki alert message.
|
|
'';
|
|
example = "Prometheus encountered value {{ $value }} with {{ $labels }}";
|
|
default = null;
|
|
};
|
|
labels = mkOption {
|
|
type = types.nullOr (types.attrsOf types.str);
|
|
description = ''
|
|
Additional alert labels.
|
|
'';
|
|
example = literalExpression ''
|
|
{ severity = "page" };
|
|
'';
|
|
default = { };
|
|
};
|
|
time = mkOption {
|
|
type = types.str;
|
|
description = ''
|
|
Time until the alert is fired.
|
|
'';
|
|
example = "5m";
|
|
default = "2m";
|
|
};
|
|
};
|
|
}
|
|
);
|
|
description = ''
|
|
Defines the loki rules.
|
|
'';
|
|
default = { };
|
|
};
|
|
};
|
|
promtail.enable = mkEnableOption "promtail";
|
|
};
|
|
};
|
|
|
|
config = mkMerge [
|
|
(
|
|
let
|
|
rulerConfig = {
|
|
groups = [
|
|
{
|
|
name = "alerting-rules";
|
|
rules = lib.mapAttrsToList (name: opts: {
|
|
alert = name;
|
|
inherit (opts) expr labels;
|
|
for = opts.time;
|
|
annotations.description = opts.description;
|
|
}) cfg.loki.rules;
|
|
}
|
|
];
|
|
};
|
|
rulerFile = pkgs.writeText "ruler.yml" (builtins.toJSON rulerConfig);
|
|
in
|
|
mkIf cfg.loki.enable {
|
|
services.loki = {
|
|
enable = true;
|
|
configuration = {
|
|
auth_enabled = false;
|
|
server.http_listen_address = "${config.networking.hostName}.coho-tet.ts.net";
|
|
server.http_listen_port = lokiPort;
|
|
|
|
common = {
|
|
ring = {
|
|
instance_addr = "${config.networking.hostName}.coho-tet.ts.net";
|
|
kvstore.store = "inmemory";
|
|
};
|
|
replication_factor = 1;
|
|
path_prefix = "/var/lib/loki";
|
|
};
|
|
|
|
schema_config.configs = [
|
|
{
|
|
from = "2024-12-01";
|
|
store = "boltdb-shipper";
|
|
object_store = "filesystem";
|
|
schema = "v13";
|
|
index = {
|
|
prefix = "index_";
|
|
period = "24h";
|
|
};
|
|
}
|
|
];
|
|
|
|
storage_config = {
|
|
filesystem.directory = "/var/lib/loki/chunks";
|
|
};
|
|
|
|
limits_config = {
|
|
reject_old_samples = true;
|
|
reject_old_samples_max_age = "168h";
|
|
allow_structured_metadata = false;
|
|
};
|
|
|
|
ruler = {
|
|
storage = {
|
|
type = "local";
|
|
local.directory = "${config.services.loki.dataDir}/rules";
|
|
};
|
|
rule_path = "${config.services.loki.dataDir}/rules-temp";
|
|
enable_api = true;
|
|
alertmanager_url = "http://127.0.0.1:${toString alertmanagerPort}";
|
|
};
|
|
};
|
|
};
|
|
systemd.tmpfiles.rules = [
|
|
"d /var/lib/loki 0700 loki loki - -"
|
|
"d /var/lib/loki/rules-temp 0700 loki loki - -"
|
|
"d /var/lib/loki/rules 0700 loki loki - -"
|
|
"d /var/lib/loki/rules/fake 0700 loki loki - -"
|
|
"L /var/lib/loki/rules/fake/ruler.yml - - - - ${rulerFile}"
|
|
];
|
|
systemd.services.loki.restartTriggers = [ rulerFile ];
|
|
}
|
|
)
|
|
(mkIf cfg.promtail.enable {
|
|
services.promtail = {
|
|
enable = true;
|
|
configuration = {
|
|
|
|
server = {
|
|
http_listen_address = "${config.networking.hostName}.coho-tet.ts.net";
|
|
http_listen_port = 28183;
|
|
grpc_listen_port = 0;
|
|
};
|
|
|
|
positions.filename = "/tmp/positions.yml";
|
|
|
|
clients = [
|
|
{
|
|
url = "http://thorite.coho-tet.ts.net:${toString lokiPort}/loki/api/v1/push";
|
|
}
|
|
];
|
|
|
|
scrape_configs = [
|
|
{
|
|
job_name = "journal";
|
|
# Copied from Mic92's config
|
|
journal = {
|
|
max_age = "12h";
|
|
json = true;
|
|
labels.job = "systemd-journal";
|
|
};
|
|
pipeline_stages = [
|
|
{
|
|
json.expressions = {
|
|
transport = "_TRANSPORT";
|
|
unit = "_SYSTEMD_UNIT";
|
|
msg = "MESSAGE";
|
|
coredump_cgroup = "COREDUMP_CGROUP";
|
|
coredump_exe = "COREDUMP_EXE";
|
|
coredump_cmdline = "COREDUMP_CMDLINE";
|
|
coredump_uid = "COREDUMP_UID";
|
|
coredump_gid = "COREDUMP_GID";
|
|
};
|
|
}
|
|
{
|
|
# Set the unit (defaulting to the transport like audit and kernel)
|
|
template = {
|
|
source = "unit";
|
|
template = "{{if .unit}}{{.unit}}{{else}}{{.transport}}{{end}}";
|
|
};
|
|
}
|
|
{
|
|
regex = {
|
|
expression = "(?P<coredump_unit>[^/]+)$";
|
|
source = "coredump_cgroup";
|
|
};
|
|
}
|
|
{
|
|
template = {
|
|
source = "msg";
|
|
# FIXME would be cleaner to have this in a match block, but could not get it to work
|
|
template = "{{if .coredump_exe}}{{.coredump_exe}} core dumped (user: {{.coredump_uid}}/{{.coredump_gid}}, command: {{.coredump_cmdline}}){{else}}{{.msg}}{{end}}";
|
|
};
|
|
}
|
|
{ labels.coredump_unit = "coredump_unit"; }
|
|
{
|
|
# Normalize session IDs (session-1234.scope -> session.scope) to limit number of label values
|
|
replace = {
|
|
source = "unit";
|
|
expression = "^(session-\\d+.scope)$";
|
|
replace = "session.scope";
|
|
};
|
|
}
|
|
{ labels.unit = "unit"; }
|
|
{
|
|
# Write the proper message instead of JSON
|
|
output.source = "msg";
|
|
}
|
|
# silence nscd:
|
|
# ignore random portscans on the internet
|
|
{ drop.expression = "refused connection: IN="; }
|
|
];
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__journal__hostname" ];
|
|
target_label = "host";
|
|
}
|
|
];
|
|
}
|
|
# {
|
|
# job_name = "caddy";
|
|
# static_configs = [
|
|
# {
|
|
# targets = [ "localhost" ];
|
|
# labels = {
|
|
# job = "caddy";
|
|
# __path__ = "/var/log/caddy/*log";
|
|
# agent = "caddy-promtail";
|
|
# };
|
|
# }
|
|
# ];
|
|
# pipeline_stages = [
|
|
# {
|
|
# json = {
|
|
# expressions = {
|
|
# duration = "duration";
|
|
# status = "status";
|
|
# };
|
|
# };
|
|
# }
|
|
# {
|
|
# labels = {
|
|
# duration = null;
|
|
# status = null;
|
|
# };
|
|
# }
|
|
# ];
|
|
# }
|
|
];
|
|
};
|
|
};
|
|
|
|
services.caddy.logFormat = ''
|
|
format json
|
|
level INFO
|
|
'';
|
|
})
|
|
];
|
|
}
|