diff --git a/.sops.yaml b/.sops.yaml index 5056c87..8e9c1d8 100644 --- a/.sops.yaml +++ b/.sops.yaml @@ -2,12 +2,13 @@ keys: - &xin age1uw059wcwfvd9xuj0hpqzqpeg7qemecspjrsatg37wc7rs2pumfdsgken0c - &host-calcite age1ytwfqfeez3dqtazyjltn7mznccwx3ua8djhned7n8mxqhw4p6e5s97skfa - &host-raspite age1nugzw24upk8pz5lyz2z89qk8se4gpcsg3ypcs58nykncr56sevrsm8qpvj - - &host-sgp-00 age13s6rwd3wjk2x5wkn69tdczhl3l5d7mfmlv90efsv4q67jne43qss9tcakx - &host-la-00 age1fw2sqaa5s9c8ml6ncsexkj8ar4288387ju92ytjys4awf9aw6smqqz94dh - &host-massicot age1jle2auermhswqtehww9gqada8car5aczrx43ztzqf9wtcld0sfmqzaecta - &host-weilite age17r3fxfmt6hgwe984w4lds9u0cnkf5ttq8hnqt800ayfmx7t8t5gqjddyml - &host-hk-00 age1p2dlc8gfgyrvtta6mty2pezjycn244gmvh456qd3wvkfwesp253qnwyta9 - &host-fra-00 age18u4mqrhqkrpcytxfxfex6aeap04u38emhy6u4wrp5k62sz2vae4qm5jj7s + - &host-biotite age1v5h946jfke6ae8pcgz52mhj26cacqcpl9dmmrrkf37x55rnq2v3szqctvv + - &host-thorite age12ng08vjx5jde5ncqutwkd5vm4ygfwy33mzhzwe0lkxzglulgpqusc89r96 creation_rules: - path_regex: machines/calcite/secrets.yaml key_groups: @@ -24,19 +25,18 @@ creation_rules: - age: - *xin - *host-massicot + - path_regex: machines/thorite/secrets.yaml + key_groups: + - age: + - *xin + - *host-thorite - path_regex: machines/dolomite/secrets/secrets.yaml key_groups: - age: - *xin - - *host-sgp-00 - *host-la-00 - *host-hk-00 - *host-fra-00 - - path_regex: machines/dolomite/secrets/sgp-00.yaml - key_groups: - - age: - - *xin - - *host-sgp-00 - path_regex: machines/dolomite/secrets/la-00.yaml key_groups: - age: @@ -64,7 +64,6 @@ creation_rules: - *xin - *host-calcite - *host-raspite - - *host-sgp-00 - *host-la-00 - *host-hk-00 - *host-massicot diff --git a/flake.nix b/flake.nix index 55a8561..7e725f2 100644 --- a/flake.nix +++ b/flake.nix @@ -60,8 +60,8 @@ outputs = { self, - home-manager, nixpkgs, + home-manager, nixos-hardware, sops-nix, flake-utils, @@ -84,6 +84,7 @@ overlayModule = { ... }: { + _module.args.my-lib = import ./overlays/my-lib; nixpkgs.overlays = [ editorOverlay (import ./overlays/add-pkgs.nix) @@ -118,10 +119,6 @@ ./machines/dolomite/bandwagon.nix ./machines/dolomite/common.nix ]; - tok-00 = [ - ./machines/dolomite/lightsail.nix - ./machines/dolomite/common.nix - ]; fra-00 = [ ./machines/dolomite/fra.nix ./machines/dolomite/common.nix @@ -178,7 +175,6 @@ }; in { - nixpkgs = nixpkgs; nixosModules.default = { imports = [ ./modules/nixos @@ -190,7 +186,9 @@ colmenaHive = colmena.lib.makeHive { meta = { # FIXME: - nixpkgs = import nixpkgs { system = "x86_64-linux"; }; + nixpkgs = import nixpkgs { + system = "x86_64-linux"; + }; }; massicot = @@ -205,20 +203,6 @@ ] ++ sharedColmenaModules; }; - tok-00 = - { ... }: - { - imports = nodeNixosModules.tok-00 ++ sharedColmenaModules; - nixpkgs.system = "x86_64-linux"; - networking.hostName = "tok-00"; - system.stateVersion = "23.11"; - deployment = { - targetHost = "video01.namely.icu"; - buildOnTarget = false; - tags = [ "proxy" ]; - }; - }; - la-00 = { ... }: { @@ -310,7 +294,6 @@ osmium = mkNixos { hostname = "osmium"; }; - } // self.colmenaHive.nodes; } diff --git a/machines/biotite/default.nix b/machines/biotite/default.nix index 1b73ee4..5021dc8 100644 --- a/machines/biotite/default.nix +++ b/machines/biotite/default.nix @@ -12,11 +12,12 @@ networking.useNetworkd = true; systemd.network.enable = true; systemd.network.networks."10-wan" = { - matchConfig.MACAddress = "00:16:3e:0a:ec:45"; - networkConfig.DHCP = "ipv4"; - dhcpV4Config = { - UseDNS = true; + matchConfig.MACAddress = "b6:20:0d:9a:6c:34"; + networkConfig = { + DHCP = "ipv4"; + IPv6SendRA = true; }; + address = [ "2a03:4000:4a:148::1/64" ]; }; commonSettings = { diff --git a/machines/calcite/configuration.nix b/machines/calcite/configuration.nix index 1be4864..181c81f 100644 --- a/machines/calcite/configuration.nix +++ b/machines/calcite/configuration.nix @@ -368,8 +368,7 @@ in }; custom.prometheus = { - enable = true; - exporters.blackbox.enable = true; + exporters.node.enable = true; }; services.ollama = { diff --git a/machines/dolomite/claw.nix b/machines/dolomite/claw.nix index d169733..f7b64b7 100644 --- a/machines/dolomite/claw.nix +++ b/machines/dolomite/claw.nix @@ -31,14 +31,7 @@ fsType = "ext4"; }; - swapDevices = [ ]; - - # Enables DHCP on each ethernet and wireless interface. In case of scripted networking - # (the default) this is the recommended approach. When using systemd-networkd it's - # still possible to use this option, but it's recommended to use it in conjunction - # with explicit per-interface declarations with `networking.interfaces..useDHCP`. - # networking.useNetworkd = false; - + networking.useNetworkd = true; systemd.network.enable = true; systemd.network.networks."10-wan" = { matchConfig.MACAddress = "00:16:3e:0a:ec:45"; @@ -47,7 +40,6 @@ UseDNS = true; }; }; - # networking.interfaces.eth0.useDHCP = lib.mkDefault true; nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; }; diff --git a/machines/dolomite/common.nix b/machines/dolomite/common.nix index 3b511ef..23306c0 100644 --- a/machines/dolomite/common.nix +++ b/machines/dolomite/common.nix @@ -21,12 +21,20 @@ }; }; }; + swapDevices = [ + { + device = "/swapfile"; + size = 2 * 1024; + } + ]; - custom.prometheus = { - enable = lib.mkDefault true; - exporters.blackbox.enable = true; + custom.prometheus.exporters = { + enable = true; + node.enable = true; }; + services.tailscale.enable = true; + commonSettings = { auth.enable = true; proxyServer = { diff --git a/machines/dolomite/fra.nix b/machines/dolomite/fra.nix index 0caf650..c5a8d02 100644 --- a/machines/dolomite/fra.nix +++ b/machines/dolomite/fra.nix @@ -52,6 +52,12 @@ networkConfig = { DHCP = "no"; Gateway = "185.217.108.1"; + DNSSEC = true; + DNSOverTLS = true; + DNS = [ + "8.8.8.8#dns.google" + "8.8.4.4#dns.google" + ]; }; address = [ "185.217.108.59/24" ]; }; diff --git a/machines/massicot/default.nix b/machines/massicot/default.nix index e461039..da2cbd5 100644 --- a/machines/massicot/default.nix +++ b/machines/massicot/default.nix @@ -24,10 +24,6 @@ hedgedoc_env = { owner = "hedgedoc"; }; - grafana_cloud_api = { - owner = "prometheus"; - sopsFile = ../secrets.yaml; - }; grafana_oauth_secret = { owner = "grafana"; }; @@ -62,6 +58,8 @@ hostName = "massicot"; }; + services.tailscale.enable = true; + commonSettings = { auth.enable = true; nix = { diff --git a/machines/massicot/networking.nix b/machines/massicot/networking.nix index 7859b2e..2a4c529 100644 --- a/machines/massicot/networking.nix +++ b/machines/massicot/networking.nix @@ -3,8 +3,10 @@ networking.useNetworkd = true; systemd.network.networks."10-wan" = { matchConfig.MACAddress = "96:00:02:68:7d:2d"; - networkConfig.DHCP = "ipv4"; - networkConfig.Gateway = "fe80::1"; + networkConfig = { + DHCP = "ipv4"; + Gateway = "fe80::1"; + }; address = [ "2a01:4f8:c17:345f::3/64" ]; diff --git a/machines/massicot/services.nix b/machines/massicot/services.nix index 6a43aa3..a1e69a0 100644 --- a/machines/massicot/services.nix +++ b/machines/massicot/services.nix @@ -43,10 +43,14 @@ in environmentFile = config.sops.secrets.hedgedoc_env.path; }; - custom.prometheus = { + custom.prometheus.exporters = { enable = true; - exporters.blackbox.enable = true; - exporters.miniflux.enable = true; + blackbox = { + enable = true; + }; + node = { + enable = true; + }; }; security.acme = { diff --git a/machines/thorite/default.nix b/machines/thorite/default.nix index ef69751..7b7ec7e 100644 --- a/machines/thorite/default.nix +++ b/machines/thorite/default.nix @@ -1,29 +1,41 @@ -{ ... }: { imports = [ ./hardware-configurations.nix + ./monitoring.nix ]; - networking.hostName = "thorite"; - networking.useNetworkd = true; - systemd.network.enable = true; - systemd.network.networks."10-wan" = { - matchConfig.MACAddress = "00:51:d3:21:f3:28"; - networkConfig = { - DHCP = "no"; - Gateway = "23.165.200.1"; + config = { + networking.hostName = "thorite"; + networking.useNetworkd = true; + systemd.network.enable = true; + systemd.network.networks."10-wan" = { + matchConfig.MACAddress = "00:51:d3:21:f3:28"; + networkConfig = { + DHCP = "no"; + Gateway = "23.165.200.1"; + DNSSEC = true; + DNSOverTLS = true; + DNS = [ + "8.8.8.8#dns.google" + "8.8.4.4#dns.google" + ]; + }; + address = [ "23.165.200.99/24" ]; }; - address = [ "23.165.200.99/24" ]; + + networking.firewall.allowedTCPPorts = [ + 80 + 443 + ]; + + commonSettings = { + auth.enable = true; + autoupgrade.enable = true; + }; + + nixpkgs.system = "x86_64-linux"; + system.stateVersion = "24.11"; + + users.users.root.hashedPassword = "$y$j9T$NToEZWJBONjSgRnMd9Ur9/$o6n7a9b8eUILQz4d37oiHCCVnDJ8hZTZt.c.37zFfU."; }; - - nixpkgs.system = "x86_64-linux"; - - system.stateVersion = "24.11"; - - commonSettings = { - auth.enable = true; - autoupgrade.enable = true; - }; - - users.users.root.hashedPassword = "$y$j9T$NToEZWJBONjSgRnMd9Ur9/$o6n7a9b8eUILQz4d37oiHCCVnDJ8hZTZt.c.37zFfU."; } diff --git a/machines/thorite/monitoring.nix b/machines/thorite/monitoring.nix new file mode 100644 index 0000000..4f80743 --- /dev/null +++ b/machines/thorite/monitoring.nix @@ -0,0 +1,100 @@ +{ config, my-lib, ... }: +with my-lib; +{ + config = { + sops = { + defaultSopsFile = ./secrets.yaml; + age.sshKeyPaths = [ "/etc/ssh/ssh_host_ed25519_key" ]; + secrets = { + "grafana/oauth_secret" = { + owner = "grafana"; + }; + }; + }; + + custom.monitoring = { + grafana.enable = true; + }; + + services.caddy.virtualHosts."https://grafana.xinyang.life".extraConfig = + with config.services.grafana.settings.server; '' + reverse_proxy http://${http_addr}:${toString http_port} + ''; + + custom.prometheus = { + enable = true; + exporters = { + enable = true; + blackbox.enable = true; + node.enable = true; + }; + ruleModules = (mkCaddyRules [ { host = "thorite"; } ]) ++ (mkNodeRules [ { host = "thorite"; } ]); + }; + + services.prometheus.scrapeConfigs = + let + probeList = [ + "la-00.video.namely.icu:8080" + "fre-00.video.namely.icu:8080" + "hk-00.video.namely.icu:8080" + "49.13.13.122:443" + "45.142.178.32:22" + "home.xinyang.life:8000" + ]; + in + (mkScrapes [ + { + name = "immich"; + scheme = "http"; + address = "weilite.coho-tet.ts.net"; + port = 8082; + } + { + name = "gotosocial"; + address = "xinyang.life"; + } + { + name = "miniflux"; + address = "rss.xinyang.life"; + } + { + name = "ntfy"; + address = "ntfy.xinyang.life"; + } + { + name = "grafana-eu"; + address = "grafana.xinyang.life"; + } + ]) + ++ (mkCaddyScrapes [ + { address = "thorite.coho-tet.ts.net"; } + ]) + ++ (mkNodeScrapes [ + { address = "thorite.coho-tet.ts.net"; } + { address = "massicot.coho-tet.ts.net"; } + { address = "weilite.coho-tet.ts.net"; } + { address = "hk-00.coho-tet.ts.net"; } + { address = "la-00.coho-tet.ts.net"; } + { address = "fra-00.coho-tet.ts.net"; } + ]) + ++ (mkBlackboxScrapes [ + { + hostAddress = "thorite.coho-tet.ts.net"; + targetAddresses = probeList; + } + { + hostAddress = "massicot.coho-tet.ts.net"; + targetAddresses = probeList; + } + { + hostAddress = "weilite.coho-tet.ts.net"; + targetAddresses = [ + "la-00.video.namely.icu:8080" + "fre-00.video.namely.icu:8080" + "hk-00.video.namely.icu:8080" + ]; + } + ]); + + }; +} diff --git a/machines/thorite/secrets.yaml b/machines/thorite/secrets.yaml new file mode 100644 index 0000000..60d475f --- /dev/null +++ b/machines/thorite/secrets.yaml @@ -0,0 +1,31 @@ +grafana: + oauth_secret: ENC[AES256_GCM,data:angZR3sl8vGcbAXyKFBvCSm+YhF5OooCcxRiSxR2zBoXMz5wv5/uMJFynwOTRVI6,iv:hVpOlM89lNbK6AsGf4Is/tLv3xPfg/XdtA8vuEK52L8=,tag:zCER+IdRnTcG2WHQ/AhxZA==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1uw059wcwfvd9xuj0hpqzqpeg7qemecspjrsatg37wc7rs2pumfdsgken0c + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBoTXRtTlRES3V4MGhZaGdr + aXJ4UFNDT0Nrb0ZuWEkxUEFDU2orbzNBSVhVCkh2VitqMGwwOVdhMFJIeWU1eTgw + UVdxY0tLVDJNVnRnQmMyS0FPYS9LVmMKLS0tIEZaMTdIMU5SQUkxL2NFK2Jtbm9v + YVR3RHpDR3F2aFlCWGd5TjNOV2p4YzgK8OKpwcvTK/0j+kQCo0+8n6sQ5Pu9t9xZ + lPWeUGk1BudsyCqgIZWF5iXfu1pJnYq1XEAM0ttJl402xKeqIovM0Q== + -----END AGE ENCRYPTED FILE----- + - recipient: age12ng08vjx5jde5ncqutwkd5vm4ygfwy33mzhzwe0lkxzglulgpqusc89r96 + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBIYk1NTlhsYW8xbFppNTBE + WlJmNzhnclBoVENXa1cvcHY5NGdRZVAzV0FJClpsTHpTeG9CK3J2ZFEreG1BTWpG + WjdaYzlLQnU0LzJLSDBZZ2pvOWdvSEkKLS0tIExRT0p1aCttZG5MMW14emJmRk5w + M2pqMUJoMGlBZnpBaVBUTFFRZUMzb2sKrlWy26Cv55/8XQEl9hee8P29uj582sIx + mUjaYE0U2qOP9bklXUQyyzQjfkBLWTLc1PTX9BjqOOsqXwkRQIYppA== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2024-11-28T17:02:03Z" + mac: ENC[AES256_GCM,data:14FOUXuKP+8+sad1UlhBW37fWzmutpyn6d4q2qKtBiOyT5ivHunFHJfHrtX83X2fLDmUfiD42bXf+rYfdtKzVUmQ6vutCUQk+Hal8NElhjcq5Ns5kT4VZRKG7/ya9+eNEEkajtq/7OFEM5KOQKTKjyOBqBq/AdYQ+ni9r45c1sM=,iv:WrdWSfrZrGalZO4WGk3JpgACY7W0odt3vP+pRkMXHfA=,tag:jeRBfR2QYjLBylOLHxU3hQ==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.9.1 diff --git a/machines/weilite/default.nix b/machines/weilite/default.nix index 8a58896..b2c761d 100644 --- a/machines/weilite/default.nix +++ b/machines/weilite/default.nix @@ -16,7 +16,6 @@ networking.hostName = "weilite"; commonSettings = { auth.enable = true; - autoupgrade.enable = true; nix = { enable = true; enableMirrors = true; @@ -61,8 +60,14 @@ }; }; - custom.prometheus = { + custom.prometheus.exporters = { enable = true; + blackbox = { + enable = true; + }; + node = { + enable = true; + }; }; systemd.mounts = [ diff --git a/modules/nixos/prometheus/blackbox.nix b/modules/nixos/prometheus/blackbox.nix deleted file mode 100644 index 1bfd896..0000000 --- a/modules/nixos/prometheus/blackbox.nix +++ /dev/null @@ -1,93 +0,0 @@ -{ - config, - lib, - pkgs, - ... -}: -let - cfg = config.custom.prometheus; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.blackbox.enable) { - services.prometheus.exporters.blackbox = { - enable = true; - listenAddress = "127.0.0.1"; - configFile = pkgs.writeText "blackbox.config.yaml" ( - lib.generators.toYAML { } { - modules = { - tcp4_connect = { - prober = "tcp"; - tcp = { - ip_protocol_fallback = false; - preferred_ip_protocol = "ip4"; - tls = false; - }; - timeout = "15s"; - }; - }; - } - ); - }; - - services.prometheus.scrapeConfigs = [ - { - job_name = "blackbox"; - scrape_interval = "1m"; - metrics_path = "/probe"; - params = { - module = [ "tcp4_connect" ]; - }; - static_configs = [ - { - targets = [ - "tok-00.namely.icu:8080" - "la-00.video.namely.icu:8080" - "auth.xinyang.life:443" - "home.xinyang.life:8000" - ]; - } - ]; - relabel_configs = [ - { - source_labels = [ "__address__" ]; - target_label = "__param_target"; - } - { - source_labels = [ "__param_target" ]; - target_label = "instance"; - } - { - target_label = "__address__"; - replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"; - } - ]; - } - { - job_name = "blackbox_exporter"; - static_configs = [ - { targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}" ]; } - ]; - } - ]; - - custom.prometheus.ruleModules = [ - { - name = "probe_alerts"; - rules = [ - { - alert = "HighProbeLatency"; - expr = "probe_duration_seconds > 0.5"; - for = "2m"; - labels = { - severity = "warning"; - }; - annotations = { - summary = "High request latency on {{ $labels.instance }}"; - description = "95th percentile of request latency is above 0.5 seconds for the last 2 minutes."; - }; - } - ]; - } - ]; - }; -} diff --git a/modules/nixos/prometheus/caddy.nix b/modules/nixos/prometheus/caddy.nix deleted file mode 100644 index 98e6783..0000000 --- a/modules/nixos/prometheus/caddy.nix +++ /dev/null @@ -1,40 +0,0 @@ -{ config, lib, ... }: -let - cfg = config.custom.prometheus; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.caddy.enable) { - services.caddy.globalConfig = lib.mkIf cfg.exporters.caddy.enable '' - servers { - metrics - } - ''; - - services.prometheus.scrapeConfigs = [ - { - job_name = "caddy"; - static_configs = [ { targets = [ "127.0.0.1:2019" ]; } ]; - } - ]; - - custom.prometheus.ruleModules = [ - { - name = "caddy_alerts"; - rules = [ - { - alert = "UpstreamHealthy"; - expr = "caddy_reverse_proxy_upstreams_healthy != 1"; - for = "5m"; - labels = { - severity = "critical"; - }; - annotations = { - summary = "Upstream {{ $labels.unstream }} not healthy"; - }; - } - ]; - } - ]; - }; - -} diff --git a/modules/nixos/prometheus/default.nix b/modules/nixos/prometheus/default.nix index ed2544a..e911def 100644 --- a/modules/nixos/prometheus/default.nix +++ b/modules/nixos/prometheus/default.nix @@ -1,21 +1,16 @@ { config, - pkgs, lib, ... }: - -with lib; - let + inherit (lib) + mkEnableOption + mkOption + mkIf + types + ; cfg = config.custom.prometheus; - mkExporterOption = - enableOption: - (mkOption { - type = types.bool; - default = enableOption; - description = "Enable this exporter"; - }); mkRulesOption = mkOption { type = types.listOf ( @@ -30,38 +25,36 @@ let in { imports = [ - ./blackbox.nix - ./caddy.nix - ./gotosocial.nix - ./immich.nix - ./miniflux.nix - ./ntfy-sh.nix - ./restic.nix + ./exporters.nix + ./grafana.nix ]; options = { + custom.monitoring = { + grafana = { + enable = mkEnableOption "grafana with oauth only"; + }; + }; custom.prometheus = { enable = mkEnableOption "Prometheus instance"; - exporters = { - enable = mkOption { - type = types.bool; - default = false; - description = "Enable Prometheus exporter on every supported services"; - }; - - restic.enable = mkExporterOption config.services.restic.server.enable; - blackbox.enable = mkExporterOption false; - caddy.enable = mkExporterOption config.services.caddy.enable; - gotosocial.enable = mkExporterOption config.services.gotosocial.enable; - immich.enable = mkExporterOption config.services.immich.enable; - miniflux.enable = mkExporterOption config.services.miniflux.enable; - ntfy-sh.enable = mkExporterOption config.services.ntfy-sh.enable; - }; - grafana = { - enable = mkEnableOption "Grafana Cloud"; - password_file = mkOption { type = types.path; }; - }; ruleModules = mkRulesOption; + exporters = { + enable = mkEnableOption "prometheus exporter on all supported and enable guarded services"; + node = { + enable = mkEnableOption "node exporter"; + listenAddress = mkOption { + type = types.str; + default = "${config.networking.hostName}.coho-tet.ts.net"; + }; + }; + blackbox = { + enable = mkEnableOption "blackbox exporter"; + listenAddress = mkOption { + type = types.str; + default = "${config.networking.hostName}.coho-tet.ts.net"; + }; + }; + }; }; }; @@ -78,46 +71,18 @@ in reverse_proxy 127.0.0.1:${toString config.services.prometheus.port} ''; }; - services.prometheus = mkIf cfg.enable { enable = true; port = 9091; globalConfig.external_labels = { hostname = config.networking.hostName; }; - remoteWrite = mkIf cfg.grafana.enable [ - { - name = "grafana"; - url = "https://prometheus-prod-24-prod-eu-west-2.grafana.net/api/prom/push"; - basic_auth = { - username = "1340065"; - password_file = cfg.grafana.password_file; - }; - } - ]; - exporters = { - node = { - enable = true; - enabledCollectors = [ - "loadavg" - "time" - "systemd" - ]; - listenAddress = "127.0.0.1"; - port = 9100; - }; - }; + scrapeConfigs = [ { job_name = "prometheus"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.port}" ]; } ]; } - { - job_name = "node"; - static_configs = [ - { targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; } - ]; - } ]; alertmanager = { @@ -163,55 +128,8 @@ in }; custom.prometheus.ruleModules = [ { - name = "system_alerts"; + name = "prometheus_alerts"; rules = [ - { - alert = "SystemdFailedUnits"; - expr = "node_systemd_unit_state{state=\"failed\"} > 0"; - for = "5m"; - labels = { - severity = "critical"; - }; - annotations = { - summary = "Systemd has failed units on {{ $labels.instance }}"; - description = "There are {{ $value }} failed units on {{ $labels.instance }}. Immediate attention required!"; - }; - } - { - alert = "HighLoadAverage"; - expr = "node_load1 > 0.8 * count without (cpu) (node_cpu_seconds_total{mode=\"idle\"})"; - for = "1m"; - labels = { - severity = "warning"; - }; - annotations = { - summary = "High load average detected on {{ $labels.instance }}"; - description = "The 1-minute load average ({{ $value }}) exceeds 80% the number of CPUs."; - }; - } - { - alert = "HighTransmitTraffic"; - expr = "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) > 100000000"; - for = "1m"; - labels = { - severity = "warning"; - }; - annotations = { - summary = "High network transmit traffic on {{ $labels.instance }} ({{ $labels.device }})"; - description = "The network interface {{ $labels.device }} on {{ $labels.instance }} is transmitting data at a rate exceeding 100 MB/s for the last 1 minute."; - }; - } - { - alert = "NetworkTrafficExceedLimit"; - expr = ''increase(node_network_transmit_bytes_total{device!="lo",device!~"tailscale.*",device!~"wg.*",device!~"br.*"}[30d]) > 322122547200''; - for = "0m"; - labels = { - severity = "critical"; - }; - annotations = { - summary = "Outbound network traffic exceed 300GB for last 30 day"; - }; - } { alert = "JobDown"; expr = "up == 0"; diff --git a/modules/nixos/prometheus/exporters.nix b/modules/nixos/prometheus/exporters.nix new file mode 100644 index 0000000..15c7ba2 --- /dev/null +++ b/modules/nixos/prometheus/exporters.nix @@ -0,0 +1,65 @@ +{ + config, + pkgs, + lib, + ... +}: +let + inherit (lib) mkIf; + cfg = config.custom.prometheus.exporters; +in +{ + config = { + services.prometheus.exporters.node = mkIf cfg.node.enable { + enable = true; + enabledCollectors = [ + "loadavg" + "time" + "systemd" + ]; + listenAddress = cfg.node.listenAddress; + port = 9100; + }; + + services.prometheus.exporters.blackbox = mkIf cfg.blackbox.enable { + enable = true; + listenAddress = cfg.blackbox.listenAddress; + configFile = pkgs.writeText "blackbox.config.yaml" ( + lib.generators.toYAML { } { + modules = { + tcp4_connect = { + prober = "tcp"; + tcp = { + ip_protocol_fallback = false; + preferred_ip_protocol = "ip4"; + tls = false; + }; + timeout = "15s"; + }; + }; + } + ); + }; + + services.gotosocial.settings = { + metrics-enabled = true; + }; + + services.immich.environment = { + IMMICH_TELEMETRY_INCLUDE = "all"; + }; + + services.restic.server.prometheus = true; + systemd.services.miniflux.environment.METRICS_COLLECTOR = "1"; + services.ntfy-sh.settings.enable-metrics = true; + + services.caddy.globalConfig = '' + servers { + metrics + } + + admin ${config.networking.hostName}.coho-tet.ts.net:2019 { + } + ''; + }; +} diff --git a/modules/nixos/prometheus/gotosocial.nix b/modules/nixos/prometheus/gotosocial.nix deleted file mode 100644 index e5da05e..0000000 --- a/modules/nixos/prometheus/gotosocial.nix +++ /dev/null @@ -1,17 +0,0 @@ -{ config, lib, ... }: -let - cfg = config.custom.prometheus; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.gotosocial.enable) { - services.gotosocial.settings = { - metrics-enabled = true; - }; - services.prometheus.scrapeConfigs = [ - { - job_name = "gotosocial"; - static_configs = [ { targets = [ "localhost:8080" ]; } ]; - } - ]; - }; -} diff --git a/modules/nixos/prometheus/grafana.nix b/modules/nixos/prometheus/grafana.nix new file mode 100644 index 0000000..e1b2cf3 --- /dev/null +++ b/modules/nixos/prometheus/grafana.nix @@ -0,0 +1,43 @@ +{ config, lib, ... }: +let + cfg = config.custom.monitoring.grafana; +in +{ + config = lib.mkIf cfg.enable { + sops.templates."grafana.env".content = '' + GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET=${config.sops.placeholder."grafana/oauth_secret"} + ''; + services.grafana = { + enable = true; + settings = { + server = { + http_addr = "127.0.0.1"; + http_port = 3003; + root_url = "https://grafana.xinyang.life"; + domain = "grafana.xinyang.life"; + }; + "auth.generic_oauth" = { + enabled = true; + name = "Kanidm"; + client_id = "grafana"; + scopes = "openid,profile,email,groups"; + auth_url = "https://auth.xinyang.life/ui/oauth2"; + token_url = "https://auth.xinyang.life/oauth2/token"; + api_url = "https://auth.xinyang.life/oauth2/openid/grafana/userinfo"; + use_pkce = true; + use_refresh_token = true; + allow_sign_up = true; + login_attribute_path = "preferred_username"; + groups_attribute_path = "groups"; + role_attribute_path = "contains(grafana_role[*], 'GrafanaAdmin') && 'GrafanaAdmin' || contains(grafana_role[*], 'Admin') && 'Admin' || contains(grafana_role[*], 'Editor') && 'Editor' || 'Viewer'"; + allow_assign_grafana_admin = true; + auto_login = true; + }; + "auth" = { + disable_login_form = true; + }; + }; + }; + systemd.services.grafana.serviceConfig.EnvironmentFile = config.sops.templates."grafana.env".path; + }; +} diff --git a/modules/nixos/prometheus/immich.nix b/modules/nixos/prometheus/immich.nix deleted file mode 100644 index 4b92500..0000000 --- a/modules/nixos/prometheus/immich.nix +++ /dev/null @@ -1,25 +0,0 @@ -{ config, lib, ... }: -let - cfg = config.custom.prometheus; - immichEnv = config.services.immich.environment; - metricPort = - if builtins.hasAttr "IMMICH_API_METRICS_PORT" immichEnv then - immichEnv.IMMICH_API_METRICS_PORT - else - 8081; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.immich.enable) { - services.immich.environment = { - IMMICH_METRICS = "true"; - }; - - services.prometheus.scrapeConfigs = [ - { - job_name = "immich"; - static_configs = [ { targets = [ "127.0.0.1:${toString metricPort}" ]; } ]; - } - ]; - }; - -} diff --git a/modules/nixos/prometheus/miniflux.nix b/modules/nixos/prometheus/miniflux.nix deleted file mode 100644 index b437b00..0000000 --- a/modules/nixos/prometheus/miniflux.nix +++ /dev/null @@ -1,15 +0,0 @@ -{ config, lib, ... }: -let - cfg = config.custom.prometheus; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.miniflux.enable) { - systemd.services.miniflux.environment.METRICS_COLLECTOR = "1"; - services.prometheus.scrapeConfigs = [ - { - job_name = "miniflux"; - static_configs = [ { targets = [ config.systemd.services.miniflux.environment.LISTEN_ADDR ]; } ]; - } - ]; - }; -} diff --git a/modules/nixos/prometheus/ntfy-sh.nix b/modules/nixos/prometheus/ntfy-sh.nix deleted file mode 100644 index 94e81f7..0000000 --- a/modules/nixos/prometheus/ntfy-sh.nix +++ /dev/null @@ -1,15 +0,0 @@ -{ config, lib, ... }: -let - cfg = config.custom.prometheus; -in -{ - config = lib.mkIf (cfg.enable && cfg.exporters.ntfy-sh.enable) { - services.ntfy-sh.settings.enable-metrics = true; - services.prometheus.scrapeConfigs = [ - { - job_name = "ntfy-sh"; - static_configs = [ { targets = [ "ntfy.xinyang.life" ]; } ]; - } - ]; - }; -} diff --git a/modules/nixos/prometheus/restic.nix b/modules/nixos/prometheus/restic.nix index a3ab710..24dcfa8 100644 --- a/modules/nixos/prometheus/restic.nix +++ b/modules/nixos/prometheus/restic.nix @@ -3,17 +3,26 @@ let cfg = config.custom.prometheus; in { - config = lib.mkIf (cfg.enable && cfg.exporters.restic.enable) { + config = { services.restic.server.prometheus = true; - services.prometheus.scrapeConfigs = [ - (lib.mkIf cfg.exporters.restic.enable { - job_name = "restic"; - static_configs = [ { targets = [ config.services.restic.server.listenAddress ]; } ]; - }) - ]; + custom.prometheus.templates.scrape.mkResticScrapes = + { + address, + port ? null, + ... + }: + let + portStr = if port then ":${toString port}" else ""; + in + [ + (lib.mkIf cfg.exporters.restic.enable { + job_name = "restic"; + static_configs = [ { targets = [ "${address}${portStr}" ]; } ]; + }) + ]; - custom.prometheus.ruleModules = [ + custom.prometheus.templates.rules.mkResticRules = [ { name = "restic_alerts"; rules = [ diff --git a/overlays/my-lib/default.nix b/overlays/my-lib/default.nix new file mode 100644 index 0000000..8d07bc1 --- /dev/null +++ b/overlays/my-lib/default.nix @@ -0,0 +1,3 @@ +{ +} +// (import ./prometheus.nix) diff --git a/overlays/my-lib/prometheus.nix b/overlays/my-lib/prometheus.nix new file mode 100644 index 0000000..29a0362 --- /dev/null +++ b/overlays/my-lib/prometheus.nix @@ -0,0 +1,199 @@ +let + mkFunction = f: (targets: (map f targets)); + mkPort = port: if isNull port then "" else ":${toString port}"; +in +{ + mkScrapes = mkFunction ( + { + name, + address, + port ? 443, + scheme ? "https", + ... + }: + { + job_name = "${name}(${address})"; + scheme = scheme; + static_configs = [ { targets = [ "${address}${mkPort port}" ]; } ]; + } + ); + + mkCaddyScrapes = mkFunction ( + { + address, + port ? 2019, + ... + }: + { + job_name = "caddy_${address}"; + static_configs = [ { targets = [ "${address}${mkPort port}" ]; } ]; + } + ); + + mkCaddyRules = mkFunction ( + { + host ? "", + name ? "caddy_alerts_${host}", + }: + { + inherit name; + rules = [ + { + alert = "UpstreamHealthy"; + expr = "caddy_reverse_proxy_upstreams_healthy != 1"; + for = "5m"; + labels = { + severity = "critical"; + }; + annotations = { + summary = "Upstream {{ $labels.unstream }} not healthy"; + }; + } + ]; + } + ); + + mkNodeScrapes = mkFunction ( + { + address, + port ? 9100, + ... + }: + { + job_name = "node_${address}"; + static_configs = [ { targets = [ "${address}${mkPort port}" ]; } ]; + } + ); + + mkNodeRules = mkFunction ( + { + host ? "", + name ? "system_alerts_${host}", + ... + }: + { + inherit name; + rules = [ + { + alert = "SystemdFailedUnits"; + expr = "node_systemd_unit_state{state=\"failed\"} > 0"; + for = "5m"; + labels = { + severity = "critical"; + }; + annotations = { + summary = "Systemd has failed units on {{ $labels.instance }}"; + description = "There are {{ $value }} failed units on {{ $labels.instance }}. Immediate attention required!"; + }; + } + { + alert = "HighLoadAverage"; + expr = "node_load1 > 0.8 * count without (cpu) (node_cpu_seconds_total{mode=\"idle\"})"; + for = "1m"; + labels = { + severity = "warning"; + }; + annotations = { + summary = "High load average detected on {{ $labels.instance }}"; + description = "The 1-minute load average ({{ $value }}) exceeds 80% the number of CPUs."; + }; + } + { + alert = "HighTransmitTraffic"; + expr = "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) > 100000000"; + for = "1m"; + labels = { + severity = "warning"; + }; + annotations = { + summary = "High network transmit traffic on {{ $labels.instance }} ({{ $labels.device }})"; + description = "The network interface {{ $labels.device }} on {{ $labels.instance }} is transmitting data at a rate exceeding 100 MB/s for the last 1 minute."; + }; + } + { + alert = "NetworkTrafficExceedLimit"; + expr = ''increase(node_network_transmit_bytes_total{device!="lo",device!~"tailscale.*",device!~"wg.*",device!~"br.*"}[30d]) > 322122547200''; + for = "0m"; + labels = { + severity = "critical"; + }; + annotations = { + summary = "Outbound network traffic exceed 300GB for last 30 day"; + }; + } + ]; + } + ); + + mkBlackboxScrapes = mkFunction ( + { + hostAddress, + hostPort ? 9115, + targetAddresses, + ... + }: + { + job_name = "blackbox(${hostAddress})"; + scrape_interval = "1m"; + metrics_path = "/probe"; + params = { + module = [ "tcp4_connect" ]; + }; + static_configs = [ + { + targets = targetAddresses; + } + ]; + relabel_configs = [ + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + target_label = "__address__"; + replacement = "${hostAddress}${mkPort hostPort}"; + } + ]; + } + ); + + mkBlackboxRules = mkFunction ( + { + host ? "", + name ? "probe_alerts_${host}", + }: + { + inherit name; + rules = [ + { + alert = "HighProbeLatency"; + expr = "probe_duration_seconds > 0.5"; + for = "3m"; + labels = { + severity = "warning"; + }; + annotations = { + summary = "High request latency on {{ $labels.instance }}"; + description = "Request latency is above 0.5 seconds for the last 3 minutes."; + }; + } + { + alert = "VeryHighProbeLatency"; + expr = "probe_duration_seconds > 1"; + for = "3m"; + labels = { + severity = "critical"; + }; + annotations = { + summary = "High request latency on {{ $labels.instance }}"; + description = "Request latency is above 0.5 seconds for the last 3 minutes."; + }; + } + ]; + } + ); +}