From ef4bdef838be88616dbfda6e6420e2458b869dc5 Mon Sep 17 00:00:00 2001 From: Matthew Grotke Date: Thu, 11 Jun 2026 01:31:57 -0400 Subject: [PATCH] Development --- ADMINISTRATORS_GUIDE.md | 163 +++++ routlin/DESCRIPTION.txt => DESCRIPTION.txt | 0 GUIDE.md => RADIUS_CAPTIVE_PORTAL.md | 0 README.md | 135 ++-- .../routlin-dash/app/pages/dnsserver/view.py | 7 - routlin/USAGE.md | 284 --------- routlin/core.py | 14 +- routlin/create_vpn_peer.py | 2 +- routlin/health.py | 11 +- routlin/maintenance.py | 575 +----------------- routlin/metrics.py | 517 ++++++++++++++++ routlin/mod_dns_queries.py | 205 ------- routlin/mod_metrics.py | 284 --------- routlin/mod_timers.py | 2 +- 14 files changed, 780 insertions(+), 1419 deletions(-) create mode 100644 ADMINISTRATORS_GUIDE.md rename routlin/DESCRIPTION.txt => DESCRIPTION.txt (100%) rename GUIDE.md => RADIUS_CAPTIVE_PORTAL.md (100%) delete mode 100644 routlin/USAGE.md create mode 100644 routlin/metrics.py delete mode 100644 routlin/mod_dns_queries.py delete mode 100644 routlin/mod_metrics.py diff --git a/ADMINISTRATORS_GUIDE.md b/ADMINISTRATORS_GUIDE.md new file mode 100644 index 0000000..603c348 --- /dev/null +++ b/ADMINISTRATORS_GUIDE.md @@ -0,0 +1,163 @@ +# Administrators Guide + +All scripts below live in the routlin project root and must be run as root (or with sudo) unless noted otherwise. + +--- + +## install.py + +Setup wizard. Installs system packages, configures Docker, sets up the dashboard container, and installs all systemd timers. + +```bash +sudo python3 install.py +sudo python3 install.py --yes # (or -y) accept all prompts automatically +``` + +Safe to re-run. Will not duplicate files. After install.py completes, run `core.py --apply` to push config.json into the live system for the first time. + +--- + +## core.py + +The main configuration engine. Reads config.json and applies it to systemd-networkd, per-VLAN dnsmasq instances, nftables, WireGuard, FreeRADIUS, and Avahi. Most dashboard Save, Apply, and Fix operations ultimately trigger core.py via the queue timer. + +```bash +sudo python3 core.py --apply # apply full config (safe to re-run multiple times) +sudo python3 core.py --apply --dry-run # preview changes without making them +sudo python3 core.py --disable # interactive wizard to revert the router to a plain client +sudo python3 core.py --disable --dry-run # preview what --disable would remove + +sudo python3 core.py --merge-blocklists # merge downloaded blocklists and reload dnsmasq +sudo python3 core.py --view-configs # print active per-VLAN dnsmasq config files +sudo python3 core.py --view-leases # print active DHCP leases +sudo python3 core.py --view-rules # print active nftables ruleset +sudo python3 core.py --reset-leases # reset all DHCP leases +sudo python3 core.py --reset-leases 30 # reset leases for VLAN 30 only +sudo python3 core.py --status # show service and timer status +``` + +**Dashboard equivalents:** + +| Flag | Dashboard | +|---|---| +| --apply | Actions page - Apply Now button | +| --merge-blocklists | DNS Blocking page - Download Blocklists button | +| --view-leases | DHCP Reservations page (active leases section) | + +--- + +## maintenance.py + +Runs on a timer (default every 10 minutes). Runs `ddns.py --update`, rotates FreeRADIUS logs, refreshes the ARP cache, and runs `metrics.py --collect`. Run manually to trigger all tasks immediately without waiting for the timer. + +```bash +sudo python3 maintenance.py +``` + +**Timer:** `routlin-maintenance.timer` - interval set in config.json under `ddns.general.timer_interval` (default 10 minutes). + +--- + +## metrics.py + +Collects DNS statistics from running dnsmasq instances and stores them in the metrics database. Also collects and prunes per-query DNS logs. Run `--collect` manually to update metrics immediately rather than waiting for the maintenance timer. Run `--view` to print an all-time summary to the terminal. The dashboard is able to display updated information on the DNS Metrics page after `--collect` runs. + +```bash +sudo python3 metrics.py --collect # collect and store metrics now +sudo python3 metrics.py --view # display all-time metrics summary +``` + +`maintenance.py` calls `metrics.py --collect` automatically on each tick. + +--- + +## health.py + +Runs health checks across all services, configuration files, nftables rules, VLAN interfaces, DHCP pools, disk space, upstream DNS, and FreeRADIUS logs. Writes results to `.health`. + +```bash +sudo python3 health.py --collect # run checks and write .health (used by timer) +sudo python3 health.py --view # run checks, write .health, and print results +``` + +`--view` gives an immediate health report without waiting for the timer. The dashboard health banner reads the same `.health` file, so running either flag manually refreshes what the dashboard shows on next page load. + +**Timer:** `routlin-health-check.timer` - every 5 minutes. + +--- + +## ddns.py + +Updates DDNS provider(s) with the current public IP. Called directly by `do_dashboard_queue.sh` for on-demand triggers from the dashboard, and by `maintenance.py` on each timer tick. Shares cache files (`.ddns-last-ip-*`, `.ddns-last-service`) and `ddns.log` across both callers. The DDNS page shows provider status and last update time. + +```bash +sudo python3 ddns.py --update # run one update, advancing the check IP service rotation +sudo python3 ddns.py --update --force # update unconditionally, ignoring cached IP +sudo python3 ddns.py --getip # print current public IP and exit, advancing the check IP service rotation +``` + +--- + +## dl_blocklists.py + +Downloads remote DNS blocklists defined in config.json to the `blocklists/` directory. Does not reload dnsmasq on its own - follow with `core.py --merge-blocklists` to merge and reload. + +```bash +sudo python3 dl_blocklists.py +sudo python3 core.py --merge-blocklists +``` + +**Timer:** `routlin-dns-blocklist-update.timer` - daily at the time configured in config.json (default 02:30 local time). The timer runs both steps automatically. + +**Dashboard equivalent:** DNS Blocking page - Download Blocklists button (runs both steps). + +--- + +## create_vpn_peer.py + +Generates a WireGuard keypair, adds the peer to config.json, and writes a ready-to-import client `.conf` file. Follow with `core.py --apply` to make the peer(s) that you generate live. + +```bash +sudo python3 create_vpn_peer.py --name laptop --ip 192.168.40.2 --vlan-id 40 +sudo python3 create_vpn_peer.py --name phone --ip 192.168.40.3 --vlan-id 40 --split-tunnel +sudo python3 create_vpn_peer.py --name laptop --ip 192.168.40.2 --iface wg0 +sudo python3 core.py --apply +``` + +| Flag | Description | +|---|---| +| --name NAME | Peer name (required) | +| --ip IP | Peer IP within the VPN subnet (required) | +| --vlan-id ID | Target VPN VLAN ID (use this or --iface) | +| --iface IFACE | WireGuard interface name e.g. wg0 (use this or --vlan-id) | +| --split-tunnel | Route only VPN subnet traffic through the tunnel (default is full tunnel) | +| --output PATH | Output path for the client .conf file | + +Transfer the generated `.conf` file to the client device securely (scp, QR code, etc.). Never send it over unencrypted channels. + +**Dashboard equivalent:** VPN page - Add Peer form. + +--- + +## check_captive_users.py + +Expires captive portal sessions that have exceeded their time limit or account expiry, and writes the corresponding nftables disallow commands to the captive queue. The Captive Portal page shows active sessions. Expiry is handled automatically by the timer; there is no manual trigger in the dashboard. + +```bash +sudo python3 check_captive_users.py +``` + +**Timer:** `routlin-captive-check.timer` - every 5 minutes. + +--- + +## Timer Reference + +| Timer | Script | Interval | +|---|---|---| +| `routlin-dashboard-queue.timer` | dashboard queue processor | 30 seconds | +| `routlin-captive-queue.timer` | captive portal queue | 10 seconds | +| `routlin-captive-check.timer` | check_captive_users.py | 5 minutes | +| `routlin-health-check.timer` | health.py --collect | 5 minutes | +| `routlin-maintenance.timer` | maintenance.py | 10 minutes (configurable) | +| `routlin-dns-blocklist-update.timer` | dl_blocklists.py + core.py --merge-blocklists | Daily at configured time | diff --git a/routlin/DESCRIPTION.txt b/DESCRIPTION.txt similarity index 100% rename from routlin/DESCRIPTION.txt rename to DESCRIPTION.txt diff --git a/GUIDE.md b/RADIUS_CAPTIVE_PORTAL.md similarity index 100% rename from GUIDE.md rename to RADIUS_CAPTIVE_PORTAL.md diff --git a/README.md b/README.md index 6b4599b..eddad61 100644 --- a/README.md +++ b/README.md @@ -1,113 +1,94 @@ # Routlin -A collection of Python scripts that transform an existing Linux server (with at least two Ethernet NICs) into a fully-featured enterprise-grade router, eliminating the need for a separate router appliance. +Turn any Linux machine with two NICs into an enterprise-grade router and firewall. Ditch vendor-gated appliances and opaque firmware while keeping your machine fully multipurpose and under your control. Manage VLANs, NAT, DNS, DHCP, VPN, RADIUS, mDNS, and content filtering through a modern web dashboard - all built on battle-tested Linux tools you already trust. + +Designed to integrate seamlessly with existing enterprise and prosumer networking hardware. + +--- ## Why Replace Your Router? -Consumer and prosumer router appliances are constrained by OEM firmware. Security patches depend entirely on the vendor's release schedule, features and functionality are often limited, and devices that reach end of life receive no vendor support at all, leaving gaping security vulnerabilities exposed on your network indefinitely. Running your router on a general-purpose Linux machine gives you: +Consumer and prosumer router appliances are constrained by OEM firmware. Security patches depend entirely on the vendor's release schedule, features are limited by design, and devices that reach end of life get abandoned with no further updates - leaving permanent security holes on your network. -- **Faster speeds** - Utilize full fledged computer hardware (typically exceeds that of consumer appliances) -- **Full flexibility** - Any configuration that Linux and its tooling support -- **Better security** - Patch your own kernel and packages on your own schedule, with no dependency on a vendor who may have abandoned your hardware +Running your router on a general-purpose Linux machine gives you: + +- **Faster speeds** - Use real computer hardware that outperforms consumer appliances +- **Full flexibility** - Any configuration that Linux supports +- **Better security** - Patch your kernel and packages on your schedule, not the vendor's --- -## Summary +## What Routlin Does -These scripts do not run continuously in the background. They install and facilitate the configuration of battle-hardened software (`dnsmasq` for DHCP and DNS, `nftables` for firewall and NAT, `chrony` for NTP, `freeradius` for RADIUS, `avahi` for mDNS discovery, and `wireguard` for VPN) using JSON files that you edit. A fully-featured, easy-to-use web management dashboard is included for users who prefer not to edit JSON directly. +Routlin installs and manages battle-hardened Linux networking software - `dnsmasq`, `nftables`, `freeradius`, `wireguard`, `avahi`, and `chrony` - using a configuration file and a web dashboard. The scripts are not daemons; they configure the real software and get out of the way. + +**VLANs and Network Segmentation** - Slice your network into isolated segments: a trusted LAN, a guest Wi-Fi, an IoT VLAN, a camera network. Each VLAN gets its own DHCP pool, DNS resolver, and firewall rules. Devices on different VLANs cannot reach each other unless you explicitly allow it. + +**DNS Filtering and Content Blocking** - Block ads, malware, and trackers at the DNS level across your whole network. Apply different blocklists to different VLANs - strict filtering on the kids' network, none on the server VLAN. Blocklists update automatically on a daily schedule. + +**WireGuard VPN** - Connect remotely to your home network from anywhere. Routlin configures the WireGuard server, generates keypairs, and produces ready-to-import client config files. Per-peer split-tunnel and full-tunnel routing supported. + +**RADIUS and 802.1X** - Assign devices to VLANs dynamically based on their MAC address using your managed switch or access point. No more manually configuring per-port VLANs on your switch - plug in a device and it lands in the right network automatically. + +**DDNS** - Keep a hostname pointed at your home IP even when your ISP changes it. Supports Cloudflare, No-IP, and DuckDNS with automatic rotation across multiple IP-check services. + +**Web Dashboard** - A clean, modern interface for managing everything without editing JSON by hand. Changes are validated, queued, and applied to the live system automatically. --- -## Capabilities +## Two Editions -The suite is organized into independent but complementary scripts, each managing one layer of the stack: +### Routlin CE - Free for Individual Use -### Core: DHCP, DNS, Firewall, RADIUS, mDNS, and WireGuard VPN (`core.py`) +Routlin CE is the core product and is free for personal, non-commercial use. It includes everything described above: VLANs, DHCP, DNS filtering, WireGuard VPN, RADIUS/802.1X, DDNS, mDNS reflection, port forwarding, the web dashboard, and more. -- Configures VLAN sub-interfaces via `systemd-networkd` -- Assigns static or dynamic DHCP reservations by MAC address and hostname -- Defines dynamic IP pools per VLAN -- Manages per-VLAN gateway, DNS, and NTP settings derived from `server_identities` -- Runs one `dnsmasq` instance per VLAN, each bound exclusively to its gateway IP, giving true per-VLAN DNS filtering -- Applies per-VLAN content filtering - VLANs with different blocklist sets each get their own merged blocklist (blocklists are downloaded and merged by `dns-blocklists.py`) -- Supports local hostname overrides (split DNS for DDNS hostnames) -- Installs a daily `systemd` timer that runs `dns-blocklists.py` to refresh blocklists -- Tracks lifetime DNS metrics (queries forwarded, cache hits, authoritative, TCP peaks, pool usage) -- Builds `nftables` tables atomically - safe to re-apply without service disruption -- Handles port forwarding (DNAT/SNAT) for externally accessible services -- Handles port wrangling - redirects DNS and NTP requests to the local resolver regardless of what the client device may have hardcoded -- Blocks traffic from specific IPs or subnets via `banned_ips` - supports single IPs, CIDR notation, wildcards, and ranges for both IPv4 and IPv6 -- Enforces inter-VLAN isolation by default (forward chain policy drop); specific cross-VLAN traffic is permitted via `inter_vlan_exceptions` -- Masquerades outbound traffic for all non-WireGuard VLANs automatically -- Auto-detects active container bridge interfaces (Docker, Podman, libvirt, etc.) and adds forward rules so VLAN clients can reach containerized services -- Auto-detects active container bridge interfaces and adds DNS listening on each bridge IP, so containers can reach the local DNS resolver during builds and at runtime -- Installs a `systemd` boot service (`routlin-nat.service`) to re-apply firewall rules on every boot -- Co-exists with Docker (does not touch Docker-managed `nat`/`filter` tables) -- Generates FreeRADIUS `clients.conf` and `users` files from `core.json` reservations, enabling dynamic VLAN assignment via MAC Authentication Bypass (MAB) for both wired (802.1X) and wireless clients -- Manages a `.radius-secret` shared secret file (generated automatically on first `--apply` if RADIUS is enabled) -- Configures `avahi-daemon` as an mDNS reflector to forward service discovery announcements (AirPrint, AirPlay, Chromecast, etc.) across VLANs -- Supports any number of WireGuard VPN interfaces (`is_vpn: true` VLANs); generates the server keypair on first apply, writes the server conf to `/etc/wireguard/`, and brings the interface up with `wg-quick`; subsequent applies sync peer changes live without restarting the interface -- Supports per-peer split-tunnel (VPN subnet only) or full-tunnel (all traffic) routing; peer data is stored directly in `core.json` +### Routlin Pro - Paid License -### Optional: DNS Blocklists (`dns-blocklists.py`) +Routlin Pro is a paid license tier with advanced security and monitoring features suited for businesses, managed networks, and power users who need deeper visibility and control. -- Downloads blocklists from upstream providers you choose (e.g. OISD, Hagezi) -- Merges them per unique VLAN combination into conf files loaded by `dnsmasq` -- Runs `core.py --apply` after a successful download to reload all instances -- Invoked by the daily `systemd` timer installed by `core.py --apply` +**Deep Packet Inspection and Device Identification** - See exactly what every device on your network is doing. Routlin Pro automatically classifies devices and identifies traffic types in real time - streaming, gaming, P2P, VoIP, and more - feeding into a Security Insights dashboard and per-device traffic rules. -### Optional: DDNS (`ddns.py`) +**Intrusion Detection and Prevention (IDS/IPS)** - Monitor your network for known threat signatures across all traffic, not just DNS. Choose alert-only mode or automatic blocking. Signature database updated regularly, with an optional extended commercial threat feed. -- Detects the current public IP by rotating through multiple IP-check services -- Updates the specified DNS providers (currently supporting Cloudflare, No-IP and DuckDNS), supporting multiple hostnames and subdomains per provider -- Caches the last known IP per provider to avoid unnecessary API calls -- Installs a `systemd` timer that runs every 5 minutes by default -- Logs all updates and errors to `ddns.log` +**SSL/TLS Traffic Inspection** - See inside encrypted HTTPS traffic for security monitoring and content filtering. Routlin Pro decrypts, inspects, and re-encrypts on the fly, enabling IDS/IPS and anomaly detection to work on traffic that would otherwise be completely opaque. -### Optional: Routlin Dashboard +**Traffic Flows (Session Logging)** - A full log of every TCP and UDP connection through the router: source, destination, port, bytes, and timing. Filter, sort, and save presets. Invaluable for diagnosing bandwidth problems or investigating unexpected activity. -- Web UI for managing all aspects of the router (VLANs, reservations, blocklists, VPN, DDNS, firewall, and more) without editing JSON by hand -- Runs as a Docker container alongside the existing scripts -- Changes made in the dashboard are queued and applied to the live system automatically via a 1-minute systemd timer +**Anomaly and Pattern Detection** - Routlin Pro watches for unusual behavior automatically: unexpected large transfers, SYN flood indicators, overnight activity on idle devices, new device types appearing, and more. Anomalies surface as dashboard alerts and can trigger automated responses like device isolation or rate limiting. + +**Restricted VLANs** - Prevent devices on a VLAN from ever contacting the internet. Perfect for IoT devices, security cameras, NAS, printers, or anything that should never phone home. Works alongside inter-VLAN exception rules so you can still reach quarantined devices from inside the LAN. + +**Supplicant-Based 802.1X Authentication** - Go beyond MAC-based authorization. Routlin Pro adds full EAP-PEAP, EAP-TTLS, and EAP-TLS support, letting devices authenticate with credentials or certificates. Revoke individual device access without changing network passwords - and block anyone spoofing a known MAC. + +**Captive Portal** - Turn any VLAN into a captive portal with a splash page, time-limited vouchers, or full RADIUS login using your existing user accounts. Ideal for guest networks, venues, or any situation where you need to control and track who gets access. --- ## Software Dependencies -These packages are required. `install.py` checks that they are installed and will prompt to install any that are missing. +`install.py` checks for these and will prompt to install any that are missing. -| Dependency | Purpose | Required By | -|---|---|---| -| `python3` | Runs all scripts | All | -| `systemd` | Service, timer, networkd, and timesyncd management | All | -| `dnsmasq` | DHCP server and DNS resolver/forwarder | `core.py` | -| `nftables` | Firewall, NAT, port forwarding, and port wrangling | `core.py` | -| `chrony` | NTP server - synchronizes system clock and serves time to VLAN clients | `core.py` | -| `freeradius` | RADIUS server for dynamic VLAN assignment via MAC auth | `core.py` | -| `avahi-daemon` | mDNS reflector for cross-VLAN service discovery | `core.py` | -| `wireguard-tools` | WireGuard VPN (`wg`, `wg-quick`) | `core.py` (when WireGuard VLANs are configured) | -| `docker` | Runs the Routlin Dashboard container | `install.py` (dashboard only) | -| `caddy` | Reverse proxy for external HTTPS access to the dashboard | `install.py` (external access only) | - ---- - -## Conflicting Software - -The following services conflict with this suite. No manual action is required: `core.py` disables them automatically on `--apply`. `core.py` re-enables them on `--disable`. - -- **systemd-resolved** - DNS stub resolver that conflicts with `dnsmasq` on port 53. Disabled on `--apply`; re-enabled on `--disable`. -- **systemd-timesyncd** - Basic SNTP client that cannot serve time to LAN clients; replaced by `chrony`. Disabled on `--apply`; re-enabled on `--disable`. -- **ufw** - Firewall manager that conflicts with the `nftables` ruleset. Disabled on `--apply` without removal. +| Dependency | Purpose | +|---|---| +| `python3` | Runs all scripts | +| `systemd` | Service, timer, networkd, and timesyncd management | +| `dnsmasq` | DHCP and DNS | +| `nftables` | Firewall, NAT, and port forwarding | +| `chrony` | NTP server for the router and VLAN clients | +| `freeradius` | RADIUS server for dynamic VLAN assignment | +| `avahi-daemon` | mDNS reflection across VLANs | +| `wireguard-tools` | WireGuard VPN (when VPN VLANs are configured) | +| `docker` | Runs the Routlin Dashboard container (dashboard only) | +| `caddy` | Reverse proxy for external HTTPS access to the dashboard (optional) | --- ## Hardware Requirements -- A Linux server with **at least two Ethernet NICs**: - - One NIC facing your ISP modem/ONT (WAN) - - One NIC facing your internal switch (LAN) +- A Linux server with **at least two Ethernet NICs** + - One NIC connected to your ISP modem or ONT (WAN) + - One NIC connected to your internal switch (LAN) --- -For manual configuration and command-line usage without the dashboard, see [USAGE.md](routlin/USAGE.md). - +For command-line usage without the dashboard, see [ADMINISTRATORS_GUIDE.md](ADMINISTRATORS_GUIDE.md). diff --git a/docker/routlin-dash/app/pages/dnsserver/view.py b/docker/routlin-dash/app/pages/dnsserver/view.py index 7fd948d..6bdc882 100644 --- a/docker/routlin-dash/app/pages/dnsserver/view.py +++ b/docker/routlin-dash/app/pages/dnsserver/view.py @@ -31,13 +31,6 @@ def _period_selector_html(current_period): def collect_tokens(cfg): - try: - new = mod_metrics.collect_metrics(cfg) - if new: - mod_metrics.update_metrics_db(new) - except Exception: - pass - tokens = config_utils.collect_layout_tokens(cfg) dns = cfg.get('upstream_dns', {}) servers = dns.get('upstream_servers', []) diff --git a/routlin/USAGE.md b/routlin/USAGE.md deleted file mode 100644 index 601f552..0000000 --- a/routlin/USAGE.md +++ /dev/null @@ -1,284 +0,0 @@ -# Routlin - Manual Usage - -This document covers manual configuration and operation via the command line and JSON files directly. If you are using the Routlin Dashboard web UI, most of this is handled for you and you do not need to follow these steps. - ---- - -## Configuration Files - -All configuration lives in two JSON files. Edit these to match your network before running any scripts. - -| File | Controls | -|---|---| -| `config.json` | VLANs, subnets, gateways, dynamic pools, static/dynamic reservations, RADIUS client flags, mDNS reflection scope, WireGuard interface settings and peers, upstream DNS servers, blocklist sources, per-VLAN blocklist assignments, host overrides, banned IPs, WAN interface, port forwarding rules, port wrangling, inter-VLAN exceptions | -| `ddns.json` | DDNS provider credentials, hostnames/subdomains, update interval, IP-check services | - -### Dotfiles (auto-generated, do not edit) - -| File | Purpose | -|---|---| -| `.radius-secret` | Shared secret between FreeRADIUS and RADIUS clients (APs, switches). Generated automatically on first `--apply` when RADIUS is enabled. Root-owned intentionally. | -| `..pub` | WireGuard server public key per interface (e.g. `.wg0.pub`). Written by `core.py --apply`; read by the dashboard to embed in client config downloads. | -| `.dashboard-queue` | Pending apply commands written by the dashboard; consumed by the 1-minute timer. | -| `.dashboard-done` | UUIDs of already-processed queue entries; prevents duplicate execution. | -| `.dashboard-last-run` | Epoch timestamp of the last timer execution. | -| `.dashboard-lock` | PID lock file preventing concurrent timer runs. | -| `.dashboard-pending` | Changes held back when Apply on Save is disabled; flushed to `.dashboard-queue` when Apply Now is clicked. | -| `.health` | JSON health check results written by `core.py --apply`, `core.py --status`, and the `routlin-health-check` timer (every 5 minutes). Read by the dashboard to display problem alerts. | -| `.dns-metrics` | Cumulative lifetime DNS metrics across all VLAN instances. Created and updated each time `--view-metrics` is run. | -| `.ddns-last-ip-*` | Cached public IP per DDNS provider. Managed by `ddns.py`. | -| `.ddns-last-service` | Tracks IP-check service rotation. Managed by `ddns.py`. | - ---- - -## Initial Configuration - -### 1. Edit Core Configuration (`config.json`) - -Edit the top-level `network_interfaces` block: - -- Set `wan_interface` to the name of your WAN-facing NIC (e.g. `eno2`). Run `ip link` to find it. - -Edit the top-level blocks: - -- Set `upstream_dns.upstream_servers` to your preferred DNS resolvers (e.g. `1.1.1.1`, `8.8.8.8`) -- Add blocklist sources under `dns_blocking.blocklists` with a name, URL, and format for each (e.g. OISD, Hagezi) -- Add entries to `host_overrides` for any local hostnames that should resolve to a specific IP (e.g. a DDNS hostname pointing to an internal server) -- Add entries to `port_forwarding` for any services that should be reachable from the internet (specify protocol, external port, destination IP, and destination port) -- Add entries to `banned_ips` to block traffic from specific IPs or networks (see below) - -Edit the `vlans` array to match your network topology. For each VLAN: - -- Set `subnet` and `subnet_mask`. The VLAN ID is derived automatically from the subnet: for a `/24` it is the third octet (e.g. `192.168.10.0/24` -> VLAN ID `10`); for a `/16` it is the second octet. Ensure this matches the 802.1Q tag configured on your switch. VLAN ID `1` (e.g. `192.168.1.0/24`) is treated as the untagged physical interface. -- For VLAN 1 (the untagged interface), the physical NIC name is taken from your `general.wan_interface` sibling - set `interface` in `general` to the LAN-facing NIC (e.g. `enp6s0`). Sub-interfaces for all other VLANs are named automatically (e.g. `enp6s0.10`). -- Set `radius_default` to `true` on exactly one VLAN - unknown MACs will be placed here (typically guest). All other VLANs set this to `false`. -- Set `use_blocklists` to a list of blocklist names for this VLAN - leave empty for unfiltered DNS -- Set `server_identities` to the IPs the router itself will hold on this VLAN. The lowest last-octet IP is auto-used as gateway, DNS, and NTP server unless overridden in `dhcp_information.explicit_overrides`. -- Set `dhcp_information` fields: pool start/end, `lease_time`, and optionally `explicit_overrides` for gateway, dns_server, or ntp_server -- Add `reservations` for devices that need a known VLAN assignment by MAC address. The `ip` field is optional: - - Omit `ip`, set it to `""`, or set it to `"dynamic"` to let DHCP assign from the pool (hostname is still set) - - Set `ip` to a specific address outside the dynamic pool to pin the device to that IP - - Set `radius_client: true` on any device (AP, switch) that will authenticate other devices via RADIUS -- Add per-VLAN `port_wrangling` entries to redirect DNS or NTP requests to the local resolver -- For WireGuard VLANs, set `is_vpn: true` and include a `vpn_information` block instead of `dhcp_information` and `server_identities`, and a `peers` array instead of `reservations`. WireGuard interface names (`wg0`, `wg1`, ...) are assigned automatically in ascending order of VLAN ID. - -```json -{ - "is_vpn": true, - "name": "vpn", - "subnet": "192.168.40.0", - "subnet_mask": "255.255.255.0", - "radius_default": false, - "use_blocklists": ["oisd-big"], - "server_identities": [ - { "description": "Router/Gateway", "ip": "192.168.40.1" } - ], - "vpn_information": { - "listen_port": 51820, - "server_endpoint": "vpn.example.com", - "domain": "local", - "explicit_overrides": { "gateway": "", "dns_servers": "", "mtu": "" } - }, - "peers": [], - "port_wrangling": [] -} -``` - -The gateway IP is derived from the `server_identities` entry with the lowest value in the last octet (same rule as non-WG VLANs). If `explicit_overrides.gateway` is set, it must match one of the `server_identities` IPs. - -### Banned IPs - -The top-level `banned_ips` array blocks inbound and outbound traffic to/from specific IPs or networks at the firewall level. This is useful for blocking known malicious hosts, entire ASNs, or geographic ranges. Entries support a flexible address syntax: - -```json -"banned_ips": [ - { "description": "Single IP", "enabled": true, "ip": "94.130.52.18" }, - { "description": "IPv4 /24 wildcard", "enabled": true, "ip": "94.130.52.*" }, - { "description": "IPv4 /16 wildcard", "enabled": true, "ip": "94.130.*.*" }, - { "description": "IPv4 CIDR", "enabled": true, "ip": "94.130.0.0/16" }, - { "description": "IPv4 range", "enabled": true, "ip": "94.130.52.1-20" }, - { "description": "IPv4 range+wildcard", "enabled": true, "ip": "94.130-133.52.*" }, - { "description": "Single IPv6", "enabled": true, "ip": "2a01:4f8:c17:b0f::2" }, - { "description": "IPv6 /48 wildcard", "enabled": true, "ip": "2a01:4f8:c17:*" }, - { "description": "IPv6 CIDR", "enabled": true, "ip": "2a01:4f8::/32" } -] -``` - -- `ip` - the address or range to block; supports single IPs, CIDR notation, wildcard octets (`*`), and numeric ranges within a quartet (e.g. `1-20`) -- `enabled` - set to `false` to disable without removing the entry -- Bans apply to both IPv4 and IPv6 traffic - -### Inter-VLAN Firewall - -All cross-VLAN traffic is blocked by default (nftables forward chain policy drop). To permit specific traffic between VLANs, add entries to the top-level `inter_vlan_exceptions` array: - -```json -{ - "description": "Kids -> Plex", - "enabled": true, - "protocol": "both", - "src_ip_or_subnet": "192.168.30.0/24", - "dst_ip_or_subnet": "192.168.1.20", - "dst_port": 32400 -} -``` - -- `src_ip_or_subnet` - single IP or CIDR subnet -- `dst_ip_or_subnet` - single IP or CIDR subnet -- `dst_port` - optional; omit to allow all ports to the destination -- `protocol` - `tcp`, `udp`, or `both` -- `enabled` - set to `false` to disable without removing - -### RADIUS / Dynamic VLAN Assignment - -When at least one reservation has `radius_client: true`, RADIUS is automatically enabled: - -- FreeRADIUS is configured to accept authentication requests from those devices (APs, switches) -- Every MAC reservation across all VLANs is mapped to its VLAN ID in the FreeRADIUS `users` file -- Unknown MACs are assigned to the `radius_default` VLAN -- The shared secret is stored in `.radius-secret` and generated on first `--apply` -- Port 1812 is restricted in nftables to accept connections only from `radius_client` IPs - -Point your AP/switch RADIUS configuration at `:1812` using the secret from `.radius-secret`. - -### mDNS Reflection - -mDNS (Multicast DNS) is the protocol devices use to advertise and discover services on a local network - it powers AirPrint (printer discovery), AirPlay, Chromecast, and similar zero-configuration protocols. mDNS uses the multicast address `224.0.0.251:5353`, which is intentionally scoped to a single subnet and does not cross VLAN boundaries on its own. - -**Single-VLAN networks:** mDNS works without any configuration - all devices share the same subnet and can hear each other's announcements directly. The `mdns_reflection` feature is unnecessary and should be left disabled or omitted entirely. - -**Multi-VLAN networks:** A device on the IoT VLAN (e.g. a network printer) advertising via mDNS is invisible to devices on the Kids or Trusted VLANs, because the multicast packets never leave the IoT subnet. The `mdns_reflection` feature solves this by running `avahi-daemon` as an mDNS proxy on the router, which has an interface on every VLAN. Avahi listens for mDNS announcements arriving on any of the designated reflection interfaces and re-broadcasts them on all the others, making services discoverable across VLANs without requiring any changes on the devices themselves. - -Configure mDNS reflection with the top-level `mdns_reflection` block in `config.json`: - -```json -"mdns_reflection": { - "enabled": true, - "reflect_vlans": ["iot", "guest", "kids"] -} -``` - -- `enabled` - set to `false` to disable entirely; avahi-daemon will be stopped and disabled on the next `--apply` -- `reflect_vlans` - list of VLAN names to participate in reflection; must contain at least two names; WireGuard VLANs are not supported - -**Important:** mDNS reflection makes services *discoverable* across VLANs, but the actual service traffic still requires appropriate `inter_vlan_exceptions` rules to pass through the firewall. For example, to print from the Kids VLAN to a printer on the IoT VLAN, you need both mDNS reflection (so the printer is discovered) and firewall exceptions for ports 9100/TCP and 631/TCP (so the print job can actually reach it). - -### 2. Edit DDNS Configuration (`ddns.json`) - -- Set `provider` to `noip`, `duckdns`, or `cloudflare` -- For No-IP: set `username`, `password`, and the `hostnames` array -- For DuckDNS: set `token` and the `subdomains` array -- For Cloudflare: set `api_token` and the relevant zone/record details -- Set `timer_interval` to how often the IP should be checked (default: `5m`) -- The `ip_check_services` list is used in rotation to detect your current public IP - the defaults can be left as-is - ---- - -## Initial Deployment - -```bash -sudo python3 install.py # Install required packages; optionally set up dashboard and HTTPS -sudo python3 core.py --apply # Apply VLANs, DHCP, DNS, firewall, RADIUS, mDNS, timers -sudo python3 dns-blocklists.py # Download and apply blocklists -``` - -Optional (if DDNS is desired): - -```bash -sudo python3 ddns.py --start # Run an immediate IP update and install the update timer -``` - -Optional (if WireGuard VPN is desired): - -1. Add a WireGuard VLAN to `config.json` with `is_vpn: true` (see configuration example above) -2. Run `sudo python3 core.py --apply` - this generates the server keypair, writes `/etc/wireguard/wg0.conf`, and brings the interface up -3. Add peers using `create_vpn_peer.py` (see below), then run `sudo python3 core.py --apply` again to sync them to the live interface - -```bash -python3 create_vpn_peer.py --name laptop --ip 192.168.40.2 -python3 create_vpn_peer.py --name laptop --ip 192.168.40.2 --iface wg0 -python3 create_vpn_peer.py --name phone --ip 192.168.40.3 --split-tunnel -python3 create_vpn_peer.py --name tablet --ip 192.168.40.4 --output ~/tablet.conf -``` - -The script reads the specified WireGuard VLAN from `config.json`, validates the IP against the VLAN subnet, generates a keypair, appends the peer to `config.json`, and writes the client `.conf` file. If the config has exactly one WireGuard VLAN, `--iface` is optional. Transfer the `.conf` to the peer device by secure means, then delete it from the server. - ---- - -## Usage Reference - -All scripts are designed to be run multiple times - re-running `--apply` replaces the previous configuration safely. - -### install.py - -``` -sudo python3 install.py -``` - -Interactive setup wizard. Detects the Linux package manager, installs required system packages, and optionally sets up the Routlin Dashboard (Docker container with SMTP configuration) and external HTTPS access via Caddy. Safe to re-run: skips already-installed packages and prompts before reconfiguring an existing dashboard. - -### core.py - -Commands that modify system state require `sudo`. Read-only commands do not. - -``` -sudo python3 core.py --apply # Apply full config: networkd, dnsmasq, nftables, RADIUS, mDNS, timers, boot service; runs health checks at end -sudo python3 core.py --apply --dry-run # Preview --apply actions without making changes -sudo python3 core.py --disable # Revert to network client (interactive wizard) -sudo python3 core.py --disable --dry-run # Preview --disable wizard without making changes -sudo python3 core.py --reset-leases # Stop dnsmasq, delete all lease files, restart (forces devices to re-acquire) -sudo python3 core.py --reset-leases VLAN # Reset leases for a specific VLAN only (e.g. trusted, iot, guest) - -python3 core.py --status # Service status, config checks, and log alerts for all managed components; writes .health -python3 core.py --view-configs # Active per-VLAN dnsmasq config files -python3 core.py --view-leases # Active DHCP leases across all VLANs with VLAN, type, and description -python3 core.py --view-rules # Active nftables ruleset -python3 core.py --view-metrics # Lifetime DNS metrics across all VLAN instances -``` - -### dns-blocklists.py - -``` -sudo python3 dns-blocklists.py -``` - -Downloads every blocklist referenced by at least one VLAN, merges them into per-combination conf files, then calls `core.py --apply` to reload dnsmasq instances. Run this after initial deployment and any time you add or change blocklist sources. The daily `systemd` timer installed by `core.py --apply` runs this automatically. - -### create_vpn_peer.py - -Does not require `sudo`. Requires `wireguard-tools` (`wg` must be on PATH) and a prior `core.py --apply` to generate the server keypair. - -``` -python3 create_vpn_peer.py --name NAME --ip IP [--iface IFACE] [--split-tunnel] [--output FILE] - - --name NAME Peer name (e.g. laptop) - --ip IP Peer IP within the VPN subnet (e.g. 192.168.40.2) - --iface IFACE WireGuard interface to add the peer to (e.g. wg0); optional if only one WireGuard VLAN exists - --split-tunnel Route only VPN subnet traffic through the tunnel (default: full tunnel) - --output FILE Output path for the client .conf file (default: vpn-client-.conf) -``` - -### ddns.py - -Only `--start` and `--disable` require `sudo` as they install/remove systemd timer files. All other commands run as a normal user. - -``` -sudo python3 ddns.py --start # Run update and install systemd timer -sudo python3 ddns.py --disable # Stop updates and remove systemd timer - -python3 ddns.py --update # Run one immediate DDNS update (used by timer) -python3 ddns.py --force # Force update regardless of cached IP -python3 ddns.py --status # Timer/service status -python3 ddns.py --getip # Print current public IP and exit -``` - ---- - -## Disabling / Uninstalling Components - -```bash -sudo python3 core.py --disable # Revert to network client (interactive wizard) -sudo python3 ddns.py --disable # Stop and remove DDNS timer -``` - -WireGuard interfaces are brought down automatically by `core.py --disable`. To stop a WireGuard interface independently: `sudo wg-quick down wg0`. diff --git a/routlin/core.py b/routlin/core.py index b7debb7..4ae6b32 100644 --- a/routlin/core.py +++ b/routlin/core.py @@ -100,7 +100,6 @@ import health as health import mod_avahi as avahi import mod_captive as captive import mod_dnsmasq as dnsmasq -import mod_metrics as metrics import mod_networkd as networkd import mod_nftables as nftables import mod_radius as radius @@ -765,7 +764,7 @@ def cmd_apply(data, dry_run=False): timer_files=[timers.HEALTH_TIMER_FILE], svc_files=[timers.HEALTH_TIMER_SVC_FILE], descriptions=["Router status health check"], - exec_starts=[f"/usr/bin/python3 {SCRIPT_DIR / 'health.py'}"], + exec_starts=[f"/usr/bin/python3 {SCRIPT_DIR / 'health.py'} --collect"], interval_secs=[timers.HEALTH_TIMER_INTERVAL_SEC], ) print() @@ -842,7 +841,6 @@ def main(): " sudo python3 core.py --view-leases Show active DHCP leases\n" " sudo python3 core.py --view-rules Show active nftables ruleset\n" " sudo python3 core.py --disable Stop instances, remove nftables, remove all config files\n" - " python3 core.py --view-metrics Show lifetime DNS metrics\n" "\n" " [--dry-run] may be combined with --apply or --disable\n" " to preview all actions verbosely without making any changes:\n" @@ -861,13 +859,11 @@ def main(): "Optionally specify a VLAN name to reset only that VLAN.") parser.add_argument("--view-rules", action="store_true", help="Show active nftables ruleset") parser.add_argument("--disable", action="store_true", help="Stop instances, remove nftables, remove all config files") - parser.add_argument("--view-metrics", action="store_true", help="Show lifetime DNS metrics across all instances") - args = parser.parse_args() if not any([args.apply, args.merge_blocklists, args.dry_run, args.status, args.view_configs, args.view_leases, - args.view_rules, args.disable, args.view_metrics, + args.view_rules, args.disable, args.reset_leases]): parser.print_help() sys.exit(0) @@ -908,12 +904,6 @@ def main(): nftables.show_rules() return - if args.view_metrics: - if not shared.is_root(): - die("This script must be run as root (sudo).") - metrics.show_metrics(data) - return - if args.disable: if not args.dry_run: if not shared.is_root(): diff --git a/routlin/create_vpn_peer.py b/routlin/create_vpn_peer.py index da1aa9c..7549e9f 100644 --- a/routlin/create_vpn_peer.py +++ b/routlin/create_vpn_peer.py @@ -88,7 +88,7 @@ def find_wg_vlan(data, iface=None, vlan_id=None): def server_pubkey(iface): - path = SCRIPT_DIR / f".wg-{iface}.pub" + path = SCRIPT_DIR / f".{iface}.pub" if not path.exists(): die( f"Server public key not found: {path}\n" diff --git a/routlin/health.py b/routlin/health.py index b053d42..20d8fe1 100644 --- a/routlin/health.py +++ b/routlin/health.py @@ -744,11 +744,20 @@ def print_table(status): # =================================================================== if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Routlin health checks") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--collect", action="store_true", help="Run checks and write .health (used by timer)") + group.add_argument("--view", action="store_true", help="Run checks, write .health, and print results") + args = parser.parse_args() + try: with open(CONFIG_FILE) as f: data = json.load(f) except Exception as ex: print(f"Error loading {CONFIG_FILE}: {ex}", file=sys.stderr) sys.exit(1) + _, status = run_and_write(data) - print_table(status) + if args.view: + print_table(status) diff --git a/routlin/maintenance.py b/routlin/maintenance.py index 1bf1e87..7a48e35 100644 --- a/routlin/maintenance.py +++ b/routlin/maintenance.py @@ -3,52 +3,32 @@ maintenance.py -- Periodic maintenance tasks run by the routlin-maintenance systemd timer. Tasks performed on each run: - 1. DDNS: fetch current public IP and update enabled provider(s) if changed. - 2. FreeRADIUS log rotation: truncate radius.log if it exceeds radius.general.log_max_kb. - -Reads config.json in the same directory. Designed to be invoked by core.py --apply -via the routlin-maintenance.timer systemd timer. - -IP check services are rotated each run using .ddns-last-service so -no single provider is spammed. If the selected service fails, the -script falls back through the remaining services in order. - -Per-provider cache files are named .ddns-last-ip-. -DDNS activity is logged to ddns.log in the same directory as this script. -DDNS log is cleared when it exceeds ddns.general.log_max_kb from config. + 1. DDNS update (delegates to ddns.py) + 2. FreeRADIUS log rotation + 3. ARP cache refresh + 4. DNS metrics collection (delegates to metrics.py) Usage: - python3 maintenance.py --update Run all tasks once (used by timer) - python3 maintenance.py --force Force DDNS update regardless of cached IP - python3 maintenance.py --getip Print current public IP and exit + python3 maintenance.py """ import ipaddress import json -import os -import subprocess import re -import urllib.request -import urllib.error +import subprocess import sys -import logging from pathlib import Path -import mod_metrics as metrics -import mod_dns_queries as dns_queries +SCRIPT_DIR = Path(__file__).parent +CONFIG_FILE = SCRIPT_DIR / "config.json" +DDNS_SCRIPT = SCRIPT_DIR / "ddns.py" +METRICS_SCRIPT = SCRIPT_DIR / "metrics.py" +RADIUS_LOG_FILE = Path("/var/log/freeradius/radius.log") +ARP_CACHE_FILE = Path("/var/lib/misc/arp-cache.json") -SCRIPT_DIR = Path(__file__).parent -CONFIG_FILE = SCRIPT_DIR / "config.json" -CACHE_SERVICE_FILE = SCRIPT_DIR / ".ddns-last-service" -LOG_FILE = SCRIPT_DIR / "ddns.log" -RADIUS_LOG_FILE = Path("/var/log/freeradius/radius.log") -ARP_CACHE_FILE = Path("/var/lib/misc/arp-cache.json") - -# log is assigned in setup_logging() after config is loaded -log = None # =================================================================== -# Load config +# Config # =================================================================== def load_config(): @@ -56,437 +36,15 @@ def load_config(): print(f"ERROR: Config file not found: {CONFIG_FILE}", file=sys.stderr) sys.exit(1) with open(CONFIG_FILE) as f: - full = json.load(f) - data = full.get("ddns", {}) - - # Validate general block - required_general = {"log_max_kb", "log_errors_only"} - missing = required_general - set(data.get("general", {}).keys()) - if missing: - print(f"ERROR: Missing keys in ddns.general block: {missing}", file=sys.stderr) - sys.exit(1) - services = data.get("ip_check_services", []) - if not services: - print("ERROR: ddns.general.ip_check_services is empty.", file=sys.stderr) - sys.exit(1) - for svc in services: - if not isinstance(svc, dict) or "type" not in svc: - print(f"ERROR: ip_check_services entry missing 'type': {svc}", file=sys.stderr) - sys.exit(1) - if svc["type"] == "http" and "url" not in svc: - print(f"ERROR: ip_check_services 'http' entry missing 'url': {svc}", file=sys.stderr) - sys.exit(1) - if svc["type"] == "dig" and "url" not in svc: - print(f"ERROR: ip_check_services 'dig' entry missing 'url': {svc}", file=sys.stderr) - sys.exit(1) - - # Validate providers block - if not data.get("providers"): - print("ERROR: No DDNS providers defined in config.", file=sys.stderr) - sys.exit(1) - for p in data["providers"]: - base_required = {"description", "provider", "enabled"} - missing = base_required - set(p.keys()) - if missing: - print(f"ERROR: Provider '{p.get('description', '?')}' missing keys: {missing}", file=sys.stderr) - sys.exit(1) - ptype = p.get("provider", "").lower() - if ptype == "noip": - extra = {"username", "password", "hostnames"} - elif ptype == "duckdns": - extra = {"api_token", "hostnames"} - elif ptype == "cloudflare": - extra = {"api_token", "hostnames"} - else: - print(f"ERROR: Provider '{p.get('description', '?')}' has unknown provider type: '{ptype}'", file=sys.stderr) - sys.exit(1) - missing = extra - set(p.keys()) - if missing: - print(f"ERROR: Provider '{p.get('description', '?')}' missing keys for {ptype}: {missing}", file=sys.stderr) - sys.exit(1) - - data['_radius'] = full.get("radius", {}) - return data - -# =================================================================== -# Helpers -# =================================================================== - -def chown_to_script_dir_owner(path): - """Chown a file to the owner of the script directory. - This works correctly whether invoked via sudo, directly as root (e.g. systemd timer), - or as a normal user - the script directory owner is always the right target. - """ - try: - stat = SCRIPT_DIR.stat() - os.chown(path, stat.st_uid, stat.st_gid) - except OSError: - pass # non-fatal - -# =================================================================== -# Logging -# =================================================================== - -def setup_logging(max_kb, errors_only): - """Clear log if oversized, then initialise logger. Must be called before log is used.""" - global log - max_bytes = int(max_kb * 1024) - try: - if LOG_FILE.exists() and LOG_FILE.stat().st_size > max_bytes: - LOG_FILE.write_text("") - if not LOG_FILE.exists(): - LOG_FILE.touch() - chown_to_script_dir_owner(LOG_FILE) - file_handler = logging.FileHandler(LOG_FILE) - except PermissionError: - print(f"WARNING: Cannot write to {LOG_FILE} (permission denied). " - f"Run with sudo or fix ownership: sudo chown $USER {LOG_FILE}") - file_handler = None - level = logging.ERROR if errors_only else logging.INFO - handlers = [logging.StreamHandler(sys.stdout)] - if file_handler: - handlers.insert(0, file_handler) - logging.basicConfig( - level=level, - format="%(asctime)s %(levelname)-8s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - handlers=handlers, - ) - log = logging.getLogger("ddns") - -# =================================================================== -# Per-provider IP cache -# =================================================================== - -def cache_file_for(description): - """Return the cache file path for a given provider description.""" - safe_name = description.replace(" ", "-") - return SCRIPT_DIR / f".ddns-last-ip-{safe_name}" - -def get_cached_ip(description): - f = cache_file_for(description) - if f.exists(): - return f.read_text().strip() - return None - -def save_cached_ip(description, ip): - f = cache_file_for(description) - f.write_text(ip) - chown_to_script_dir_owner(f) - -# =================================================================== -# Service rotation -# =================================================================== - -def get_next_service_index(total): - """Read last used index, increment, wrap around, return next index.""" - if CACHE_SERVICE_FILE.exists(): - try: - last = int(CACHE_SERVICE_FILE.read_text().strip()) - except ValueError: - last = -1 - else: - last = -1 - return (last + 1) % total - -def save_service_index(index): - CACHE_SERVICE_FILE.write_text(str(index)) - chown_to_script_dir_owner(CACHE_SERVICE_FILE) - -# =================================================================== -# Public IP detection -# =================================================================== - -def _extract_ip(body): - """Extract an IPv4 address from an HTTP response body. - Handles plain text, key=value (e.g. Cloudflare /cdn-cgi/trace), and HTML. - """ - for line in body.splitlines(): - if line.startswith("ip="): - candidate = line[3:].strip() - if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', candidate): - return candidate - plain = body.strip() - if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', plain): - return plain - match = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', body) - return match.group(1) if match else None - - -def _get_ip_via_http(spec): - """Fetch public IP from an HTTP endpoint. spec: {"type": "http", "url": "..."}""" - req = urllib.request.Request(spec["url"], headers={"User-Agent": "ddns-update/1.0"}) - with urllib.request.urlopen(req, timeout=10) as r: - return _extract_ip(r.read().decode().strip()) - - -_SAFE_DIG_RE = re.compile(r'^[a-zA-Z0-9.\-_@+:\s]+$') - -def _get_ip_via_dig(spec): - """Query public IP via dig. spec: {"type": "dig", "url": ""} - Requires the 'dig' utility to be installed. - """ - url = spec["url"] - if not _SAFE_DIG_RE.match(url): - log.warning(f"Skipping dig service with disallowed characters: {url!r}") - return None - cmd = ["dig", "+short"] + url.split() - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) - if result.returncode != 0: - return None - match = re.search(r'\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b', result.stdout) - if match: - return match.group(1) - except FileNotFoundError: - log.warning("'dig' not found; cannot use dig IP check service.") - except Exception: - pass - return None + return json.load(f) # =================================================================== - -def get_public_ip(services): - """ - Start at the next service in rotation. If it fails, fall through - the remaining services in order. Saves the index of the service - that succeeded so the next run starts with the following one. - """ - total = len(services) - start = get_next_service_index(total) - ordered = [services[(start + i) % total] for i in range(total)] - - for i, spec in enumerate(ordered): - stype = spec.get("type", "http") - label = spec.get("url", "?") - try: - if stype == "dig": - ip = _get_ip_via_dig(spec) - else: - ip = _get_ip_via_http(spec) - if ip: - save_service_index((start + i) % total) - log.info(f"Public IP retrieved from {label}: {ip}") - return ip - except Exception as ex: - log.warning(f"IP check failed for {label}: {ex}") - continue - - log.error("Could not determine public IP from any configured service.") - sys.exit(1) - -# =================================================================== -# No-IP update +# DDNS - delegate to ddns.py # =================================================================== -def update_noip(provider, ip): - """ - No-IP HTTP update API. - Docs: https://www.noip.com/integrate/request - Uses HTTP Basic Auth. Supports comma-separated list of hostnames. - """ - username = provider["username"] - password = provider["password"] - hostnames = ",".join(provider["hostnames"]) - - url = f"https://dynupdate.no-ip.com/nic/update?hostname={hostnames}&myip={ip}" - - password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() - password_mgr.add_password(None, url, username, password) - handler = urllib.request.HTTPBasicAuthHandler(password_mgr) - opener = urllib.request.build_opener(handler) - - req = urllib.request.Request(url, headers={"User-Agent": "ddns-update/1.0"}) - - try: - with opener.open(req, timeout=10) as r: - return r.read().decode().strip() - except urllib.error.URLError as e: - log.error(f"Network error contacting No-IP: {e}") - return None - -def interpret_noip_response(response, hostnames, ip): - """ - No-IP response codes: - good -- update successful - nochg -- IP already set to this value (no change needed) - nohost -- hostname not found in account - badauth -- invalid credentials - badagent -- client blocked - !donator -- feature requires paid account - abuse -- account blocked for abuse - 911 -- server-side error, retry later - """ - if response is None: - return False - if response.startswith("good"): - log.info(f"No-IP updated successfully: {hostnames} -> {ip}") - return True - elif response.startswith("nochg"): - log.info(f"No-IP: no change needed ({hostnames} already set to {ip})") - return True - elif response == "nohost": - log.error(f"No-IP: hostname '{hostnames}' not found in account.") - elif response == "badauth": - log.error(f"No-IP: authentication failed for '{hostnames}'. Check username and password.") - elif response == "badagent": - log.error("No-IP: client blocked by No-IP.") - elif response == "!donator": - log.error("No-IP: this feature requires a paid account.") - elif response == "abuse": - log.error("No-IP: account blocked for abuse.") - elif response == "911": - log.error("No-IP: server error. Will retry on next run.") - else: - log.error(f"No-IP: unexpected response: {response}") - return False - - -# =================================================================== -# DuckDNS update -# =================================================================== - -def update_duckdns(provider, ip): - """ - DuckDNS HTTP update API. - Docs: https://www.duckdns.org/spec.jsp - Token-based, no username/password. Subdomains are the short name only - (e.g. "myhome", not "myhome.duckdns.org"). Supports multiple subdomains - as a comma-separated list. - Returns True on success, False on failure. - """ - token = provider["api_token"] - subdomains = ",".join(h.replace(".duckdns.org", "") for h in provider["hostnames"]) - description = provider["description"] - - url = f"https://www.duckdns.org/update?domains={subdomains}&token={token}&ip={ip}" - - try: - req = urllib.request.Request(url, headers={"User-Agent": "ddns-update/1.0"}) - with urllib.request.urlopen(req, timeout=10) as r: - response = r.read().decode().strip() - if response == "OK": - log.info(f"DuckDNS updated successfully: {subdomains} -> {ip}") - return True - else: - log.error(f"DuckDNS update failed for '{description}': response was '{response}'") - return False - except urllib.error.URLError as e: - log.error(f"Network error contacting DuckDNS: {e}") - return False - -# =================================================================== -# Cloudflare DNS update -# =================================================================== - -def _cf_api_get(url, headers): - req = urllib.request.Request(url, headers=headers) - try: - with urllib.request.urlopen(req, timeout=10) as r: - return json.loads(r.read().decode()) - except Exception as e: - log.error(f"Cloudflare API GET error ({url}): {e}") - return None - -def _cf_get_zone_id(zone_name, headers): - data = _cf_api_get( - f"https://api.cloudflare.com/client/v4/zones?name={zone_name}", headers - ) - if data and data.get("success") and data["result"]: - return data["result"][0]["id"] - return None - -def _cf_get_record_id(zone_id, hostname, headers): - data = _cf_api_get( - f"https://api.cloudflare.com/client/v4/zones/{zone_id}/dns_records?name={hostname}&type=A", - headers, - ) - if data and data.get("success") and data["result"]: - return data["result"][0]["id"] - return None - -def update_cloudflare(provider, ip): - """ - Cloudflare DNS update API. - Docs: https://developers.cloudflare.com/api/resources/dns/subresources/records/methods/edit/ - Bearer-token auth. Looks up zone and record IDs dynamically, then PATCHes each A record. - """ - token = provider["api_token"] - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json", - "User-Agent": "ddns-update/1.0", - } - success = True - for hostname in provider["hostnames"]: - zone_name = ".".join(hostname.split(".")[-2:]) - zone_id = _cf_get_zone_id(zone_name, headers) - if not zone_id: - log.error(f"Cloudflare: zone '{zone_name}' not found in account.") - success = False - continue - record_id = _cf_get_record_id(zone_id, hostname, headers) - if not record_id: - log.error(f"Cloudflare: A record for '{hostname}' not found in zone '{zone_name}'.") - success = False - continue - url = f"https://api.cloudflare.com/client/v4/zones/{zone_id}/dns_records/{record_id}" - payload = json.dumps({"content": ip}).encode() - req = urllib.request.Request(url, data=payload, headers=headers, method="PATCH") - try: - with urllib.request.urlopen(req, timeout=10) as r: - data = json.loads(r.read().decode()) - if data.get("success"): - log.info(f"Cloudflare updated successfully: {hostname} -> {ip}") - else: - log.error(f"Cloudflare update failed for '{hostname}': {data.get('errors')}") - success = False - except Exception as e: - log.error(f"Cloudflare API PATCH error for '{hostname}': {e}") - success = False - return success - -# =================================================================== -# Process a single provider block -# =================================================================== - -def process_provider(provider, current_ip, force=False): - description = provider["description"] - - if not provider.get("enabled") is True: - log.info(f"Provider '{description}' is disabled, skipping.") - return - - cached_ip = get_cached_ip(description) - - if not force and current_ip == cached_ip: - log.info(f"[{description}] IP unchanged ({current_ip}), skipping update.") - return - - if force: - log.info(f"[{description}] Force update requested. Updating with {current_ip}...") - elif cached_ip: - log.info(f"[{description}] IP changed: {cached_ip} -> {current_ip}. Updating...") - else: - log.info(f"[{description}] No cached IP found. Updating with {current_ip}...") - - ptype = provider["provider"].lower() - - if ptype == "noip": - hostnames = ",".join(provider["hostnames"]) - response = update_noip(provider, current_ip) - success = interpret_noip_response(response, hostnames, current_ip) - elif ptype == "duckdns": - success = update_duckdns(provider, current_ip) - elif ptype == "cloudflare": - success = update_cloudflare(provider, current_ip) - else: - log.error(f"[{description}] Unknown provider type: '{ptype}'") - return - - if success: - save_cached_ip(description, current_ip) +def run_ddns(): + subprocess.run([sys.executable, str(DDNS_SCRIPT), "--update"]) # =================================================================== @@ -494,7 +52,6 @@ def process_provider(provider, current_ip, force=False): # =================================================================== def _clear_radius_log_dir(log_dir, reason): - """Delete all files in log_dir and print reason.""" try: files = [p for p in log_dir.iterdir() if p.is_file()] if not files: @@ -514,7 +71,6 @@ def _clear_radius_log_dir(log_dir, reason): def rotate_radius_log(radius_cfg): - """Clear the FreeRADIUS log dir if logging is disabled or total size exceeds log_max_kb.""" general = radius_cfg.get("general", {}) log_dir = RADIUS_LOG_FILE.parent if not log_dir.exists(): @@ -536,20 +92,15 @@ def rotate_radius_log(radius_cfg): # =================================================================== -# Main +# ARP cache # =================================================================== ARP_MAX_AGE_SECS = 4 * 3600 def refresh_arp_cache(cfg): - try: - with open(CONFIG_FILE) as f: - full_cfg = json.load(f) - except Exception: - full_cfg = {} vlan_networks = [] - for v in full_cfg.get('vlans', []): + for v in cfg.get('vlans', []): subnet = v.get('subnet') mask = v.get('subnet_mask') if subnet and mask: @@ -560,12 +111,12 @@ def refresh_arp_cache(cfg): try: result = subprocess.run(['ip', '-stats', 'neigh'], capture_output=True, text=True, timeout=5) - best = {} # mac -> (used_secs, entry_dict) + best = {} for line in result.stdout.splitlines(): parts = line.split() if 'lladdr' not in parts: continue - if ':' in parts[0]: # skip IPv6 + if ':' in parts[0]: continue try: addr = ipaddress.IPv4Address(parts[0]) @@ -592,87 +143,17 @@ def refresh_arp_cache(cfg): print(f"WARNING: Could not refresh ARP cache: {exc}") -def run_update(cfg, force=False, getip_only=False): - """Perform a single DDNS update pass. - If force=True, bypasses the cached IP check and always updates. - If getip_only=True, prints the detected public IP and returns without updating providers.""" - current_ip = get_public_ip(cfg["ip_check_services"]) - - if getip_only: - print(current_ip) - return - - enabled = [p for p in cfg["providers"] if p.get("enabled") is True] - - if not enabled: - log.error("No enabled providers found in config.") - sys.exit(1) - - for provider in enabled: - process_provider(provider, current_ip, force=force) - +# =================================================================== +# Main +# =================================================================== def main(): - import argparse - parser = argparse.ArgumentParser( - description="Routlin periodic maintenance (DDNS update + log rotation)", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "examples:\n" - " python3 maintenance.py --update Run all tasks once (used by timer)\n" - " python3 maintenance.py --force Force DDNS update regardless of cached IP\n" - " python3 maintenance.py --getip Print current public IP and exit\n" - ) - ) - parser.add_argument("--update", action="store_true", help="Run all tasks once (used by timer)") - parser.add_argument("--force", action="store_true", help="Force DDNS update regardless of cached IP") - parser.add_argument("--getip", action="store_true", help="Print current public IP and exit") - - args = parser.parse_args() - - if not any([args.update, args.force, args.getip]): - parser.print_help() - return - - if args.getip: - global log - log = logging.getLogger("ddns_quiet") - log.addHandler(logging.NullHandler()) - log.propagate = False - cfg = load_config() - run_update(cfg, getip_only=True) - return - - cfg = load_config() - general = cfg["general"] - setup_logging(general["log_max_kb"], general["log_errors_only"]) - - if args.update or args.force: - run_update(cfg, force=args.force) - - rotate_radius_log(cfg.get("_radius", {})) + run_ddns() + cfg = load_config() + rotate_radius_log(cfg.get("radius", {})) refresh_arp_cache(cfg) + subprocess.run([sys.executable, str(METRICS_SCRIPT), "--collect"]) - try: - with open(CONFIG_FILE) as f: - full_cfg = json.load(f) - new_metrics = metrics.collect_metrics(full_cfg) - if new_metrics: - metrics.update_metrics_db(new_metrics) - except Exception as e: - log.warning(f"DNS metrics collection failed: {e}") - - try: - with open(CONFIG_FILE) as f: - full_cfg = json.load(f) - inserted = dns_queries.collect(full_cfg) - if inserted: - log.info(f"DNS query collector: inserted {inserted} new rows.") - pruned = dns_queries.prune(full_cfg) - if pruned: - log.info(f"DNS query collector: pruned {pruned} old rows.") - except Exception as e: - log.warning(f"DNS query collection failed: {e}") if __name__ == "__main__": main() diff --git a/routlin/metrics.py b/routlin/metrics.py new file mode 100644 index 0000000..86769cc --- /dev/null +++ b/routlin/metrics.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 +""" +metrics.py -- DNS metrics collection and display. + +Collects DNS stats by sending SIGUSR1 to dnsmasq instances, parses output +from journalctl, and stores daily-aggregated totals in a SQLite database. +Also collects per-query DNS logs from journalctl and stores them for the +dashboard DNS Queries page. + +Usage: + python3 metrics.py --collect Collect and store metrics (run by maintenance timer) + python3 metrics.py --view Display all-time metrics summary +""" + +import json +import os +import re +import signal +import sqlite3 +import subprocess +import sys +import time +from collections import defaultdict, deque +from datetime import date +from pathlib import Path + +import mod_shared as shared +import mod_validation as validation + +SCRIPT_DIR = Path(__file__).parent +CONFIG_FILE = SCRIPT_DIR / "config.json" +METRICS_DB_FILE = shared.SCRIPT_DIR / ".dns-metrics" +QUERIES_DB_FILE = shared.SCRIPT_DIR / ".dns-queries" + + +# =================================================================== +# Config +# =================================================================== + +def load_config(): + if not CONFIG_FILE.exists(): + print(f"ERROR: Config file not found: {CONFIG_FILE}", file=sys.stderr) + sys.exit(1) + with open(CONFIG_FILE) as f: + return json.load(f) + + +# =================================================================== +# Metrics database +# =================================================================== + +def open_metrics_db(): + con = sqlite3.connect(METRICS_DB_FILE, timeout=10) + con.execute('PRAGMA journal_mode=WAL') + con.executescript(''' + CREATE TABLE IF NOT EXISTS daily_totals ( + date TEXT PRIMARY KEY, + last_updated INTEGER, + queries_forwarded INTEGER NOT NULL DEFAULT 0, + queries_answered_locally INTEGER NOT NULL DEFAULT 0, + queries_authoritative INTEGER NOT NULL DEFAULT 0, + cache_reused INTEGER NOT NULL DEFAULT 0, + tcp_hwm INTEGER NOT NULL DEFAULT 0, + tcp_max_allowed INTEGER NOT NULL DEFAULT 0, + pool_memory_max INTEGER NOT NULL DEFAULT 0, + dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0, + dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0, + dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0 + ); + CREATE TABLE IF NOT EXISTS daily_servers ( + date TEXT NOT NULL, + address TEXT NOT NULL, + queries_sent INTEGER NOT NULL DEFAULT 0, + retried INTEGER NOT NULL DEFAULT 0, + failed INTEGER NOT NULL DEFAULT 0, + nxdomain INTEGER NOT NULL DEFAULT 0, + avg_latency_ms INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (date, address) + ); + ''') + con.commit() + return con + + +# =================================================================== +# Queries database +# =================================================================== + +QUERY_RE = re.compile(r'query\[(\w+)\] (\S+) from ([\d.]+)') +BLOCK_RE = re.compile(r'(\S+) is 0\.0\.0\.0$') +CACHED_RE = re.compile(r'cached (\S+) is ') +FWD_RE = re.compile(r'forwarded (\S+) to ') +REPLY_RE = re.compile(r'\breply (\S+) is ') +LOCAL_RE = re.compile(r'/\S+ (\S+) is ') + + +def open_queries_db(): + con = sqlite3.connect(QUERIES_DB_FILE, timeout=10) + con.execute('PRAGMA journal_mode=WAL') + con.executescript(''' + CREATE TABLE IF NOT EXISTS dns_queries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ts INTEGER NOT NULL, + domain TEXT NOT NULL, + qtype TEXT NOT NULL, + client_ip TEXT NOT NULL, + vlan TEXT NOT NULL, + blocked INTEGER NOT NULL DEFAULT 0 + ); + CREATE INDEX IF NOT EXISTS idx_dq_ts ON dns_queries(ts); + CREATE INDEX IF NOT EXISTS idx_dq_domain ON dns_queries(domain, blocked); + CREATE INDEX IF NOT EXISTS idx_dq_client ON dns_queries(client_ip); + CREATE TABLE IF NOT EXISTS collector_state ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ); + ''') + con.commit() + return con + + +def _get_cursor(con): + row = con.execute("SELECT value FROM collector_state WHERE key='cursor'").fetchone() + return row[0] if row else None + + +def _save_cursor(con, cursor_val): + con.execute( + "INSERT OR REPLACE INTO collector_state(key, value) VALUES ('cursor', ?)", + (cursor_val,) + ) + + +# =================================================================== +# Collect metrics +# =================================================================== + +def collect_metrics(data): + """ + Send SIGUSR1 to each running dnsmasq instance and parse stats from + journalctl. Returns a combined metrics dict, or None if unavailable. + """ + m = { + "queries_forwarded": 0, + "queries_answered_locally": 0, + "queries_authoritative": 0, + "cache_reused": 0, + "tcp_hwm": 0, + "tcp_max_allowed": 0, + "pool_memory_max": 0, + "dnssec_subqueries_hwm": 0, + "dnssec_crypto_hwm": 0, + "dnssec_sig_fails_hwm": 0, + "servers": [] + } + + t_signal = int(time.time()) + any_running = False + for vlan in data["vlans"]: + pid_file = shared.vlan_pid_file(vlan) + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, signal.SIGUSR1) + any_running = True + except Exception: + continue + + if not any_running: + print("No dnsmasq instances are running.") + return None + + time.sleep(2) + + server_map = {} + for vlan in data["vlans"]: + svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data)) + result = subprocess.run( + ["journalctl", "-u", svc, f"--since=@{t_signal}", "--no-pager", "-o", "cat"], + capture_output=True, text=True + ) + for line in result.stdout.splitlines(): + r = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line) + if r: + m["cache_reused"] += int(r.group(1)) + + r = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line) + if r: + m["queries_forwarded"] += int(r.group(1)) + m["queries_answered_locally"] += int(r.group(2)) + + r = re.search(r"queries for authoritative zones (\d+)", line) + if r: + m["queries_authoritative"] += int(r.group(1)) + + r = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line) + if r: + m["tcp_hwm"] = max(m["tcp_hwm"], int(r.group(1))) + m["tcp_max_allowed"] = max(m["tcp_max_allowed"], int(r.group(2))) + + r = re.search(r"pool memory in use \d+, max (\d+)", line) + if r: + m["pool_memory_max"] = max(m["pool_memory_max"], int(r.group(1))) + + r = re.search( + r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), " + r"nxdomain replies (\d+), avg\. latency (\d+)ms", + line + ) + if r: + addr = r.group(1) + if addr not in server_map: + server_map[addr] = { + "address": addr, "queries_sent": 0, "retried": 0, + "failed": 0, "nxdomain": 0, "avg_latency_ms": 0 + } + server_map[addr]["queries_sent"] += int(r.group(2)) + server_map[addr]["retried"] += int(r.group(3)) + server_map[addr]["failed"] += int(r.group(4)) + server_map[addr]["nxdomain"] += int(r.group(5)) + if int(r.group(6)) > 0: + server_map[addr]["avg_latency_ms"] = int(r.group(6)) + + m["servers"] = list(server_map.values()) + return m + + +def store_metrics(new_metrics): + today = date.today().isoformat() + con = open_metrics_db() + + con.execute(''' + INSERT INTO daily_totals( + date, last_updated, + queries_forwarded, queries_answered_locally, queries_authoritative, + cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max, + dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm + ) VALUES (?,strftime('%s','now'),?,?,?,?,?,?,?,?,?,?) + ON CONFLICT(date) DO UPDATE SET + last_updated = strftime('%s','now'), + queries_forwarded = queries_forwarded + excluded.queries_forwarded, + queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally, + queries_authoritative = queries_authoritative + excluded.queries_authoritative, + cache_reused = cache_reused + excluded.cache_reused, + tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm), + tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0 + THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END, + pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max), + dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm), + dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm), + dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm) + ''', ( + today, + new_metrics["queries_forwarded"], + new_metrics["queries_answered_locally"], + new_metrics["queries_authoritative"], + new_metrics["cache_reused"], + new_metrics["tcp_hwm"], + new_metrics["tcp_max_allowed"], + new_metrics["pool_memory_max"], + new_metrics["dnssec_subqueries_hwm"], + new_metrics["dnssec_crypto_hwm"], + new_metrics["dnssec_sig_fails_hwm"], + )) + + for srv in new_metrics["servers"]: + con.execute(''' + INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms) + VALUES (?,?,?,?,?,?,?) + ON CONFLICT(date, address) DO UPDATE SET + queries_sent = queries_sent + excluded.queries_sent, + retried = retried + excluded.retried, + failed = failed + excluded.failed, + nxdomain = nxdomain + excluded.nxdomain, + avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0 + THEN excluded.avg_latency_ms + ELSE avg_latency_ms END + ''', ( + today, srv["address"], + srv["queries_sent"], srv["retried"], srv["failed"], + srv["nxdomain"], srv["avg_latency_ms"], + )) + + con.commit() + shared.chown_to_script_dir_owner(METRICS_DB_FILE) + con.close() + + +# =================================================================== +# Collect DNS queries +# =================================================================== + +def collect_queries(data): + """ + Fetch new dnsmasq query log entries from journalctl since the last + stored cursor, parse query/result pairs, and insert into dns_queries. + Returns the number of rows inserted. + """ + unit_to_vlan = {} + for vlan in data.get('vlans', []): + if not vlan.get('dnsmasq_log_queries_days', 0): + continue + iface = validation.derive_interface(vlan, data) + svc = shared.vlan_service_name(vlan, iface) + unit_to_vlan[svc] = vlan['name'] + unit_to_vlan[svc + '.service'] = vlan['name'] + + if not unit_to_vlan: + return 0 + + con = open_queries_db() + journal_cursor = _get_cursor(con) + + cmd = ['journalctl', '-u', 'dnsmasq-routlin-*', '--no-pager', '-o', 'json'] + if journal_cursor: + cmd += ['--after-cursor', journal_cursor] + + result = subprocess.run(cmd, capture_output=True, text=True) + + pending = defaultdict(deque) + rows = [] + last_cursor = journal_cursor + + for line in result.stdout.splitlines(): + try: + entry = json.loads(line) + except Exception: + continue + + msg = entry.get('MESSAGE', '') + if not isinstance(msg, str): + continue + + raw_unit = entry.get('_SYSTEMD_UNIT', '') + vlan_name = unit_to_vlan.get(raw_unit) or unit_to_vlan.get(raw_unit.removesuffix('.service')) + jcursor = entry.get('__CURSOR', '') + ts = int(entry.get('__REALTIME_TIMESTAMP', 0)) // 1_000_000 + + if vlan_name: + qm = QUERY_RE.search(msg) + if qm: + pending[qm.group(2)].append({ + 'ts': ts, 'qtype': qm.group(1), + 'client_ip': qm.group(3), 'vlan': vlan_name, + }) + else: + domain = None + blocked = 0 + bm = BLOCK_RE.search(msg) + if bm: + domain = bm.group(1) + blocked = 1 + else: + for pat in (CACHED_RE, FWD_RE, REPLY_RE, LOCAL_RE): + pm = pat.search(msg) + if pm: + domain = pm.group(1) + break + + if domain and pending.get(domain): + p = pending[domain].popleft() + if not pending[domain]: + del pending[domain] + rows.append((p['ts'], domain, p['qtype'], p['client_ip'], p['vlan'], blocked)) + + if jcursor: + last_cursor = jcursor + + for domain, q in pending.items(): + for p in q: + rows.append((p['ts'], domain, p['qtype'], p['client_ip'], p['vlan'], 0)) + + if rows: + con.executemany( + 'INSERT INTO dns_queries(ts, domain, qtype, client_ip, vlan, blocked)' + ' VALUES(?,?,?,?,?,?)', + rows + ) + + if last_cursor and last_cursor != journal_cursor: + _save_cursor(con, last_cursor) + + con.commit() + shared.chown_to_script_dir_owner(QUERIES_DB_FILE) + con.close() + return len(rows) + + +def prune_queries(data): + """ + Delete dns_queries rows older than the retention period configured per VLAN. + Returns the number of rows deleted. + """ + days = max( + (v.get('dnsmasq_log_queries_days', 0) for v in data.get('vlans', [])), + default=0 + ) + if not days or not QUERIES_DB_FILE.exists(): + return 0 + cutoff = int(time.time()) - days * 86400 + con = open_queries_db() + cur = con.execute('DELETE FROM dns_queries WHERE ts < ?', (cutoff,)) + deleted = cur.rowcount + con.commit() + con.close() + return deleted + + +# =================================================================== +# Display +# =================================================================== + +def show_metrics(data): + new = collect_metrics(data) + if new is None: + return + store_metrics(new) + + con = open_metrics_db() + row = con.execute(''' + SELECT + MIN(date), MAX(date), COUNT(*), + SUM(queries_forwarded), SUM(queries_answered_locally), + SUM(queries_authoritative), SUM(cache_reused), + MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max) + FROM daily_totals + ''').fetchone() + servers = con.execute(''' + SELECT + ds.address, + SUM(ds.queries_sent), + SUM(ds.retried), + SUM(ds.failed), + SUM(ds.nxdomain), + (SELECT avg_latency_ms FROM daily_servers d2 + WHERE d2.address = ds.address AND d2.avg_latency_ms > 0 + ORDER BY d2.date DESC LIMIT 1) + FROM daily_servers ds + GROUP BY ds.address + ORDER BY SUM(ds.queries_sent) DESC + ''').fetchall() + con.close() + + first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row + + print("DNS Metrics (all-time totals across all VLAN instances)") + print(f" First recorded : {first or '-'}") + print(f" Last recorded : {last or '-'}") + print(f" Days tracked : {days or 0}") + print() + print("Queries") + print(f" Forwarded to upstream : {(fwd or 0):,}") + print(f" Answered from cache : {(local or 0):,}") + print(f" Authoritative : {(auth or 0):,}") + print(f" Cache reused : {(reused or 0):,}") + print() + print("TCP") + print(f" Peak concurrent (HWM) : {tcp_hwm or 0}") + print(f" Max allowed : {tcp_max or 0}") + print() + print(f"Pool memory peak : {pool or 0} bytes") + if servers: + print() + print("Upstream servers (all-time)") + for addr, sent, retried, failed, nxdomain, latency in servers: + print(f" {addr}") + print(f" Sent : {(sent or 0):,}") + print(f" Retried : {(retried or 0):,}") + print(f" Failed : {(failed or 0):,}") + print(f" NXDOMAIN : {(nxdomain or 0):,}") + print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -") + + +# =================================================================== +# Main +# =================================================================== + +def main(): + import argparse + parser = argparse.ArgumentParser( + description="DNS metrics collection and display", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "examples:\n" + " python3 metrics.py --collect Collect and store metrics (run by maintenance timer)\n" + " python3 metrics.py --view Display all-time metrics summary\n" + ) + ) + parser.add_argument("--collect", action="store_true", help="Collect and store metrics") + parser.add_argument("--view", action="store_true", help="Display all-time metrics summary") + + args = parser.parse_args() + + if not any([args.collect, args.view]): + parser.print_help() + return + + data = load_config() + + if args.view: + show_metrics(data) + return + + new = collect_metrics(data) + if new: + store_metrics(new) + + inserted = collect_queries(data) + if inserted: + print(f"DNS query collector: inserted {inserted} new rows.") + + pruned = prune_queries(data) + if pruned: + print(f"DNS query collector: pruned {pruned} old rows.") + + +if __name__ == "__main__": + main() diff --git a/routlin/mod_dns_queries.py b/routlin/mod_dns_queries.py deleted file mode 100644 index 99ac853..0000000 --- a/routlin/mod_dns_queries.py +++ /dev/null @@ -1,205 +0,0 @@ -""" -mod_dns_queries.py -- DNS query log collector. - -Reads dnsmasq query logs from journalctl using a cursor bookmark, -parses query/result line pairs, and appends rows to a SQLite database. - -Called by: - - maintenance.py on each timer tick - - routlin-dash overview page on each page load (background thread) - -Only VLANs with dnsmasq_log_queries_days > 0 are collected. -""" - -import json -import re -import sqlite3 -import subprocess -from collections import defaultdict, deque -from pathlib import Path - -import mod_shared as shared -import mod_validation as validation - -DB_FILE = shared.SCRIPT_DIR / ".dns-queries" - -QUERY_RE = re.compile(r'query\[(\w+)\] (\S+) from ([\d.]+)') -BLOCK_RE = re.compile(r'(\S+) is 0\.0\.0\.0$') -CACHED_RE = re.compile(r'cached (\S+) is ') -FWD_RE = re.compile(r'forwarded (\S+) to ') -REPLY_RE = re.compile(r'\breply (\S+) is ') -LOCAL_RE = re.compile(r'/\S+ (\S+) is ') - - -# =================================================================== -# Database -# =================================================================== - -def open_db(): - con = sqlite3.connect(DB_FILE, timeout=10) - con.execute('PRAGMA journal_mode=WAL') - con.executescript(''' - CREATE TABLE IF NOT EXISTS dns_queries ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - ts INTEGER NOT NULL, - domain TEXT NOT NULL, - qtype TEXT NOT NULL, - client_ip TEXT NOT NULL, - vlan TEXT NOT NULL, - blocked INTEGER NOT NULL DEFAULT 0 - ); - CREATE INDEX IF NOT EXISTS idx_dq_ts ON dns_queries(ts); - CREATE INDEX IF NOT EXISTS idx_dq_domain ON dns_queries(domain, blocked); - CREATE INDEX IF NOT EXISTS idx_dq_client ON dns_queries(client_ip); - CREATE TABLE IF NOT EXISTS collector_state ( - key TEXT PRIMARY KEY, - value TEXT NOT NULL - ); - ''') - con.commit() - return con - - -def _get_cursor(con): - row = con.execute("SELECT value FROM collector_state WHERE key='cursor'").fetchone() - return row[0] if row else None - - -def _save_cursor(con, cursor_val): - con.execute( - "INSERT OR REPLACE INTO collector_state(key, value) VALUES ('cursor', ?)", - (cursor_val,) - ) - - -# =================================================================== -# Collection -# =================================================================== - -def collect(data): - """ - Fetch new dnsmasq log entries from journalctl since the last stored - cursor, parse query/result pairs, and insert into dns_queries. - Returns the number of rows inserted. - """ - unit_to_vlan = {} - for vlan in data.get('vlans', []): - if not vlan.get('dnsmasq_log_queries_days', 0): - continue - iface = validation.derive_interface(vlan, data) - svc = shared.vlan_service_name(vlan, iface) - unit_to_vlan[svc] = vlan['name'] - unit_to_vlan[svc + '.service'] = vlan['name'] - - if not unit_to_vlan: - return 0 - - con = open_db() - journal_cursor = _get_cursor(con) - - cmd = ['journalctl', '-u', 'dnsmasq-routlin-*', '--no-pager', '-o', 'json'] - if journal_cursor: - cmd += ['--after-cursor', journal_cursor] - - result = subprocess.run(cmd, capture_output=True, text=True) - - # pending[domain] = deque of {ts, qtype, client_ip, vlan} - # FIFO so concurrent same-domain queries from different clients pair correctly. - pending = defaultdict(deque) - rows = [] - last_cursor = journal_cursor - - for line in result.stdout.splitlines(): - try: - entry = json.loads(line) - except Exception: - continue - - msg = entry.get('MESSAGE', '') - if not isinstance(msg, str): - continue - - raw_unit = entry.get('_SYSTEMD_UNIT', '') - vlan_name = unit_to_vlan.get(raw_unit) or unit_to_vlan.get(raw_unit.removesuffix('.service')) - jcursor = entry.get('__CURSOR', '') - ts = int(entry.get('__REALTIME_TIMESTAMP', 0)) // 1_000_000 - - if vlan_name: - m = QUERY_RE.search(msg) - if m: - # Incoming query line -- push to pending, wait for result line - pending[m.group(2)].append({ - 'ts': ts, 'qtype': m.group(1), - 'client_ip': m.group(3), 'vlan': vlan_name, - }) - else: - # Result line -- identify domain and whether it was blocked - domain = None - blocked = 0 - - bm = BLOCK_RE.search(msg) - if bm: - domain = bm.group(1) - blocked = 1 - else: - for pat in (CACHED_RE, FWD_RE, REPLY_RE, LOCAL_RE): - pm = pat.search(msg) - if pm: - domain = pm.group(1) - break - - if domain and pending.get(domain): - p = pending[domain].popleft() - if not pending[domain]: - del pending[domain] - rows.append((p['ts'], domain, p['qtype'], p['client_ip'], p['vlan'], blocked)) - - if jcursor: - last_cursor = jcursor - - # Flush any pending entries that never received a result line. - # This can happen when the collector runs mid-transaction. We - # record them as not-blocked since if they were blocked dnsmasq - # would have answered synchronously and the result line would be - # in the same journal batch. - for domain, q in pending.items(): - for p in q: - rows.append((p['ts'], domain, p['qtype'], p['client_ip'], p['vlan'], 0)) - - if rows: - con.executemany( - 'INSERT INTO dns_queries(ts, domain, qtype, client_ip, vlan, blocked)' - ' VALUES(?,?,?,?,?,?)', - rows - ) - - if last_cursor and last_cursor != journal_cursor: - _save_cursor(con, last_cursor) - - con.commit() - shared.chown_to_script_dir_owner(DB_FILE) - con.close() - return len(rows) - - -def prune(data): - """ - Delete dns_queries rows older than the retention period configured per VLAN. - Uses the maximum retention days across all logging-enabled VLANs. - Returns the number of rows deleted. - """ - days = max( - (v.get('dnsmasq_log_queries_days', 0) for v in data.get('vlans', [])), - default=0 - ) - if not days: - return 0 - if not DB_FILE.exists(): - return 0 - cutoff = int(__import__('time').time()) - days * 86400 - con = open_db() - cur = con.execute('DELETE FROM dns_queries WHERE ts < ?', (cutoff,)) - deleted = cur.rowcount - con.commit() - con.close() - return deleted diff --git a/routlin/mod_metrics.py b/routlin/mod_metrics.py deleted file mode 100644 index e166787..0000000 --- a/routlin/mod_metrics.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -mod_metrics.py -- DNS metrics collection and display. - -Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl, -and stores daily-aggregated totals in a SQLite database (.dns-metrics2). - -Each maintenance tick upserts into today's row, accumulating additive -counters and taking MAX for high-water marks. All-time totals are -derived with SUM/MAX across rows at read time. -""" - -import os -import re -import signal -import sqlite3 -import subprocess -import time -from datetime import date - -import mod_shared as shared -import mod_validation as validation - -DB_FILE = shared.SCRIPT_DIR / ".dns-metrics" - - -# =================================================================== -# Database -# =================================================================== - -def open_db(): - con = sqlite3.connect(DB_FILE, timeout=10) - con.execute('PRAGMA journal_mode=WAL') - con.executescript(''' - CREATE TABLE IF NOT EXISTS daily_totals ( - date TEXT PRIMARY KEY, - last_updated INTEGER, - queries_forwarded INTEGER NOT NULL DEFAULT 0, - queries_answered_locally INTEGER NOT NULL DEFAULT 0, - queries_authoritative INTEGER NOT NULL DEFAULT 0, - cache_reused INTEGER NOT NULL DEFAULT 0, - tcp_hwm INTEGER NOT NULL DEFAULT 0, - tcp_max_allowed INTEGER NOT NULL DEFAULT 0, - pool_memory_max INTEGER NOT NULL DEFAULT 0, - dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0, - dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0, - dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0 - ); - CREATE TABLE IF NOT EXISTS daily_servers ( - date TEXT NOT NULL, - address TEXT NOT NULL, - queries_sent INTEGER NOT NULL DEFAULT 0, - retried INTEGER NOT NULL DEFAULT 0, - failed INTEGER NOT NULL DEFAULT 0, - nxdomain INTEGER NOT NULL DEFAULT 0, - avg_latency_ms INTEGER NOT NULL DEFAULT 0, - PRIMARY KEY (date, address) - ); - ''') - con.commit() - return con - - -# =================================================================== -# Collect -# =================================================================== - -def collect_metrics(data): - """ - Send SIGUSR1 to each running dnsmasq instance and parse stats from - journalctl. Returns a combined metrics dict, or None if unavailable. - """ - metrics = { - "queries_forwarded": 0, - "queries_answered_locally": 0, - "queries_authoritative": 0, - "cache_reused": 0, - "tcp_hwm": 0, - "tcp_max_allowed": 0, - "pool_memory_max": 0, - "dnssec_subqueries_hwm": 0, - "dnssec_crypto_hwm": 0, - "dnssec_sig_fails_hwm": 0, - "servers": [] - } - - t_signal = int(time.time()) - any_running = False - for vlan in data["vlans"]: - pid_file = shared.vlan_pid_file(vlan) - try: - pid = int(pid_file.read_text().strip()) - os.kill(pid, signal.SIGUSR1) - any_running = True - except Exception: - continue - - if not any_running: - print("No dnsmasq instances are running.") - return None - - time.sleep(2) - - server_map = {} - for vlan in data["vlans"]: - svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data)) - result = subprocess.run( - ["journalctl", "-u", svc, f"--since=@{t_signal}", - "--no-pager", "-o", "cat"], - capture_output=True, text=True - ) - for line in result.stdout.splitlines(): - m = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line) - if m: - metrics["cache_reused"] += int(m.group(1)) - - m = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line) - if m: - metrics["queries_forwarded"] += int(m.group(1)) - metrics["queries_answered_locally"] += int(m.group(2)) - - m = re.search(r"queries for authoritative zones (\d+)", line) - if m: - metrics["queries_authoritative"] += int(m.group(1)) - - m = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line) - if m: - metrics["tcp_hwm"] = max(metrics["tcp_hwm"], int(m.group(1))) - metrics["tcp_max_allowed"] = max(metrics["tcp_max_allowed"], int(m.group(2))) - - m = re.search(r"pool memory in use \d+, max (\d+)", line) - if m: - metrics["pool_memory_max"] = max(metrics["pool_memory_max"], int(m.group(1))) - - m = re.search( - r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), " - r"nxdomain replies (\d+), avg\. latency (\d+)ms", - line - ) - if m: - addr = m.group(1) - if addr not in server_map: - server_map[addr] = { - "address": addr, "queries_sent": 0, "retried": 0, - "failed": 0, "nxdomain": 0, "avg_latency_ms": 0 - } - server_map[addr]["queries_sent"] += int(m.group(2)) - server_map[addr]["retried"] += int(m.group(3)) - server_map[addr]["failed"] += int(m.group(4)) - server_map[addr]["nxdomain"] += int(m.group(5)) - if int(m.group(6)) > 0: - server_map[addr]["avg_latency_ms"] = int(m.group(6)) - - metrics["servers"] = list(server_map.values()) - return metrics - - -# =================================================================== -# Store -# =================================================================== - -def update_metrics_db(new_metrics): - today = date.today().isoformat() - con = open_db() - - con.execute(''' - INSERT INTO daily_totals( - date, last_updated, - queries_forwarded, queries_answered_locally, queries_authoritative, - cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max, - dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm - ) VALUES (?,strftime('%s','now'),?,?,?,?,?,?,?,?,?,?) - ON CONFLICT(date) DO UPDATE SET - last_updated = strftime('%s','now'), - queries_forwarded = queries_forwarded + excluded.queries_forwarded, - queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally, - queries_authoritative = queries_authoritative + excluded.queries_authoritative, - cache_reused = cache_reused + excluded.cache_reused, - tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm), - tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0 - THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END, - pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max), - dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm), - dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm), - dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm) - ''', ( - today, - new_metrics["queries_forwarded"], - new_metrics["queries_answered_locally"], - new_metrics["queries_authoritative"], - new_metrics["cache_reused"], - new_metrics["tcp_hwm"], - new_metrics["tcp_max_allowed"], - new_metrics["pool_memory_max"], - new_metrics["dnssec_subqueries_hwm"], - new_metrics["dnssec_crypto_hwm"], - new_metrics["dnssec_sig_fails_hwm"], - )) - - for srv in new_metrics["servers"]: - con.execute(''' - INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms) - VALUES (?,?,?,?,?,?,?) - ON CONFLICT(date, address) DO UPDATE SET - queries_sent = queries_sent + excluded.queries_sent, - retried = retried + excluded.retried, - failed = failed + excluded.failed, - nxdomain = nxdomain + excluded.nxdomain, - avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0 - THEN excluded.avg_latency_ms - ELSE avg_latency_ms END - ''', ( - today, srv["address"], - srv["queries_sent"], srv["retried"], srv["failed"], - srv["nxdomain"], srv["avg_latency_ms"], - )) - - con.commit() - shared.chown_to_script_dir_owner(DB_FILE) - con.close() - - -# =================================================================== -# Display -# =================================================================== - -def show_metrics(data): - new = collect_metrics(data) - if new is None: - return - update_metrics_db(new) - - con = open_db() - row = con.execute(''' - SELECT - MIN(date), MAX(date), COUNT(*), - SUM(queries_forwarded), SUM(queries_answered_locally), - SUM(queries_authoritative), SUM(cache_reused), - MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max) - FROM daily_totals - ''').fetchone() - servers = con.execute(''' - SELECT - ds.address, - SUM(ds.queries_sent), - SUM(ds.retried), - SUM(ds.failed), - SUM(ds.nxdomain), - (SELECT avg_latency_ms FROM daily_servers d2 - WHERE d2.address = ds.address AND d2.avg_latency_ms > 0 - ORDER BY d2.date DESC LIMIT 1) - FROM daily_servers ds - GROUP BY ds.address - ORDER BY SUM(ds.queries_sent) DESC - ''').fetchall() - con.close() - - first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row - - print("DNS Metrics (all-time totals across all VLAN instances)") - print(f" First recorded : {first or '-'}") - print(f" Last recorded : {last or '-'}") - print(f" Days tracked : {days or 0}") - print() - print("Queries") - print(f" Forwarded to upstream : {(fwd or 0):,}") - print(f" Answered from cache : {(local or 0):,}") - print(f" Authoritative : {(auth or 0):,}") - print(f" Cache reused : {(reused or 0):,}") - print() - print("TCP") - print(f" Peak concurrent (HWM) : {tcp_hwm or 0}") - print(f" Max allowed : {tcp_max or 0}") - print() - print(f"Pool memory peak : {pool or 0} bytes") - if servers: - print() - print("Upstream servers (all-time)") - for addr, sent, retried, failed, nxdomain, latency in servers: - print(f" {addr}") - print(f" Sent : {(sent or 0):,}") - print(f" Retried : {(retried or 0):,}") - print(f" Failed : {(failed or 0):,}") - print(f" NXDOMAIN : {(nxdomain or 0):,}") - print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -") diff --git a/routlin/mod_timers.py b/routlin/mod_timers.py index 423550f..800d104 100644 --- a/routlin/mod_timers.py +++ b/routlin/mod_timers.py @@ -192,7 +192,7 @@ def install_maint_timer(data): "", "[Service]", "Type=oneshot", - f"ExecStart=/usr/bin/python3 {script_path} --update", + f"ExecStart=/usr/bin/python3 {script_path}", "", ]) timer_content = "\n".join([