From 181240eae774c0c6a9687ed5af67868f98c17fe4 Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Tue, 12 May 2026 17:03:59 +0200 Subject: [PATCH] Added hal Python CLI Wraps the rescue/chroot/diagnose/fix workflows in a single tool with LUKS-passphrase keyring caching. Subcommands: status, connect rescue, connect chroot, diagnose, fix-boot, fix-network, downgrade-kernel, downgrade-initramfs, reinstall-grub, use-static-ip, upgrade-system, forget-passphrase. connect subcommands accept an optional remote command after the host for non-interactive execution. README updated to reference hal instead of the previous shell scripts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/settings.json | 29 ++ .codex | 0 .gitignore | 39 +++ Makefile | 39 +++ README.md | 48 ++- pyproject.toml | 29 ++ src/hetzner_arch_luks/__init__.py | 1 + src/hetzner_arch_luks/__main__.py | 4 + src/hetzner_arch_luks/cli.py | 212 ++++++++++++ src/hetzner_arch_luks/probe.py | 55 ++++ src/hetzner_arch_luks/remote.py | 303 ++++++++++++++++++ .../resources/diagnose/inside.sh | 155 +++++++++ src/hetzner_arch_luks/resources/fix/boot.sh | 55 ++++ src/hetzner_arch_luks/resources/fix/grub.sh | 92 ++++++ .../resources/fix/initramfs.sh | 119 +++++++ src/hetzner_arch_luks/resources/fix/kernel.sh | 110 +++++++ .../resources/fix/network.sh | 69 ++++ .../resources/fix/static_ip.sh | 124 +++++++ .../resources/maintain/upgrade.sh | 95 ++++++ src/hetzner_arch_luks/ssh.py | 145 +++++++++ 20 files changed, 1718 insertions(+), 5 deletions(-) create mode 100644 .claude/settings.json create mode 100644 .codex create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 pyproject.toml create mode 100644 src/hetzner_arch_luks/__init__.py create mode 100644 src/hetzner_arch_luks/__main__.py create mode 100644 src/hetzner_arch_luks/cli.py create mode 100644 src/hetzner_arch_luks/probe.py create mode 100644 src/hetzner_arch_luks/remote.py create mode 100644 src/hetzner_arch_luks/resources/diagnose/inside.sh create mode 100644 src/hetzner_arch_luks/resources/fix/boot.sh create mode 100644 src/hetzner_arch_luks/resources/fix/grub.sh create mode 100644 src/hetzner_arch_luks/resources/fix/initramfs.sh create mode 100644 src/hetzner_arch_luks/resources/fix/kernel.sh create mode 100644 src/hetzner_arch_luks/resources/fix/network.sh create mode 100644 src/hetzner_arch_luks/resources/fix/static_ip.sh create mode 100644 src/hetzner_arch_luks/resources/maintain/upgrade.sh create mode 100644 src/hetzner_arch_luks/ssh.py diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..6d2e022 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,29 @@ +{ + "permissions": { + "allow": [ + "Edit", + "Write", + "Bash(*)", + "WebFetch(domain:pypi.org)", + "WebFetch(domain:files.pythonhosted.org)", + "Bash(python3 -c ' *)", + "WebFetch(domain:api.github.com)" + ], + "ask": [ + "Bash(*hal *)", + "Bash(*hetzner_arch_luks *)", + "Bash(ssh *)", + "Bash(scp *)", + "Bash(sftp *)" + ] + }, + "sandbox": { + "enabled": true, + "autoAllowBashIfSandboxed": true, + "network": { + "allowedDomains": [ + "*" + ] + } + } +} diff --git a/.codex b/.codex new file mode 100644 index 0000000..e69de29 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a7629fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# Python build / runtime artifacts +__pycache__/ +*.py[cod] +*$py.class +*.egg-info/ +*.egg +.eggs/ +build/ +dist/ +wheels/ +pip-wheel-metadata/ + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# Tooling caches +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.tox/ +.coverage +.coverage.* +htmlcov/ + +# Editor / IDE +.idea/ +.vscode/ +*.swp +*~ +.DS_Store + +# Claude Code: personal overrides (settings.json itself is checked in) +.claude/settings.local.json + +# Diagnostic output from `hal diagnose ... | tee diagnose-*.log` +diagnose-*.log diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8aaa50e --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +# Top-level targets for the hetzner-arch-luks helper package. +# +# Usage: +# make install # editable install for the current user +# make uninstall +# make clean # remove Python build artifacts +# make check # quick smoke tests (imports + --help) + +PYTHON ?= python3 +PIP ?= $(PYTHON) -m pip + +.DEFAULT_GOAL := help +.PHONY: help install install-system uninstall clean check + +help: + @echo "Targets:" + @echo " install pip install --user -e ." + @echo " install-system pip install -e . (system-wide; needs sudo or venv)" + @echo " uninstall remove the installed package" + @echo " clean remove __pycache__, *.egg-info, build/, dist/" + @echo " check run package smoke tests" + +install: + $(PIP) install --user -e . + +install-system: + $(PIP) install -e . + +uninstall: + $(PIP) uninstall -y hetzner-arch-luks + +clean: + rm -rf build dist + find . -type d -name '__pycache__' -prune -exec rm -rf {} + + find . -type d -name '*.egg-info' -prune -exec rm -rf {} + + +check: + $(PYTHON) -m hetzner_arch_luks --help >/dev/null + $(PYTHON) -c "from hetzner_arch_luks import cli, ssh, probe, remote; print('imports OK')" diff --git a/README.md b/README.md index cc37b54..7386c9c 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,30 @@ The following symbols show in which environment the code is executed: * :ghost: Chroot from Rescue System into Arch * :minidisc: Arch OS +## CLI helper (`hal`) +This repo ships a small Python CLI (`hal`) that wraps the recurring SSH / LUKS / chroot dances. Install it once on your client: + +```bash +pip install --user -e . +``` + +After that, `hal` is on your `$PATH`. Subcommands used throughout the guide: + +| Command | What it does | +|---|---| +| `hal status ` | Probe reachability (ping, ports 22/222, SSH banner). No login. | +| `hal connect rescue ` | Wait for rescue, drop known_hosts entry, SSH in as root. | +| `hal connect chroot ` | Prompt LUKS passphrase **first** (hidden), then via rescue: assemble RAID → unlock LUKS → mount → drop into `chroot /mnt /bin/bash`. | +| `hal diagnose ` | Same setup as `connect chroot`, then runs a fixed diagnostic script inside the chroot and prints the report to stdout. | + +The passphrase prompt happens *before* the SSH connection is established, so you can type it once, walk away, and the rest runs unattended. + ## Guide ### 1. Configure and Install Image #### 1.1 Login to Hetzner Rescue System :computer: : ```bash -ssh root@your_server_ip +hal connect rescue your_server_ip ``` #### 1.2 Create the /autosetup @@ -154,8 +172,7 @@ reboot #### 4.3 Login to the rescue system :computer: : ```bash -ssh-keygen -f "$HOME/.ssh/known_hosts" -R your_server_ip -ssh root@your_server_ip +hal connect rescue your_server_ip ``` #### 4.4 Mount the "system" @@ -301,6 +318,26 @@ btrfs filesystem resize max / ## 8. Debugging ### 8.1 Login to System from Rescue System +With the rescue system already activated and running, drop straight into the chroot from your client: + +:computer: : +```bash +hal connect chroot your_server_ip +``` +You'll be prompted for the LUKS passphrase first (hidden input). The CLI then waits for rescue, assembles the RAID, opens LUKS, activates LVM, mounts `/mnt` + `/mnt/boot` + the pseudo-filesystems, and drops you into `chroot /mnt /bin/bash`. Idempotent — re-running while already mounted just re-enters the chroot. + +### 8.2 Collect diagnostics in one shot +If you want a non-interactive snapshot of the installed system's state (package versions, last-boot journal errors, sshd status, `/boot` contents, etc.): + +:computer: : +```bash +hal diagnose your_server_ip | tee "diagnose-$(date +%F-%H%M).log" +``` +The CLI runs the same setup as `connect chroot` and then a fixed inspection script inside the chroot. Output goes to stdout (and the log file via `tee`). + +
+Manual equivalent of the unlock + mount sequence + :ambulance: : ```bash cryptsetup luksOpen /dev/md1 cryptroot @@ -311,7 +348,8 @@ mount --bind /sys /mnt/sys mount --bind /proc /mnt/proc chroot /mnt ``` -### 8.2 Logout from chroot environment +
+### 8.3 Logout from chroot environment :ghost: :ambulance: : ```bash exit @@ -321,7 +359,7 @@ sync reboot ``` -### 8.3 Regenerate GRUB and Arch +### 8.4 Regenerate GRUB and Arch :ghost: : ```bash mkinitcpio -p linux diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c4bbedd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[build-system] +requires = ["setuptools>=64"] +build-backend = "setuptools.build_meta" + +[project] +name = "hetzner-arch-luks" +version = "0.1.0" +description = "CLI helpers for the hetzner-arch-luks setup: connect to rescue, drop into the encrypted chroot, probe reachability, collect diagnostics." +readme = "README.md" +requires-python = ">=3.9" +authors = [{ name = "Kevin Veen-Birkenbach" }] +license = { text = "Proprietary" } +classifiers = [ + "Environment :: Console", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", +] + +[project.scripts] +hal = "hetzner_arch_luks.cli:main" + +[tool.setuptools] +package-dir = { "" = "src" } + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +hetzner_arch_luks = ["resources/**/*.sh"] diff --git a/src/hetzner_arch_luks/__init__.py b/src/hetzner_arch_luks/__init__.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/src/hetzner_arch_luks/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/src/hetzner_arch_luks/__main__.py b/src/hetzner_arch_luks/__main__.py new file mode 100644 index 0000000..bfdcd0c --- /dev/null +++ b/src/hetzner_arch_luks/__main__.py @@ -0,0 +1,4 @@ +from .cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/hetzner_arch_luks/cli.py b/src/hetzner_arch_luks/cli.py new file mode 100644 index 0000000..6168b9c --- /dev/null +++ b/src/hetzner_arch_luks/cli.py @@ -0,0 +1,212 @@ +"""Command-line interface for the hetzner-arch-luks helpers. + +Entry point: hal + +Subcommands: + status client-side reachability probe (no login) + connect rescue SSH into the rescue system + connect chroot LUKS unlock + mount + interactive chroot shell + diagnose LUKS unlock + mount + collect diagnostics + +For commands that need the LUKS passphrase, the prompt happens *first*, before +any network IO — so you can type the passphrase, walk away, and the rest runs +unattended. +""" +from __future__ import annotations + +import argparse +import sys + +from . import probe, remote + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="hal", + description="Helper CLI for the hetzner-arch-luks workflow.", + ) + sub = parser.add_subparsers(dest="cmd", required=True) + + p_status = sub.add_parser( + "status", + help="Probe reachability of a host (ping + ports + SSH banner). No login.", + ) + p_status.add_argument("host") + + p_connect = sub.add_parser( + "connect", + help="Open an interactive remote shell.", + ) + p_connect_sub = p_connect.add_subparsers(dest="target", required=True) + + p_rescue = p_connect_sub.add_parser( + "rescue", + help="SSH into the Hetzner rescue system (waits for port 22 to come up). " + "Pass extra args after the host to run them non-interactively.", + ) + p_rescue.add_argument("host") + p_rescue.add_argument( + "command", + nargs=argparse.REMAINDER, + help="Optional command + args to run on the rescue instead of opening " + "an interactive shell. Example: hal connect rescue HOST reboot", + ) + + p_chroot = p_connect_sub.add_parser( + "chroot", + help="Unlock LUKS via rescue, mount, and drop into chroot /mnt /bin/bash. " + "Pass extra args after the host to run them inside the chroot.", + ) + p_chroot.add_argument("host") + p_chroot.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + p_chroot.add_argument( + "command", + nargs=argparse.REMAINDER, + help="Optional command + args to run inside the chroot instead of " + "opening an interactive shell. Example: hal connect chroot HOST pacman -Q linux", + ) + + p_diag = sub.add_parser( + "diagnose", + help="Collect diagnostics from inside the installed system via rescue.", + ) + p_diag.add_argument("host") + p_diag.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_fix = sub.add_parser( + "fix-boot", + help="Apply boot/SSH fixes inside the chroot. MUTATES the installed system.", + ) + p_fix.add_argument("host") + p_fix.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_fixnet = sub.add_parser( + "fix-network", + help="Rewrite systemd-networkd .network files to use MACAddress= match. MUTATES.", + ) + p_fixnet.add_argument("host") + p_fixnet.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_dk = sub.add_parser( + "downgrade-kernel", + help="Roll the linux package back to the previous cached version. MUTATES. " + "Use after a kernel-bump pacman -Syu made the system unbootable.", + ) + p_dk.add_argument("host") + p_dk.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_fp = sub.add_parser( + "forget-passphrase", + help="Drop the cached LUKS passphrase for a host from the libsecret keyring.", + ) + p_fp.add_argument("host") + + p_rg = sub.add_parser( + "reinstall-grub", + help="Re-run grub-install on every disk backing /boot. MUTATES the MBR. " + "Use after a grub-package upgrade that didn't refresh the bootloader.", + ) + p_rg.add_argument("host") + p_rg.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_di = sub.add_parser( + "downgrade-initramfs", + help="Downgrade mkinitcpio + dropbear + cryptsetup + mdadm + lvm2 to the " + "version before the last pacman -Syu, then rebuild initramfs. MUTATES.", + ) + p_di.add_argument("host") + p_di.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_si = sub.add_parser( + "use-static-ip", + help="Replace ip=dhcp in /etc/default/grub with a static kernel-cmdline " + "network spec (derived from /etc/systemd/network/*.network). MUTATES.", + ) + p_si.add_argument("host") + p_si.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + p_us = sub.add_parser( + "upgrade-system", + help="Full pacman -Syyu + initramfs rebuild + grub-install on every boot disk " + "+ grub.cfg regen, all in one chroot session. Uses --disable-sandbox " + "to work around the Hetzner Rescue kernel's missing Landlock. MUTATES.", + ) + p_us.add_argument("host") + p_us.add_argument( + "--no-passphrase-prompt", + action="store_true", + help="Skip the early LUKS prompt (use when LUKS is already open from a prior run).", + ) + + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_parser().parse_args(argv) + + if args.cmd == "status": + return probe.status(args.host) + if args.cmd == "connect" and args.target == "rescue": + return remote.connect_rescue(args.host, command=args.command or None) + if args.cmd == "connect" and args.target == "chroot": + return remote.connect_chroot( + args.host, + ask_passphrase=not args.no_passphrase_prompt, + command=args.command or None, + ) + if args.cmd == "diagnose": + return remote.diagnose(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "fix-boot": + return remote.fix_boot(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "fix-network": + return remote.fix_network(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "downgrade-kernel": + return remote.downgrade_kernel(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "forget-passphrase": + return remote.forget_passphrase(args.host) + if args.cmd == "reinstall-grub": + return remote.reinstall_grub(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "downgrade-initramfs": + return remote.downgrade_initramfs(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "use-static-ip": + return remote.use_static_ip(args.host, ask_passphrase=not args.no_passphrase_prompt) + if args.cmd == "upgrade-system": + return remote.upgrade_system(args.host, ask_passphrase=not args.no_passphrase_prompt) + + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/hetzner_arch_luks/probe.py b/src/hetzner_arch_luks/probe.py new file mode 100644 index 0000000..f04f9b4 --- /dev/null +++ b/src/hetzner_arch_luks/probe.py @@ -0,0 +1,55 @@ +"""Client-side reachability probes that need no SSH credentials.""" +from __future__ import annotations + +import shutil +import socket +import subprocess + + +def _have(cmd: str) -> bool: + return shutil.which(cmd) is not None + + +def _ssh_banner(host: str, port: int = 22, timeout: float = 3) -> str: + """Read the first line the SSH server emits on connect. + + Distinguishes Hetzner rescue (Debian OpenSSH banner) from installed Arch + (Arch OpenSSH banner) from Dropbear (Dropbear banner). + """ + try: + with socket.create_connection((host, port), timeout=timeout) as s: + s.settimeout(2) + data = s.recv(256) + return data.decode("utf-8", errors="replace").splitlines()[0] if data else "" + except (OSError, socket.timeout, UnicodeDecodeError): + return "" + + +def status(host: str) -> int: + """Print a reachability report for `host`. Returns 0 always.""" + print(f"==> ping (ICMP) {host}") + try: + subprocess.run(["ping", "-c", "2", "-W", "2", host], check=False) + except FileNotFoundError: + print("(ping not available)") + + print() + print(f"==> ports 22, 222 on {host}") + if _have("nmap"): + subprocess.run(["nmap", "-Pn", "-p", "22,222", host], check=False) + else: + print("(nmap not installed; falling back to TCP probes)") + for port in (22, 222): + ok = False + try: + with socket.create_connection((host, port), timeout=3): + ok = True + except (OSError, socket.timeout): + pass + print(f" {port}: {'reachable' if ok else 'not reachable (filtered/closed/timeout)'}") + + print() + print(f"==> SSH banner on {host}:22") + banner = _ssh_banner(host, 22) + print(banner if banner else "(no banner)") + return 0 diff --git a/src/hetzner_arch_luks/remote.py b/src/hetzner_arch_luks/remote.py new file mode 100644 index 0000000..6053852 --- /dev/null +++ b/src/hetzner_arch_luks/remote.py @@ -0,0 +1,303 @@ +"""Orchestrates the rescue / chroot / diagnose flows over an SshSession. + +Key UX choices: + - The LUKS passphrase is prompted *before* we touch the network, so the + user enters it once and can step away while the rest runs. + - On first prompt the passphrase is cached in the libsecret keyring + (GNOME Keyring / KWallet via secret-tool) so subsequent runs against + the same host skip the prompt entirely. +""" +from __future__ import annotations + +import getpass +import importlib.resources +import shlex +import shutil +import subprocess +import sys + +from .ssh import SshSession, wait_for_port + + +# Pre-LUKS step: assemble the RAID arrays. Idempotent (mdadm returns non-zero +# when arrays are already assembled — we swallow that). +_ASSEMBLE = "mdadm --assemble --scan 2>/dev/null || true" + +# Post-LUKS step: activate LVM, mount root + boot, bind /dev /proc /sys /run. +# Idempotent: every mount is guarded with `mountpoint -q`. +_MOUNT = r""" +set -e +vgchange -ay >/dev/null +if ! mountpoint -q /mnt; then + mount /dev/vg0/root /mnt + mkdir -p /mnt/boot + mount /dev/md0 /mnt/boot +fi +for d in dev proc sys run; do + mountpoint -q "/mnt/$d" || mount --rbind "/$d" "/mnt/$d" +done +""" + +# Schema for libsecret entries: +# service = hetzner-arch-luks +# host = +_KEYRING_SERVICE = "hetzner-arch-luks" + + +# ---- keyring helpers (libsecret via secret-tool) --------------------------- + + +def _have_secret_tool() -> bool: + return shutil.which("secret-tool") is not None + + +def _keyring_load(host: str) -> str | None: + """Look up the cached LUKS passphrase for `host`. None if not stored.""" + if not _have_secret_tool(): + return None + r = subprocess.run( + ["secret-tool", "lookup", "service", _KEYRING_SERVICE, "host", host], + capture_output=True, text=True, + ) + if r.returncode == 0 and r.stdout: + # secret-tool prints the secret raw, without trailing newline + return r.stdout + return None + + +def _keyring_store(host: str, passphrase: str) -> None: + """Persist `passphrase` in libsecret under (service, host).""" + if not _have_secret_tool(): + return + label = f"hetzner-arch-luks LUKS passphrase for {host}" + subprocess.run( + [ + "secret-tool", "store", "--label", label, + "service", _KEYRING_SERVICE, "host", host, + ], + input=passphrase, text=True, check=False, + ) + + +def _keyring_clear(host: str) -> bool: + """Drop the cached passphrase for `host`. Returns True if anything was deleted.""" + if not _have_secret_tool(): + return False + if _keyring_load(host) is None: + return False + subprocess.run( + ["secret-tool", "clear", "service", _KEYRING_SERVICE, "host", host], + check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + return True + + +# ---- passphrase prompt ----------------------------------------------------- + + +def _prompt_passphrase(host: str, *, force_prompt: bool = False) -> str: + """Get the LUKS passphrase for `host`. + + Order: + 1. Try the libsecret keyring (skipped if force_prompt=True or + secret-tool isn't installed). + 2. Hidden prompt via getpass. On success, store to the keyring for + next time. + + Empty input aborts the whole command. + """ + if not force_prompt: + cached = _keyring_load(host) + if cached: + print(f"(passphrase from keyring for {host})", file=sys.stderr) + return cached + p = getpass.getpass(f"LUKS passphrase for {host}: ") + if not p: + print("Empty passphrase — aborting.", file=sys.stderr) + sys.exit(1) + _keyring_store(host, p) + return p + + +# ---- session helpers ------------------------------------------------------- + + +def _wait_rescue(host: str, timeout: int = 300) -> None: + print(f"==> Waiting for {host}:22 ...") + if not wait_for_port(host, 22, timeout=timeout): + print(f"Timeout: {host}:22 not reachable after {timeout}s", file=sys.stderr) + sys.exit(1) + + +def _luks_is_open(ssh: SshSession) -> bool: + r = ssh.run("test -e /dev/mapper/cryptroot", check=False, capture=True) + return r.returncode == 0 + + +def _ensure_unlocked(ssh: SshSession, host: str, passphrase: str | None) -> None: + """Open LUKS if needed. Retries once with a fresh prompt if the cached + passphrase from the keyring is rejected by cryptsetup. + + cryptsetup reads the passphrase from stdin (via --key-file=-) and stops + at EOF. We send raw bytes with no trailing newline. + """ + if _luks_is_open(ssh): + print("==> LUKS already open.") + return + if passphrase is None: + passphrase = _prompt_passphrase(host) + print("==> Opening LUKS ...") + try: + ssh.run( + "cryptsetup luksOpen --key-file=- /dev/md1 cryptroot", + input_=passphrase.encode(), + ) + except subprocess.CalledProcessError: + # Most likely: wrong passphrase. If we got it from the keyring, + # clear the bad entry and re-prompt once. + if _keyring_clear(host): + print( + "==> cryptsetup rejected the cached passphrase. Cleared keyring; re-prompting.", + file=sys.stderr, + ) + passphrase = _prompt_passphrase(host, force_prompt=True) + ssh.run( + "cryptsetup luksOpen --key-file=- /dev/md1 cryptroot", + input_=passphrase.encode(), + ) + else: + raise + + +def _setup(ssh: SshSession, host: str, passphrase: str | None) -> None: + """Full sequence: assemble + LUKS + LVM + mount + binds.""" + print("==> Assembling RAID ...") + ssh.run(_ASSEMBLE) + _ensure_unlocked(ssh, host, passphrase) + print("==> Activating LVM + mounting + binding ...") + ssh.run(_MOUNT) + + +# ---- public entry points (called by cli.py) -------------------------------- + + +def connect_rescue(host: str, *, command: list[str] | None = None) -> int: + """Wait for rescue to come up, then either open an interactive SSH shell + or run `command` non-interactively and print its output. + + No passphrase prompt — rescue itself isn't encrypted. + """ + _wait_rescue(host) + with SshSession(host) as ssh: + if command: + cmd_str = " ".join(shlex.quote(c) for c in command) + print(f"==> Running on rescue: {cmd_str}") + ssh.run(cmd_str, check=False) + else: + print("==> Connected to rescue. Type 'exit' to leave.") + ssh.run("exec bash -l", tty=True, check=False) + return 0 + + +def connect_chroot( + host: str, + *, + ask_passphrase: bool = True, + command: list[str] | None = None, +) -> int: + """Unlock LUKS via rescue, mount, then either open an interactive chroot + shell or run `command` inside the chroot non-interactively and print + its output.""" + passphrase = _prompt_passphrase(host) if ask_passphrase else None + _wait_rescue(host) + with SshSession(host) as ssh: + _setup(ssh, host, passphrase) + if command: + # Pipe the command into chroot's bash via stdin — avoids all the + # quoting layers of `bash -c ''` and is identical to how the + # diagnose/fix scripts are streamed in. + cmd_str = " ".join(shlex.quote(c) for c in command) + print(f"==> Running in chroot: {cmd_str}") + ssh.run("chroot /mnt /bin/bash", input_=(cmd_str + "\n").encode()) + else: + print("==> Entering chroot. Type 'exit' to leave.") + ssh.run("chroot /mnt /bin/bash", tty=True, check=False) + return 0 + + +def diagnose(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + run the chrooted diagnose script. Output goes to stdout.""" + return _run_chroot_script(host, "diagnose/inside.sh", "diagnose", ask_passphrase) + + +def fix_boot(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + apply boot/SSH fixes inside chroot. MUTATES the system.""" + return _run_chroot_script(host, "fix/boot.sh", "fix-boot", ask_passphrase) + + +def fix_network(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + rewrite .network files to use MACAddress= match. MUTATES.""" + return _run_chroot_script(host, "fix/network.sh", "fix-network", ask_passphrase) + + +def downgrade_kernel(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + downgrade linux to the previous cached version. MUTATES.""" + return _run_chroot_script(host, "fix/kernel.sh", "downgrade-kernel", ask_passphrase) + + +def reinstall_grub(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + grub-install on every disk backing /boot's RAID. MUTATES MBR.""" + return _run_chroot_script(host, "fix/grub.sh", "reinstall-grub", ask_passphrase) + + +def downgrade_initramfs(host: str, *, ask_passphrase: bool = True) -> int: + """Downgrade mkinitcpio+dropbear+cryptsetup+mdadm+lvm2, rebuild initramfs. MUTATES.""" + return _run_chroot_script(host, "fix/initramfs.sh", "downgrade-initramfs", ask_passphrase) + + +def use_static_ip(host: str, *, ask_passphrase: bool = True) -> int: + """Replace ip=dhcp in /etc/default/grub with a static spec parsed from + the existing systemd-networkd .network file. Regenerates grub.cfg. MUTATES.""" + return _run_chroot_script(host, "fix/static_ip.sh", "use-static-ip", ask_passphrase) + + +def upgrade_system(host: str, *, ask_passphrase: bool = True) -> int: + """Unlock + mount + full `pacman -Syu` + rebuild initramfs + refresh GRUB + (config + MBR on all boot disks). Uses --disable-sandbox because the + Hetzner Rescue kernel lacks Landlock. MUTATES.""" + return _run_chroot_script(host, "maintain/upgrade.sh", "upgrade-system", ask_passphrase) + + +def forget_passphrase(host: str) -> int: + """Drop the stored LUKS passphrase for `host` from the libsecret keyring.""" + if not _have_secret_tool(): + print("secret-tool not installed — no keyring backend; nothing to clear.", + file=sys.stderr) + return 1 + if _keyring_clear(host): + print(f"Cleared cached LUKS passphrase for {host}.") + return 0 + print(f"No cached LUKS passphrase for {host}.") + return 0 + + +def _run_chroot_script(host: str, resource: str, label: str, ask_passphrase: bool) -> int: + """Shared driver: unlock + mount + pipe a packaged script into chrooted bash. + + The script is streamed as stdin to `chroot /mnt /bin/bash`; bash reads its + program from stdin, so it runs inside the chroot without leaving any file + on the target. + """ + passphrase = _prompt_passphrase(host) if ask_passphrase else None + _wait_rescue(host) + inside = ( + importlib.resources + .files("hetzner_arch_luks") + .joinpath(f"resources/{resource}") + .read_bytes() + ) + with SshSession(host) as ssh: + _setup(ssh, host, passphrase) + print(f"==> Running {label} inside chroot ...") + ssh.run("chroot /mnt /bin/bash", input_=inside) + return 0 diff --git a/src/hetzner_arch_luks/resources/diagnose/inside.sh b/src/hetzner_arch_luks/resources/diagnose/inside.sh new file mode 100644 index 0000000..a1955d4 --- /dev/null +++ b/src/hetzner_arch_luks/resources/diagnose/inside.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Runs INSIDE the chroot of the installed Arch system. Prints diagnostics +# grouped by banner. Read-only — no state changes. + +banner() { printf "\n========== %s ==========\n" "$1"; } + +banner "uname / os-release" +uname -a +cat /etc/os-release + +banner "package versions (boot/storage/net/ssh)" +pacman -Q linux mkinitcpio openssh systemd device-mapper lvm2 grub \ + cryptsetup mdadm dropbear 2>&1 +pacman -Q mkinitcpio-utils mkinitcpio-dropbear mkinitcpio-netconf 2>&1 || true + +banner "recent upgrades of boot/network/sshd components (last 60 matches)" +# Focused on the packages that most often break a Hetzner Arch+LUKS boot. +grep -E '\[ALPM\] (upgraded|installed|removed) (linux( |$)|systemd( |$)|mkinitcpio( |$)|openssh( |$)|dropbear( |$)|glibc( |$)|cryptsetup( |$)|lvm2( |$)|mdadm( |$)|grub( |$)|iproute2( |$)|nftables( |$)|iptables( |$)|firewalld( |$)|fail2ban( |$)|mkinitcpio-utils( |$)|mkinitcpio-dropbear( |$)|mkinitcpio-netconf( |$))' /var/log/pacman.log 2>/dev/null \ + | tail -60 \ + || echo "(no matches)" + +banner "last full-system upgrade transactions" +grep -nE 'starting full system upgrade|transaction completed' /var/log/pacman.log 2>/dev/null \ + | tail -10 || echo "(no matches)" + +banner "initcpio udev rules shipped on disk" +ls -l /usr/lib/initcpio/udev/ 2>&1 + +banner "is the historically broken file present?" +ls -l /usr/lib/initcpio/udev/11-dm-initramfs.rules 2>&1 || echo "absent" + +banner "encryptssh install hook still references it?" +grep -n "11-dm-initramfs.rules" \ + /usr/lib/initcpio/install/encryptssh \ + /etc/initcpio/install/encryptssh 2>/dev/null || echo "no match" + +banner "mkinitcpio.conf (HOOKS, MODULES, BINARIES, FILES, COMPRESSION)" +grep -E '^(HOOKS|MODULES|BINARIES|FILES|COMPRESSION)=' /etc/mkinitcpio.conf 2>&1 + +banner "/etc/crypttab" +cat /etc/crypttab 2>&1 || true + +banner "/etc/fstab" +cat /etc/fstab 2>&1 || true + +banner "/boot contents and free space" +ls -lh /boot 2>&1 +df -h /boot 2>&1 + +banner "GRUB config + bootloader state" +ls -lh /boot/grub/ 2>&1 +echo +if [ -f /boot/grub/grub.cfg ]; then + if command -v grub-script-check >/dev/null 2>&1; then + grub-script-check /boot/grub/grub.cfg 2>&1 && echo "grub.cfg: syntax OK" + else + echo "grub-script-check not available — skipping syntax check" + fi + echo + echo "-- menuentry / linux / initrd lines (first 40):" + grep -nE '^\s*(linux|initrd|menuentry)' /boot/grub/grub.cfg 2>&1 | head -40 + + echo + echo "-- referenced kernel/initramfs files exist?" + for p in $(grep -hE '^\s*(linux|initrd)\b' /boot/grub/grub.cfg 2>/dev/null \ + | awk '{print $2}' | sort -u); do + if [ -e "$p" ]; then echo "EXISTS $p" + elif [ -e "/boot${p}" ]; then echo "EXISTS /boot${p} (grub.cfg path: $p)" + else echo "MISSING $p" + fi + done +else + echo "/boot/grub/grub.cfg NOT FOUND" +fi +echo +echo "-- grubenv:" +grub-editenv /boot/grub/grubenv list 2>/dev/null || cat /boot/grub/grubenv 2>/dev/null | head -5 || echo "(no grubenv)" + +banner "initramfs contents — key tools actually packed in?" +if command -v lsinitcpio >/dev/null 2>&1; then + echo "-- matches in /boot/initramfs-linux.img:" + lsinitcpio /boot/initramfs-linux.img 2>/dev/null \ + | grep -E '(cryptsetup|dropbear|encryptssh|netconf|mdadm|lvm|/init$|hooks/)' \ + | sort -u | head -50 +else + echo "lsinitcpio not available" +fi + +banner "network: which service manages it?" +for u in systemd-networkd NetworkManager netctl-auto dhcpcd; do + printf " %-22s %s\n" "$u" "$(systemctl is-enabled "$u" 2>&1)" +done +# dhcpcd@interface units (Arch default for static-ish setups) +systemctl list-unit-files 'dhcpcd@*' --no-pager 2>/dev/null | grep -E 'dhcpcd@' || true + +banner "network: config files present" +echo "-- /etc/systemd/network/" +ls -la /etc/systemd/network/ 2>&1 | head -20 || echo "(empty/missing)" +echo +echo "-- /etc/NetworkManager/system-connections/" +ls -la /etc/NetworkManager/system-connections/ 2>&1 | head -20 || echo "(empty/missing)" +echo +echo "-- /etc/netctl/" +ls -la /etc/netctl/ 2>&1 | head -20 || echo "(empty/missing)" +echo +echo "-- /etc/hostname / /etc/hosts" +cat /etc/hostname 2>&1 || true +echo "---" +cat /etc/hosts 2>&1 || true + +banner "firewall units (would persist across reboots)" +for u in nftables iptables ip6tables firewalld ufw fail2ban docker; do + printf " %-12s %s\n" "$u" "$(systemctl is-enabled "$u" 2>&1)" +done +echo +if [ -f /etc/nftables.conf ]; then + echo "-- /etc/nftables.conf (first 60 lines):" + head -60 /etc/nftables.conf +fi +[ -f /etc/iptables/iptables.rules ] && { echo "-- /etc/iptables/iptables.rules (head 40):"; head -40 /etc/iptables/iptables.rules; } + +banner "sshd state + drop-ins" +sshd -t 2>&1 +systemctl is-enabled sshd 2>&1 +grep -nE '^Port|^ListenAddress|^PermitRootLogin' /etc/ssh/sshd_config 2>&1 || true +echo +echo "-- sshd_config.d/ drop-ins (can override main config!):" +ls -la /etc/ssh/sshd_config.d/ 2>&1 || echo "(no drop-ins dir)" +for f in /etc/ssh/sshd_config.d/*.conf; do + [ -e "$f" ] || continue + echo + echo "-- $f:" + cat "$f" +done + +banner "journal: which boots are actually recorded?" +journalctl --list-boots --no-pager 2>&1 | tail -15 + +banner "last recorded boot (-b 0): all errors" +journalctl -b 0 -p err --no-pager 2>&1 | head -100 || true + +banner "last recorded boot (-b 0): sshd" +journalctl -b 0 -u sshd --no-pager 2>&1 | head -40 || true + +banner "last recorded boot (-b 0): cryptsetup / dropbear / network units" +journalctl -b 0 \ + -u 'systemd-cryptsetup*' -u 'dropbear*' \ + -u 'systemd-networkd*' -u 'NetworkManager*' -u 'dhcpcd*' \ + --no-pager 2>&1 | head -80 || true + +banner "previous boot (-b -1): errors (only if a previous boot is recorded)" +journalctl -b -1 -p err --no-pager 2>&1 | head -50 || true + +banner "failed units of last boot" +systemctl --failed --no-pager 2>&1 || true diff --git a/src/hetzner_arch_luks/resources/fix/boot.sh b/src/hetzner_arch_luks/resources/fix/boot.sh new file mode 100644 index 0000000..371794a --- /dev/null +++ b/src/hetzner_arch_luks/resources/fix/boot.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Runs INSIDE the chroot of the installed Arch system. Applies the recommended +# boot / SSH fixes: +# +# 1. PermitRootLogin: rewrite a literal "no" line to "prohibit-password" +# in /etc/ssh/sshd_config AND any drop-in under /etc/ssh/sshd_config.d/. +# Backups are kept once as *.hal-backup. +# 2. Persistent journald: create /var/log/journal so journald survives +# reboot (next boot onwards). Helps catch the next failure if there is one. +# +# Idempotent: re-running is safe — no-op on already-fixed configs. + +set -e + +banner() { printf "\n========== %s ==========\n" "$1"; } + +banner "PermitRootLogin (before)" +grep -rn '^PermitRootLogin' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/ 2>/dev/null \ + || echo "(no explicit setting found)" + +changed=0 +for f in /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*.conf; do + [ -e "$f" ] || continue + if grep -q '^PermitRootLogin no$' "$f"; then + [ -f "$f.hal-backup" ] || cp -a "$f" "$f.hal-backup" + sed -i 's/^PermitRootLogin no$/PermitRootLogin prohibit-password/' "$f" + echo "==> Patched: $f (backup at $f.hal-backup)" + changed=1 + fi +done +[ "$changed" -eq 0 ] && echo "==> Nothing to patch — PermitRootLogin is not 'no' anywhere." + +banner "PermitRootLogin (after)" +grep -rn '^PermitRootLogin' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/ 2>/dev/null \ + || echo "(no explicit setting found)" + +banner "sshd_config syntax check" +sshd -t && echo "syntax OK" + +banner "persistent journald" +if [ ! -d /var/log/journal ]; then + mkdir -p /var/log/journal + systemd-tmpfiles --create --prefix /var/log/journal 2>&1 || true + echo "==> Created /var/log/journal. journald will persist from next boot onwards." +else + echo "/var/log/journal already exists — journald is already persistent." +fi + +banner "/boot space" +df -h /boot +ls -lh /boot + +banner "summary" +echo "Done. The changes take effect on the NEXT boot of the installed system." +echo "Exit the chroot and reboot out of rescue when ready." diff --git a/src/hetzner_arch_luks/resources/fix/grub.sh b/src/hetzner_arch_luks/resources/fix/grub.sh new file mode 100644 index 0000000..4036f83 --- /dev/null +++ b/src/hetzner_arch_luks/resources/fix/grub.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Re-install GRUB stage1 + core.img to the MBR of every physical disk that +# backs /boot's RAID array. Needed when a `pacman -Syu` updated the grub +# package but grub-install was never re-run afterwards, leaving stale +# Stage1 code in the MBR that may not understand the new modules in +# /boot/grub/i386-pc/. +# +# Also regenerates /boot/grub/grub.cfg. +# +# Boot disks are auto-detected from the components of /dev/md0. +# Targets BIOS GRUB (--target=i386-pc); the existing /boot/grub/i386-pc/ +# directory confirms this is a BIOS setup. + +set -e + +banner() { printf "\n========== %s ==========\n" "$1"; } + +banner "current /boot/grub state" +ls -lh /boot/grub/ +echo +echo "-- /boot/grub/i386-pc/ — most recent files:" +ls -lt /boot/grub/i386-pc/ 2>/dev/null | head -8 + +banner "identifying boot disks (members of md0)" +if [ ! -e /dev/md0 ]; then + echo "ERROR: /dev/md0 does not exist. Was the RAID assembled before chroot?" + exit 1 +fi +echo "-- mdadm --detail /dev/md0 (member partitions):" +mdadm --detail /dev/md0 | awk '/active sync/ {print " " $NF}' + +# Convert a partition path to its parent disk. lsblk fails inside our chroot +# (can't resolve PKNAME against the rescue-bound /sys), so use the standard +# Linux device naming conventions instead. +parent_disk() { + local part="$1" + case "$part" in + /dev/nvme[0-9]*n[0-9]*p[0-9]*) echo "${part%p[0-9]*}" ;; + /dev/mmcblk[0-9]*p[0-9]*) echo "${part%p[0-9]*}" ;; + /dev/loop[0-9]*p[0-9]*) echo "${part%p[0-9]*}" ;; + /dev/sd[a-z]*[0-9]*) echo "$part" | sed -E 's/[0-9]+$//' ;; + /dev/vd[a-z]*[0-9]*) echo "$part" | sed -E 's/[0-9]+$//' ;; + /dev/hd[a-z]*[0-9]*) echo "$part" | sed -E 's/[0-9]+$//' ;; + *) + # Last resort — try lsblk; may return empty in chroot + local d + d=$(lsblk -no PKNAME "$part" 2>/dev/null | head -1) + [ -n "$d" ] && echo "/dev/$d" + ;; + esac +} + +BOOT_DISKS=() +for part in $(mdadm --detail /dev/md0 2>/dev/null | awk '/active sync/ {print $NF}'); do + disk=$(parent_disk "$part") + [ -z "$disk" ] && { echo "WARN: cannot resolve parent disk for $part"; continue; } + already=0 + for d in "${BOOT_DISKS[@]}"; do [ "$d" = "$disk" ] && already=1; done + [ "$already" -eq 0 ] && BOOT_DISKS+=("$disk") +done + +if [ "${#BOOT_DISKS[@]}" -eq 0 ]; then + echo "ERROR: could not detect any boot disks." + exit 1 +fi +echo +echo "Will run grub-install on: ${BOOT_DISKS[*]}" + +banner "regenerating /boot/grub/grub.cfg" +grub-mkconfig -o /boot/grub/grub.cfg 2>&1 | tail -10 + +banner "reinstalling GRUB to each boot disk" +for disk in "${BOOT_DISKS[@]}"; do + echo + echo "-- grub-install --target=i386-pc --recheck $disk" + grub-install --target=i386-pc --recheck "$disk" +done + +banner "post-install state" +echo "-- /boot/grub/i386-pc/ — newest files now:" +ls -lt /boot/grub/i386-pc/ 2>/dev/null | head -6 + +banner "next steps" +cat < (OLD -> NEW)" line in pacman.log. +prev_version() { + local pkg="$1" + grep -E "\[ALPM\] upgraded $pkg \(" /var/log/pacman.log 2>/dev/null \ + | tail -1 \ + | sed -E "s/.*upgraded $pkg \(([^ ]+) -> [^)]+\).*/\1/" +} + +banner "discovering previous versions from pacman.log" +declare -A FNAMES +TARGETS=() +for pkg in "${PKGS[@]}"; do + prev=$(prev_version "$pkg") + curr=$(pacman -Q "$pkg" 2>/dev/null | awk '{print $2}') + if [ -z "$prev" ]; then + echo " $pkg: no 'upgraded' entry in pacman.log — SKIP" + continue + fi + if [ "$prev" = "$curr" ]; then + echo " $pkg: already at previous version $curr — skip" + continue + fi + arch=$(pkg_arch "$pkg") + fname="${pkg}-${prev}-${arch}.pkg.tar.zst" + echo " $pkg: $curr → $prev ($fname)" + FNAMES[$pkg]="$fname" + TARGETS+=("$pkg") +done + +if [ "${#TARGETS[@]}" -eq 0 ]; then + echo "Nothing to downgrade." + exit 0 +fi + +banner "fetching packages" +FILES=() +for pkg in "${TARGETS[@]}"; do + fname="${FNAMES[$pkg]}" + cache="/var/cache/pacman/pkg/$fname" + if [ -e "$cache" ]; then + echo " $pkg: cached → $cache" + FILES+=("$cache") + continue + fi + first_letter="${pkg:0:1}" + url="https://archive.archlinux.org/packages/${first_letter}/${pkg}/${fname}" + out="/tmp/$fname" + echo " $pkg: fetching" + echo " URL: $url" + if curl -fsSL --connect-timeout 15 -o "$out" "$url"; then + size=$(du -h "$out" | cut -f1) + echo " OK ($size)" + FILES+=("$out") + else + echo " FAILED — cannot continue without all packages" + exit 1 + fi +done + +banner "downgrading (single transaction)" +pacman -U --noconfirm "${FILES[@]}" + +banner "rebuilding initramfs (with downgraded mkinitcpio + tools)" +mkinitcpio -P + +banner "regenerating GRUB config" +grub-mkconfig -o /boot/grub/grub.cfg 2>&1 | tail -10 + +banner "result" +for pkg in "${PKGS[@]}"; do + pacman -Q "$pkg" 2>/dev/null || true +done + +banner "next steps" +cat </dev/null \ + | tail -1 \ + | sed -E 's/.*upgraded linux \(([^ ]+) -> [^)]+\).*/\1/') +CURR=$(pacman -Q linux | awk '{print $2}') + +if [ -z "$PREV" ]; then + echo "FATAL: Could not parse a previous kernel version from /var/log/pacman.log." + echo " Pacman log entries for 'linux' upgrades:" + grep -E '\[ALPM\] (installed|upgraded) linux \(' /var/log/pacman.log 2>/dev/null \ + | tail -5 || echo " (none found)" + exit 1 +fi + +echo "Currently installed: linux-$CURR" +echo "Previous version: linux-$PREV" + +if [ "$PREV" = "$CURR" ]; then + echo "Already on the previous version. Nothing to do." + exit 0 +fi + +PKG_NAME="linux-${PREV}-x86_64.pkg.tar.zst" +CACHE_PATH="/var/cache/pacman/pkg/${PKG_NAME}" + +banner "locating package" +TARGET="" +if [ -e "$CACHE_PATH" ]; then + echo "Found in cache: $CACHE_PATH" + TARGET="$CACHE_PATH" +else + echo "Not in cache. Fetching from archive.archlinux.org ..." + URL="https://archive.archlinux.org/packages/l/linux/${PKG_NAME}" + echo "URL: $URL" + if curl -fsSL --connect-timeout 15 -o "/tmp/${PKG_NAME}" "$URL"; then + TARGET="/tmp/${PKG_NAME}" + echo "Downloaded: $TARGET ($(du -h "$TARGET" | cut -f1))" + else + cat <&2 + +Download failed from $URL. +Reasons might be: + - chroot has no working DNS / no outbound network + - the specific version is no longer on archive.archlinux.org + - upstream temporarily unavailable + +Workarounds: + 1. Test network from chroot: + curl -v https://archive.archlinux.org/ + 2. Manually download on your client: + curl -O $URL + and SCP into rescue, then place at: + /mnt/tmp/${PKG_NAME} + (Inside the chroot it appears as /tmp/${PKG_NAME}.) + 3. Pick a different version — list at: + https://archive.archlinux.org/packages/l/linux/ +EOF + exit 1 + fi +fi + +banner "/boot space before" +df -h /boot +ls -lh /boot + +banner "downgrading kernel (pacman -U)" +pacman -U --noconfirm "$TARGET" + +banner "regenerating initramfs" +mkinitcpio -P + +banner "regenerating GRUB config" +grub-mkconfig -o /boot/grub/grub.cfg 2>&1 | tail -10 + +banner "/boot space after" +df -h /boot +ls -lh /boot + +banner "result" +pacman -Q linux + +banner "next steps" +cat <.hal-backup. + +set -e + +banner() { printf "\n========== %s ==========\n" "$1"; } + +banner "detecting NIC MAC" +# Pick the first non-loopback link with a colon-formatted MAC. +MAC=$(ip -br link show 2>/dev/null \ + | awk '$1 != "lo" && $1 != "" && $3 ~ /^([0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}$/ {print $3; exit}') + +if [ -z "$MAC" ]; then + echo "Could not auto-detect a non-loopback MAC. Aborting." >&2 + exit 1 +fi +echo "Detected MAC: $MAC" + +banner ".network files (before)" +for f in /etc/systemd/network/*.network; do + [ -e "$f" ] || continue + echo "-- $f:" + cat "$f" + echo +done + +banner "patching" +changed=0 +for f in /etc/systemd/network/*.network; do + [ -e "$f" ] || continue + if grep -qE '^[[:space:]]*MACAddress[[:space:]]*=' "$f"; then + echo "$f: already uses MACAddress= — skipping" + continue + fi + if ! grep -qE '^[[:space:]]*Name[[:space:]]*=' "$f"; then + echo "$f: no Name= match — skipping" + continue + fi + [ -f "$f.hal-backup" ] || cp -a "$f" "$f.hal-backup" + awk -v mac="$MAC" ' + BEGIN { replaced=0 } + /^[[:space:]]*Name[[:space:]]*=/ && !replaced { print "MACAddress=" mac; replaced=1; next } + { print } + ' "$f" > "$f.tmp" && mv "$f.tmp" "$f" + echo "$f: patched (backup at $f.hal-backup)" + changed=1 +done +[ "$changed" -eq 0 ] && echo "Nothing to patch — all .network files already use MACAddress=." + +banner ".network files (after)" +for f in /etc/systemd/network/*.network; do + [ -e "$f" ] || continue + echo "-- $f:" + cat "$f" + echo +done + +banner "summary" +echo "Done. The change takes effect on the NEXT boot of the installed system." +echo "Backups (if any) are at /etc/systemd/network/*.network.hal-backup." diff --git a/src/hetzner_arch_luks/resources/fix/static_ip.sh b/src/hetzner_arch_luks/resources/fix/static_ip.sh new file mode 100644 index 0000000..35acfe0 --- /dev/null +++ b/src/hetzner_arch_luks/resources/fix/static_ip.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Replaces `ip=dhcp` in /etc/default/grub with a static kernel-cmdline +# network spec derived from the existing /etc/systemd/network/*.network file. +# +# Why: Dropbear-in-initramfs relies on a working network for remote LUKS +# unlock. On Hetzner Dedicated, `ip=dhcp` is fragile — Hetzner's own docs +# recommend static configuration for FDE+Dropbear setups. A kernel/iproute2 +# upgrade can subtly change the DHCP request format and break the +# previously-working DHCP path. +# +# The .network file already has the correct values (IP, gateway). This +# script reuses them in the kernel cmdline so dropbear has network in +# initramfs without depending on Hetzner DHCP. +# +# Resulting cmdline format (Linux kernel `ip=` documented form): +# ip=:::::: +# We use: +# ip=46.4.224.77::46.4.224.65:255.255.255.255:echoserver:eth0:none +# +# Idempotent: re-running won't double-patch. +# Reversible: original /etc/default/grub backed up to .hal-backup. + +set -e + +banner() { printf "\n========== %s ==========\n" "$1"; } + +banner "locating systemd-networkd config" +NETFILE="" +for f in /etc/systemd/network/*.network; do + [ -e "$f" ] || continue + NETFILE="$f" + break +done +if [ -z "$NETFILE" ]; then + echo "ERROR: no /etc/systemd/network/*.network file found." + echo " Cannot derive static IP/gateway." + exit 1 +fi +echo "Using: $NETFILE" +echo +cat "$NETFILE" + +banner "parsing" +# IPv4 address: first Address= or [Address]/Address= line without colon. +IPV4=$(awk ' + /^[[:space:]]*Address[[:space:]]*=/ { + sub(/^[[:space:]]*Address[[:space:]]*=[[:space:]]*/, "") + if ($0 !~ /:/) { print; exit } + } +' "$NETFILE") +IPV4_BARE="${IPV4%%/*}" + +# Gateway: first IPv4 Gateway= line. +GATEWAY=$(awk ' + /^[[:space:]]*Gateway[[:space:]]*=/ { + sub(/^[[:space:]]*Gateway[[:space:]]*=[[:space:]]*/, "") + if ($0 !~ /:/) { print; exit } + } +' "$NETFILE") + +HOST="$(cat /etc/hostname 2>/dev/null | head -1 | tr -d ' \t\n' || true)" +[ -z "$HOST" ] && HOST="host" + +# Device: 'eth0' matches the kernel pre-udev naming of the first ethernet +# interface and is what Hetzner uses in their FDE-static-IP docs. +DEVICE="eth0" + +echo " IPv4: $IPV4_BARE" +echo " Gateway: $GATEWAY" +echo " Hostname: $HOST" +echo " Device: $DEVICE" + +if [ -z "$IPV4_BARE" ] || [ -z "$GATEWAY" ]; then + echo "ERROR: could not parse IPv4 address or gateway from $NETFILE." + exit 1 +fi + +IPSPEC="ip=${IPV4_BARE}::${GATEWAY}:255.255.255.255:${HOST}:${DEVICE}:none" +echo +echo "Will set kernel cmdline param: $IPSPEC" + +banner "current /etc/default/grub" +cat /etc/default/grub + +banner "patching /etc/default/grub" +if grep -qE 'ip=dhcp' /etc/default/grub; then + [ -f /etc/default/grub.hal-backup ] || cp -a /etc/default/grub /etc/default/grub.hal-backup + # Replace just the ip=dhcp token (leaves all other kernel params untouched) + sed -i -E "s|ip=dhcp|${IPSPEC}|g" /etc/default/grub + echo "Replaced ip=dhcp → $IPSPEC" + echo "Backup: /etc/default/grub.hal-backup" +elif grep -qE "ip=${IPV4_BARE//./\\.}::" /etc/default/grub; then + echo "Static ip= already configured for $IPV4_BARE — no change." +elif grep -qE 'ip=' /etc/default/grub; then + echo "WARNING: /etc/default/grub has an ip= directive that's neither dhcp" + echo " nor the expected static spec. Manual review needed:" + grep -nE 'ip=' /etc/default/grub + echo "Aborting — won't blindly overwrite an unknown ip= value." + exit 1 +else + echo "No ip= directive found in GRUB_CMDLINE_LINUX. Manual edit may be needed." + exit 1 +fi + +banner "patched /etc/default/grub" +cat /etc/default/grub + +banner "regenerating /boot/grub/grub.cfg" +grub-mkconfig -o /boot/grub/grub.cfg 2>&1 | tail -10 + +banner "verifying" +echo "-- ip= lines in new grub.cfg:" +grep -nE '\bip=' /boot/grub/grub.cfg | head -5 || echo "(no ip= line found — unexpected)" + +banner "next steps" +cat </dev/null | head -1) + [ -n "$d" ] && echo "/dev/$d" + ;; + esac +} + +banner "pre-upgrade state" +echo "-- key packages BEFORE:" +pacman -Q linux mkinitcpio systemd openssh dropbear cryptsetup mdadm lvm2 grub 2>&1 | head -15 +echo +echo "-- /boot space BEFORE:" +df -h /boot + +banner "running pacman -Syyu (with --disable-sandbox for Rescue kernel)" +pacman --disable-sandbox -Syyu --noconfirm + +banner "rebuilding initramfs" +mkinitcpio -P + +banner "identifying boot disks (members of md0)" +if [ ! -e /dev/md0 ]; then + echo "ERROR: /dev/md0 not present. RAID not assembled? Aborting GRUB step." + exit 1 +fi +BOOT_DISKS=() +for part in $(mdadm --detail /dev/md0 2>/dev/null | awk '/active sync/ {print $NF}'); do + disk=$(parent_disk "$part") + [ -z "$disk" ] && { echo "WARN: cannot resolve parent disk for $part"; continue; } + already=0 + for d in "${BOOT_DISKS[@]}"; do [ "$d" = "$disk" ] && already=1; done + [ "$already" -eq 0 ] && BOOT_DISKS+=("$disk") +done +echo "Boot disks: ${BOOT_DISKS[*]}" + +banner "refreshing GRUB on all boot disks" +for disk in "${BOOT_DISKS[@]}"; do + echo + echo "-- grub-install --target=i386-pc --recheck $disk" + grub-install --target=i386-pc --recheck "$disk" +done + +banner "regenerating /boot/grub/grub.cfg" +grub-mkconfig -o /boot/grub/grub.cfg 2>&1 | tail -10 + +banner "post-upgrade state" +echo "-- key packages AFTER:" +pacman -Q linux mkinitcpio systemd openssh dropbear cryptsetup mdadm lvm2 grub 2>&1 | head -15 +echo +echo "-- /boot space AFTER:" +df -h /boot + +banner "summary" +cat <\` afterwards to + harden the initramfs network against future DHCP issues. + 2. Exit chroot, umount -R /mnt, reboot, disable Rescue in Hetzner Robot. + 3. Watch with: hal status +EOF diff --git a/src/hetzner_arch_luks/ssh.py b/src/hetzner_arch_luks/ssh.py new file mode 100644 index 0000000..7038999 --- /dev/null +++ b/src/hetzner_arch_luks/ssh.py @@ -0,0 +1,145 @@ +"""SSH helpers using OpenSSH ControlMaster for connection reuse. + +The `SshSession` context manager opens a single SSH connection on enter +(interactive: password / host key accept happens here once) and then runs +follow-up commands over the same multiplexed channel without re-auth. + +We deliberately wrap the OpenSSH client rather than using a library like +paramiko so the user's existing config (~/.ssh/config, agent, key files, +known_hosts) just works. +""" +from __future__ import annotations + +import os +import shutil +import socket +import subprocess +import tempfile +import time + + +def remove_stale_known_hosts(host: str) -> None: + """Drop any cached host key for `host`. + + Each Hetzner rescue activation generates a fresh host key, so a stale + entry would otherwise block the connection with a MITM warning. + """ + known = os.path.expanduser("~/.ssh/known_hosts") + if not os.path.exists(known): + return + subprocess.run( + ["ssh-keygen", "-f", known, "-R", host], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +def tcp_reachable(host: str, port: int, timeout: float = 3) -> bool: + try: + with socket.create_connection((host, port), timeout=timeout): + return True + except (OSError, socket.timeout): + return False + + +def wait_for_port(host: str, port: int = 22, timeout: int = 300, interval: int = 2) -> bool: + """Block until host:port accepts TCP or `timeout` elapses.""" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if tcp_reachable(host, port, timeout=2): + return True + time.sleep(interval) + return False + + +class SshSession: + """Persistent SSH connection to one host via OpenSSH ControlMaster. + + Use as a context manager. The master is opened by running a no-op remote + command during __enter__; this is where interactive prompts (password, + host key acceptance) happen. Subsequent `run()` calls reuse the cached + connection. + + Example: + with SshSession("rescue.example.com") as ssh: + ssh.run("uname -a") + ssh.run("cat", input_=b"hello") + ssh.run("/bin/bash", tty=True) # interactive shell + """ + + def __init__(self, host: str, user: str = "root"): + self.host = host + self.user = user + self._tmpdir: str | None = None + self._sock: str | None = None + + # ---- context management ------------------------------------------------- + + def __enter__(self) -> "SshSession": + self._tmpdir = tempfile.mkdtemp(prefix="hal-ssh-") + self._sock = os.path.join(self._tmpdir, "ctl") + remove_stale_known_hosts(self.host) + # Open the master with a quick no-op. Auth (and any TTY prompts) happen + # right here. After this returns, the socket at self._sock is live and + # follow-up ssh invocations reusing it skip auth entirely. + cmd = [ + "ssh", + "-o", "ControlMaster=auto", + "-o", f"ControlPath={self._sock}", + "-o", "ControlPersist=10m", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ServerAliveInterval=30", + f"{self.user}@{self.host}", + "true", + ] + subprocess.run(cmd, check=True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._sock and os.path.exists(self._sock): + subprocess.run( + [ + "ssh", "-o", f"ControlPath={self._sock}", + "-O", "exit", f"{self.user}@{self.host}", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + if self._tmpdir and os.path.isdir(self._tmpdir): + shutil.rmtree(self._tmpdir, ignore_errors=True) + + # ---- remote execution --------------------------------------------------- + + def run( + self, + remote_cmd: str, + *, + tty: bool = False, + input_: bytes | None = None, + check: bool = True, + capture: bool = False, + ) -> subprocess.CompletedProcess: + """Run `remote_cmd` on the remote host over the multiplexed channel. + + remote_cmd : Shell command(s) as a single string. Newlines OK — the + remote shell parses them as multiple statements. + tty : Allocate a remote pseudo-tty (needed for interactive + tools like `bash` or things using /dev/tty). + input_ : Bytes to feed to the remote command's stdin. Mutually + exclusive with tty (no terminal if stdin is a pipe). + check : Raise CalledProcessError on non-zero exit. + capture : Capture stdout/stderr in the returned CompletedProcess + instead of inheriting the parent's. + """ + if tty and input_ is not None: + raise ValueError("tty=True is incompatible with feeding stdin via input_") + cmd = ["ssh", "-o", f"ControlPath={self._sock}"] + if tty: + cmd += ["-t"] + cmd += [f"{self.user}@{self.host}", remote_cmd] + kwargs: dict = {"check": check} + if input_ is not None: + kwargs["input"] = input_ + if capture: + kwargs["capture_output"] = True + return subprocess.run(cmd, **kwargs)