nopenpilot/selfdrive/updated.py

474 lines
17 KiB
Python
Raw Normal View History

2020-01-17 12:03:22 -07:00
#!/usr/bin/env python3
# Safe Update: A simple service that waits for network access and tries to
# update every 10 minutes. It's intended to make the OP update process more
# robust against Git repository corruption. This service DOES NOT try to fix
# an already-corrupt BASEDIR Git repo, only prevent it from happening.
#
# During normal operation, both onroad and offroad, the update process makes
# no changes to the BASEDIR install of OP. All update attempts are performed
# in a disposable staging area provided by OverlayFS. It assumes the deleter
# process provides enough disk space to carry out the process.
#
# If an update succeeds, a flag is set, and the update is swapped in at the
# next reboot. If an update is interrupted or otherwise fails, the OverlayFS
# upper layer and metadata can be discarded before trying again.
#
# The swap on boot is triggered by launch_chffrplus.sh
# gated on the existence of $FINALIZED/.overlay_consistent and also the
# existence and mtime of $BASEDIR/.overlay_init.
#
# Other than build byproducts, BASEDIR should not be modified while this
# service is running. Developers modifying code directly in BASEDIR should
# disable this service.
import os
import datetime
import subprocess
import psutil
import shutil
import signal
import fcntl
2020-08-12 12:39:21 -06:00
import time
2020-01-17 12:03:22 -07:00
import threading
from pathlib import Path
from typing import List, Tuple, Optional
2020-01-17 12:03:22 -07:00
from common.basedir import BASEDIR
from common.markdown import parse_markdown
2020-01-17 12:03:22 -07:00
from common.params import Params
from selfdrive.hardware import EON, TICI, HARDWARE
2020-01-17 12:03:22 -07:00
from selfdrive.swaglog import cloudlog
2020-08-12 12:39:21 -06:00
from selfdrive.controls.lib.alertmanager import set_offroad_alert
from selfdrive.version import is_tested_branch
2020-01-17 12:03:22 -07:00
LOCK_FILE = os.getenv("UPDATER_LOCK_FILE", "/tmp/safe_staging_overlay.lock")
STAGING_ROOT = os.getenv("UPDATER_STAGING_ROOT", "/data/safe_staging")
2020-01-17 12:03:22 -07:00
2020-08-12 12:39:21 -06:00
NEOSUPDATE_DIR = os.getenv("UPDATER_NEOSUPDATE_DIR", "/data/neoupdate")
2020-01-17 12:03:22 -07:00
OVERLAY_UPPER = os.path.join(STAGING_ROOT, "upper")
OVERLAY_METADATA = os.path.join(STAGING_ROOT, "metadata")
OVERLAY_MERGED = os.path.join(STAGING_ROOT, "merged")
FINALIZED = os.path.join(STAGING_ROOT, "finalized")
DAYS_NO_CONNECTIVITY_MAX = 14 # do not allow to engage after this many days
DAYS_NO_CONNECTIVITY_PROMPT = 10 # send an offroad prompt after this many days
2020-01-17 12:03:22 -07:00
class WaitTimeHelper:
2020-08-12 12:39:21 -06:00
def __init__(self, proc):
self.proc = proc
self.ready_event = threading.Event()
self.shutdown = False
2020-01-17 12:03:22 -07:00
signal.signal(signal.SIGTERM, self.graceful_shutdown)
signal.signal(signal.SIGINT, self.graceful_shutdown)
signal.signal(signal.SIGHUP, self.update_now)
def graceful_shutdown(self, signum: int, frame) -> None:
# umount -f doesn't appear effective in avoiding "device busy" on NEOS,
2020-01-17 12:03:22 -07:00
# so don't actually die until the next convenient opportunity in main().
cloudlog.info("caught SIGINT/SIGTERM, dismounting overlay at next opportunity")
2020-08-12 12:39:21 -06:00
# forward the signal to all our child processes
child_procs = self.proc.children(recursive=True)
for p in child_procs:
p.send_signal(signum)
2020-01-17 12:03:22 -07:00
self.shutdown = True
self.ready_event.set()
def update_now(self, signum: int, frame) -> None:
2020-01-17 12:03:22 -07:00
cloudlog.info("caught SIGHUP, running update check immediately")
self.ready_event.set()
def sleep(self, t: float) -> None:
self.ready_event.wait(timeout=t)
2020-01-17 12:03:22 -07:00
def run(cmd: List[str], cwd: Optional[str] = None, low_priority: bool = False):
2020-08-12 12:39:21 -06:00
if low_priority:
cmd = ["nice", "-n", "19"] + cmd
2020-01-17 12:03:22 -07:00
return subprocess.check_output(cmd, cwd=cwd, stderr=subprocess.STDOUT, encoding='utf8')
def set_consistent_flag(consistent: bool) -> None:
os.sync()
2020-01-17 12:03:22 -07:00
consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent"))
if consistent:
consistent_file.touch()
elif not consistent:
consistent_file.unlink(missing_ok=True)
os.sync()
2020-01-17 12:03:22 -07:00
def set_params(new_version: bool, failed_count: int, exception: Optional[str]) -> None:
2020-01-17 12:03:22 -07:00
params = Params()
params.put("UpdateFailedCount", str(failed_count))
last_update = datetime.datetime.utcnow()
if failed_count == 0:
t = last_update.isoformat()
params.put("LastUpdateTime", t.encode('utf8'))
else:
try:
t = params.get("LastUpdateTime", encoding='utf8')
last_update = datetime.datetime.fromisoformat(t)
except (TypeError, ValueError):
pass
if exception is None:
params.delete("LastUpdateException")
else:
params.put("LastUpdateException", exception)
2020-01-17 12:03:22 -07:00
# Write out release notes for new versions
2020-01-17 12:03:22 -07:00
if new_version:
try:
with open(os.path.join(FINALIZED, "RELEASES.md"), "rb") as f:
r = f.read().split(b'\n\n', 1)[0] # Slice latest release notes
try:
params.put("ReleaseNotes", parse_markdown(r.decode("utf-8")))
except Exception:
params.put("ReleaseNotes", r + b"\n")
2020-01-17 12:03:22 -07:00
except Exception:
params.put("ReleaseNotes", "")
params.put_bool("UpdateAvailable", True)
2020-01-17 12:03:22 -07:00
# Handle user prompt
for alert in ("Offroad_UpdateFailed", "Offroad_ConnectivityNeeded", "Offroad_ConnectivityNeededPrompt"):
set_offroad_alert(alert, False)
now = datetime.datetime.utcnow()
dt = now - last_update
if failed_count > 15 and exception is not None:
if is_tested_branch():
extra_text = "Ensure the software is correctly installed"
else:
extra_text = exception
set_offroad_alert("Offroad_UpdateFailed", True, extra_text=extra_text)
elif dt.days > DAYS_NO_CONNECTIVITY_MAX and failed_count > 1:
set_offroad_alert("Offroad_ConnectivityNeeded", True)
elif dt.days > DAYS_NO_CONNECTIVITY_PROMPT:
remaining = max(DAYS_NO_CONNECTIVITY_MAX - dt.days, 1)
set_offroad_alert("Offroad_ConnectivityNeededPrompt", True, extra_text=f"{remaining} day{'' if remaining == 1 else 's'}.")
2020-01-17 12:03:22 -07:00
def setup_git_options(cwd: str) -> None:
# We sync FS object atimes (which NEOS doesn't use) and mtimes, but ctimes
2020-04-16 12:55:16 -06:00
# are outside user control. Make sure Git is set up to ignore system ctimes,
# because they change when we make hard links during finalize. Otherwise,
# there is a lot of unnecessary churn. This appears to be a common need on
# OSX as well: https://www.git-tower.com/blog/make-git-rebase-safe-on-osx/
# We are using copytree to copy the directory, which also changes
2020-04-16 12:55:16 -06:00
# inode numbers. Ignore those changes too.
git_cfg = [
("core.trustctime", "false"),
("core.checkStat", "minimal"),
]
for option, value in git_cfg:
run(["git", "config", option, value], cwd)
def dismount_overlay() -> None:
if os.path.ismount(OVERLAY_MERGED):
cloudlog.info("unmounting existing overlay")
args = ["umount", "-l", OVERLAY_MERGED]
if TICI:
args = ["sudo"] + args
run(args)
2020-04-16 12:55:16 -06:00
def init_overlay() -> None:
2020-04-16 12:55:16 -06:00
overlay_init_file = Path(os.path.join(BASEDIR, ".overlay_init"))
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
if overlay_init_file.is_file():
git_dir_path = os.path.join(BASEDIR, ".git")
new_files = run(["find", git_dir_path, "-newer", str(overlay_init_file)])
if not len(new_files.splitlines()):
# A valid overlay already exists
return
else:
cloudlog.info(".git directory changed, recreating overlay")
2020-04-16 12:55:16 -06:00
2020-01-17 12:03:22 -07:00
cloudlog.info("preparing new safe staging area")
params = Params()
params.put_bool("UpdateAvailable", False)
set_consistent_flag(False)
dismount_overlay()
if TICI:
run(["sudo", "rm", "-rf", STAGING_ROOT])
2020-01-17 12:03:22 -07:00
if os.path.isdir(STAGING_ROOT):
shutil.rmtree(STAGING_ROOT)
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED]:
2020-01-17 12:03:22 -07:00
os.mkdir(dirname, 0o755)
if os.lstat(BASEDIR).st_dev != os.lstat(OVERLAY_MERGED).st_dev:
2020-01-17 12:03:22 -07:00
raise RuntimeError("base and overlay merge directories are on different filesystems; not valid for overlay FS!")
# Leave a timestamped canary in BASEDIR to check at startup. The device clock
2020-01-17 12:03:22 -07:00
# should be correct by the time we get here. If the init file disappears, or
# critical mtimes in BASEDIR are newer than .overlay_init, continue.sh can
# assume that BASEDIR has used for local development or otherwise modified,
# and skips the update activation attempt.
consistent_file = Path(os.path.join(BASEDIR, ".overlay_consistent"))
if consistent_file.is_file():
consistent_file.unlink()
overlay_init_file.touch()
2020-01-17 12:03:22 -07:00
os.sync()
2020-01-17 12:03:22 -07:00
overlay_opts = f"lowerdir={BASEDIR},upperdir={OVERLAY_UPPER},workdir={OVERLAY_METADATA}"
mount_cmd = ["mount", "-t", "overlay", "-o", overlay_opts, "none", OVERLAY_MERGED]
if TICI:
run(["sudo"] + mount_cmd)
run(["sudo", "chmod", "755", os.path.join(OVERLAY_METADATA, "work")])
else:
run(mount_cmd)
2020-01-17 12:03:22 -07:00
git_diff = run(["git", "diff"], OVERLAY_MERGED, low_priority=True)
params.put("GitDiff", git_diff)
cloudlog.info(f"git diff output:\n{git_diff}")
2020-01-17 12:03:22 -07:00
def finalize_update() -> None:
2020-01-17 12:03:22 -07:00
"""Take the current OverlayFS merged view and finalize a copy outside of
OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree"""
2020-08-12 12:39:21 -06:00
# Remove the update ready flag and any old updates
2020-01-17 12:03:22 -07:00
cloudlog.info("creating finalized version of the overlay")
2020-08-12 12:39:21 -06:00
set_consistent_flag(False)
# Copy the merged overlay view and set the update ready flag
if os.path.exists(FINALIZED):
shutil.rmtree(FINALIZED)
2020-01-17 12:03:22 -07:00
shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True)
run(["git", "reset", "--hard"], FINALIZED)
run(["git", "submodule", "foreach", "--recursive", "git", "reset"], FINALIZED)
2020-08-12 12:39:21 -06:00
set_consistent_flag(True)
2020-01-17 12:03:22 -07:00
cloudlog.info("done finalizing overlay")
2021-10-25 16:58:26 -06:00
def handle_agnos_update(wait_helper: WaitTimeHelper) -> None:
from selfdrive.hardware.tici.agnos import flash_agnos_update, get_target_slot_number
cur_version = HARDWARE.get_os_version()
updated_version = run(["bash", "-c", r"unset AGNOS_VERSION && source launch_env.sh && \
echo -n $AGNOS_VERSION"], OVERLAY_MERGED).strip()
cloudlog.info(f"AGNOS version check: {cur_version} vs {updated_version}")
if cur_version == updated_version:
return
# prevent an openpilot getting swapped in with a mismatched or partially downloaded agnos
set_consistent_flag(False)
cloudlog.info(f"Beginning background installation for AGNOS {updated_version}")
set_offroad_alert("Offroad_NeosUpdate", True)
manifest_path = os.path.join(OVERLAY_MERGED, "selfdrive/hardware/tici/agnos.json")
target_slot_number = get_target_slot_number()
flash_agnos_update(manifest_path, target_slot_number, cloudlog)
set_offroad_alert("Offroad_NeosUpdate", False)
def handle_neos_update(wait_helper: WaitTimeHelper) -> None:
from selfdrive.hardware.eon.neos import download_neos_update
cur_neos = HARDWARE.get_os_version()
updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \
echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip()
cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}")
if cur_neos == updated_neos:
return
cloudlog.info(f"Beginning background download for NEOS {updated_neos}")
set_offroad_alert("Offroad_NeosUpdate", True)
update_manifest = os.path.join(OVERLAY_MERGED, "selfdrive/hardware/eon/neos.json")
neos_downloaded = False
start_time = time.monotonic()
# Try to download for one day
while not neos_downloaded and not wait_helper.shutdown and \
(time.monotonic() - start_time < 60*60*24):
wait_helper.ready_event.clear()
try:
download_neos_update(update_manifest, cloudlog)
neos_downloaded = True
except Exception:
cloudlog.info("NEOS background download failed, retrying")
wait_helper.sleep(120)
# If the download failed, we'll show the alert again when we retry
set_offroad_alert("Offroad_NeosUpdate", False)
if not neos_downloaded:
raise Exception("Failed to download NEOS update")
cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds")
2021-10-25 16:58:26 -06:00
def check_git_fetch_result(fetch_txt: str) -> bool:
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
return len(fetch_txt) > 0 and (fetch_txt != err_msg)
def check_for_update() -> Tuple[bool, bool]:
setup_git_options(OVERLAY_MERGED)
try:
git_fetch_output = run(["git", "fetch", "--dry-run"], OVERLAY_MERGED, low_priority=True)
return True, check_git_fetch_result(git_fetch_output)
except subprocess.CalledProcessError:
return False, False
def fetch_update(wait_helper: WaitTimeHelper) -> bool:
cloudlog.info("attempting git fetch inside staging overlay")
2020-01-17 12:03:22 -07:00
2020-04-16 12:55:16 -06:00
setup_git_options(OVERLAY_MERGED)
2020-08-12 12:39:21 -06:00
git_fetch_output = run(["git", "fetch"], OVERLAY_MERGED, low_priority=True)
2020-01-17 12:03:22 -07:00
cloudlog.info("git fetch success: %s", git_fetch_output)
cur_hash = run(["git", "rev-parse", "HEAD"], OVERLAY_MERGED).rstrip()
upstream_hash = run(["git", "rev-parse", "@{u}"], OVERLAY_MERGED).rstrip()
new_version = cur_hash != upstream_hash
git_fetch_result = check_git_fetch_result(git_fetch_output)
2020-01-17 12:03:22 -07:00
cloudlog.info(f"comparing {cur_hash} to {upstream_hash}")
2020-01-17 12:03:22 -07:00
if new_version or git_fetch_result:
cloudlog.info("Running update")
2020-08-12 12:39:21 -06:00
2020-01-17 12:03:22 -07:00
if new_version:
cloudlog.info("git reset in progress")
r = [
2020-08-12 12:39:21 -06:00
run(["git", "reset", "--hard", "@{u}"], OVERLAY_MERGED, low_priority=True),
run(["git", "clean", "-xdf"], OVERLAY_MERGED, low_priority=True ),
run(["git", "submodule", "init"], OVERLAY_MERGED, low_priority=True),
run(["git", "submodule", "update"], OVERLAY_MERGED, low_priority=True),
2020-01-17 12:03:22 -07:00
]
cloudlog.info("git reset success: %s", '\n'.join(r))
if EON:
handle_neos_update(wait_helper)
elif TICI:
handle_agnos_update(wait_helper)
2020-08-12 12:39:21 -06:00
# Create the finalized, ready-to-swap update
finalize_update()
2020-08-12 12:39:21 -06:00
cloudlog.info("openpilot update successful!")
2020-01-17 12:03:22 -07:00
else:
cloudlog.info("nothing new from git at this time")
2020-08-12 12:39:21 -06:00
return new_version
2020-01-17 12:03:22 -07:00
def main() -> None:
2020-01-17 12:03:22 -07:00
params = Params()
if params.get_bool("DisableUpdates"):
cloudlog.warning("updates are disabled by the DisableUpdates param")
exit(0)
2020-01-17 12:03:22 -07:00
ov_lock_fd = open(LOCK_FILE, 'w')
try:
fcntl.flock(ov_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except OSError as e:
raise RuntimeError("couldn't get overlay lock; is another instance running?") from e
2020-01-17 12:03:22 -07:00
# Set low io priority
2020-08-12 12:39:21 -06:00
proc = psutil.Process()
2020-01-17 12:03:22 -07:00
if psutil.LINUX:
2020-08-12 12:39:21 -06:00
proc.ionice(psutil.IOPRIO_CLASS_BE, value=7)
2020-01-17 12:03:22 -07:00
# Check if we just performed an update
if Path(os.path.join(STAGING_ROOT, "old_openpilot")).is_dir():
cloudlog.event("update installed")
if not params.get("InstallDate"):
t = datetime.datetime.utcnow().isoformat()
params.put("InstallDate", t.encode('utf8'))
2020-01-17 12:03:22 -07:00
2020-09-29 22:26:27 -06:00
overlay_init = Path(os.path.join(BASEDIR, ".overlay_init"))
overlay_init.unlink(missing_ok=True)
2020-09-29 22:26:27 -06:00
first_run = True
last_fetch_time = 0.0
update_failed_count = 0
# Set initial params for offroad alerts
set_params(False, 0, None)
# Wait for IsOffroad to be set before our first update attempt
wait_helper = WaitTimeHelper(proc)
wait_helper.sleep(30)
# Run the update loop
# * every 1m, do a lightweight internet/update check
# * every 10m, do a full git fetch
2020-08-04 14:48:19 -06:00
while not wait_helper.shutdown:
update_now = wait_helper.ready_event.is_set()
wait_helper.ready_event.clear()
2020-08-04 14:48:19 -06:00
# Don't run updater while onroad or if the time's wrong
time_wrong = datetime.datetime.utcnow().year < 2019
is_onroad = not params.get_bool("IsOffroad")
if is_onroad or time_wrong:
wait_helper.sleep(30)
cloudlog.info("not running updater, not offroad")
2020-08-04 14:48:19 -06:00
continue
2020-01-17 12:03:22 -07:00
2020-08-04 14:48:19 -06:00
# Attempt an update
exception = None
new_version = False
update_failed_count += 1
2020-08-04 14:48:19 -06:00
try:
init_overlay()
internet_ok, update_available = check_for_update()
if internet_ok and not update_available:
2020-08-04 14:48:19 -06:00
update_failed_count = 0
# Fetch updates at most every 10 minutes
if internet_ok and (update_now or time.monotonic() - last_fetch_time > 60*10):
new_version = fetch_update(wait_helper)
update_failed_count = 0
last_fetch_time = time.monotonic()
if first_run and not new_version and os.path.isdir(NEOSUPDATE_DIR):
shutil.rmtree(NEOSUPDATE_DIR)
first_run = False
2020-08-04 14:48:19 -06:00
except subprocess.CalledProcessError as e:
cloudlog.event(
"update process failed",
cmd=e.cmd,
output=e.output,
returncode=e.returncode
)
exception = f"command failed: {e.cmd}\n{e.output}"
overlay_init.unlink(missing_ok=True)
except Exception as e:
2020-08-04 14:48:19 -06:00
cloudlog.exception("uncaught updated exception, shouldn't happen")
exception = str(e)
overlay_init.unlink(missing_ok=True)
2020-01-17 12:03:22 -07:00
try:
set_params(new_version, update_failed_count, exception)
except Exception:
cloudlog.exception("uncaught updated exception while setting params, shouldn't happen")
wait_helper.sleep(60)
2020-01-17 12:03:22 -07:00
dismount_overlay()
2020-01-17 12:03:22 -07:00
2020-01-17 12:03:22 -07:00
if __name__ == "__main__":
main()