nopenpilot/selfdrive/updated.py

394 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
# Safe Update: A simple service that waits for network access and tries to
# update every 10 minutes. It's intended to make the OP update process more
# robust against Git repository corruption. This service DOES NOT try to fix
# an already-corrupt BASEDIR Git repo, only prevent it from happening.
#
# During normal operation, both onroad and offroad, the update process makes
# no changes to the BASEDIR install of OP. All update attempts are performed
# in a disposable staging area provided by OverlayFS. It assumes the deleter
# process provides enough disk space to carry out the process.
#
# If an update succeeds, a flag is set, and the update is swapped in at the
# next reboot. If an update is interrupted or otherwise fails, the OverlayFS
# upper layer and metadata can be discarded before trying again.
#
# The swap on boot is triggered by launch_chffrplus.sh
# gated on the existence of $FINALIZED/.overlay_consistent and also the
# existence and mtime of $BASEDIR/.overlay_init.
#
# Other than build byproducts, BASEDIR should not be modified while this
# service is running. Developers modifying code directly in BASEDIR should
# disable this service.
import os
import datetime
import subprocess
import psutil
from stat import S_ISREG, S_ISDIR, S_ISLNK, S_IMODE, ST_MODE, ST_INO, ST_UID, ST_GID, ST_ATIME, ST_MTIME
import shutil
import signal
from pathlib import Path
import fcntl
import threading
import time
from cffi import FFI
from common.basedir import BASEDIR
from common.params import Params
from selfdrive.swaglog import cloudlog
STAGING_ROOT = "/data/safe_staging"
OVERLAY_UPPER = os.path.join(STAGING_ROOT, "upper")
OVERLAY_METADATA = os.path.join(STAGING_ROOT, "metadata")
OVERLAY_MERGED = os.path.join(STAGING_ROOT, "merged")
FINALIZED = os.path.join(STAGING_ROOT, "finalized")
NICE_LOW_PRIORITY = ["nice", "-n", "19"]
SHORT = os.getenv("SHORT") is not None
# Workaround for the EON/termux build of Python having os.link removed.
ffi = FFI()
ffi.cdef("int link(const char *oldpath, const char *newpath);")
libc = ffi.dlopen(None)
class WaitTimeHelper:
ready_event = threading.Event()
shutdown = False
def __init__(self):
signal.signal(signal.SIGTERM, self.graceful_shutdown)
signal.signal(signal.SIGINT, self.graceful_shutdown)
signal.signal(signal.SIGHUP, self.update_now)
def graceful_shutdown(self, signum, frame):
# umount -f doesn't appear effective in avoiding "device busy" on EON,
# so don't actually die until the next convenient opportunity in main().
cloudlog.info("caught SIGINT/SIGTERM, dismounting overlay at next opportunity")
self.shutdown = True
self.ready_event.set()
def update_now(self, signum, frame):
cloudlog.info("caught SIGHUP, running update check immediately")
self.ready_event.set()
def wait_between_updates(ready_event):
ready_event.clear()
if SHORT:
ready_event.wait(timeout=10)
else:
ready_event.wait(timeout=60 * 10)
def link(src, dest):
# Workaround for the EON/termux build of Python having os.link removed.
return libc.link(src.encode(), dest.encode())
def run(cmd, cwd=None):
return subprocess.check_output(cmd, cwd=cwd, stderr=subprocess.STDOUT, encoding='utf8')
def remove_consistent_flag():
os.system("sync")
consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent"))
try:
consistent_file.unlink()
except FileNotFoundError:
pass
os.system("sync")
def set_consistent_flag():
consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent"))
os.system("sync")
consistent_file.touch()
os.system("sync")
def set_update_available_params(new_version=False):
params = Params()
t = datetime.datetime.utcnow().isoformat()
params.put("LastUpdateTime", t.encode('utf8'))
if new_version:
try:
with open(os.path.join(FINALIZED, "RELEASES.md"), "rb") as f:
r = f.read()
r = r[:r.find(b'\n\n')] # Slice latest release notes
params.put("ReleaseNotes", r + b"\n")
except Exception:
params.put("ReleaseNotes", "")
params.put("UpdateAvailable", "1")
def dismount_ovfs():
if os.path.ismount(OVERLAY_MERGED):
cloudlog.error("unmounting existing overlay")
run(["umount", "-l", OVERLAY_MERGED])
def setup_git_options(cwd):
# We sync FS object atimes (which EON doesn't use) and mtimes, but ctimes
# are outside user control. Make sure Git is set up to ignore system ctimes,
# because they change when we make hard links during finalize. Otherwise,
# there is a lot of unnecessary churn. This appears to be a common need on
# OSX as well: https://www.git-tower.com/blog/make-git-rebase-safe-on-osx/
try:
trustctime = run(["git", "config", "--get", "core.trustctime"], cwd)
trustctime_set = (trustctime.strip() == "false")
except subprocess.CalledProcessError:
trustctime_set = False
if not trustctime_set:
cloudlog.info("Setting core.trustctime false")
run(["git", "config", "core.trustctime", "false"], cwd)
# We are temporarily using copytree to copy the directory, which also changes
# inode numbers. Ignore those changes too.
try:
checkstat = run(["git", "config", "--get", "core.checkStat"], cwd)
checkstat_set = (checkstat.strip() == "minimal")
except subprocess.CalledProcessError:
checkstat_set = False
if not checkstat_set:
cloudlog.info("Setting core.checkState minimal")
run(["git", "config", "core.checkStat", "minimal"], cwd)
def init_ovfs():
cloudlog.info("preparing new safe staging area")
Params().put("UpdateAvailable", "0")
remove_consistent_flag()
dismount_ovfs()
if os.path.isdir(STAGING_ROOT):
shutil.rmtree(STAGING_ROOT)
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED, FINALIZED]:
os.mkdir(dirname, 0o755)
if not os.lstat(BASEDIR).st_dev == os.lstat(OVERLAY_MERGED).st_dev:
raise RuntimeError("base and overlay merge directories are on different filesystems; not valid for overlay FS!")
# Remove consistent flag from current BASEDIR so it's not copied over
if os.path.isfile(os.path.join(BASEDIR, ".overlay_consistent")):
os.remove(os.path.join(BASEDIR, ".overlay_consistent"))
# Leave a timestamped canary in BASEDIR to check at startup. The EON clock
# should be correct by the time we get here. If the init file disappears, or
# critical mtimes in BASEDIR are newer than .overlay_init, continue.sh can
# assume that BASEDIR has used for local development or otherwise modified,
# and skips the update activation attempt.
Path(os.path.join(BASEDIR, ".overlay_init")).touch()
overlay_opts = f"lowerdir={BASEDIR},upperdir={OVERLAY_UPPER},workdir={OVERLAY_METADATA}"
run(["mount", "-t", "overlay", "-o", overlay_opts, "none", OVERLAY_MERGED])
def inodes_in_tree(search_dir):
"""Given a search root, produce a dictionary mapping of inodes to relative
pathnames of regular files (no directories, symlinks, or special files)."""
inode_map = {}
for root, _, files in os.walk(search_dir, topdown=True):
for file_name in files:
full_path_name = os.path.join(root, file_name)
st = os.lstat(full_path_name)
if S_ISREG(st[ST_MODE]):
inode_map[st[ST_INO]] = full_path_name
return inode_map
def dup_ovfs_object(inode_map, source_obj, target_dir):
"""Given a relative pathname to copy, and a new target root, duplicate the
source object in the target root, using hardlinks for regular files."""
source_full_path = os.path.join(OVERLAY_MERGED, source_obj)
st = os.lstat(source_full_path)
target_full_path = os.path.join(target_dir, source_obj)
if S_ISREG(st[ST_MODE]):
# Hardlink all regular files; ownership and permissions are shared.
link(inode_map[st[ST_INO]], target_full_path)
else:
# Recreate all directories and symlinks; copy ownership and permissions.
if S_ISDIR(st[ST_MODE]):
os.mkdir(os.path.join(FINALIZED, source_obj), S_IMODE(st[ST_MODE]))
elif S_ISLNK(st[ST_MODE]):
os.symlink(os.readlink(source_full_path), target_full_path)
os.chmod(target_full_path, S_IMODE(st[ST_MODE]), follow_symlinks=False)
else:
# Ran into a FIFO, socket, etc. Should not happen in OP install dir.
# Ignore without copying for the time being; revisit later if needed.
cloudlog.error("can't copy this file type: %s" % source_full_path)
os.chown(target_full_path, st[ST_UID], st[ST_GID], follow_symlinks=False)
# Sync target mtimes to the cached lstat() value from each source object.
# Restores shared inode mtimes after linking, fixes symlinks and dirs.
os.utime(target_full_path, (st[ST_ATIME], st[ST_MTIME]), follow_symlinks=False)
def finalize_from_ovfs_hardlink():
"""Take the current OverlayFS merged view and finalize a copy outside of
OverlayFS, ready to be swapped-in at BASEDIR. Copy using hardlinks"""
cloudlog.info("creating finalized version of the overlay")
# The "copy" is done with hardlinks, but since the OverlayFS merge looks
# like a different filesystem, and hardlinks can't cross filesystems, we
# have to borrow a source pathname from the upper or lower layer.
inode_map = inodes_in_tree(BASEDIR)
inode_map.update(inodes_in_tree(OVERLAY_UPPER))
shutil.rmtree(FINALIZED)
os.umask(0o077)
os.mkdir(FINALIZED)
for root, dirs, files in os.walk(OVERLAY_MERGED, topdown=True):
for obj_name in dirs:
relative_path_name = os.path.relpath(os.path.join(root, obj_name), OVERLAY_MERGED)
dup_ovfs_object(inode_map, relative_path_name, FINALIZED)
for obj_name in files:
relative_path_name = os.path.relpath(os.path.join(root, obj_name), OVERLAY_MERGED)
dup_ovfs_object(inode_map, relative_path_name, FINALIZED)
cloudlog.info("done finalizing overlay")
def finalize_from_ovfs_copy():
"""Take the current OverlayFS merged view and finalize a copy outside of
OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree"""
cloudlog.info("creating finalized version of the overlay")
shutil.rmtree(FINALIZED)
shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True)
cloudlog.info("done finalizing overlay")
def attempt_update():
cloudlog.info("attempting git update inside staging overlay")
setup_git_options(OVERLAY_MERGED)
git_fetch_output = run(NICE_LOW_PRIORITY + ["git", "fetch"], OVERLAY_MERGED)
cloudlog.info("git fetch success: %s", git_fetch_output)
cur_hash = run(["git", "rev-parse", "HEAD"], OVERLAY_MERGED).rstrip()
upstream_hash = run(["git", "rev-parse", "@{u}"], OVERLAY_MERGED).rstrip()
new_version = cur_hash != upstream_hash
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
git_fetch_result = len(git_fetch_output) > 0 and (git_fetch_output != err_msg)
cloudlog.info("comparing %s to %s" % (cur_hash, upstream_hash))
if new_version or git_fetch_result:
cloudlog.info("Running update")
if new_version:
cloudlog.info("git reset in progress")
r = [
run(NICE_LOW_PRIORITY + ["git", "reset", "--hard", "@{u}"], OVERLAY_MERGED),
run(NICE_LOW_PRIORITY + ["git", "clean", "-xdf"], OVERLAY_MERGED),
run(NICE_LOW_PRIORITY + ["git", "submodule", "init"], OVERLAY_MERGED),
run(NICE_LOW_PRIORITY + ["git", "submodule", "update"], OVERLAY_MERGED),
]
cloudlog.info("git reset success: %s", '\n'.join(r))
# Un-set the validity flag to prevent the finalized tree from being
# activated later if the finalize step is interrupted
remove_consistent_flag()
finalize_from_ovfs_copy()
# Make sure the validity flag lands on disk LAST, only when the local git
# repo and OP install are in a consistent state.
set_consistent_flag()
cloudlog.info("update successful!")
else:
cloudlog.info("nothing new from git at this time")
set_update_available_params(new_version=new_version)
def main():
update_failed_count = 0
overlay_init_done = False
params = Params()
if params.get("DisableUpdates") == b"1":
raise RuntimeError("updates are disabled by param")
if not os.geteuid() == 0:
raise RuntimeError("updated must be launched as root!")
# Set low io priority
p = psutil.Process()
if psutil.LINUX:
p.ionice(psutil.IOPRIO_CLASS_BE, value=7)
ov_lock_fd = open('/tmp/safe_staging_overlay.lock', 'w')
try:
fcntl.flock(ov_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
raise RuntimeError("couldn't get overlay lock; is another updated running?")
# Wait a short time before our first update attempt
# Avoids race with IsOffroad not being set, reduces manager startup load
time.sleep(30)
wait_helper = WaitTimeHelper()
while True:
update_failed_count += 1
time_wrong = datetime.datetime.utcnow().year < 2019
ping_failed = subprocess.call(["ping", "-W", "4", "-c", "1", "8.8.8.8"])
# Wait until we have a valid datetime to initialize the overlay
if not (ping_failed or time_wrong):
try:
# If the git directory has modifcations after we created the overlay
# we need to recreate the overlay
if overlay_init_done:
overlay_init_fn = os.path.join(BASEDIR, ".overlay_init")
git_dir_path = os.path.join(BASEDIR, ".git")
new_files = run(["find", git_dir_path, "-newer", overlay_init_fn])
if len(new_files.splitlines()):
cloudlog.info(".git directory changed, recreating overlay")
overlay_init_done = False
if not overlay_init_done:
init_ovfs()
overlay_init_done = True
if params.get("IsOffroad") == b"1":
attempt_update()
update_failed_count = 0
else:
cloudlog.info("not running updater, openpilot running")
except subprocess.CalledProcessError as e:
cloudlog.event(
"update process failed",
cmd=e.cmd,
output=e.output,
returncode=e.returncode
)
overlay_init_done = False
except Exception:
cloudlog.exception("uncaught updated exception, shouldn't happen")
params.put("UpdateFailedCount", str(update_failed_count))
wait_between_updates(wait_helper.ready_event)
if wait_helper.shutdown:
break
# We've been signaled to shut down
dismount_ovfs()
if __name__ == "__main__":
main()