Improved updater robustness (#2046)
* git ping * cleanup overlay init * separate update available check from fetch * cleanup setting params * only fetch neos update on android * move that * type hints * lightweight update check with git ls-remote * git fetch dry run * cleanupalbatross
parent
cf46de13d2
commit
722a440eb6
|
@ -32,13 +32,14 @@ import fcntl
|
|||
import time
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from common.hardware import ANDROID
|
||||
from common.basedir import BASEDIR
|
||||
from common.params import Params
|
||||
from selfdrive.swaglog import cloudlog
|
||||
from selfdrive.controls.lib.alertmanager import set_offroad_alert
|
||||
|
||||
TEST_IP = os.getenv("UPDATER_TEST_IP", "8.8.8.8")
|
||||
LOCK_FILE = os.getenv("UPDATER_LOCK_FILE", "/tmp/safe_staging_overlay.lock")
|
||||
STAGING_ROOT = os.getenv("UPDATER_STAGING_ROOT", "/data/safe_staging")
|
||||
|
||||
|
@ -60,7 +61,7 @@ class WaitTimeHelper:
|
|||
signal.signal(signal.SIGINT, self.graceful_shutdown)
|
||||
signal.signal(signal.SIGHUP, self.update_now)
|
||||
|
||||
def graceful_shutdown(self, signum, frame):
|
||||
def graceful_shutdown(self, signum: int, frame) -> None:
|
||||
# umount -f doesn't appear effective in avoiding "device busy" on NEOS,
|
||||
# so don't actually die until the next convenient opportunity in main().
|
||||
cloudlog.info("caught SIGINT/SIGTERM, dismounting overlay at next opportunity")
|
||||
|
@ -73,35 +74,42 @@ class WaitTimeHelper:
|
|||
self.shutdown = True
|
||||
self.ready_event.set()
|
||||
|
||||
def update_now(self, signum, frame):
|
||||
def update_now(self, signum: int, frame) -> None:
|
||||
cloudlog.info("caught SIGHUP, running update check immediately")
|
||||
self.ready_event.set()
|
||||
|
||||
def sleep(self, t):
|
||||
def sleep(self, t: float) -> None:
|
||||
self.ready_event.wait(timeout=t)
|
||||
|
||||
|
||||
def run(cmd, cwd=None, low_priority=False):
|
||||
def run(cmd: List[str], cwd: Optional[str] = None, low_priority: bool = False):
|
||||
if low_priority:
|
||||
cmd = ["nice", "-n", "19"] + cmd
|
||||
return subprocess.check_output(cmd, cwd=cwd, stderr=subprocess.STDOUT, encoding='utf8')
|
||||
|
||||
|
||||
def set_consistent_flag(consistent):
|
||||
os.system("sync")
|
||||
def set_consistent_flag(consistent: bool) -> None:
|
||||
os.sync()
|
||||
consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent"))
|
||||
if consistent:
|
||||
consistent_file.touch()
|
||||
elif not consistent and consistent_file.exists():
|
||||
consistent_file.unlink()
|
||||
os.system("sync")
|
||||
os.sync()
|
||||
|
||||
|
||||
def set_update_available_params(new_version):
|
||||
def set_params(new_version: bool, failed_count: int, exception: Optional[str]) -> None:
|
||||
params = Params()
|
||||
|
||||
t = datetime.datetime.utcnow().isoformat()
|
||||
params.put("LastUpdateTime", t.encode('utf8'))
|
||||
params.put("UpdateFailedCount", str(failed_count))
|
||||
if failed_count == 0:
|
||||
t = datetime.datetime.utcnow().isoformat()
|
||||
params.put("LastUpdateTime", t.encode('utf8'))
|
||||
|
||||
if exception is None:
|
||||
params.delete("LastUpdateException")
|
||||
else:
|
||||
params.put("LastUpdateException", exception)
|
||||
|
||||
if new_version:
|
||||
try:
|
||||
|
@ -114,13 +122,7 @@ def set_update_available_params(new_version):
|
|||
params.put("UpdateAvailable", "1")
|
||||
|
||||
|
||||
def dismount_ovfs():
|
||||
if os.path.ismount(OVERLAY_MERGED):
|
||||
cloudlog.error("unmounting existing overlay")
|
||||
run(["umount", "-l", OVERLAY_MERGED])
|
||||
|
||||
|
||||
def setup_git_options(cwd):
|
||||
def setup_git_options(cwd: str) -> None:
|
||||
# We sync FS object atimes (which NEOS doesn't use) and mtimes, but ctimes
|
||||
# are outside user control. Make sure Git is set up to ignore system ctimes,
|
||||
# because they change when we make hard links during finalize. Otherwise,
|
||||
|
@ -134,66 +136,128 @@ def setup_git_options(cwd):
|
|||
("core.checkStat", "minimal"),
|
||||
]
|
||||
for option, value in git_cfg:
|
||||
try:
|
||||
ret = run(["git", "config", "--get", option], cwd)
|
||||
config_ok = ret.strip() == value
|
||||
except subprocess.CalledProcessError:
|
||||
config_ok = False
|
||||
|
||||
if not config_ok:
|
||||
cloudlog.info(f"Setting git '{option}' to '{value}'")
|
||||
run(["git", "config", option, value], cwd)
|
||||
run(["git", "config", option, value], cwd)
|
||||
|
||||
|
||||
def init_ovfs():
|
||||
def dismount_overlay() -> None:
|
||||
if os.path.ismount(OVERLAY_MERGED):
|
||||
cloudlog.info("unmounting existing overlay")
|
||||
run(["umount", "-l", OVERLAY_MERGED])
|
||||
|
||||
|
||||
def init_overlay() -> None:
|
||||
|
||||
overlay_init_file = Path(os.path.join(BASEDIR, ".overlay_init"))
|
||||
|
||||
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
|
||||
if overlay_init_file.is_file():
|
||||
git_dir_path = os.path.join(BASEDIR, ".git")
|
||||
new_files = run(["find", git_dir_path, "-newer", str(overlay_init_file)])
|
||||
if not len(new_files.splitlines()):
|
||||
# A valid overlay already exists
|
||||
return
|
||||
else:
|
||||
cloudlog.info(".git directory changed, recreating overlay")
|
||||
|
||||
cloudlog.info("preparing new safe staging area")
|
||||
|
||||
Params().put("UpdateAvailable", "0")
|
||||
|
||||
set_consistent_flag(False)
|
||||
|
||||
dismount_ovfs()
|
||||
dismount_overlay()
|
||||
if os.path.isdir(STAGING_ROOT):
|
||||
shutil.rmtree(STAGING_ROOT)
|
||||
|
||||
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED, FINALIZED]:
|
||||
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED]:
|
||||
os.mkdir(dirname, 0o755)
|
||||
|
||||
if not os.lstat(BASEDIR).st_dev == os.lstat(OVERLAY_MERGED).st_dev:
|
||||
if os.lstat(BASEDIR).st_dev != os.lstat(OVERLAY_MERGED).st_dev:
|
||||
raise RuntimeError("base and overlay merge directories are on different filesystems; not valid for overlay FS!")
|
||||
|
||||
# Remove consistent flag from current BASEDIR so it's not copied over
|
||||
if os.path.isfile(os.path.join(BASEDIR, ".overlay_consistent")):
|
||||
os.remove(os.path.join(BASEDIR, ".overlay_consistent"))
|
||||
|
||||
# Leave a timestamped canary in BASEDIR to check at startup. The device clock
|
||||
# should be correct by the time we get here. If the init file disappears, or
|
||||
# critical mtimes in BASEDIR are newer than .overlay_init, continue.sh can
|
||||
# assume that BASEDIR has used for local development or otherwise modified,
|
||||
# and skips the update activation attempt.
|
||||
Path(os.path.join(BASEDIR, ".overlay_init")).touch()
|
||||
consistent_file = Path(os.path.join(BASEDIR, ".overlay_consistent"))
|
||||
if consistent_file.is_file():
|
||||
consistent_file.unlink()
|
||||
overlay_init_file.touch()
|
||||
|
||||
os.system("sync")
|
||||
os.sync()
|
||||
overlay_opts = f"lowerdir={BASEDIR},upperdir={OVERLAY_UPPER},workdir={OVERLAY_METADATA}"
|
||||
run(["mount", "-t", "overlay", "-o", overlay_opts, "none", OVERLAY_MERGED])
|
||||
|
||||
|
||||
def finalize_from_ovfs():
|
||||
def finalize_update() -> None:
|
||||
"""Take the current OverlayFS merged view and finalize a copy outside of
|
||||
OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree"""
|
||||
|
||||
# Remove the update ready flag and any old updates
|
||||
cloudlog.info("creating finalized version of the overlay")
|
||||
set_consistent_flag(False)
|
||||
shutil.rmtree(FINALIZED)
|
||||
|
||||
# Copy the merged overlay view and set the update ready flag
|
||||
if os.path.exists(FINALIZED):
|
||||
shutil.rmtree(FINALIZED)
|
||||
shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True)
|
||||
|
||||
set_consistent_flag(True)
|
||||
cloudlog.info("done finalizing overlay")
|
||||
|
||||
|
||||
def attempt_update(wait_helper):
|
||||
cloudlog.info("attempting git update inside staging overlay")
|
||||
def handle_neos_update(wait_helper: WaitTimeHelper) -> None:
|
||||
with open(NEOS_VERSION, "r") as f:
|
||||
cur_neos = f.read().strip()
|
||||
|
||||
updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \
|
||||
echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip()
|
||||
|
||||
cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}")
|
||||
if cur_neos == updated_neos:
|
||||
return
|
||||
|
||||
cloudlog.info(f"Beginning background download for NEOS {updated_neos}")
|
||||
set_offroad_alert("Offroad_NeosUpdate", True)
|
||||
|
||||
updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater")
|
||||
update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json"
|
||||
|
||||
neos_downloaded = False
|
||||
start_time = time.monotonic()
|
||||
# Try to download for one day
|
||||
while not neos_downloaded and not wait_helper.shutdown and \
|
||||
(time.monotonic() - start_time < 60*60*24):
|
||||
wait_helper.ready_event.clear()
|
||||
try:
|
||||
run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True)
|
||||
neos_downloaded = True
|
||||
except subprocess.CalledProcessError:
|
||||
cloudlog.info("NEOS background download failed, retrying")
|
||||
wait_helper.sleep(120)
|
||||
|
||||
# If the download failed, we'll show the alert again when we retry
|
||||
set_offroad_alert("Offroad_NeosUpdate", False)
|
||||
if not neos_downloaded:
|
||||
raise Exception("Failed to download NEOS update")
|
||||
cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds")
|
||||
|
||||
|
||||
def check_git_fetch_result(fetch_txt):
|
||||
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
|
||||
return len(fetch_txt) > 0 and (fetch_txt != err_msg)
|
||||
|
||||
|
||||
def check_for_update() -> Tuple[bool, bool]:
|
||||
setup_git_options(OVERLAY_MERGED)
|
||||
try:
|
||||
git_fetch_output = run(["git", "fetch", "--dry-run"], OVERLAY_MERGED, low_priority=True)
|
||||
return True, check_git_fetch_result(git_fetch_output)
|
||||
except subprocess.CalledProcessError:
|
||||
return False, False
|
||||
|
||||
|
||||
def fetch_update(wait_helper: WaitTimeHelper) -> bool:
|
||||
cloudlog.info("attempting git fetch inside staging overlay")
|
||||
|
||||
setup_git_options(OVERLAY_MERGED)
|
||||
|
||||
|
@ -203,9 +267,7 @@ def attempt_update(wait_helper):
|
|||
cur_hash = run(["git", "rev-parse", "HEAD"], OVERLAY_MERGED).rstrip()
|
||||
upstream_hash = run(["git", "rev-parse", "@{u}"], OVERLAY_MERGED).rstrip()
|
||||
new_version = cur_hash != upstream_hash
|
||||
|
||||
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
|
||||
git_fetch_result = len(git_fetch_output) > 0 and (git_fetch_output != err_msg)
|
||||
git_fetch_result = check_git_fetch_result(git_fetch_output)
|
||||
|
||||
cloudlog.info("comparing %s to %s" % (cur_hash, upstream_hash))
|
||||
if new_version or git_fetch_result:
|
||||
|
@ -221,48 +283,15 @@ def attempt_update(wait_helper):
|
|||
]
|
||||
cloudlog.info("git reset success: %s", '\n'.join(r))
|
||||
|
||||
# Download the accompanying NEOS version if it doesn't match the current version
|
||||
with open(NEOS_VERSION, "r") as f:
|
||||
cur_neos = f.read().strip()
|
||||
|
||||
updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \
|
||||
echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip()
|
||||
|
||||
cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}")
|
||||
if cur_neos != updated_neos:
|
||||
cloudlog.info(f"Beginning background download for NEOS {updated_neos}")
|
||||
|
||||
set_offroad_alert("Offroad_NeosUpdate", True)
|
||||
updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater")
|
||||
update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json"
|
||||
|
||||
neos_downloaded = False
|
||||
start_time = time.monotonic()
|
||||
# Try to download for one day
|
||||
while (time.monotonic() - start_time < 60*60*24) and not wait_helper.shutdown:
|
||||
wait_helper.ready_event.clear()
|
||||
try:
|
||||
run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True)
|
||||
neos_downloaded = True
|
||||
break
|
||||
except subprocess.CalledProcessError:
|
||||
cloudlog.info("NEOS background download failed, retrying")
|
||||
wait_helper.sleep(120)
|
||||
|
||||
# If the download failed, we'll show the alert again when we retry
|
||||
set_offroad_alert("Offroad_NeosUpdate", False)
|
||||
if not neos_downloaded:
|
||||
raise Exception("Failed to download NEOS update")
|
||||
|
||||
cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds")
|
||||
if ANDROID:
|
||||
handle_neos_update(wait_helper)
|
||||
|
||||
# Create the finalized, ready-to-swap update
|
||||
finalize_from_ovfs()
|
||||
finalize_update()
|
||||
cloudlog.info("openpilot update successful!")
|
||||
else:
|
||||
cloudlog.info("nothing new from git at this time")
|
||||
|
||||
set_update_available_params(new_version)
|
||||
return new_version
|
||||
|
||||
|
||||
|
@ -272,7 +301,7 @@ def main():
|
|||
if params.get("DisableUpdates") == b"1":
|
||||
raise RuntimeError("updates are disabled by the DisableUpdates param")
|
||||
|
||||
if os.geteuid() != 0:
|
||||
if ANDROID and os.geteuid() != 0:
|
||||
raise RuntimeError("updated must be launched as root!")
|
||||
|
||||
# Set low io priority
|
||||
|
@ -290,45 +319,45 @@ def main():
|
|||
wait_helper = WaitTimeHelper(proc)
|
||||
wait_helper.sleep(30)
|
||||
|
||||
first_run = True
|
||||
last_fetch_time = 0
|
||||
update_failed_count = 0
|
||||
update_available = False
|
||||
overlay_initialized = False
|
||||
|
||||
# Run the update loop
|
||||
# * every 1m, do a lightweight internet/update check
|
||||
# * every 10m, do a full git fetch
|
||||
while not wait_helper.shutdown:
|
||||
update_now = wait_helper.ready_event.is_set()
|
||||
wait_helper.ready_event.clear()
|
||||
|
||||
# Check for internet every 30s
|
||||
# Don't run updater while onroad or if the time's wrong
|
||||
time_wrong = datetime.datetime.utcnow().year < 2019
|
||||
ping_failed = os.system(f"ping -W 4 -c 1 {TEST_IP}") != 0
|
||||
if ping_failed or time_wrong:
|
||||
is_onroad = params.get("IsOffroad") != b"1"
|
||||
if is_onroad or time_wrong:
|
||||
wait_helper.sleep(30)
|
||||
cloudlog.info("not running updater, not offroad")
|
||||
continue
|
||||
|
||||
# Attempt an update
|
||||
exception = None
|
||||
new_version = False
|
||||
update_failed_count += 1
|
||||
try:
|
||||
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
|
||||
if overlay_initialized:
|
||||
overlay_init_fn = os.path.join(BASEDIR, ".overlay_init")
|
||||
git_dir_path = os.path.join(BASEDIR, ".git")
|
||||
new_files = run(["find", git_dir_path, "-newer", overlay_init_fn])
|
||||
init_overlay()
|
||||
|
||||
if len(new_files.splitlines()):
|
||||
cloudlog.info(".git directory changed, recreating overlay")
|
||||
overlay_initialized = False
|
||||
|
||||
if not overlay_initialized:
|
||||
init_ovfs()
|
||||
overlay_initialized = True
|
||||
|
||||
if params.get("IsOffroad") == b"1":
|
||||
update_available = attempt_update(wait_helper) or update_available
|
||||
internet_ok, update_available = check_for_update()
|
||||
if internet_ok and not update_available:
|
||||
update_failed_count = 0
|
||||
if not update_available and os.path.isdir(NEOSUPDATE_DIR):
|
||||
shutil.rmtree(NEOSUPDATE_DIR)
|
||||
else:
|
||||
cloudlog.info("not running updater, openpilot running")
|
||||
|
||||
# Fetch updates at most every 10 minutes
|
||||
if internet_ok and (update_now or time.monotonic() - last_fetch_time > 60*10):
|
||||
new_version = fetch_update(wait_helper)
|
||||
update_failed_count = 0
|
||||
last_fetch_time = time.monotonic()
|
||||
|
||||
if first_run and not new_version and os.path.isdir(NEOSUPDATE_DIR):
|
||||
shutil.rmtree(NEOSUPDATE_DIR)
|
||||
first_run = False
|
||||
except subprocess.CalledProcessError as e:
|
||||
cloudlog.event(
|
||||
"update process failed",
|
||||
|
@ -336,21 +365,15 @@ def main():
|
|||
output=e.output,
|
||||
returncode=e.returncode
|
||||
)
|
||||
exception = e
|
||||
overlay_initialized = False
|
||||
except Exception:
|
||||
exception = f"command failed: {e.cmd}\n{e.output}"
|
||||
except Exception as e:
|
||||
cloudlog.exception("uncaught updated exception, shouldn't happen")
|
||||
exception = str(e)
|
||||
|
||||
params.put("UpdateFailedCount", str(update_failed_count))
|
||||
if exception is None:
|
||||
params.delete("LastUpdateException")
|
||||
else:
|
||||
params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}")
|
||||
set_params(new_version, update_failed_count, exception)
|
||||
wait_helper.sleep(60)
|
||||
|
||||
# Wait 10 minutes between update attempts
|
||||
wait_helper.sleep(60*10)
|
||||
|
||||
dismount_ovfs()
|
||||
dismount_overlay()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue