Improved updater robustness (#2046)

* git ping

* cleanup overlay init

* separate update available check from fetch

* cleanup setting params

* only fetch neos update on android

* move that

* type hints

* lightweight update check with git ls-remote

* git fetch dry run

* cleanup
albatross
Adeeb Shihadeh 2020-09-22 12:09:41 -07:00 committed by GitHub
parent cf46de13d2
commit 722a440eb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 142 additions and 119 deletions

View File

@ -32,13 +32,14 @@ import fcntl
import time
import threading
from pathlib import Path
from typing import List, Tuple, Optional
from common.hardware import ANDROID
from common.basedir import BASEDIR
from common.params import Params
from selfdrive.swaglog import cloudlog
from selfdrive.controls.lib.alertmanager import set_offroad_alert
TEST_IP = os.getenv("UPDATER_TEST_IP", "8.8.8.8")
LOCK_FILE = os.getenv("UPDATER_LOCK_FILE", "/tmp/safe_staging_overlay.lock")
STAGING_ROOT = os.getenv("UPDATER_STAGING_ROOT", "/data/safe_staging")
@ -60,7 +61,7 @@ class WaitTimeHelper:
signal.signal(signal.SIGINT, self.graceful_shutdown)
signal.signal(signal.SIGHUP, self.update_now)
def graceful_shutdown(self, signum, frame):
def graceful_shutdown(self, signum: int, frame) -> None:
# umount -f doesn't appear effective in avoiding "device busy" on NEOS,
# so don't actually die until the next convenient opportunity in main().
cloudlog.info("caught SIGINT/SIGTERM, dismounting overlay at next opportunity")
@ -73,35 +74,42 @@ class WaitTimeHelper:
self.shutdown = True
self.ready_event.set()
def update_now(self, signum, frame):
def update_now(self, signum: int, frame) -> None:
cloudlog.info("caught SIGHUP, running update check immediately")
self.ready_event.set()
def sleep(self, t):
def sleep(self, t: float) -> None:
self.ready_event.wait(timeout=t)
def run(cmd, cwd=None, low_priority=False):
def run(cmd: List[str], cwd: Optional[str] = None, low_priority: bool = False):
if low_priority:
cmd = ["nice", "-n", "19"] + cmd
return subprocess.check_output(cmd, cwd=cwd, stderr=subprocess.STDOUT, encoding='utf8')
def set_consistent_flag(consistent):
os.system("sync")
def set_consistent_flag(consistent: bool) -> None:
os.sync()
consistent_file = Path(os.path.join(FINALIZED, ".overlay_consistent"))
if consistent:
consistent_file.touch()
elif not consistent and consistent_file.exists():
consistent_file.unlink()
os.system("sync")
os.sync()
def set_update_available_params(new_version):
def set_params(new_version: bool, failed_count: int, exception: Optional[str]) -> None:
params = Params()
t = datetime.datetime.utcnow().isoformat()
params.put("LastUpdateTime", t.encode('utf8'))
params.put("UpdateFailedCount", str(failed_count))
if failed_count == 0:
t = datetime.datetime.utcnow().isoformat()
params.put("LastUpdateTime", t.encode('utf8'))
if exception is None:
params.delete("LastUpdateException")
else:
params.put("LastUpdateException", exception)
if new_version:
try:
@ -114,13 +122,7 @@ def set_update_available_params(new_version):
params.put("UpdateAvailable", "1")
def dismount_ovfs():
if os.path.ismount(OVERLAY_MERGED):
cloudlog.error("unmounting existing overlay")
run(["umount", "-l", OVERLAY_MERGED])
def setup_git_options(cwd):
def setup_git_options(cwd: str) -> None:
# We sync FS object atimes (which NEOS doesn't use) and mtimes, but ctimes
# are outside user control. Make sure Git is set up to ignore system ctimes,
# because they change when we make hard links during finalize. Otherwise,
@ -134,66 +136,128 @@ def setup_git_options(cwd):
("core.checkStat", "minimal"),
]
for option, value in git_cfg:
try:
ret = run(["git", "config", "--get", option], cwd)
config_ok = ret.strip() == value
except subprocess.CalledProcessError:
config_ok = False
if not config_ok:
cloudlog.info(f"Setting git '{option}' to '{value}'")
run(["git", "config", option, value], cwd)
run(["git", "config", option, value], cwd)
def init_ovfs():
def dismount_overlay() -> None:
if os.path.ismount(OVERLAY_MERGED):
cloudlog.info("unmounting existing overlay")
run(["umount", "-l", OVERLAY_MERGED])
def init_overlay() -> None:
overlay_init_file = Path(os.path.join(BASEDIR, ".overlay_init"))
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
if overlay_init_file.is_file():
git_dir_path = os.path.join(BASEDIR, ".git")
new_files = run(["find", git_dir_path, "-newer", str(overlay_init_file)])
if not len(new_files.splitlines()):
# A valid overlay already exists
return
else:
cloudlog.info(".git directory changed, recreating overlay")
cloudlog.info("preparing new safe staging area")
Params().put("UpdateAvailable", "0")
set_consistent_flag(False)
dismount_ovfs()
dismount_overlay()
if os.path.isdir(STAGING_ROOT):
shutil.rmtree(STAGING_ROOT)
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED, FINALIZED]:
for dirname in [STAGING_ROOT, OVERLAY_UPPER, OVERLAY_METADATA, OVERLAY_MERGED]:
os.mkdir(dirname, 0o755)
if not os.lstat(BASEDIR).st_dev == os.lstat(OVERLAY_MERGED).st_dev:
if os.lstat(BASEDIR).st_dev != os.lstat(OVERLAY_MERGED).st_dev:
raise RuntimeError("base and overlay merge directories are on different filesystems; not valid for overlay FS!")
# Remove consistent flag from current BASEDIR so it's not copied over
if os.path.isfile(os.path.join(BASEDIR, ".overlay_consistent")):
os.remove(os.path.join(BASEDIR, ".overlay_consistent"))
# Leave a timestamped canary in BASEDIR to check at startup. The device clock
# should be correct by the time we get here. If the init file disappears, or
# critical mtimes in BASEDIR are newer than .overlay_init, continue.sh can
# assume that BASEDIR has used for local development or otherwise modified,
# and skips the update activation attempt.
Path(os.path.join(BASEDIR, ".overlay_init")).touch()
consistent_file = Path(os.path.join(BASEDIR, ".overlay_consistent"))
if consistent_file.is_file():
consistent_file.unlink()
overlay_init_file.touch()
os.system("sync")
os.sync()
overlay_opts = f"lowerdir={BASEDIR},upperdir={OVERLAY_UPPER},workdir={OVERLAY_METADATA}"
run(["mount", "-t", "overlay", "-o", overlay_opts, "none", OVERLAY_MERGED])
def finalize_from_ovfs():
def finalize_update() -> None:
"""Take the current OverlayFS merged view and finalize a copy outside of
OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree"""
# Remove the update ready flag and any old updates
cloudlog.info("creating finalized version of the overlay")
set_consistent_flag(False)
shutil.rmtree(FINALIZED)
# Copy the merged overlay view and set the update ready flag
if os.path.exists(FINALIZED):
shutil.rmtree(FINALIZED)
shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True)
set_consistent_flag(True)
cloudlog.info("done finalizing overlay")
def attempt_update(wait_helper):
cloudlog.info("attempting git update inside staging overlay")
def handle_neos_update(wait_helper: WaitTimeHelper) -> None:
with open(NEOS_VERSION, "r") as f:
cur_neos = f.read().strip()
updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \
echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip()
cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}")
if cur_neos == updated_neos:
return
cloudlog.info(f"Beginning background download for NEOS {updated_neos}")
set_offroad_alert("Offroad_NeosUpdate", True)
updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater")
update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json"
neos_downloaded = False
start_time = time.monotonic()
# Try to download for one day
while not neos_downloaded and not wait_helper.shutdown and \
(time.monotonic() - start_time < 60*60*24):
wait_helper.ready_event.clear()
try:
run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True)
neos_downloaded = True
except subprocess.CalledProcessError:
cloudlog.info("NEOS background download failed, retrying")
wait_helper.sleep(120)
# If the download failed, we'll show the alert again when we retry
set_offroad_alert("Offroad_NeosUpdate", False)
if not neos_downloaded:
raise Exception("Failed to download NEOS update")
cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds")
def check_git_fetch_result(fetch_txt):
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
return len(fetch_txt) > 0 and (fetch_txt != err_msg)
def check_for_update() -> Tuple[bool, bool]:
setup_git_options(OVERLAY_MERGED)
try:
git_fetch_output = run(["git", "fetch", "--dry-run"], OVERLAY_MERGED, low_priority=True)
return True, check_git_fetch_result(git_fetch_output)
except subprocess.CalledProcessError:
return False, False
def fetch_update(wait_helper: WaitTimeHelper) -> bool:
cloudlog.info("attempting git fetch inside staging overlay")
setup_git_options(OVERLAY_MERGED)
@ -203,9 +267,7 @@ def attempt_update(wait_helper):
cur_hash = run(["git", "rev-parse", "HEAD"], OVERLAY_MERGED).rstrip()
upstream_hash = run(["git", "rev-parse", "@{u}"], OVERLAY_MERGED).rstrip()
new_version = cur_hash != upstream_hash
err_msg = "Failed to add the host to the list of known hosts (/data/data/com.termux/files/home/.ssh/known_hosts).\n"
git_fetch_result = len(git_fetch_output) > 0 and (git_fetch_output != err_msg)
git_fetch_result = check_git_fetch_result(git_fetch_output)
cloudlog.info("comparing %s to %s" % (cur_hash, upstream_hash))
if new_version or git_fetch_result:
@ -221,48 +283,15 @@ def attempt_update(wait_helper):
]
cloudlog.info("git reset success: %s", '\n'.join(r))
# Download the accompanying NEOS version if it doesn't match the current version
with open(NEOS_VERSION, "r") as f:
cur_neos = f.read().strip()
updated_neos = run(["bash", "-c", r"unset REQUIRED_NEOS_VERSION && source launch_env.sh && \
echo -n $REQUIRED_NEOS_VERSION"], OVERLAY_MERGED).strip()
cloudlog.info(f"NEOS version check: {cur_neos} vs {updated_neos}")
if cur_neos != updated_neos:
cloudlog.info(f"Beginning background download for NEOS {updated_neos}")
set_offroad_alert("Offroad_NeosUpdate", True)
updater_path = os.path.join(OVERLAY_MERGED, "installer/updater/updater")
update_manifest = f"file://{OVERLAY_MERGED}/installer/updater/update.json"
neos_downloaded = False
start_time = time.monotonic()
# Try to download for one day
while (time.monotonic() - start_time < 60*60*24) and not wait_helper.shutdown:
wait_helper.ready_event.clear()
try:
run([updater_path, "bgcache", update_manifest], OVERLAY_MERGED, low_priority=True)
neos_downloaded = True
break
except subprocess.CalledProcessError:
cloudlog.info("NEOS background download failed, retrying")
wait_helper.sleep(120)
# If the download failed, we'll show the alert again when we retry
set_offroad_alert("Offroad_NeosUpdate", False)
if not neos_downloaded:
raise Exception("Failed to download NEOS update")
cloudlog.info(f"NEOS background download successful, took {time.monotonic() - start_time} seconds")
if ANDROID:
handle_neos_update(wait_helper)
# Create the finalized, ready-to-swap update
finalize_from_ovfs()
finalize_update()
cloudlog.info("openpilot update successful!")
else:
cloudlog.info("nothing new from git at this time")
set_update_available_params(new_version)
return new_version
@ -272,7 +301,7 @@ def main():
if params.get("DisableUpdates") == b"1":
raise RuntimeError("updates are disabled by the DisableUpdates param")
if os.geteuid() != 0:
if ANDROID and os.geteuid() != 0:
raise RuntimeError("updated must be launched as root!")
# Set low io priority
@ -290,45 +319,45 @@ def main():
wait_helper = WaitTimeHelper(proc)
wait_helper.sleep(30)
first_run = True
last_fetch_time = 0
update_failed_count = 0
update_available = False
overlay_initialized = False
# Run the update loop
# * every 1m, do a lightweight internet/update check
# * every 10m, do a full git fetch
while not wait_helper.shutdown:
update_now = wait_helper.ready_event.is_set()
wait_helper.ready_event.clear()
# Check for internet every 30s
# Don't run updater while onroad or if the time's wrong
time_wrong = datetime.datetime.utcnow().year < 2019
ping_failed = os.system(f"ping -W 4 -c 1 {TEST_IP}") != 0
if ping_failed or time_wrong:
is_onroad = params.get("IsOffroad") != b"1"
if is_onroad or time_wrong:
wait_helper.sleep(30)
cloudlog.info("not running updater, not offroad")
continue
# Attempt an update
exception = None
new_version = False
update_failed_count += 1
try:
# Re-create the overlay if BASEDIR/.git has changed since we created the overlay
if overlay_initialized:
overlay_init_fn = os.path.join(BASEDIR, ".overlay_init")
git_dir_path = os.path.join(BASEDIR, ".git")
new_files = run(["find", git_dir_path, "-newer", overlay_init_fn])
init_overlay()
if len(new_files.splitlines()):
cloudlog.info(".git directory changed, recreating overlay")
overlay_initialized = False
if not overlay_initialized:
init_ovfs()
overlay_initialized = True
if params.get("IsOffroad") == b"1":
update_available = attempt_update(wait_helper) or update_available
internet_ok, update_available = check_for_update()
if internet_ok and not update_available:
update_failed_count = 0
if not update_available and os.path.isdir(NEOSUPDATE_DIR):
shutil.rmtree(NEOSUPDATE_DIR)
else:
cloudlog.info("not running updater, openpilot running")
# Fetch updates at most every 10 minutes
if internet_ok and (update_now or time.monotonic() - last_fetch_time > 60*10):
new_version = fetch_update(wait_helper)
update_failed_count = 0
last_fetch_time = time.monotonic()
if first_run and not new_version and os.path.isdir(NEOSUPDATE_DIR):
shutil.rmtree(NEOSUPDATE_DIR)
first_run = False
except subprocess.CalledProcessError as e:
cloudlog.event(
"update process failed",
@ -336,21 +365,15 @@ def main():
output=e.output,
returncode=e.returncode
)
exception = e
overlay_initialized = False
except Exception:
exception = f"command failed: {e.cmd}\n{e.output}"
except Exception as e:
cloudlog.exception("uncaught updated exception, shouldn't happen")
exception = str(e)
params.put("UpdateFailedCount", str(update_failed_count))
if exception is None:
params.delete("LastUpdateException")
else:
params.put("LastUpdateException", f"command failed: {exception.cmd}\n{exception.output}")
set_params(new_version, update_failed_count, exception)
wait_helper.sleep(60)
# Wait 10 minutes between update attempts
wait_helper.sleep(60*10)
dismount_ovfs()
dismount_overlay()
if __name__ == "__main__":
main()