android health daemon (#21965)

* androidd

* three strikes

* better logging

* persistent

* add to release files

* cleanup

* no cpu usage

* colon

Co-authored-by: Comma Device <device@comma.ai>
pull/21988/head
Adeeb Shihadeh 2021-08-19 16:00:05 -07:00 committed by GitHub
parent f7bd264db8
commit c8ca56dddd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 89 additions and 2 deletions

View File

@ -275,6 +275,7 @@ selfdrive/hardware/hw.h
selfdrive/hardware/eon/__init__.py
selfdrive/hardware/eon/hardware.h
selfdrive/hardware/eon/hardware.py
selfdrive/hardware/eon/androidd.py
selfdrive/hardware/tici/__init__.py
selfdrive/hardware/tici/hardware.py
selfdrive/hardware/tici/amplifier.py

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
import os
import time
import psutil
from typing import Optional
from common.realtime import set_core_affinity, set_realtime_priority
from selfdrive.swaglog import cloudlog
MAX_MODEM_CRASHES = 3
MODEM_PATH = "/sys/devices/soc/2080000.qcom,mss/subsys5"
WATCHED_PROCS = ["zygote", "zygote64", "/system/bin/servicemanager", "/system/bin/surfaceflinger"]
def get_modem_crash_count() -> Optional[int]:
try:
with open(os.path.join(MODEM_PATH, "crash_count")) as f:
return int(f.read())
except Exception:
cloudlog.exception("Error reading modem crash count")
return None
def get_modem_state() -> str:
try:
with open(os.path.join(MODEM_PATH, "state")) as f:
return f.read().strip()
except Exception:
cloudlog.exception("Error reading modem state")
return ""
def main():
set_core_affinity(1)
set_realtime_priority(1)
procs = {}
crash_count = 0
modem_killed = False
modem_state = "ONLINE"
while True:
# check critical android services
if any(p is None or not p.is_running() for p in procs.values()) or not len(procs):
cur = {p: None for p in WATCHED_PROCS}
for p in psutil.process_iter(attrs=['cmdline']):
cmdline = None if not len(p.info['cmdline']) else p.info['cmdline'][0]
if cmdline in WATCHED_PROCS:
cur[cmdline] = p
if len(procs):
for p in WATCHED_PROCS:
if cur[p] != procs[p]:
cloudlog.event("android service pid changed", proc=p, cur=cur[p], prev=procs[p])
procs.update(cur)
# check modem state
state = get_modem_state()
if state != modem_state and not modem_killed:
cloudlog.event("modem state changed", state=state)
modem_state = state
# check modem crashes
cnt = get_modem_crash_count()
if cnt is not None:
if cnt > crash_count:
cloudlog.event("modem crash", count=cnt)
crash_count = cnt
# handle excessive modem crashes
if crash_count > MAX_MODEM_CRASHES and not modem_killed:
cloudlog.event("killing modem")
with open("/sys/kernel/debug/msm_subsys/modem", "w") as f:
f.write("put")
modem_killed = True
time.sleep(1)
if __name__ == "__main__":
main()

View File

@ -30,12 +30,15 @@ procs = [
PythonProcess("paramsd", "selfdrive.locationd.paramsd"),
PythonProcess("plannerd", "selfdrive.controls.plannerd"),
PythonProcess("radard", "selfdrive.controls.radard"),
PythonProcess("rtshield", "selfdrive.rtshield", enabled=EON),
PythonProcess("thermald", "selfdrive.thermald.thermald", persistent=True),
PythonProcess("timezoned", "selfdrive.timezoned", enabled=TICI, persistent=True),
PythonProcess("tombstoned", "selfdrive.tombstoned", enabled=not PC, persistent=True),
PythonProcess("updated", "selfdrive.updated", enabled=not PC, persistent=True),
PythonProcess("uploader", "selfdrive.loggerd.uploader", persistent=True),
# EON only
PythonProcess("rtshield", "selfdrive.rtshield", enabled=EON),
PythonProcess("androidd", "selfdrive.hardware.eon.androidd", enabled=EON, persistent=True),
]
managed_processes = {p.name: p for p in procs}

View File

@ -13,7 +13,7 @@ from cereal.services import service_list
from common.basedir import BASEDIR
from common.timeout import Timeout
from common.params import Params
from selfdrive.hardware import TICI
from selfdrive.hardware import EON, TICI
from selfdrive.loggerd.config import ROOT
from selfdrive.test.helpers import set_params_enabled
from tools.lib.logreader import LogReader
@ -44,6 +44,11 @@ PROCS = {
"./logcatd": 0,
}
if EON:
PROCS.update({
"selfdrive.hardware.eon.androidd": 0.4,
})
if TICI:
PROCS.update({
"./loggerd": 60.0,