android health daemon (#21965)
* androidd * three strikes * better logging * persistent * add to release files * cleanup * no cpu usage * colon Co-authored-by: Comma Device <device@comma.ai>pull/21988/head
parent
f7bd264db8
commit
c8ca56dddd
|
@ -275,6 +275,7 @@ selfdrive/hardware/hw.h
|
|||
selfdrive/hardware/eon/__init__.py
|
||||
selfdrive/hardware/eon/hardware.h
|
||||
selfdrive/hardware/eon/hardware.py
|
||||
selfdrive/hardware/eon/androidd.py
|
||||
selfdrive/hardware/tici/__init__.py
|
||||
selfdrive/hardware/tici/hardware.py
|
||||
selfdrive/hardware/tici/amplifier.py
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import psutil
|
||||
from typing import Optional
|
||||
|
||||
from common.realtime import set_core_affinity, set_realtime_priority
|
||||
from selfdrive.swaglog import cloudlog
|
||||
|
||||
|
||||
MAX_MODEM_CRASHES = 3
|
||||
MODEM_PATH = "/sys/devices/soc/2080000.qcom,mss/subsys5"
|
||||
WATCHED_PROCS = ["zygote", "zygote64", "/system/bin/servicemanager", "/system/bin/surfaceflinger"]
|
||||
|
||||
|
||||
def get_modem_crash_count() -> Optional[int]:
|
||||
try:
|
||||
with open(os.path.join(MODEM_PATH, "crash_count")) as f:
|
||||
return int(f.read())
|
||||
except Exception:
|
||||
cloudlog.exception("Error reading modem crash count")
|
||||
return None
|
||||
|
||||
def get_modem_state() -> str:
|
||||
try:
|
||||
with open(os.path.join(MODEM_PATH, "state")) as f:
|
||||
return f.read().strip()
|
||||
except Exception:
|
||||
cloudlog.exception("Error reading modem state")
|
||||
return ""
|
||||
|
||||
def main():
|
||||
set_core_affinity(1)
|
||||
set_realtime_priority(1)
|
||||
|
||||
procs = {}
|
||||
crash_count = 0
|
||||
modem_killed = False
|
||||
modem_state = "ONLINE"
|
||||
while True:
|
||||
# check critical android services
|
||||
if any(p is None or not p.is_running() for p in procs.values()) or not len(procs):
|
||||
cur = {p: None for p in WATCHED_PROCS}
|
||||
for p in psutil.process_iter(attrs=['cmdline']):
|
||||
cmdline = None if not len(p.info['cmdline']) else p.info['cmdline'][0]
|
||||
if cmdline in WATCHED_PROCS:
|
||||
cur[cmdline] = p
|
||||
|
||||
if len(procs):
|
||||
for p in WATCHED_PROCS:
|
||||
if cur[p] != procs[p]:
|
||||
cloudlog.event("android service pid changed", proc=p, cur=cur[p], prev=procs[p])
|
||||
procs.update(cur)
|
||||
|
||||
# check modem state
|
||||
state = get_modem_state()
|
||||
if state != modem_state and not modem_killed:
|
||||
cloudlog.event("modem state changed", state=state)
|
||||
modem_state = state
|
||||
|
||||
# check modem crashes
|
||||
cnt = get_modem_crash_count()
|
||||
if cnt is not None:
|
||||
if cnt > crash_count:
|
||||
cloudlog.event("modem crash", count=cnt)
|
||||
crash_count = cnt
|
||||
|
||||
# handle excessive modem crashes
|
||||
if crash_count > MAX_MODEM_CRASHES and not modem_killed:
|
||||
cloudlog.event("killing modem")
|
||||
with open("/sys/kernel/debug/msm_subsys/modem", "w") as f:
|
||||
f.write("put")
|
||||
modem_killed = True
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -30,12 +30,15 @@ procs = [
|
|||
PythonProcess("paramsd", "selfdrive.locationd.paramsd"),
|
||||
PythonProcess("plannerd", "selfdrive.controls.plannerd"),
|
||||
PythonProcess("radard", "selfdrive.controls.radard"),
|
||||
PythonProcess("rtshield", "selfdrive.rtshield", enabled=EON),
|
||||
PythonProcess("thermald", "selfdrive.thermald.thermald", persistent=True),
|
||||
PythonProcess("timezoned", "selfdrive.timezoned", enabled=TICI, persistent=True),
|
||||
PythonProcess("tombstoned", "selfdrive.tombstoned", enabled=not PC, persistent=True),
|
||||
PythonProcess("updated", "selfdrive.updated", enabled=not PC, persistent=True),
|
||||
PythonProcess("uploader", "selfdrive.loggerd.uploader", persistent=True),
|
||||
|
||||
# EON only
|
||||
PythonProcess("rtshield", "selfdrive.rtshield", enabled=EON),
|
||||
PythonProcess("androidd", "selfdrive.hardware.eon.androidd", enabled=EON, persistent=True),
|
||||
]
|
||||
|
||||
managed_processes = {p.name: p for p in procs}
|
||||
|
|
|
@ -13,7 +13,7 @@ from cereal.services import service_list
|
|||
from common.basedir import BASEDIR
|
||||
from common.timeout import Timeout
|
||||
from common.params import Params
|
||||
from selfdrive.hardware import TICI
|
||||
from selfdrive.hardware import EON, TICI
|
||||
from selfdrive.loggerd.config import ROOT
|
||||
from selfdrive.test.helpers import set_params_enabled
|
||||
from tools.lib.logreader import LogReader
|
||||
|
@ -44,6 +44,11 @@ PROCS = {
|
|||
"./logcatd": 0,
|
||||
}
|
||||
|
||||
if EON:
|
||||
PROCS.update({
|
||||
"selfdrive.hardware.eon.androidd": 0.4,
|
||||
})
|
||||
|
||||
if TICI:
|
||||
PROCS.update({
|
||||
"./loggerd": 60.0,
|
||||
|
|
Loading…
Reference in New Issue