nopenpilot/selfdrive/thermald/thermald.py

429 lines
17 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2020-01-17 12:03:22 -07:00
import datetime
import os
import queue
import threading
import time
from collections import OrderedDict, namedtuple
2021-03-28 22:22:31 -06:00
from pathlib import Path
from typing import Dict, Optional, Tuple
import psutil
import cereal.messaging as messaging
2020-01-17 12:03:22 -07:00
from cereal import log
from common.dict_helpers import strip_deprecated_keys
2020-01-17 12:03:22 -07:00
from common.filter_simple import FirstOrderFilter
from common.params import Params
from common.realtime import DT_TRML, sec_since_boot
from selfdrive.controls.lib.alertmanager import set_offroad_alert
from selfdrive.hardware import EON, HARDWARE, PC, TICI
2020-01-17 12:03:22 -07:00
from selfdrive.loggerd.config import get_available_percent
from selfdrive.statsd import statlog
from selfdrive.swaglog import cloudlog
from selfdrive.thermald.power_monitoring import PowerMonitoring
from selfdrive.thermald.fan_controller import EonFanController, UnoFanController, TiciFanController
from selfdrive.version import terms_version, training_version
ThermalStatus = log.DeviceState.ThermalStatus
NetworkType = log.DeviceState.NetworkType
NetworkStrength = log.DeviceState.NetworkStrength
2020-01-17 12:03:22 -07:00
CURRENT_TAU = 15. # 15s time constant
TEMP_TAU = 5. # 5s time constant
DISCONNECT_TIMEOUT = 5. # wait 5 seconds before going offroad after disconnect so you get an alert
PANDA_STATES_TIMEOUT = int(1000 * 2.5 * DT_TRML) # 2.5x the expected pandaState frequency
2020-01-17 12:03:22 -07:00
ThermalBand = namedtuple("ThermalBand", ['min_temp', 'max_temp'])
HardwareState = namedtuple("HardwareState", ['network_type', 'network_metered', 'network_strength', 'network_info', 'nvme_temps', 'modem_temps'])
# List of thermal bands. We will stay within this region as long as we are within the bounds.
# When exiting the bounds, we'll jump to the lower or higher band. Bands are ordered in the dict.
THERMAL_BANDS = OrderedDict({
ThermalStatus.green: ThermalBand(None, 80.0),
ThermalStatus.yellow: ThermalBand(75.0, 96.0),
ThermalStatus.red: ThermalBand(80.0, 107.),
ThermalStatus.danger: ThermalBand(94.0, None),
})
2020-03-12 12:55:33 -06:00
# Override to highest thermal band when offroad and above this temp
OFFROAD_DANGER_TEMP = 79.5 if TICI else 70.0
prev_offroad_states: Dict[str, Tuple[bool, Optional[str]]] = {}
2021-03-29 15:37:52 -06:00
tz_by_type: Optional[Dict[str, int]] = None
def populate_tz_by_type():
global tz_by_type
tz_by_type = {}
for n in os.listdir("/sys/devices/virtual/thermal"):
if not n.startswith("thermal_zone"):
continue
with open(os.path.join("/sys/devices/virtual/thermal", n, "type")) as f:
tz_by_type[f.read().strip()] = int(n.lstrip("thermal_zone"))
def read_tz(x):
if x is None:
2020-02-28 21:27:41 -07:00
return 0
if isinstance(x, str):
if tz_by_type is None:
populate_tz_by_type()
x = tz_by_type[x]
2020-01-17 12:03:22 -07:00
try:
with open(f"/sys/devices/virtual/thermal/thermal_zone{x}/temp") as f:
return int(f.read())
2020-01-17 12:03:22 -07:00
except FileNotFoundError:
return 0
def read_thermal(thermal_config):
dat = messaging.new_message('deviceState')
dat.deviceState.cpuTempC = [read_tz(z) / thermal_config.cpu[1] for z in thermal_config.cpu[0]]
dat.deviceState.gpuTempC = [read_tz(z) / thermal_config.gpu[1] for z in thermal_config.gpu[0]]
dat.deviceState.memoryTempC = read_tz(thermal_config.mem[0]) / thermal_config.mem[1]
dat.deviceState.ambientTempC = read_tz(thermal_config.ambient[0]) / thermal_config.ambient[1]
dat.deviceState.pmicTempC = [read_tz(z) / thermal_config.pmic[1] for z in thermal_config.pmic[0]]
2020-01-17 12:03:22 -07:00
return dat
2020-03-12 12:55:33 -06:00
def set_offroad_alert_if_changed(offroad_alert: str, show_alert: bool, extra_text: Optional[str]=None):
if prev_offroad_states.get(offroad_alert, None) == (show_alert, extra_text):
return
prev_offroad_states[offroad_alert] = (show_alert, extra_text)
set_offroad_alert(offroad_alert, show_alert, extra_text)
def hw_state_thread(end_event, hw_queue):
"""Handles non critical hardware state, and sends over queue"""
count = 0
registered_count = 0
prev_hw_state = None
2020-01-17 12:03:22 -07:00
while not end_event.is_set():
# these are expensive calls. update every 10s
if (count % int(10. / DT_TRML)) == 0:
try:
network_type = HARDWARE.get_network_type()
modem_temps = HARDWARE.get_modem_temperatures()
if len(modem_temps) == 0 and prev_hw_state is not None:
modem_temps = prev_hw_state.modem_temps
hw_state = HardwareState(
network_type=network_type,
network_metered=HARDWARE.get_network_metered(network_type),
network_strength=HARDWARE.get_network_strength(network_type),
network_info=HARDWARE.get_network_info(),
nvme_temps=HARDWARE.get_nvme_temperatures(),
modem_temps=modem_temps,
)
try:
hw_queue.put_nowait(hw_state)
except queue.Full:
pass
if TICI and (hw_state.network_info is not None) and (hw_state.network_info.get('state', None) == "REGISTERED"):
registered_count += 1
else:
registered_count = 0
if registered_count > 10:
cloudlog.warning(f"Modem stuck in registered state {hw_state.network_info}. nmcli conn up lte")
os.system("nmcli conn up lte")
registered_count = 0
prev_hw_state = hw_state
except Exception:
cloudlog.exception("Error getting hardware state")
count += 1
time.sleep(DT_TRML)
def thermald_thread(end_event, hw_queue):
pm = messaging.PubMaster(['deviceState'])
sm = messaging.SubMaster(["peripheralState", "gpsLocationExternal", "controlsState", "pandaStates"], poll=["pandaStates"])
2020-01-17 12:03:22 -07:00
count = 0
onroad_conditions: Dict[str, bool] = {
"ignition": False,
}
startup_conditions: Dict[str, bool] = {}
startup_conditions_prev: Dict[str, bool] = {}
2020-01-17 12:03:22 -07:00
off_ts = None
started_ts = None
started_seen = False
thermal_status = ThermalStatus.green
usb_power = True
last_hw_state = HardwareState(
network_type=NetworkType.none,
network_metered=False,
network_strength=NetworkStrength.unknown,
network_info=None,
nvme_temps=[],
modem_temps=[],
)
2020-02-03 12:28:38 -07:00
2020-01-17 12:03:22 -07:00
current_filter = FirstOrderFilter(0., CURRENT_TAU, DT_TRML)
temp_filter = FirstOrderFilter(0., TEMP_TAU, DT_TRML)
2020-01-17 12:03:22 -07:00
should_start_prev = False
in_car = False
is_uno = False
engaged_prev = False
2020-01-17 12:03:22 -07:00
params = Params()
power_monitor = PowerMonitoring()
2020-01-17 12:03:22 -07:00
HARDWARE.initialize_hardware()
thermal_config = HARDWARE.get_thermal_config()
fan_controller = None
while not end_event.is_set():
sm.update(PANDA_STATES_TIMEOUT)
pandaStates = sm['pandaStates']
peripheralState = sm['peripheralState']
msg = read_thermal(thermal_config)
2020-01-17 12:03:22 -07:00
if sm.updated['pandaStates'] and len(pandaStates) > 0:
# Set ignition based on any panda connected
onroad_conditions["ignition"] = any(ps.ignitionLine or ps.ignitionCan for ps in pandaStates if ps.pandaType != log.PandaState.PandaType.unknown)
pandaState = pandaStates[0]
in_car = pandaState.harnessStatus != log.PandaState.HarnessStatus.notConnected
usb_power = peripheralState.usbPowerMode != log.PeripheralState.UsbPowerMode.client
# Setup fan handler on first connect to panda
if fan_controller is None and peripheralState.pandaType != log.PandaState.PandaType.unknown:
is_uno = peripheralState.pandaType == log.PandaState.PandaType.uno
if TICI:
fan_controller = TiciFanController()
elif is_uno or PC:
fan_controller = UnoFanController()
else:
fan_controller = EonFanController()
try:
last_hw_state = hw_queue.get_nowait()
except queue.Empty:
pass
2020-02-03 12:28:38 -07:00
msg.deviceState.freeSpacePercent = get_available_percent(default=100.0)
msg.deviceState.memoryUsagePercent = int(round(psutil.virtual_memory().percent))
msg.deviceState.cpuUsagePercent = [int(round(n)) for n in psutil.cpu_percent(percpu=True)]
msg.deviceState.gpuUsagePercent = int(round(HARDWARE.get_gpu_usage_percent()))
msg.deviceState.networkType = last_hw_state.network_type
msg.deviceState.networkMetered = last_hw_state.network_metered
msg.deviceState.networkStrength = last_hw_state.network_strength
if last_hw_state.network_info is not None:
msg.deviceState.networkInfo = last_hw_state.network_info
msg.deviceState.nvmeTempC = last_hw_state.nvme_temps
msg.deviceState.modemTempC = last_hw_state.modem_temps
msg.deviceState.screenBrightnessPercent = HARDWARE.get_screen_brightness()
msg.deviceState.batteryPercent = HARDWARE.get_battery_capacity()
msg.deviceState.batteryCurrent = HARDWARE.get_battery_current()
msg.deviceState.usbOnline = HARDWARE.get_usb_present()
current_filter.update(msg.deviceState.batteryCurrent / 1e6)
2020-01-17 12:03:22 -07:00
max_comp_temp = temp_filter.update(
max(max(msg.deviceState.cpuTempC), msg.deviceState.memoryTempC, max(msg.deviceState.gpuTempC))
)
2020-01-17 12:03:22 -07:00
if fan_controller is not None:
msg.deviceState.fanSpeedPercentDesired = fan_controller.update(max_comp_temp, onroad_conditions["ignition"])
2020-01-17 12:03:22 -07:00
is_offroad_for_5_min = (started_ts is None) and ((not started_seen) or (off_ts is None) or (sec_since_boot() - off_ts > 60 * 5))
if is_offroad_for_5_min and max_comp_temp > OFFROAD_DANGER_TEMP:
# If device is offroad we want to cool down before going onroad
# since going onroad increases load and can make temps go over 107
2020-01-17 12:03:22 -07:00
thermal_status = ThermalStatus.danger
2021-01-16 15:23:06 -07:00
else:
current_band = THERMAL_BANDS[thermal_status]
band_idx = list(THERMAL_BANDS.keys()).index(thermal_status)
if current_band.min_temp is not None and max_comp_temp < current_band.min_temp:
thermal_status = list(THERMAL_BANDS.keys())[band_idx - 1]
elif current_band.max_temp is not None and max_comp_temp > current_band.max_temp:
thermal_status = list(THERMAL_BANDS.keys())[band_idx + 1]
2020-01-17 12:03:22 -07:00
# **** starting logic ****
# Ensure date/time are valid
now = datetime.datetime.utcnow()
startup_conditions["time_valid"] = (now.year > 2020) or (now.year == 2020 and now.month >= 10)
set_offroad_alert_if_changed("Offroad_InvalidTime", (not startup_conditions["time_valid"]))
2020-01-17 12:03:22 -07:00
startup_conditions["up_to_date"] = params.get("Offroad_ConnectivityNeeded") is None or params.get_bool("DisableUpdates") or params.get_bool("SnoozeUpdate")
startup_conditions["not_uninstalling"] = not params.get_bool("DoUninstall")
startup_conditions["accepted_terms"] = params.get("HasAcceptedTerms") == terms_version
2020-01-17 12:03:22 -07:00
# with 2% left, we killall, otherwise the phone will take a long time to boot
startup_conditions["free_space"] = msg.deviceState.freeSpacePercent > 2
2020-11-03 20:56:25 -07:00
startup_conditions["completed_training"] = params.get("CompletedTrainingVersion") == training_version or \
params.get_bool("Passive")
startup_conditions["not_driver_view"] = not params.get_bool("IsDriverViewEnabled")
startup_conditions["not_taking_snapshot"] = not params.get_bool("IsTakingSnapshot")
2020-01-17 12:03:22 -07:00
# if any CPU gets above 107 or the battery gets above 63, kill all processes
# controls will warn with CPU above 95 or battery above 60
onroad_conditions["device_temp_good"] = thermal_status < ThermalStatus.danger
set_offroad_alert_if_changed("Offroad_TemperatureTooHigh", (not onroad_conditions["device_temp_good"]))
2020-01-17 12:03:22 -07:00
# TODO: this should move to TICI.initialize_hardware, but we currently can't import params there
if TICI:
if not os.path.isfile("/persist/comma/living-in-the-moment"):
if not Path("/data/media").is_mount():
set_offroad_alert_if_changed("Offroad_StorageMissing", True)
else:
# check for bad NVMe
try:
with open("/sys/block/nvme0n1/device/model") as f:
model = f.read().strip()
if not model.startswith("Samsung SSD 980") and params.get("Offroad_BadNvme") is None:
set_offroad_alert_if_changed("Offroad_BadNvme", True)
cloudlog.event("Unsupported NVMe", model=model, error=True)
except Exception:
pass
# Handle offroad/onroad transition
should_start = all(onroad_conditions.values())
if started_ts is None:
should_start = should_start and all(startup_conditions.values())
if should_start != should_start_prev or (count == 0):
params.put_bool("IsOnroad", should_start)
params.put_bool("IsOffroad", not should_start)
params.put_bool("IsEngaged", False)
engaged_prev = False
HARDWARE.set_power_save(not should_start)
2020-01-17 12:03:22 -07:00
if sm.updated['controlsState']:
engaged = sm['controlsState'].enabled
if engaged != engaged_prev:
params.put_bool("IsEngaged", engaged)
engaged_prev = engaged
try:
with open('/dev/kmsg', 'w') as kmsg:
kmsg.write(f"<3>[thermald] engaged: {engaged}\n")
except Exception:
pass
if should_start:
2020-01-17 12:03:22 -07:00
off_ts = None
if started_ts is None:
started_ts = sec_since_boot()
started_seen = True
else:
if onroad_conditions["ignition"] and (startup_conditions != startup_conditions_prev):
cloudlog.event("Startup blocked", startup_conditions=startup_conditions, onroad_conditions=onroad_conditions)
2020-01-17 12:03:22 -07:00
started_ts = None
if off_ts is None:
off_ts = sec_since_boot()
# Offroad power monitoring
power_monitor.calculate(peripheralState, onroad_conditions["ignition"])
msg.deviceState.offroadPowerUsageUwh = power_monitor.get_power_used()
msg.deviceState.carBatteryCapacityUwh = max(0, power_monitor.get_car_battery_capacity())
current_power_draw = HARDWARE.get_current_power_draw() # pylint: disable=assignment-from-none
msg.deviceState.powerDrawW = current_power_draw if current_power_draw is not None else 0
# Check if we need to disable charging (handled by boardd)
msg.deviceState.chargingDisabled = power_monitor.should_disable_charging(onroad_conditions["ignition"], in_car, off_ts)
# Check if we need to shut down
if power_monitor.should_shutdown(peripheralState, onroad_conditions["ignition"], in_car, off_ts, started_seen):
cloudlog.warning(f"shutting device down, offroad since {off_ts}")
params.put_bool("DoShutdown", True)
msg.deviceState.chargingError = current_filter.x > 0. and msg.deviceState.batteryPercent < 90 # if current is positive, then battery is being discharged
msg.deviceState.started = started_ts is not None
msg.deviceState.startedMonoTime = int(1e9*(started_ts or 0))
2020-01-17 12:03:22 -07:00
last_ping = params.get("LastAthenaPingTime")
if last_ping is not None:
msg.deviceState.lastAthenaPingTime = int(last_ping)
msg.deviceState.thermalStatus = thermal_status
pm.send("deviceState", msg)
2020-01-17 12:03:22 -07:00
2021-03-08 18:00:58 -07:00
if EON and not is_uno:
set_offroad_alert_if_changed("Offroad_ChargeDisabled", (not usb_power))
2020-01-17 12:03:22 -07:00
should_start_prev = should_start
startup_conditions_prev = startup_conditions.copy()
2020-01-17 12:03:22 -07:00
# Log to statsd
statlog.gauge("free_space_percent", msg.deviceState.freeSpacePercent)
statlog.gauge("gpu_usage_percent", msg.deviceState.gpuUsagePercent)
statlog.gauge("memory_usage_percent", msg.deviceState.memoryUsagePercent)
for i, usage in enumerate(msg.deviceState.cpuUsagePercent):
statlog.gauge(f"cpu{i}_usage_percent", usage)
for i, temp in enumerate(msg.deviceState.cpuTempC):
statlog.gauge(f"cpu{i}_temperature", temp)
for i, temp in enumerate(msg.deviceState.gpuTempC):
statlog.gauge(f"gpu{i}_temperature", temp)
statlog.gauge("memory_temperature", msg.deviceState.memoryTempC)
statlog.gauge("ambient_temperature", msg.deviceState.ambientTempC)
for i, temp in enumerate(msg.deviceState.pmicTempC):
statlog.gauge(f"pmic{i}_temperature", temp)
for i, temp in enumerate(last_hw_state.nvme_temps):
statlog.gauge(f"nvme_temperature{i}", temp)
for i, temp in enumerate(last_hw_state.modem_temps):
statlog.gauge(f"modem_temperature{i}", temp)
statlog.gauge("fan_speed_percent_desired", msg.deviceState.fanSpeedPercentDesired)
statlog.gauge("screen_brightness_percent", msg.deviceState.screenBrightnessPercent)
# report to server once every 10 minutes
if (count % int(600. / DT_TRML)) == 0:
2021-04-28 01:09:03 -06:00
if EON and started_ts is None and msg.deviceState.memoryUsagePercent > 40:
cloudlog.event("High offroad memory usage", mem=msg.deviceState.memoryUsagePercent)
2020-01-17 12:03:22 -07:00
cloudlog.event("STATUS_PACKET",
2020-03-12 12:55:33 -06:00
count=count,
pandaStates=[strip_deprecated_keys(p.to_dict()) for p in pandaStates],
peripheralState=strip_deprecated_keys(peripheralState.to_dict()),
location=(strip_deprecated_keys(sm["gpsLocationExternal"].to_dict()) if sm.alive["gpsLocationExternal"] else None),
deviceState=strip_deprecated_keys(msg.to_dict()))
2020-01-17 12:03:22 -07:00
count += 1
def main():
hw_queue = queue.Queue(maxsize=1)
end_event = threading.Event()
threads = [
threading.Thread(target=hw_state_thread, args=(end_event, hw_queue)),
threading.Thread(target=thermald_thread, args=(end_event, hw_queue)),
]
for t in threads:
t.start()
try:
while True:
time.sleep(1)
if not all(t.is_alive() for t in threads):
break
finally:
end_event.set()
for t in threads:
t.join()
2020-01-17 12:03:22 -07:00
2020-03-12 12:55:33 -06:00
2020-01-17 12:03:22 -07:00
if __name__ == "__main__":
main()