From f3db94a0c480bc12df98ec679a2f84833e134ee3 Mon Sep 17 00:00:00 2001 From: Willem Melching Date: Mon, 18 Jan 2021 13:29:01 +0100 Subject: [PATCH] Linux tombstones: add stacktrace and upload (#19737) * Add stacktrace to tombstones * make sentry output prettier * Refactor * Generate upload filename * Actually move file * Fix spaces * copy and upload * dont delete just yet Co-authored-by: Comma Device --- selfdrive/loggerd/uploader.py | 3 +- selfdrive/tombstoned.py | 172 +++++++++++++++++++++++++++------- selfdrive/version.py | 3 +- 3 files changed, 144 insertions(+), 34 deletions(-) diff --git a/selfdrive/loggerd/uploader.py b/selfdrive/loggerd/uploader.py index 3874fbf0..1e84cbd0 100644 --- a/selfdrive/loggerd/uploader.py +++ b/selfdrive/loggerd/uploader.py @@ -58,6 +58,7 @@ class Uploader(): self.last_resp = None self.last_exc = None + self.immediate_folders = ["crash/"] self.immediate_priority = {"qlog.bz2": 0, "qcamera.ts": 1} self.high_priority = {"rlog.bz2": 0, "fcamera.hevc": 1, "dcamera.hevc": 2, "ecamera.hevc": 3} @@ -98,7 +99,7 @@ class Uploader(): # try to upload qlog files first for name, key, fn in upload_files: - if name in self.immediate_priority: + if name in self.immediate_priority or any(f in fn for f in self.immediate_folders): return (key, fn) if with_raw: diff --git a/selfdrive/tombstoned.py b/selfdrive/tombstoned.py index 1989dbd0..26c17b4a 100755 --- a/selfdrive/tombstoned.py +++ b/selfdrive/tombstoned.py @@ -1,23 +1,70 @@ #!/usr/bin/env python3 +import datetime import os +import re +import shutil +import signal +import subprocess import time +import glob from raven import Client from raven.transport.http import HTTPTransport +from common.file_helpers import mkdirs_exists_ok from selfdrive.hardware import TICI +from selfdrive.loggerd.config import ROOT from selfdrive.swaglog import cloudlog -from selfdrive.version import version, origin, branch, dirty +from selfdrive.version import branch, commit, dirty, origin, version MAX_SIZE = 100000 * 10 # Normal size is 40-100k, allow up to 1M if TICI: - MAX_SIZE = MAX_SIZE * 10 # Allow larger size for tici + MAX_SIZE = MAX_SIZE * 100 # Allow larger size for tici since files contain coredump +MAX_TOMBSTONE_FN_LEN = 85 + +TOMBSTONE_DIR = "/data/tombstones/" +APPORT_DIR = "/var/crash/" + + +def safe_fn(s): + extra = ['_'] + return "".join(c for c in s if c.isalnum() or c in extra).rstrip() + + +def sentry_report(client, fn, message, contents): + cloudlog.error({'tombstone': message}) + client.captureMessage( + message=message, + sdk={'name': 'tombstoned', 'version': '0'}, + extra={ + 'tombstone_fn': fn, + 'tombstone': contents + }, + ) + +def clear_apport_folder(): + for f in glob.glob(APPORT_DIR + '*'): + try: + os.remove(f) + except Exception: + pass + + +def get_apport_stacktrace(fn): + try: + cmd = f'apport-retrace -s <(cat <(echo "Package: openpilot") "{fn}")' + return subprocess.check_output(cmd, shell=True, encoding='utf8', timeout=30, executable='/bin/bash') # pylint: disable=unexpected-keyword-arg + except subprocess.CalledProcessError: + return "Error getting stacktrace" + except subprocess.TimeoutExpired: + return "Timeout getting stacktrace" + def get_tombstones(): """Returns list of (filename, ctime) for all tombstones in /data/tombstones and apport crashlogs in /var/crash""" files = [] - for folder in ["/data/tombstones/", "/var/crash/"]: + for folder in [TOMBSTONE_DIR, APPORT_DIR]: if os.path.exists(folder): with os.scandir(folder) as d: @@ -30,7 +77,7 @@ def get_tombstones(): return files -def report_tombstone(fn, client): +def report_tombstone_android(fn, client): f_size = os.path.getsize(fn) if f_size > MAX_SIZE: cloudlog.error(f"Tombstone {fn} too big, {f_size}. Skipping...") @@ -39,41 +86,99 @@ def report_tombstone(fn, client): with open(fn, encoding='ISO-8859-1') as f: contents = f.read() - # Get summary for sentry title - if fn.endswith(".crash"): - lines = contents.split('\n') - message = lines[6] + message = " ".join(contents.split('\n')[5:7]) - status_idx = contents.find('ProcStatus') - if status_idx >= 0: - lines = contents[status_idx:].split('\n') - message += " " + lines[1] - else: - message = " ".join(contents.split('\n')[5:7]) + # Cut off pid/tid, since that varies per run + name_idx = message.find('name') + if name_idx >= 0: + message = message[name_idx:] - # Cut off pid/tid, since that varies per run - name_idx = message.find('name') - if name_idx >= 0: - message = message[name_idx:] + # Cut off fault addr + fault_idx = message.find(', fault addr') + if fault_idx >= 0: + message = message[:fault_idx] - # Cut off fault addr - fault_idx = message.find(', fault addr') - if fault_idx >= 0: - message = message[:fault_idx] + sentry_report(client, fn, message, contents) - cloudlog.error({'tombstone': message}) - client.captureMessage( - message=message, - sdk={'name': 'tombstoned', 'version': '0'}, - extra={ - 'tombstone_fn': fn, - 'tombstone': contents - }, - ) +def report_tombstone_apport(fn, client): + f_size = os.path.getsize(fn) + if f_size > MAX_SIZE: + cloudlog.error(f"Tombstone {fn} too big, {f_size}. Skipping...") + return + + message = "" # One line description of the crash + contents = "" # Full file contents without coredump + path = "" # File path relative to openpilot directory + + proc_maps = False + + with open(fn) as f: + for line in f: + if "CoreDump" in line: + break + elif "ProcMaps" in line: + proc_maps = True + elif "ProcStatus" in line: + proc_maps = False + + if not proc_maps: + contents += line + + if "ExecutablePath" in line: + path = line.strip().split(': ')[-1] + path = path.replace('/data/openpilot/', '') + message += path + elif "Signal" in line: + message += " - " + line.strip() + + try: + sig_num = int(line.strip().split(': ')[-1]) + message += " (" + signal.Signals(sig_num).name + ")" # pylint: disable=no-member + except ValueError: + pass + + stacktrace = get_apport_stacktrace(fn) + stacktrace_s = stacktrace.split('\n') + crash_function = "No stacktrace" + + if len(stacktrace_s) > 2: + found = False + + # Try to find first entry in openpilot, fall back to first line + for line in stacktrace_s: + if "at selfdrive/" in line: + crash_function = line + found = True + break + + if not found: + crash_function = stacktrace_s[1] + + # Remove arguments that can contain pointers to make sentry one-liner unique + crash_function = re.sub(r'\(.*?\)', '', crash_function) + + contents = stacktrace + "\n\n" + contents + message = message + " - " + crash_function + sentry_report(client, fn, message, contents) + + # Copy crashlog to upload folder + clean_path = path.replace('/', '_') + date = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") + + new_fn = f"{date}_{commit[:8]}_{safe_fn(clean_path)}"[:MAX_TOMBSTONE_FN_LEN] + + crashlog_dir = os.path.join(ROOT, "crash") + mkdirs_exists_ok(crashlog_dir) + + # Files could be on different filesystems, copy, then delete + shutil.copy(fn, os.path.join(crashlog_dir, new_fn)) + os.remove(fn) def main(): + # TODO: turn on when all tombstones are recovered + # clear_apport_folder() # Clear apport folder on start, otherwise duplicate crashes won't register initial_tombstones = set(get_tombstones()) tags = { @@ -91,7 +196,10 @@ def main(): for fn, _ in (now_tombstones - initial_tombstones): try: cloudlog.info(f"reporting new tombstone {fn}") - report_tombstone(fn, client) + if fn.endswith(".crash"): + report_tombstone_apport(fn, client) + else: + report_tombstone_android(fn, client) except Exception: cloudlog.exception(f"Error reporting tombstone {fn}") diff --git a/selfdrive/version.py b/selfdrive/version.py index ef975f89..b0684183 100644 --- a/selfdrive/version.py +++ b/selfdrive/version.py @@ -52,6 +52,7 @@ comma_remote: bool = False tested_branch: bool = False origin = get_git_remote() branch = get_git_full_branchname() +commit = get_git_commit() if (origin is not None) and (branch is not None): try: @@ -74,7 +75,7 @@ if (origin is not None) and (branch is not None): try: dirty_files = run_cmd(["git", "diff-index", branch, "--"]) cloudlog.event("dirty comma branch", version=version, dirty=dirty, origin=origin, branch=branch, - dirty_files=dirty_files, commit=get_git_commit(), origin_commit=get_git_commit(branch)) + dirty_files=dirty_files, commit=commit, origin_commit=get_git_commit(branch)) except subprocess.CalledProcessError: pass