nopenpilot/tools/lib/url_file.py

# pylint: skip-file

import os
import time
import tempfile
import threading
import urllib.parse
import pycurl
from hashlib import sha256
from io import BytesIO
from tenacity import retry, wait_random_exponential, stop_after_attempt
from common.file_helpers import mkdirs_exists_ok, atomic_write_in_dir
#  Cache chunk size
K = 1000
CHUNK_SIZE = 1000 * K

CACHE_DIR = os.environ.get("COMMA_CACHE", "/tmp/comma_download_cache/")


def hash_256(link):
  hsh = str(sha256((link.split("?")[0]).encode('utf-8')).hexdigest())
  return hsh


class URLFile:
  _tlocal = threading.local()

  def __init__(self, url, debug=False, cache=None):
    self._url = url
    self._pos = 0
    self._length = None
    self._local_file = None
    self._debug = debug
    #  True by default, false if FILEREADER_CACHE is defined, but can be overwritten by the cache input
    self._force_download = not int(os.environ.get("FILEREADER_CACHE", "0"))
    if cache is not None:
      self._force_download = not cache

    try:
      self._curl = self._tlocal.curl
    except AttributeError:
      self._curl = self._tlocal.curl = pycurl.Curl()
    mkdirs_exists_ok(CACHE_DIR)

  def __enter__(self):
    return self

  def __exit__(self, exc_type, exc_value, traceback):
    if self._local_file is not None:
      os.remove(self._local_file.name)
      self._local_file.close()
      self._local_file = None

  @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True)
  def get_length_online(self):
    c = self._curl
    c.reset()
    c.setopt(pycurl.NOSIGNAL, 1)
    c.setopt(pycurl.TIMEOUT_MS, 500000)
    c.setopt(pycurl.FOLLOWLOCATION, True)
    c.setopt(pycurl.URL, self._url)
    c.setopt(c.NOBODY, 1)
    c.perform()
    length = int(c.getinfo(c.CONTENT_LENGTH_DOWNLOAD))
    c.reset()
    return length

  def get_length(self):
    if self._length is not None:
      return self._length
    file_length_path = os.path.join(CACHE_DIR, hash_256(self._url) + "_length")
    if os.path.exists(file_length_path) and not self._force_download:
      with open(file_length_path) as file_length:
          content = file_length.read()
          self._length = int(content)
          return self._length

    self._length = self.get_length_online()
    if not self._force_download:
      with atomic_write_in_dir(file_length_path, mode="w") as file_length:
        file_length.write(str(self._length))
    return self._length

  def read(self, ll=None):
    if self._force_download:
      return self.read_aux(ll=ll)

    file_begin = self._pos
    file_end = self._pos + ll if ll is not None else self.get_length()
    #  We have to align with chunks we store. Position is the begginiing of the latest chunk that starts before or at our file
    position = (file_begin // CHUNK_SIZE) * CHUNK_SIZE
    response = b""
    while True:
      self._pos = position
      chunk_number = self._pos / CHUNK_SIZE
      file_name = hash_256(self._url) + "_" + str(chunk_number)
      full_path = os.path.join(CACHE_DIR, str(file_name))
      data = None
      #  If we don't have a file, download it
      if not os.path.exists(full_path):
        data = self.read_aux(ll=CHUNK_SIZE)
        with atomic_write_in_dir(full_path, mode="wb") as new_cached_file:
          new_cached_file.write(data)
      else:
        with open(full_path, "rb") as cached_file:
          data = cached_file.read()

      response += data[max(0, file_begin - position): min(CHUNK_SIZE, file_end - position)]

      position += CHUNK_SIZE
      if position >= file_end:
        self._pos = file_end
        return response

  @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True)
  def read_aux(self, ll=None):
    download_range = False
    headers = ["Connection: keep-alive"]
    if self._pos != 0 or ll is not None:
      if ll is None:
        end = self.get_length() - 1
      else:
        end = min(self._pos + ll, self.get_length()) - 1
      if self._pos >= end:
        return b""
      headers.append(f"Range: bytes={self._pos}-{end}")
      download_range = True

    dats = BytesIO()
    c = self._curl
    c.setopt(pycurl.URL, self._url)
    c.setopt(pycurl.WRITEDATA, dats)
    c.setopt(pycurl.NOSIGNAL, 1)
    c.setopt(pycurl.TIMEOUT_MS, 500000)
    c.setopt(pycurl.HTTPHEADER, headers)
    c.setopt(pycurl.FOLLOWLOCATION, True)

    if self._debug:
      print("downloading", self._url)

      def header(x):
        if b'MISS' in x:
          print(x.strip())

      c.setopt(pycurl.HEADERFUNCTION, header)

      def test(debug_type, debug_msg):
       print("  debug(%d): %s" % (debug_type, debug_msg.strip()))

      c.setopt(pycurl.VERBOSE, 1)
      c.setopt(pycurl.DEBUGFUNCTION, test)
      t1 = time.time()

    c.perform()

    if self._debug:
      t2 = time.time()
      if t2 - t1 > 0.1:
        print(f"get {self._url} {headers!r} {t2 - t1:.f} slow")

    response_code = c.getinfo(pycurl.RESPONSE_CODE)
    if response_code == 416:  # Requested Range Not Satisfiable
      raise Exception(f"Error, range out of bounds {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")
    if download_range and response_code != 206:  # Partial Content
      raise Exception(f"Error, requested range but got unexpected response {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")
    if (not download_range) and response_code != 200:  # OK
      raise Exception(f"Error {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")

    ret = dats.getvalue()
    self._pos += len(ret)
    return ret

  def seek(self, pos):
    self._pos = pos

  @property
  def name(self):
    """Returns a local path to file with the URLFile's contents.

       This can be used to interface with modules that require local files.
    """
    if self._local_file is None:
      _, ext = os.path.splitext(urllib.parse.urlparse(self._url).path)
      local_fd, local_path = tempfile.mkstemp(suffix=ext)
      try:
        os.write(local_fd, self.read())
        local_file = open(local_path, "rb")
      except Exception:
        os.remove(local_path)
        raise
      finally:
        os.close(local_fd)

      self._local_file = local_file
      self.read = self._local_file.read
      self.seek = self._local_file.seek

    return self._local_file.name