# pylint: skip-file import os import time import tempfile import threading import urllib.parse import pycurl from hashlib import sha256 from io import BytesIO from tenacity import retry, wait_random_exponential, stop_after_attempt from common.file_helpers import mkdirs_exists_ok, atomic_write_in_dir # Cache chunk size K = 1000 CHUNK_SIZE = 1000 * K CACHE_DIR = os.environ.get("COMMA_CACHE", "/tmp/comma_download_cache/") def hash_256(link): hsh = str(sha256((link.split("?")[0]).encode('utf-8')).hexdigest()) return hsh class URLFile: _tlocal = threading.local() def __init__(self, url, debug=False, cache=None): self._url = url self._pos = 0 self._length = None self._local_file = None self._debug = debug # True by default, false if FILEREADER_CACHE is defined, but can be overwritten by the cache input self._force_download = not int(os.environ.get("FILEREADER_CACHE", "0")) if cache is not None: self._force_download = not cache try: self._curl = self._tlocal.curl except AttributeError: self._curl = self._tlocal.curl = pycurl.Curl() mkdirs_exists_ok(CACHE_DIR) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): if self._local_file is not None: os.remove(self._local_file.name) self._local_file.close() self._local_file = None @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True) def get_length_online(self): c = self._curl c.reset() c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.TIMEOUT_MS, 500000) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.URL, self._url) c.setopt(c.NOBODY, 1) c.perform() length = int(c.getinfo(c.CONTENT_LENGTH_DOWNLOAD)) c.reset() return length def get_length(self): if self._length is not None: return self._length file_length_path = os.path.join(CACHE_DIR, hash_256(self._url) + "_length") if os.path.exists(file_length_path) and not self._force_download: with open(file_length_path) as file_length: content = file_length.read() self._length = int(content) return self._length self._length = self.get_length_online() if not self._force_download: with atomic_write_in_dir(file_length_path, mode="w") as file_length: file_length.write(str(self._length)) return self._length def read(self, ll=None): if self._force_download: return self.read_aux(ll=ll) file_begin = self._pos file_end = self._pos + ll if ll is not None else self.get_length() # We have to align with chunks we store. Position is the begginiing of the latest chunk that starts before or at our file position = (file_begin // CHUNK_SIZE) * CHUNK_SIZE response = b"" while True: self._pos = position chunk_number = self._pos / CHUNK_SIZE file_name = hash_256(self._url) + "_" + str(chunk_number) full_path = os.path.join(CACHE_DIR, str(file_name)) data = None # If we don't have a file, download it if not os.path.exists(full_path): data = self.read_aux(ll=CHUNK_SIZE) with atomic_write_in_dir(full_path, mode="wb") as new_cached_file: new_cached_file.write(data) else: with open(full_path, "rb") as cached_file: data = cached_file.read() response += data[max(0, file_begin - position): min(CHUNK_SIZE, file_end - position)] position += CHUNK_SIZE if position >= file_end: self._pos = file_end return response @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True) def read_aux(self, ll=None): download_range = False headers = ["Connection: keep-alive"] if self._pos != 0 or ll is not None: if ll is None: end = self.get_length() - 1 else: end = min(self._pos + ll, self.get_length()) - 1 if self._pos >= end: return b"" headers.append(f"Range: bytes={self._pos}-{end}") download_range = True dats = BytesIO() c = self._curl c.setopt(pycurl.URL, self._url) c.setopt(pycurl.WRITEDATA, dats) c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.TIMEOUT_MS, 500000) c.setopt(pycurl.HTTPHEADER, headers) c.setopt(pycurl.FOLLOWLOCATION, True) if self._debug: print("downloading", self._url) def header(x): if b'MISS' in x: print(x.strip()) c.setopt(pycurl.HEADERFUNCTION, header) def test(debug_type, debug_msg): print(" debug(%d): %s" % (debug_type, debug_msg.strip())) c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.DEBUGFUNCTION, test) t1 = time.time() c.perform() if self._debug: t2 = time.time() if t2 - t1 > 0.1: print(f"get {self._url} {headers!r} {t2 - t1:.f} slow") response_code = c.getinfo(pycurl.RESPONSE_CODE) if response_code == 416: # Requested Range Not Satisfiable raise Exception(f"Error, range out of bounds {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}") if download_range and response_code != 206: # Partial Content raise Exception(f"Error, requested range but got unexpected response {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}") if (not download_range) and response_code != 200: # OK raise Exception(f"Error {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}") ret = dats.getvalue() self._pos += len(ret) return ret def seek(self, pos): self._pos = pos @property def name(self): """Returns a local path to file with the URLFile's contents. This can be used to interface with modules that require local files. """ if self._local_file is None: _, ext = os.path.splitext(urllib.parse.urlparse(self._url).path) local_fd, local_path = tempfile.mkstemp(suffix=ext) try: os.write(local_fd, self.read()) local_file = open(local_path, "rb") except Exception: os.remove(local_path) raise finally: os.close(local_fd) self._local_file = local_file self.read = self._local_file.read self.seek = self._local_file.seek return self._local_file.name