pytorch/binaries/benchmark_helper.cc

/**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#ifdef _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#include <psapi.h>
#endif

#include <binaries/benchmark_helper.h>
#include "caffe2/core/blob_serialization.h"
#ifdef __CUDA_ARCH__
#include "caffe2/core/context_gpu.h"
#endif
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/utils/bench_utils.h"
#include "caffe2/utils/string_utils.h"
#include <observers/net_observer_reporter_print.h>
#include <observers/observer_config.h>
#include <observers/perf_observer.h>

#if defined(TARGET_OS_MAC) || \
defined(TARGET_OS_IPHONE) || \
defined(TARGET_IPHONE_SIMULATOR)
#include <malloc/malloc.h>
#else
#include <malloc.h>
#endif


void observerConfig() {
  caffe2::ClearGlobalNetObservers();
  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
    return std::make_unique<caffe2::PerfNetObserver>(subject);
  });
  caffe2::ObserverConfig::setReporter(
      std::make_unique<caffe2::NetObserverReporterPrint>());
}

bool backendCudaSet(const string& backend) {
  bool run_on_gpu = false;
  if (backend == "cuda") {
#ifdef __CUDA_ARCH__
    if (caffe2::HasCudaGPU()) {
      run_on_gpu = true;
    } else {
      CAFFE_THROW("NO GPU support on this host machine");
    }
#else
    CAFFE_THROW("NO GPU support");
#endif
  }
  return run_on_gpu;
}

void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
  for (int j = 0; j < net_def->op_size(); j++) {
    caffe2::OperatorDef* op = net_def->mutable_op(j);
    op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev));
  }
}

void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
  if (backend != "builtin") {
    string engine = backend == "nnpack"
        ? "NNPACK"
        : backend == "eigen" ? "EIGEN"
                             : backend == "mkl" ? "MKLDNN"
                                                : backend == "cuda"
                    ? "CUDA"
                    : backend == "dnnlowp" ? "DNNLOWP"
                                           : backend == "dnnlowp_acc16"
                            ? "DNNLOWP_ACC16"
                            : backend == "default" ? "" : "NONE";
    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
    for (int i = 0; i < net_def->op_size(); i++) {
      caffe2::OperatorDef* op_def = net_def->mutable_op(i);
      op_def->set_engine(engine);
    }
  }
}

int loadInput(
    shared_ptr<caffe2::Workspace> workspace,
    const bool run_on_gpu,
    map<string, caffe2::TensorProtos>& tensor_protos_map,
    const string& input,
    const string& input_file,
    const string& input_dims,
    const string& input_type) {
  // How many input blobs are in the inputs
  int blob_num = 1;
  // Load input.
  if (input.size()) {
    vector<string> input_names = caffe2::split(',', input);
    if (input_file.size()) {
      vector<string> input_files = caffe2::split(',', input_file);
      CAFFE_ENFORCE_EQ(
          input_names.size(),
          input_files.size(),
          "Input name and file should have the same number.");
      for (int i = 0; i < input_names.size(); ++i) {
        caffe2::TensorProtos tensor_protos;
        CAFFE_ENFORCE(
            caffe2::ReadProtoFromFile(input_files[i], &tensor_protos));
        workspace->CreateBlob(input_names[i]);
        tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos));
      }
      // Check that all blobs have the same number of entries
      blob_num = tensor_protos_map[input_names[0]].protos_size();
      for (int i = 1; i < input_names.size(); ++i) {
        int bnum = tensor_protos_map[input_names[i]].protos_size();
        CAFFE_ENFORCE_EQ(
            blob_num,
            bnum,
            "Number of blobs are not the same for all inputs");
      }
    } else if (input_dims.size() || input_type.size()) {
      CAFFE_ENFORCE_GE(
          input_dims.size(),
          0,
          "Input dims must be specified when input tensors are used.");
      CAFFE_ENFORCE_GE(
          input_type.size(),
          0,
          "Input type must be specified when input tensors are used.");

      vector<string> input_dims_list = caffe2::split(';', input_dims);
      CAFFE_ENFORCE_EQ(
          input_names.size(),
          input_dims_list.size(),
          "Input name and dims should have the same number of items.");
      vector<string> input_type_list = caffe2::split(';', input_type);
      CAFFE_ENFORCE_EQ(
          input_names.size(),
          input_type_list.size(),
          "Input name and type should have the same number of items.");
      for (size_t i = 0; i < input_names.size(); ++i) {
        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
        vector<int> input_dims;
        for (const string& s : input_dims_str) {
          input_dims.push_back(c10::stoi(s));
        }
        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
        if (blob == nullptr) {
          blob = workspace->CreateBlob(input_names[i]);
        }
        if (run_on_gpu) {
          LOG(INFO) << "Running on GPU.";
#ifdef __CUDA_ARCH__
          caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>();
          TORCH_CHECK_NOTNULL(tensor);
          tensor->Resize(input_dims);
          if (input_type_list[i] == "uint8_t") {
            tensor->mutable_data<uint8_t>();
          } else if (input_type_list[i] == "float") {
            tensor->mutable_data<float>();
          } else {
            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
          }
#else
          CAFFE_THROW("Not support GPU on mobile.");
#endif
        } else {
          if (input_type_list[i] == "uint8_t") {
            caffe2::int8::Int8TensorCPU* tensor =
                blob->GetMutable<caffe2::int8::Int8TensorCPU>();
            TORCH_CHECK_NOTNULL(tensor);
            tensor->t.Resize(input_dims);
            tensor->t.mutable_data<uint8_t>();
          } else if (input_type_list[i] == "float") {
            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
            TORCH_CHECK_NOTNULL(tensor);
            tensor->Resize(input_dims);
            tensor->mutable_data<float>();
          } else if (input_type_list[i] == "int") {
            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
            TORCH_CHECK_NOTNULL(tensor);
            tensor->Resize(input_dims);
            tensor->mutable_data<int>();
          } else {
            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
          }
        }
      }
    } else {
      CAFFE_THROW(
          "You requested input tensors, but neither input_file nor "
          "input_dims is set.");
    }
  }
  return blob_num;
}

void fillInputBlob(
    shared_ptr<caffe2::Workspace> workspace,
    map<string, caffe2::TensorProtos>& tensor_protos_map,
    int iteration) {
  if (tensor_protos_map.empty()) {
    return;
  }
  static caffe2::TensorDeserializer deserializer;
  for (auto& tensor_kv : tensor_protos_map) {
    caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
    if (blob == nullptr) {
      blob = workspace->CreateBlob(tensor_kv.first);
    }
    // todo: support gpu and make this function a template
    int protos_size = tensor_kv.second.protos_size();
    if (protos_size == 1 && iteration > 0) {
      // Do not override the input data if there is only one input data,
      // since it will clear all caches. Rely on wipe_cache to
      // clear caches
      continue;
    }
    caffe2::TensorProto* tensor_proto =
        tensor_kv.second.mutable_protos(iteration % protos_size);
    BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto));
    // todo: for other types
  }
}

void runNetwork(
    shared_ptr<caffe2::Workspace> workspace,
    caffe2::NetBase* net,
    map<string, caffe2::TensorProtos>& tensor_protos_map,
    const bool wipe_cache,
    const bool run_individual,
    const bool run_on_gpu,
    const bool text_output,
    const int warmup,
    const int iter,
    const int num_blobs,
    const int sleep_before_run,
    const int sleep_between_iteration,
    const int sleep_between_net_and_operator,
    const std::string& output,
    const std::string& output_folder) {

  LOG(INFO) << "Starting benchmark.";
  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
  LOG(INFO) << "Running warmup runs.";
  for (int i = 0; i < warmup; ++i) {
    fillInputBlob(workspace, tensor_protos_map, i);
    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
  }

  if (wipe_cache) {
    caffe2::wipe_cache();
  }
  if (sleep_before_run > 0) {
    std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run));
  }
  LOG(INFO) << "Main runs.";
  CAFFE_ENFORCE(
      iter >= 0,
      "Number of main runs should be non negative, provided ",
      iter,
      ".");
  LOG(INFO) << "net runs.";
  long long duration_sum = 0;
  for (int i = 0; i < iter; ++i) {
    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
    fillInputBlob(workspace, tensor_protos_map, i);
    if (wipe_cache) {
      caffe2::wipe_cache();
    }
    auto start = std::chrono::high_resolution_clock::now();
    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    duration_sum += duration.count();
    // Write the output for the first num_blobs times
    writeOutput(
        workspace,
        run_on_gpu,
        output,
        output_folder,
        text_output,
        i,
        num_blobs);
    if (wipe_cache) {
      caffe2::wipe_cache();
    }
    if (sleep_between_iteration > 0) {
      std::this_thread::sleep_for(
          std::chrono::seconds(sleep_between_iteration));
    }
  }
  std::cout << "Average Duration: " << (duration_sum/iter) << " us" << std::endl;
  if (run_individual) {
    LOG(INFO) << "operator runs.";
    if (sleep_between_net_and_operator > 0) {
      std::this_thread::sleep_for(
          std::chrono::seconds(sleep_between_net_and_operator));
    }
    for (int i = 0; i < iter; ++i) {
      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
      fillInputBlob(workspace, tensor_protos_map, i);
      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
      if (wipe_cache) {
        caffe2::wipe_cache();
      }
      if (sleep_between_iteration > 0) {
        std::this_thread::sleep_for(
            std::chrono::seconds(sleep_between_iteration));
      }
    }
  }
}

void writeOutput(
    shared_ptr<caffe2::Workspace> workspace,
    const bool run_on_gpu,
    const string& output,
    const string& output_folder,
    const bool text_output,
    const int index,
    const int num_blobs) {
  if (output.size() == 0) {
    return;
  }
  string output_prefix = output_folder.size() ? output_folder + "/" : "";
  vector<string> output_names = caffe2::split(',', output);
  if (output == "*") {
    output_names = workspace->Blobs();
  }
  for (const string& name : output_names) {
    CAFFE_ENFORCE(
        workspace->HasBlob(name),
        "You requested a non-existing blob: ",
        name);
    if (text_output) {
      if (run_on_gpu) {
#ifdef __CUDA_ARCH__
        writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
            workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
            output_prefix,
            name,
            index,
            num_blobs);
#else
        CAFFE_THROW("Not support GPU.");
#endif
      } else {
        writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
            BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
            output_prefix,
            name,
            index,
            num_blobs);
      }
    } else {
      // Do not support multiple entries per blob.
      CAFFE_ENFORCE(
          index == 0,
          "Binary file only support one output.");
      string serialized = SerializeBlob(*workspace->GetBlob(name), name);
      string output_filename = output_prefix + name;
      caffe2::WriteStringToFile(serialized, output_filename.c_str());
    }
  }
}

void logBenchmarkResult(
    const std::string& type,
    const std::string& metric,
    const std::string& unit,
    const int value) {
  LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
            << "\"type\": \"" << type << "\", "
            << "\"metric\": \"" << metric << "\", "
            << "\"unit\": \"" << unit << "\", "
            << "\"value\": " << c10::to_string(value) << "}\n";
}

long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
  if (FLAGS_measure_memory) {
#if defined(TARGET_OS_IPHONE) || \
defined(TARGET_OS_MAC) || \
defined(TARGET_IPHONE_SIMULATOR)
    malloc_statistics_t stats = {0};
    malloc_zone_statistics(nullptr, &stats);
    return stats.size_allocated;
#elif defined(_WIN32)
    PROCESS_MEMORY_COUNTERS_EX pmc;
    GetProcessMemoryInfo(
        GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc));
    return pmc.PrivateUsage;
#else
    struct mallinfo info = mallinfo();
    return info.uordblks;
#endif
  }

  return 0;
}

int benchmark(
    int argc,
    char* argv[],
    const string& FLAGS_backend,
    const string& FLAGS_init_net,
    const string& FLAGS_input,
    const string& FLAGS_input_dims,
    const string& FLAGS_input_file,
    const string& FLAGS_input_type,
    int FLAGS_iter,
    bool FLAGS_measure_memory,
    const string& FLAGS_net,
    const string& FLAGS_output,
    const string& FLAGS_output_folder,
    bool FLAGS_run_individual,
    int FLAGS_sleep_before_run,
    int FLAGS_sleep_between_iteration,
    int FLAGS_sleep_between_net_and_operator,
    bool FLAGS_text_output,
    int FLAGS_warmup,
    bool FLAGS_wipe_cache) {
  // Check arguments to be correct
  {
    // Need to check whether file exists, as the file reader does not assert if
    // file does not exist
    std::ifstream net_file(FLAGS_net);
    CAFFE_ENFORCE(net_file.good());
    net_file.close();

    std::ifstream init_net_file(FLAGS_init_net);
    CAFFE_ENFORCE(init_net_file.good());
    init_net_file.close();

    if (FLAGS_input_file.size() > 0) {
      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
      for (auto input_file : input_files) {
        std::ifstream ifile(input_file);
        CAFFE_ENFORCE(ifile.good());
        ifile.close();
      }
    }
  }

  observerConfig();
  caffe2::ShowLogInfoToStderr();

  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
  bool run_on_gpu = backendCudaSet(FLAGS_backend);
  // Run initialization network, measure resources used.
  long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
  caffe2::NetDef init_net_def;
  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
  setOperatorEngine(&init_net_def, FLAGS_backend);
  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
  init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;

  map<string, caffe2::TensorProtos> tensor_protos_map;
  int num_blobs = loadInput(
      workspace,
      run_on_gpu,
      tensor_protos_map,
      FLAGS_input,
      FLAGS_input_file,
      FLAGS_input_dims,
      FLAGS_input_type);

  // Run main network.
  long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
  caffe2::NetDef net_def;
  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
  setOperatorEngine(&net_def, FLAGS_backend);
  if (!net_def.has_name()) {
    net_def.set_name("benchmark");
  }
  caffe2::NetBase* net = workspace->CreateNet(net_def);
  TORCH_CHECK_NOTNULL(net);
  runNetwork(
      workspace,
      net,
      tensor_protos_map,
      FLAGS_wipe_cache,
      FLAGS_run_individual,
      run_on_gpu,
      FLAGS_text_output,
      FLAGS_warmup,
      FLAGS_iter,
      num_blobs,
      FLAGS_sleep_before_run,
      FLAGS_sleep_between_iteration,
      FLAGS_sleep_between_net_and_operator,
      FLAGS_output,
      FLAGS_output_folder);
  predict_vmem = getVirtualMemoryIfOptionEnabled(
      FLAGS_measure_memory) - predict_vmem;
  if (FLAGS_measure_memory) {
    logBenchmarkResult(
        "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
  }

  return 0;
}