adding a separate file for GPU

soumyadipghosh · soumyadipghosh · commit a0bfb9f9678f · 2020-12-28T19:19:47.000-05:00
diff --git a/cpp/distributed/dist-mnist-gpu.cpp b/cpp/distributed/dist-mnist-gpu.cpp
@@ -0,0 +1,285 @@
+#include <cuda.h>
+#include <stdint.h>
+#include <torch/torch.h>
+#include <unistd.h>
+#include <iostream>
+#include "cuda_runtime.h"
+#include "mpi.h"
+#include "nccl.h"
+
+std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
+    {at::kByte, MPI_UNSIGNED_CHAR},
+    {at::kChar, MPI_CHAR},
+    {at::kDouble, MPI_DOUBLE},
+    {at::kFloat, MPI_FLOAT},
+    {at::kInt, MPI_INT},
+    {at::kLong, MPI_LONG},
+    {at::kShort, MPI_SHORT},
+};
+
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i = 0; i < maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+// Define a Convolutional Module
+struct Model : torch::nn::Module {
+  Model()
+      : conv1(torch::nn::Conv2dOptions(1, 10, 5)),
+        conv2(torch::nn::Conv2dOptions(10, 20, 5)),
+        fc1(320, 50),
+        fc2(50, 10) {
+    register_module("conv1", conv1);
+    register_module("conv2", conv2);
+    register_module("conv2_drop", conv2_drop);
+    register_module("fc1", fc1);
+    register_module("fc2", fc2);
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    x = torch::relu(torch::max_pool2d(conv1->forward(x), 2));
+    x = torch::relu(
+        torch::max_pool2d(conv2_drop->forward(conv2->forward(x)), 2));
+    x = x.view({-1, 320});
+    x = torch::relu(fc1->forward(x));
+    x = torch::dropout(x, 0.5, is_training());
+    x = fc2->forward(x);
+    return torch::log_softmax(x, 1);
+  }
+
+  torch::nn::Conv2d conv1;
+  torch::nn::Conv2d conv2;
+  torch::nn::Dropout2d conv2_drop;
+  torch::nn::Linear fc1;
+  torch::nn::Linear fc2;
+};
+
+int main(int argc, char* argv[]) {
+  // MPI variables
+  int rank, numranks, localRank;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &numranks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  // end MPI variables
+
+  torch::Device device = (torch::kCPU);
+  if (torch::cuda::is_available()) {
+    std::cout << "CUDA is available! Training on GPU." << std::endl;
+    device = torch::kCUDA;
+  } else {
+    std::cout << "CUDA not available. Training on CPU." << std::endl;
+  }
+
+  // Calculating localRank based on hostname which is used in
+  // selecting a GPU
+  uint64_t hostHashs[numranks];
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  hostHashs[rank] = getHostHash(hostname);
+  MPI_Allgather(
+      MPI_IN_PLACE,
+      0,
+      MPI_DATATYPE_NULL,
+      hostHashs,
+      sizeof(uint64_t),
+      MPI_BYTE,
+      MPI_COMM_WORLD);
+  for (int p = 0; p < numranks; p++) {
+    if (p == rank)
+      break;
+    if (hostHashs[p] == hostHashs[rank])
+      localRank++;
+  }
+
+  ncclUniqueId id;
+  ncclComm_t comm;
+  float *sendbuff, *recvbuff;
+  cudaStream_t s;
+
+  // get NCCL unique ID at rank 0 and broadcast it to all others
+  if (rank == 0)
+    ncclGetUniqueId(&id);
+  MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+  // picking a GPU based on localRank, allocate device buffers
+  cudaSetDevice(localRank);
+  cudaStreamCreate(&s);
+
+  // initializing NCCL
+  ncclCommInitRank(&comm, numranks, id, rank);
+
+  // Timer variables
+  auto tstart = 0.0;
+  auto tend = 0.0;
+
+  // TRAINING
+  // Read train dataset
+  const char* kDataRoot = "../data";
+  auto train_dataset =
+      torch::data::datasets::MNIST(kDataRoot)
+          .map(torch::data::transforms::Normalize<>(0.1307, 0.3081))
+          .map(torch::data::transforms::Stack<>());
+
+  // Distributed Random Sampler
+  auto data_sampler = torch::data::samplers::DistributedRandomSampler(
+      train_dataset.size().value(), numranks, rank, false);
+
+  auto num_train_samples_per_proc = train_dataset.size().value() / numranks;
+
+  // Generate dataloader
+  auto total_batch_size = 64;
+  auto batch_size_per_proc =
+      total_batch_size / numranks; // effective batch size in each processor
+  auto data_loader = torch::data::make_data_loader(
+      std::move(train_dataset), data_sampler, batch_size_per_proc);
+
+  // setting manual seed
+  torch::manual_seed(0);
+
+  auto model = std::make_shared<Model>();
+  model->to(device);
+
+  auto learning_rate = 1e-2;
+
+  torch::optim::SGD optimizer(model->parameters(), learning_rate);
+
+  // Number of epochs
+  size_t num_epochs = 10;
+
+  // start timer
+  tstart = MPI_Wtime();
+
+  for (size_t epoch = 1; epoch <= num_epochs; ++epoch) {
+    size_t num_correct = 0;
+
+    for (auto& batch : *data_loader) {
+      auto ip = batch.data.to(device);
+      auto op = batch.target.squeeze().to(device);
+
+      // convert to required formats
+      ip = ip.to(torch::kF32);
+      op = op.to(torch::kLong);
+
+      // Reset gradients
+      model->zero_grad();
+
+      // Execute forward pass
+      auto prediction = model->forward(ip);
+
+      auto loss = torch::nll_loss(torch::log_softmax(prediction, 1), op);
+
+      // Backpropagation
+      loss.backward();
+
+      // Averaging the gradients of the parameters in all the processors
+      // Note: This may lag behind DistributedDataParallel (DDP) in performance
+      // since this synchronizes parameters after backward pass while DDP
+      // overlaps synchronizing parameters and computing gradients in backward
+      // pass
+
+      if (torch::cuda::is_available()) {
+        for (auto& param : model->named_parameters()) {
+          ncclAllReduce(
+              param.value().grad().data_ptr(),
+              param.value().grad().data_ptr(),
+              param.value().grad().numel(),
+              ncclFloat,
+              ncclSum,
+              comm,
+              s);
+          cudaStreamSynchronize(s);
+          param.value().grad().data() = param.value().grad().data() / numranks;
+        }
+      } else {
+        for (auto& param : model->named_parameters()) {
+          MPI_Allreduce(
+              MPI_IN_PLACE,
+              param.value().grad().data_ptr(),
+              param.value().grad().numel(),
+              mpiDatatype.at(param.value().grad().scalar_type()),
+              MPI_SUM,
+              MPI_COMM_WORLD);
+
+          param.value().grad().data() = param.value().grad().data() / numranks;
+        }
+      }
+
+      // Update parameters
+      optimizer.step();
+
+      auto guess = prediction.argmax(1);
+      num_correct += torch::sum(guess.eq_(op)).item<int64_t>();
+    } // end batch loader
+
+    auto accuracy = 100.0 * num_correct / num_train_samples_per_proc;
+
+    std::cout << "Accuracy in rank " << rank << " in epoch " << epoch << " - "
+              << accuracy << std::endl;
+
+  } // end epoch
+
+  // end timer
+  tend = MPI_Wtime();
+  if (rank == 0) {
+    std::cout << "Training time - " << (tend - tstart) << std::endl;
+  }
+
+  // TESTING ONLY IN RANK 0
+  if (rank == 0) {
+    auto test_dataset =
+        torch::data::datasets::MNIST(
+            kDataRoot, torch::data::datasets::MNIST::Mode::kTest)
+            .map(torch::data::transforms::Normalize<>(0.1307, 0.3081))
+            .map(torch::data::transforms::Stack<>());
+
+    auto num_test_samples = test_dataset.size().value();
+    auto test_loader = torch::data::make_data_loader(
+        std::move(test_dataset), num_test_samples);
+
+    model->eval(); // enable eval mode to prevent backprop
+
+    size_t num_correct = 0;
+
+    for (auto& batch : *test_loader) {
+      auto ip = batch.data.to(device);
+      auto op = batch.target.squeeze().to(device);
+
+      // convert to required format
+      ip = ip.to(torch::kF32);
+      op = op.to(torch::kLong);
+
+      auto prediction = model->forward(ip);
+
+      auto loss = torch::nll_loss(torch::log_softmax(prediction, 1), op);
+
+      std::cout << "Test loss - " << loss.item<float>() << std::endl;
+
+      auto guess = prediction.argmax(1);
+
+      num_correct += torch::sum(guess.eq_(op)).item<int64_t>();
+
+    } // end test loader
+
+    std::cout << "Num correct - " << num_correct << std::endl;
+    std::cout << "Test Accuracy - " << 100.0 * num_correct / num_test_samples
+              << std::endl;
+  } // end rank 0
+
+  ncclCommDestroy(comm);
+
+  MPI_Finalize();
+}