chainer · shu65 · Aug 17, 2017 · Aug 15, 2017 · Aug 15, 2017 · Aug 15, 2017
diff --git a/chainermn/communicators/_base.py b/chainermn/communicators/_base.py
@@ -96,7 +96,10 @@ def __init__(self, mpi_comm, use_nccl):
 
         self._init_ranks()
 
-        # TODO(akiba): write why we delay initializing comms
+        # We have to delay the initialization of communicators. This is because
+        # NCCL's communicators use the current CUDA devices at the time of
+        # initialization. Therefore, we have to initialize NCCL communicators
+        # after users set the devices to use.
         self.inter_mpi_comm = None
         self.intra_mpi_comm = None
         if self.use_nccl:

diff --git a/chainermn/communicators/hierarchical_communicator.py b/chainermn/communicators/hierarchical_communicator.py
@@ -38,8 +38,6 @@ def allreduce_grad(self, model):
             self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
             nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr)
 
-        # TODO(akiba): sync necessary?
-
         # Inter-node allreduce
         if self.intra_rank == 0:
             _communication_utility.inter_allreduce_gpu(

diff --git a/chainermn/dataset.py b/chainermn/dataset.py
@@ -25,7 +25,8 @@ def scatter_dataset(dataset, comm):
     assert hasattr(comm, 'send')
     assert hasattr(comm, 'recv')
 
-    # TODO(akiba): write why we do not use mpi_comm.scatter
+    # We cannot use `mpi_comm.scatter`. This is due to MPI4py's bug.
+    # For large datasets, when using `mpi_comm.scatter`, it causes MemoryError.
     if comm.rank == 0:
         mine = None
         n_total_samples = len(dataset)