A tiny multiprocess data loader in ~100 lines, inspired by
torch.utils.data.DataLoader
,
geohot/tinygrad, and karpathy/micrograd.
See blog post: DataLoaders Explained: Building a Multi-Process Data Loader from Scratch
from dataloader import DataLoader
import numpy as np
class Dataset:
def __init__(self, size):
self.size = size
def __len__(self):
return self.size
def __getitem__(self, index):
return np.zeros((3, 32, 32)), 1
ds = Dataset(1024)
dl = DataLoader(ds, num_workers=4, batch_size=64)
x, y = next(dl)
print(x.shape) # (64, 3, 32, 32)
print(y.shape) # (64,)
from torch.utils import data
import numpy as np
class Dataset(data.Dataset):
def __init__(self, size):
super().__init__()
self.size = size
def __len__(self):
return self.size
def __getitem__(self, index):
return np.zeros((3, 32, 32)), 1
ds = Dataset(1024)
dl = data.DataLoader(ds, num_workers=4, batch_size=64)
x, y = next(iter(dl))
print(x.shape) # torch.Size([64, 3, 32, 32])
print(y.shape) # torch.Size([64])