train_loader = DataLoader(train_dataset,
                          batch_size=64,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=8)

enable pin_memory and set num_workers for faster transfers