buggood first issue
Description
System Info
accelerate version: 1.10.1
OS: ubuntu 20.04.6 LTS
python version: 3.11.13
numpy version: 2.3.3
torch version: 2.8.0
accelerate configuration:
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
self.accelerator = Accelerator(
log_with="tensorboard",
project_dir="./tensorboard_logs",
kwargs_handlers=[ddp_kwargs],
gradient_accumulation_steps=self.grad_accumulation_steps,
**accelerate_kwargs,
)
Information
- The official example scripts
- My own modified scripts
Tasks
- One of the scripts in the examples/ folder of Accelerate or an officially supported
no_trainerscript in theexamplesfolder of thetransformersrepo (such asrun_no_trainer_glue.py) - My own task or dataset (give details below)
Reproduction
I use dynamic batch sampler:
class DynamicBatchSampler(Sampler):
"""Extension of Sampler that will do the following:
1. Change the batch size (essentially number of sequences)
in a batch to ensure that the total number of frames are less
than a certain threshold.
2. Make sure the padding efficiency in the batch is high.
3. Shuffle batches each epoch while maintaining reproducibility.
"""
def __init__(
self, dataset: Dataset, frames_threshold: int, max_samples=0, random_seed=None, drop_residual: bool = False
):
self.frames_threshold = frames_threshold
self.max_samples = max_samples
self.random_seed = random_seed
self.epoch = 0
indices, batches = [], []
logger.info("Sorting dataset by frame lengths... This can be slow if duration was not precomputed")
# for idx in tqdm(range(len(dataset)), desc="Sorting dataset... "):
# indices.append((idx, dataset.get_frame_len(idx)))
for idx in range(len(dataset)):
indices.append((idx, dataset.get_frame_len(idx)))
indices.sort(key=lambda elem: elem[1])
batch = []
batch_frames = 0
# for idx, frame_len in tqdm(
# indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
# ):
logger.info(f"Creating dynamic batches with {frames_threshold} audio frames per gpu")
for idx, frame_len in indices:
if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
batch.append(idx)
batch_frames += frame_len
else:
if len(batch) > 0:
batches.append(batch)
if frame_len <= self.frames_threshold:
batch = [idx]
batch_frames = frame_len
else:
logger.warning(
f"Single sample with {frame_len} frames exceeds the frames_threshold of {self.frames_threshold}, dropping it."
)
batch = []
batch_frames = 0
if not drop_residual and len(batch) > 0:
batches.append(batch)
del indices
self.batches = batches
# Ensure even batches with accelerate BatchSamplerShard cls under frame_per_batch setting
self.drop_last = True
def set_epoch(self, epoch: int) -> None:
"""Sets the epoch for this sampler."""
self.epoch = epoch
def __iter__(self):
# Use both random_seed and epoch for deterministic but different shuffling per epoch
if self.random_seed is not None:
g = torch.Generator()
g.manual_seed(self.random_seed + self.epoch)
# Use PyTorch's random permutation for better reproducibility across PyTorch versions
indices = torch.randperm(len(self.batches), generator=g).tolist()
batches = [self.batches[i] for i in indices]
else:
batches = self.batches
return iter(batches)
def __len__(self):
return len(self.batches)
How I init the dataloader:
def init_dataloader(
data: list[Path],
metrics: dict[str, dict[str, float]],
sample_rate: int = 16000,
batch_frame_per_gpu: int = 480000,
max_samples_per_gpu: int = 32,
num_workers: int = 4,
prefetch: int = 100,
feature_extractor: transformers.SequenceFeatureExtractor = None,
seed: int = 42,
is_train: bool = True,
) -> DataLoader:
dataset = SQADataset(data, metrics, sample_rate)
batch_sampler = DynamicBatchSampler(
dataset,
batch_frame_per_gpu,
max_samples=max_samples_per_gpu,
random_seed=seed if is_train else None,
)
if feature_extractor:
feature_extractor_partial = partial(feature_extractor, sampling_rate=sample_rate)
collate_fn_partial = partial(collate_fn, feature_extractor=feature_extractor_partial)
else:
collate_fn_partial = collate_fn
dataloader = DataLoader(
dataset,
collate_fn=collate_fn_partial,
num_workers=num_workers,
prefetch_factor=prefetch,
pin_memory=num_workers != 0,
persistent_workers=num_workers != 0,
batch_sampler=batch_sampler,
)
return dataloader
How I prepare the dataset:
train_dataloader, cv_dataloader = self.accelerator.prepare(train_dataloader, cv_dataloader)
Expected behavior
When I train a single task, everything works as expected util now. However, when I run multiple tasks simultaneously that use the same dataset, it often results in a deadlock, showing the following error:
kernel:[7423642.314824] watchdog: BUG: soft lockup - CPU#9 stuck for 22s! [cuda-EvtHandlr:3217326]