verl-project/verl
View on GitHubmaybe_filter_out_long_prompts过滤prompts的不支持图片路径,他读取的是bytes图像数据,或者Image.Image
Open
#6,623 opened on Jun 5, 2026
help wanted
Description
Feature request
。
Motivation
def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
# filter out too long prompts
if self.filter_overlong_prompts:
tokenizer = self.tokenizer
processor = self.processor
prompt_key = self.prompt_key
image_key = self.image_key
video_key = self.video_key
if processor is not None:
from verl.utils.dataset.vision_utils import process_image, process_video
def doc2len(doc) -> int:
try:
messages = self._build_messages(doc)
# pass tool schemas if available so the processor can format prompts
apply_kwargs = dict(**self.apply_chat_template_kwargs)
if self.tool_schemas is not None:
apply_kwargs["tools"] = self.tool_schemas
raw_prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False, **apply_kwargs
)
if image_key in doc and doc[image_key]:
images = [
process_image(image, image_patch_size=self.image_patch_size) for image in doc[image_key]
]
else:
images = None
if video_key in doc and doc[video_key]:
videos, video_metadata = zip(
*[
process_video(
video, image_patch_size=self.image_patch_size, return_video_metadata=True
)
for video in doc[video_key]
],
strict=True,
)
videos = list(videos)
video_metadata = list(video_metadata)
videos_kwargs = {"video_metadata": video_metadata, "do_sample_frames": False}
else:
videos = None
videos_kwargs = {}
if images is None and videos is None:
# only text prompt
return len(
processor.tokenizer(
text=raw_prompt,
add_special_tokens=False, # avoid adding special tokens
return_attention_mask=False,
)["input_ids"]
)
else:
# multi-modal prompt
return len(
processor(text=[raw_prompt], images=images, videos=videos, videos_kwargs=videos_kwargs)[
"input_ids"
][0]
)
except Exception:
print("Error processing one of the samples, skipping...")
traceback.print_exc()
return self.max_prompt_length + 1
else:
def doc2len(doc) -> int:
try:
apply_kwargs = dict(**self.apply_chat_template_kwargs)
if self.tool_schemas is not None:
apply_kwargs["tools"] = self.tool_schemas
# Keep explicit tokenization to avoid transformers version default changes.
apply_kwargs.pop("tokenize", None)
apply_kwargs.pop("return_dict", None)
apply_kwargs.pop("return_tensors", None)
tokenized_prompt = tokenizer.apply_chat_template(
doc[prompt_key], add_generation_prompt=True, tokenize=True, **apply_kwargs
)
return len(normalize_token_ids(tokenized_prompt))
except Exception:
print("Error processing one of the samples, skipping...")
traceback.print_exc()
return self.max_prompt_length + 1
dataframe = dataframe.filter(
lambda doc: doc2len(doc) <= self.max_prompt_length,
num_proc=self.num_workers,
desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
)
print(f"filter dataset len: {len(dataframe)}")
return dataframe
def _build_messages(self, example: dict):
"""Replace <image> and <video> placeholder in messages with corresponding image and video
which is required by processor.apply_chat_template.
- <image>: {"type": "image", **image}
- <video>: {"type": "video", **video}
Args:
example: Row dictionary from dataframe.
Returns:
messages: List of messages with replaced placeholder.
"""
messages: list = example[self.prompt_key]
# When concatenating image and video datasets, pop will return None for image or video sample
images = example.pop(self.image_key, None) or []
videos = example.pop(self.video_key, None) or []
image_offset, video_offset = 0, 0
for message in messages:
if not images and not videos:
continue
assert self.processor is not None, "processor is needed to process image and video"
content = message["content"]
if not isinstance(content, str):
continue
content_list = []
segments = re.split("(<image>|<video>)", content)
segments = [item for item in segments if item != ""]
for segment in segments:
if segment == "<image>":
assert image_offset < len(images), f"image_offset {image_offset} >= len(images) {len(images)}"
image = images[image_offset]
if isinstance(image, Image.Image):
image = image.convert("RGB")
content_list.append({"type": "image", "image": image})
elif isinstance(image, dict):
if "bytes" in image:
image["image"] = Image.open(BytesIO(image["bytes"]))
content_list.append({"type": "image", **image})
else:
raise TypeError(f"image must be dict or PIL.Image, unsupported image type: {type(image)}")
image_offset += 1
elif segment == "<video>":
assert video_offset < len(videos), f"video_offset {video_offset} >= len(videos) {len(videos)}"
content_list.append({"type": "video", **videos[video_offset]})
video_offset += 1
else:
content_list.append({"type": "text", "text": segment})
message["content"] = content_list
assert image_offset == len(images), f"image_offset {image_offset} != len(images) {len(images)}"
assert video_offset == len(videos), f"video_offset {video_offset} != len(videos) {len(videos)}"
return messages
Your contribution
。