verl-project/verl

maybe_filter_out_long_prompts过滤prompts的不支持图片路径,他读取的是bytes图像数据,或者Image.Image

Open

#6,623 opened on Jun 5, 2026

View on GitHub
 (2 comments) (0 reactions) (0 assignees)Python (21,533 stars) (3,940 forks)auto 404
help wanted

Description

Feature request

Motivation

                  def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
                          # filter out too long prompts
                          if self.filter_overlong_prompts:
                              tokenizer = self.tokenizer
                              processor = self.processor
                              prompt_key = self.prompt_key
                              image_key = self.image_key
                              video_key = self.video_key
                  
                              if processor is not None:
                                  from verl.utils.dataset.vision_utils import process_image, process_video
                  
                                  def doc2len(doc) -> int:
                                      try:
                                          messages = self._build_messages(doc)
                                          # pass tool schemas if available so the processor can format prompts
                                          apply_kwargs = dict(**self.apply_chat_template_kwargs)
                                          if self.tool_schemas is not None:
                                              apply_kwargs["tools"] = self.tool_schemas
                  
                                          raw_prompt = self.processor.apply_chat_template(
                                              messages, add_generation_prompt=True, tokenize=False, **apply_kwargs
                                          )
                                          if image_key in doc and doc[image_key]:
                                              images = [
                                                  process_image(image, image_patch_size=self.image_patch_size) for image in doc[image_key]
                                              ]
                                          else:
                                              images = None
                  
                                          if video_key in doc and doc[video_key]:
                                              videos, video_metadata = zip(
                                                  *[
                                                      process_video(
                                                          video, image_patch_size=self.image_patch_size, return_video_metadata=True
                                                      )
                                                      for video in doc[video_key]
                                                  ],
                                                  strict=True,
                                              )
                                              videos = list(videos)
                                              video_metadata = list(video_metadata)
                                              videos_kwargs = {"video_metadata": video_metadata, "do_sample_frames": False}
                                          else:
                                              videos = None
                                              videos_kwargs = {}
                  
                                          if images is None and videos is None:
                                              # only text prompt
                                              return len(
                                                  processor.tokenizer(
                                                      text=raw_prompt,
                                                      add_special_tokens=False,  # avoid adding special tokens
                                                      return_attention_mask=False,
                                                  )["input_ids"]
                                              )
                                          else:
                                              # multi-modal prompt
                                              return len(
                                                  processor(text=[raw_prompt], images=images, videos=videos, videos_kwargs=videos_kwargs)[
                                                      "input_ids"
                                                  ][0]
                                              )
                                      except Exception:
                                          print("Error processing one of the samples, skipping...")
                                          traceback.print_exc()
                                          return self.max_prompt_length + 1
                  
                              else:
                  
                                  def doc2len(doc) -> int:
                                      try:
                                          apply_kwargs = dict(**self.apply_chat_template_kwargs)
                                          if self.tool_schemas is not None:
                                              apply_kwargs["tools"] = self.tool_schemas
                  
                                          # Keep explicit tokenization to avoid transformers version default changes.
                                          apply_kwargs.pop("tokenize", None)
                                          apply_kwargs.pop("return_dict", None)
                                          apply_kwargs.pop("return_tensors", None)
                  
                                          tokenized_prompt = tokenizer.apply_chat_template(
                                              doc[prompt_key], add_generation_prompt=True, tokenize=True, **apply_kwargs
                                          )
                                          return len(normalize_token_ids(tokenized_prompt))
                                      except Exception:
                                          print("Error processing one of the samples, skipping...")
                                          traceback.print_exc()
                                          return self.max_prompt_length + 1
                  
                              dataframe = dataframe.filter(
                                  lambda doc: doc2len(doc) <= self.max_prompt_length,
                                  num_proc=self.num_workers,
                                  desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
                              )
                  
                              print(f"filter dataset len: {len(dataframe)}")
                          return dataframe
                  
                  
                  
                  def _build_messages(self, example: dict):
                          """Replace <image> and <video> placeholder in messages with corresponding image and video
                          which is required by processor.apply_chat_template.
                          - <image>: {"type": "image", **image}
                          - <video>: {"type": "video", **video}
                  
                          Args:
                              example: Row dictionary from dataframe.
                  
                          Returns:
                              messages: List of messages with replaced placeholder.
                          """
                          messages: list = example[self.prompt_key]
                          # When concatenating image and video datasets, pop will return None for image or video sample
                          images = example.pop(self.image_key, None) or []
                          videos = example.pop(self.video_key, None) or []
                      
                  
                          image_offset, video_offset = 0, 0
                          for message in messages:
                              if not images and not videos:
                                  continue
                              assert self.processor is not None, "processor is needed to process image and video"
                  
                              content = message["content"]
                              if not isinstance(content, str):
                                  continue
                  
                              content_list = []
                              segments = re.split("(<image>|<video>)", content)
                              segments = [item for item in segments if item != ""]
                              for segment in segments:
                                  if segment == "<image>":
                                      assert image_offset < len(images), f"image_offset {image_offset} >= len(images) {len(images)}"
                                      image = images[image_offset]
                                      if isinstance(image, Image.Image):
                                          image = image.convert("RGB")
                                          content_list.append({"type": "image", "image": image})
                                      elif isinstance(image, dict):
                                          if "bytes" in image:
                                              image["image"] = Image.open(BytesIO(image["bytes"]))
                                          content_list.append({"type": "image", **image})
                                      else:
                                          raise TypeError(f"image must be dict or PIL.Image, unsupported image type: {type(image)}")
                                      image_offset += 1
                                  elif segment == "<video>":
                                      assert video_offset < len(videos), f"video_offset {video_offset} >= len(videos) {len(videos)}"
                                      content_list.append({"type": "video", **videos[video_offset]})
                                      video_offset += 1
                                  else:
                                      content_list.append({"type": "text", "text": segment})
                              message["content"] = content_list
                  
                          assert image_offset == len(images), f"image_offset {image_offset} != len(images) {len(images)}"
                          assert video_offset == len(videos), f"video_offset {video_offset} != len(videos) {len(videos)}"
                          return messages

Your contribution

Contributor guide