Description
I have been using the script with some modifications to full fine tune Qwen 2.5 VL 7B , I have 2x 3090 cards and 256 GB of CPU RAM with 24-core AMD CPU. However, the scripts works with LoRa adapters, either 4bit or 16 bit lora. In full fine tune it does not work , here in my script , I have commented it extensively to show what is happening.
from unsloth import FastVisionModel # FastLanguageModel for LLMs import torch from datasets import load_from_disk from unsloth import is_bf16_supported from unsloth.trainer import UnslothVisionDataCollator from trl import SFTTrainer, SFTConfig
model used 'unsloth/Qwen2.5-7B' 16bit
https://huggingface.co/unsloth/Qwen2.5-7B
model, tokenizer = FastVisionModel.from_pretrained( "./model_dir/qwen25_full_unsloth", full_finetuning = True, load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA. use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context )
dataset = load_from_disk('./train_data/subset_10_transformed')
the dataset has two columns 'text' which contain Bbox information as JSON
'image which is just an PIL Image object
Deataset is derived from HF dataset Pub layout net.
num_samples = int(len(dataset) * 0.05) dataset = dataset.select(range(num_samples))
for testing, take a small set.
dataset = dataset.rename_column('image_processed','image')
instruction1 = "Extract bounding box information from this image, "
instruction2 = "There are multiple bounding boxes and categories. "
+ " The categories are as follows: Text, Title , List , Table and Figure. "+
"Format output as JSON with a delimiter <###> at the end to denote end of output. "
def get_image_info(img): a = img.size return f"width is {a[0]}, height is {a[1]}. all bounding boxes are relative to image size. "
def convert_to_conversation(sample): x = get_image_info(sample["image"]) instruction = instruction1 + x + instruction2 conversation = [ { "role": "user", "content" : [ {"type" : "text", "text" : instruction}, {"type" : "image", "image" : sample["image"]} ] }, { "role" : "assistant", "content" : [ {"type" : "text", "text" : sample["text"]} ] }, ] return { "messages" : conversation }
This gets converted to a list that works fine for QLora,
however, the model is expecting a dataset not a list
so when you run this script we get an error come up
converted_dataset = [convert_to_conversation(sample) for sample in dataset] print(type(converted_dataset)) FastVisionModel.for_training(model) # Enable for training!
trainer = SFTTrainer( model = model, tokenizer = tokenizer, data_collator = UnslothVisionDataCollator(model, tokenizer, resize='max'), # Must use! train_dataset = converted_dataset, # " as you can see dataset gets converted to list of dictionaries, # this works for Qlora training, however, when I do full fine tune # I get this message "AttributeError: 'list' object has no attribute 'map' #" I think somewhere in the code it is expecting dataset not a list. args = SFTConfig( per_device_train_batch_size = 2, gradient_accumulation_steps = 16, warmup_steps = 5, max_steps = 800, # num_train_epochs = 1, # Set this instead of max_steps for full training runs learning_rate = 2e-4, bf16 = True, logging_steps = 1, optim = "adamw_8bit", weight_decay = 0.01, lr_scheduler_type = "linear", seed = 3407, output_dir = "./outputs_deepspeed_full", report_to = "none", # For Weights and Biases
# You MUST put the below items for vision finetuning:
remove_unused_columns = False,
dataset_text_field = "",
dataset_kwargs = {"skip_prepare_dataset": True},
dataset_num_proc = 4,
max_seq_length = 2048,
),
)
trainer.train() trainer.save_model('./tuned_model_full_bf16')