[ { "role":"system", "content":[{"type":"text","text":"You are a Vision Language Model specialized in interpreting visual data from chart images..."}] }, { "role":"user", "content":[ {"type":"image","image":"<image_data>"}, {"type":"text","text":"What is the highest value in the bar chart?"} ] }, { "role":"assistant", "content":[{"type":"text","text":"42"}] } ]
# Messages containing a images list as a video and a text query messages = [ { "role": "user", "content": [ { "type": "video", "video": [ "file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg", ], }, {"type": "text", "text": "Describe this video."}, ], } ]
# Messages containing a local video path and a text query messages = [ { "role": "user", "content": [ { "type": "video", "video": "file:///path/to/video1.mp4", "max_pixels": 360 * 420, "fps": 1.0, }, {"type": "text", "text": "Describe this video."}, ], } ]
# Messages containing a video url and a text query messages = [ { "role": "user", "content": [ { "type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4", }, {"type": "text", "text": "Describe this video."}, ], } ]
training_args = TrainingArguments( output_dir="./fine_tuned_model", # Directory for model checkpoints per_device_train_batch_size=4, # Batch size per device (GPU/TPU) num_train_epochs=3, # Total training epochs learning_rate=5e-5, # Learning rate save_steps=1000, # Save checkpoint every 1000 steps bf16=True, # Use mixed precision for training gradient_checkpointing=True, # Enable to reduce activation memory usage gradient_accumulation_steps=16, # Accumulate gradients over 16 steps logging_steps=50# Log metrics every 50 steps )
defcollate_fn(examples): # System message template for the VLM system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images. Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase. The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text. Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""
# Initialize lists for text and image inputs text_inputs = [] image_inputs = []
# Process all examples in one loop for example in examples: # Format the chat structure for the processor formatted_example = { "messages": [ { "role": "system", "content": [{"type": "text", "text": system_message}], }, { "role": "user", "content": [ { "type": "image", }, { "type": "text", "text": example["query"], }, ], }, ] } # Apply chat template and strip extra spaces text_inputs.append(processor.apply_chat_template(formatted_example["messages"], tokenize=False).strip()) # Ensure images are in RGB mode image = example["image"] if image.mode != 'RGB': image = image.convert('RGB') image_inputs.append( [image] )
# Tokenize the texts and process the images batch = processor( text=text_inputs, images=image_inputs, return_tensors="pt", padding=True )
# Clone input IDs for labels labels = batch["input_ids"].clone() labels[labels == processor.tokenizer.pad_token_id] = -100# Mask padding tokens in labels
# Ensure image_token is converted to string if it is an AddedToken # In some processor, processor.image_token return a list for each image. # TODO: AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") only have one ? image_token_id = processor.tokenizer.convert_tokens_to_ids(str(processor.image_token))
# Mask image token IDs in the labels labels[labels == image_token_id] = -100
# Add labels back to the batch batch["labels"] = labels
# Configure the Trainer training_args = SFTConfig( output_dir="sft_output", # Directory to save the model num_train_epochs=3, # number of training epochs per_device_train_batch_size=1, # batch size per device during training gradient_accumulation_steps=16, # number of steps before performing a backward/update pass gradient_checkpointing=True, # use gradient checkpointing to save memory optim="adamw_torch_fused", # use fused adamw optimizer logging_steps=5, # log every 10 steps save_strategy="epoch", # save checkpoint every epoch learning_rate=2e-4, # learning rate, based on QLoRA paper bf16=True, # use bfloat16 precision tf32=True, # use tf32 precision max_grad_norm=0.3, # max gradient norm based on QLoRA paper warmup_ratio=0.03, # warmup ratio based on QLoRA paper lr_scheduler_type="constant", # use constant learning rate scheduler push_to_hub=True, # push model to hub gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing # dataloader_num_workers=16, dataset_text_field="", # need a dummy field for collator dataset_kwargs = {"skip_prepare_dataset": True}, # important for collator remove_unused_columns = False# necessary else features except label will be removed ) # Initialize the Trainer trainer = SFTTrainer( model=model, args=training_args, train_dataset=ds["train"], eval_dataset=ds["test"], data_collator=collate_fn, peft_config=peft_config, tokenizer=processor.tokenizer, )