text = data['text']
images = data['image']
inputs = processor(text=text,
images=images,
return_tensors="pt",
padding=True
).to(device)
>>> inputs.keys()
dict_keys(['input_ids', 'attention_mask', 'pixel_values'])