query_text = "Dog running on grass"
# preprocess text (tokenize, etc.)
inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
# generate text embeddings
text_features = model.get_text_features(**inputs)
# normalize text embedding
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# Calculate similarity scores across all image embddings
similarity = torch.mm(text_features, image_emb.T)
# Get top-k matches
values, indices = similarity[0].topk(min(top_k, len(data)))