RAG multimodal 3 - snappify.com

CLIP-Notebook.ipynb
query_text = "Dog running on grass"

# preprocess text (tokenize, etc.)
inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)

# generate text embeddings
text_features = model.get_text_features(**inputs)

# normalize text embedding
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
# Calculate similarity scores across all image embddings
similarity = torch.mm(text_features, image_emb.T)
    
# Get top-k matches
values, indices = similarity[0].topk(min(top_k, len(data)))