# autoregressive model (GPT-style) attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=True) # dLLM (bidirectional) attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=False)