# autoregressive model (GPT-style)
attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=True)

# dLLM (bidirectional)
attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=False)