1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| import torch import torch.nn as nn import math
class MultiHeadAttention(nn.Module): def __init__(self, d_model, n_heads): super().__init__() self.d_k = d_model // n_heads self.n_heads = n_heads self.W_q = nn.Linear(d_model, d_model) self.W_k = nn.Linear(d_model, d_model) self.W_v = nn.Linear(d_model, d_model) self.W_o = nn.Linear(d_model, d_model) def forward(self, Q, K, V, mask=None): batch_size = Q.size(0) Q = self.W_q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) K = self.W_k(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) V = self.W_v(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attn = torch.softmax(scores, dim=-1) output = torch.matmul(attn, V) output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k) return self.W_o(output)
|