Transformer & Bert

Pre-training of Deep Bidirectional Transformers for Language Understanding

Transfromer

关于Bert,多的不说了,网上的扫盲帖、详解帖也一大堆。我们针对源码来一步一步的看。
Transformer部分的源码来自于pytorch-transformer

首先是Multi-head Self-Attention,这也是Transformer中最重要的一部分。 \[ \operatorname{Attention}(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}\right) V \]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class MultiHeadAttention(nn.Module):
def __init__(self, hidden_size, dropout_rate, head_size=8):
super(MultiHeadAttention, self).__init__()

self.head_size = head_size

self.att_size = att_size = hidden_size // head_size
self.scale = att_size ** -0.5 # d_k ^ 0.5

self.linear_q = nn.Linear(hidden_size, head_size * att_size, bias=False)
self.linear_k = nn.Linear(hidden_size, head_size * att_size, bias=False)
self.linear_v = nn.Linear(hidden_size, head_size * att_size, bias=False)
initialize_weight(self.linear_q)
initialize_weight(self.linear_k)
initialize_weight(self.linear_v)

self.att_dropout = nn.Dropout(dropout_rate)

self.output_layer = nn.Linear(head_size * att_size, hidden_size,
bias=False)
initialize_weight(self.output_layer)

def forward(self, q, k, v, mask, cache=None):
orig_q_size = q.size()

d_k = self.att_size
d_v = self.att_size
batch_size = q.size(0)

# head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)

q = self.linear_q(q).view(batch_size, -1, self.head_size, d_k)
if cache is not None and 'encdec_k' in cache:
k, v = cache['encdec_k'], cache['encdec_v']
else:
k = self.linear_k(k).view(batch_size, -1, self.head_size, d_k)
v = self.linear_v(v).view(batch_size, -1, self.head_size, d_v)

if cache is not None:
cache['encdec_k'], cache['encdec_v'] = k, v

q = q.transpose(1, 2) # [b, h, q_len, d_k]
v = v.transpose(1, 2) # [b, h, v_len, d_v]
k = k.transpose(1, 2).transpose(2, 3) # [b, h, d_k, k_len]

# Scaled Dot-Product Attention.
# Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
q.mul_(self.scale)
x = torch.matmul(q, k) # [b, h, q_len, k_len]
x.masked_fill_(mask.unsqueeze(1), -1e9)
x = torch.softmax(x, dim=3)
x = self.att_dropout(x)
x = x.matmul(v) # [b, h, q_len, attn]

x = x.transpose(1, 2).contiguous() # [b, q_len, h, attn]
x = x.view(batch_size, -1, self.head_size * d_v)

x = self.output_layer(x)

assert x.size() == orig_q_size
return x

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class EncoderLayer(nn.Module):
def __init__(self, hidden_size, filter_size, dropout_rate):
super(EncoderLayer, self).__init__()

self.self_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)
self.self_attention_dropout = nn.Dropout(dropout_rate)

self.ffn_norm = nn.LayerNorm(hidden_size, eps=1e-6)
self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)
self.ffn_dropout = nn.Dropout(dropout_rate)

def forward(self, x, mask): # pylint: disable=arguments-differ
y = self.self_attention_norm(x)
y = self.self_attention(y, y, y, mask)
y = self.self_attention_dropout(y)
x = x + y #skip-connection

y = self.ffn_norm(x)
y = self.ffn(y)
y = self.ffn_dropout(y)
x = x + y
return x

Decoder的结构与Encoder基本类似,不过输入还包含Encoder的输出以及之前序列的输出结果,另外Decoder的自注意力层只允许关注输出序列之前的位置(通过-inf在self-attention计算中的softmax部分进行mask)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
class Transformer(nn.Module):
def __init__(self, i_vocab_size, t_vocab_size,
n_layers=6,
hidden_size=512,
filter_size=2048,
dropout_rate=0.1,
share_target_embedding=True,
has_inputs=True,
src_pad_idx=None,
trg_pad_idx=None):
super(Transformer, self).__init__()

self.hidden_size = hidden_size
self.emb_scale = hidden_size ** 0.5
self.has_inputs = has_inputs
self.src_pad_idx = src_pad_idx
self.trg_pad_idx = trg_pad_idx

self.t_vocab_embedding = nn.Embedding(t_vocab_size, hidden_size)
nn.init.normal_(self.t_vocab_embedding.weight, mean=0,
std=hidden_size**-0.5)
self.t_emb_dropout = nn.Dropout(dropout_rate)
self.decoder = Decoder(hidden_size, filter_size,
dropout_rate, n_layers)

if has_inputs:
if not share_target_embedding:
self.i_vocab_embedding = nn.Embedding(i_vocab_size,
hidden_size)
nn.init.normal_(self.i_vocab_embedding.weight, mean=0,
std=hidden_size**-0.5)
else:
self.i_vocab_embedding = self.t_vocab_embedding

self.i_emb_dropout = nn.Dropout(dropout_rate)

self.encoder = Encoder(hidden_size, filter_size,
dropout_rate, n_layers)

# For positional encoding
num_timescales = self.hidden_size // 2
max_timescale = 10000.0
min_timescale = 1.0
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
max(num_timescales - 1, 1))
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float32) *
-log_timescale_increment)
self.register_buffer('inv_timescales', inv_timescales)

def forward(self, inputs, targets):
enc_output, i_mask = None, None
if self.has_inputs:
i_mask = utils.create_pad_mask(inputs, self.src_pad_idx)
enc_output = self.encode(inputs, i_mask)

t_mask = utils.create_pad_mask(targets, self.trg_pad_idx)
target_size = targets.size()[1]
t_self_mask = utils.create_trg_self_mask(target_size,
device=targets.device)
return self.decode(targets, enc_output, i_mask, t_self_mask, t_mask)

def encode(self, inputs, i_mask):
# Input embedding
input_embedded = self.i_vocab_embedding(inputs)
input_embedded.masked_fill_(i_mask.squeeze(1).unsqueeze(-1), 0)
input_embedded *= self.emb_scale
input_embedded += self.get_position_encoding(inputs)
input_embedded = self.i_emb_dropout(input_embedded)

return self.encoder(input_embedded, i_mask)
#mask: sequence mask + padding mask
def decode(self, targets, enc_output, i_mask, t_self_mask, t_mask,
cache=None):
# target embedding
target_embedded = self.t_vocab_embedding(targets)
target_embedded.masked_fill_(t_mask.squeeze(1).unsqueeze(-1), 0)

# Shifting
target_embedded = target_embedded[:, :-1]
target_embedded = F.pad(target_embedded, (0, 0, 1, 0))

target_embedded *= self.emb_scale
target_embedded += self.get_position_encoding(targets)
target_embedded = self.t_emb_dropout(target_embedded)

# decoder
decoder_output = self.decoder(target_embedded, enc_output, i_mask,
t_self_mask, cache)
# linear
output = torch.matmul(decoder_output,
self.t_vocab_embedding.weight.transpose(0, 1))

return output

def get_position_encoding(self, x):
max_length = x.size()[1]
position = torch.arange(max_length, dtype=torch.float32,
device=x.device)
scaled_time = position.unsqueeze(1) * self.inv_timescales.unsqueeze(0)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)],
dim=1)
signal = F.pad(signal, (0, 0, 0, self.hidden_size % 2))
signal = signal.view(1, max_length, self.hidden_size)
return signal

Bert

先来看一下论文的摘要:

We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications.

这里可以把Bert的核心概括为几个点,首先是Pre-trained model——借助未标记数据进行训练,针对具体任务做fine-tune,其次是编码上下文,而不是单侧的。

上图展现了几个Pre-train model的结构对比,ELMo借助了LSTM作为编码器,介于LSTM对比Self-attention的劣势,这一模型可以看作只关注上下文的Bert,而GPT则是单向语言模型,对比Bert相当于把Encoder layer换为Decoder layer,也就获得了对应的单向视野。而Bert的Transformer,其实也就是相当于n个Encoder的堆叠。

the BERT Transformer uses bidirectional self-attention, while the GPT Transformer uses constrained self-attention where every token can only attend to context to its left.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Bert中的Transform block
import torch.nn as nn
from .attention import MultiHeadedAttention
from .utils import SublayerConnection, PositionwiseFeedForward


class TransformerBlock(nn.Module):
"""
Bidirectional Encoder = Transformer (self-attention)
Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
"""

def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
"""
:param hidden: hidden size of transformer
:param attn_heads: head sizes of multi-head attention
:param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
:param dropout: dropout rate
"""

super().__init__()
self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.dropout = nn.Dropout(p=dropout)

def forward(self, x, mask):
x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
x = self.output_sublayer(x, self.feed_forward)
return self.dropout(x)

在Bert原论文中,Bert的Bidirectional还体现在具体的训练任务上:

Unlike left-toright language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pretrain a deep bidirectional Transformer.

具体来说,训练任务包括两个,一是Masked Language Model,另一个是Next Sentence Prediction用于理解两个句子间的关系

对于具体下游任务来说,句子层面的我们可以取[CLS]的vector并做Softmax之类的操作。利用bert获取上下文相关的词向量则可以针对某个token,取其所在位置对应的某些层的hidden states,然后做特征融合(Feature-based)。

I do not accept rewards, but you can donate to the public welfare of China.
0%