dxzmpk

endless hard working

0%

深度学习Notebook多折交叉验证样本代码

深度学习notebook多折交叉验证样本代码

这份样本代码基于 @abhishek’s BERT Base Uncased using PyTorch在tweet情感词抽取比赛上的notebook, 如果决定使用代码,请为abhishek投上一个赞成票。

导入需要用到的包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
# tqdm用来记录notebook训练的进度条
from tqdm.autonotebook import tqdm

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
class config:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 16
EPOCHS = 6
ROBERTA_PATH = "../input/roberta-base/"
MODEL_PATH = "pytorch_model.bin"
TRAINING_FILE = "../input/tweet-train-folds/train_folds.csv"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
vocab_file=f"{ROBERTA_PATH}/vocab.json",
merges_file=f"{ROBERTA_PATH}/merges.txt",
lowercase=True,
add_prefix_space=True
)

Data Processing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
tweet = " " + " ".join(str(tweet).split())
selected_text = " " + " ".join(str(selected_text).split())

len_st = len(selected_text) - 1
idx0 = None
idx1 = None

for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
if " " + tweet[ind: ind+len_st] == selected_text:
idx0 = ind
idx1 = ind + len_st - 1
break

char_targets = [0] * len(tweet)
if idx0 != None and idx1 != None:
for ct in range(idx0, idx1 + 1):
char_targets[ct] = 1

tok_tweet = tokenizer.encode(tweet)
input_ids_orig = tok_tweet.ids
tweet_offsets = tok_tweet.offsets

target_idx = []
for j, (offset1, offset2) in enumerate(tweet_offsets):
if sum(char_targets[offset1: offset2]) > 0:
target_idx.append(j)

targets_start = target_idx[0]
targets_end = target_idx[-1]

sentiment_id = {
'positive': 1313,
'negative': 2430,
'neutral': 7974
}

input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
mask = [1] * len(token_type_ids)
tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
targets_start += 4
targets_end += 4

padding_length = max_len - len(input_ids)
if padding_length > 0:
input_ids = input_ids + ([1] * padding_length)
mask = mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)

return {
'ids': input_ids,
'mask': mask,
'token_type_ids': token_type_ids,
'targets_start': targets_start,
'targets_end': targets_end,
'orig_tweet': tweet,
'orig_selected': selected_text,
'sentiment': sentiment,
'offsets': tweet_offsets
}

Data loader

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class TweetDataset:
"""
Dataset which stores the tweets and returns them as processed features
"""
def __init__(self, tweet, sentiment, selected_text):
self.tweet = tweet
self.sentiment = sentiment
self.selected_text = selected_text
self.tokenizer = config.TOKENIZER
self.max_len = config.MAX_LEN

def __len__(self):
return len(self.tweet)

def __getitem__(self, item):
data = process_data(
self.tweet[item],
self.selected_text[item],
self.sentiment[item],
self.tokenizer,
self.max_len
)

# Return the processed data where the lists are converted to `torch.tensor`s
return {
'ids': torch.tensor(data["ids"], dtype=torch.long),
'mask': torch.tensor(data["mask"], dtype=torch.long),
'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
'orig_tweet': data["orig_tweet"],
'orig_selected': data["orig_selected"],
'sentiment': data["sentiment"],
'offsets': torch.tensor(data["offsets"], dtype=torch.long)
}

模型定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class TweetModel(transformers.BertPreTrainedModel):
def __init__(self, conf):
super(TweetModel, self).__init__(conf)
self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
self.drop_out = nn.Dropout(0.1)
self.l_1 = nn.Linear(768 * 2, 400)
self.l0 = nn.Linear(400, 2)
torch.nn.init.normal_(self.l0.weight, std=0.02)

def forward(self, ids, mask, token_type_ids):
_, _, out = self.roberta(
ids,
attention_mask=mask,
token_type_ids=token_type_ids
)

out = torch.cat((out[-1], out[-2]), dim=-1)
out = self.drop_out(out)
logits = self.l_1(out)
logits = self.l0(logits)

start_logits, end_logits = logits.split(1, dim=-1)

start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)

return start_logits, end_logits

自定义损失函数(optional,取决于任务)

1
2
3
4
5
6
7
8
9
def loss_fn(start_logits, end_logits, start_positions, end_positions):
"""
Return the sum of the cross entropy losses for both the start and end logits
"""
loss_fct = nn.CrossEntropyLoss()
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss)
return total_loss

Training Function

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def train_fn(data_loader, model, optimizer, device, scheduler=None):
"""
Trains the bert model on the twitter data
"""
# Set model to training mode (dropout + sampled batchnorm is activated)
model.train()
losses = utils.AverageMeter()
jaccards = utils.AverageMeter()

# Set tqdm to add loading screen and set the length
tk0 = tqdm(data_loader, total=len(data_loader))

# Train the model on each batch
for bi, d in enumerate(tk0):

ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
targets_start = d["targets_start"]
targets_end = d["targets_end"]
sentiment = d["sentiment"]
orig_selected = d["orig_selected"]
orig_tweet = d["orig_tweet"]
targets_start = d["targets_start"]
targets_end = d["targets_end"]
offsets = d["offsets"]

# Move ids, masks, and targets to gpu while setting as torch.long
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets_start = targets_start.to(device, dtype=torch.long)
targets_end = targets_end.to(device, dtype=torch.long)

# Reset gradients
model.zero_grad()
# Use ids, masks, and token types as input to the model
# Predict logits for each of the input tokens for each batch
outputs_start, outputs_end = model(
ids=ids,
mask=mask,
token_type_ids=token_type_ids,
) # (bs x SL), (bs x SL)
# Calculate batch loss based on CrossEntropy
loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
# Calculate gradients based on loss
loss.backward()
# Adjust weights based on calculated gradients
optimizer.step()
# Update scheduler
scheduler.step()

# Apply softmax to the start and end logits
# This squeezes each of the logits in a sequence to a value between 0 and 1, while ensuring that they sum to 1
# This is similar to the characteristics of "probabilities"
outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

# Calculate the jaccard score based on the predictions for this batch
jaccard_scores = []
for px, tweet in enumerate(orig_tweet):
selected_tweet = orig_selected[px]
tweet_sentiment = sentiment[px]
jaccard_score, _ = calculate_jaccard_score(
original_tweet=tweet, # Full text of the px'th tweet in the batch
target_string=selected_tweet, # Span containing the specified sentiment for the px'th tweet in the batch
sentiment_val=tweet_sentiment, # Sentiment of the px'th tweet in the batch
idx_start=np.argmax(outputs_start[px, :]), # Predicted start index for the px'th tweet in the batch
idx_end=np.argmax(outputs_end[px, :]), # Predicted end index for the px'th tweet in the batch
offsets=offsets[px] # Offsets for each of the tokens for the px'th tweet in the batch
)
jaccard_scores.append(jaccard_score)
# Update the jaccard score and loss
# For details, refer to `AverageMeter` in https://www.kaggle.com/abhishek/utils
jaccards.update(np.mean(jaccard_scores), ids.size(0))
losses.update(loss.item(), ids.size(0))
# Print the average loss and jaccard score at the end of each batch
tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

Evaluation Functions

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
def calculate_jaccard_score(
original_tweet,
target_string,
sentiment_val,
idx_start,
idx_end,
offsets,
verbose=False):
"""
Calculate the jaccard score from the predicted span and the actual span for a batch of tweets
"""

# A span's start index has to be greater than or equal to the end index
# If this doesn't hold, the start index is set to equal the end index (the span is a single token)
if idx_end < idx_start:
idx_end = idx_start

# Combine into a string the tokens that belong to the predicted span
filtered_output = ""
for ix in range(idx_start, idx_end + 1):
filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
# If the token is not the last token in the tweet, and the ending offset of the current token is less
# than the beginning offset of the following token, add a space.
# Basically, add a space when the next token (word piece) corresponds to a new word
if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
filtered_output += " "

# Set the predicted output as the original tweet when the tweet's sentiment is "neutral", or the tweet only contains one word
if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
filtered_output = original_tweet

# Calculate the jaccard score between the predicted span, and the actual span
# The IOU (intersection over union) approach is detailed in the utils module's `jaccard` function:
# https://www.kaggle.com/abhishek/utils
jac = utils.jaccard(target_string.strip(), filtered_output.strip())
return jac, filtered_output


def eval_fn(data_loader, model, device):
"""
Evaluation function to predict on the test set
"""
# Set model to evaluation mode
# I.e., turn off dropout and set batchnorm to use overall mean and variance (from training), rather than batch level mean and variance
# Reference: https://github.com/pytorch/pytorch/issues/5406
model.eval()
losses = utils.AverageMeter()
jaccards = utils.AverageMeter()

# Turns off gradient calculations (https://datascience.stackexchange.com/questions/32651/what-is-the-use-of-torch-no-grad-in-pytorch)
with torch.no_grad():
tk0 = tqdm(data_loader, total=len(data_loader))
# Make predictions and calculate loss / jaccard score for each batch
for bi, d in enumerate(tk0):
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
sentiment = d["sentiment"]
orig_selected = d["orig_selected"]
orig_tweet = d["orig_tweet"]
targets_start = d["targets_start"]
targets_end = d["targets_end"]
offsets = d["offsets"].numpy()

# Move tensors to GPU for faster matrix calculations
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets_start = targets_start.to(device, dtype=torch.long)
targets_end = targets_end.to(device, dtype=torch.long)

# Predict logits for start and end indexes
outputs_start, outputs_end = model(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)
# Calculate loss for the batch
loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
# Apply softmax to the predicted logits for the start and end indexes
# This converts the "logits" to "probability-like" scores
outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
# Calculate jaccard scores for each tweet in the batch
jaccard_scores = []
for px, tweet in enumerate(orig_tweet):
selected_tweet = orig_selected[px]
tweet_sentiment = sentiment[px]
jaccard_score, _ = calculate_jaccard_score(
original_tweet=tweet,
target_string=selected_tweet,
sentiment_val=tweet_sentiment,
idx_start=np.argmax(outputs_start[px, :]),
idx_end=np.argmax(outputs_end[px, :]),
offsets=offsets[px]
)
jaccard_scores.append(jaccard_score)

# Update running jaccard score and loss
jaccards.update(np.mean(jaccard_scores), ids.size(0))
losses.update(loss.item(), ids.size(0))
# Print the running average loss and jaccard score
tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

print(f"Jaccard = {jaccards.avg}")
return jaccards.avg

Training

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def run(fold):
"""
Train model for a speciied fold
"""
# Read training csv
dfx = pd.read_csv(config.TRAINING_FILE)

# Set train validation set split
df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

# Instantiate TweetDataset with training data
train_dataset = TweetDataset(
tweet=df_train.text.values,
sentiment=df_train.sentiment.values,
selected_text=df_train.selected_text.values
)

# Instantiate DataLoader with `train_dataset`
# This is a generator that yields the dataset in batches
train_data_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=config.TRAIN_BATCH_SIZE,
num_workers=4
)

# Instantiate TweetDataset with validation data
valid_dataset = TweetDataset(
tweet=df_valid.text.values,
sentiment=df_valid.sentiment.values,
selected_text=df_valid.selected_text.values
)

# Instantiate DataLoader with `valid_dataset`
valid_data_loader = torch.utils.data.DataLoader(
valid_dataset,
batch_size=config.VALID_BATCH_SIZE,
num_workers=2
)

# Set device as `cuda` (GPU)
device = torch.device("cuda")
# Load pretrained BERT (bert-base-uncased)
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
# Output hidden states
# This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
model_config.output_hidden_states = True
# Instantiate our model with `model_config`
model = TweetModel(conf=model_config)
# Move the model to the GPU
model.to(device)

# Calculate the number of training steps
num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
# Get the list of named parameters
param_optimizer = list(model.named_parameters())
# Specify parameters where weight decay shouldn't be applied
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
# Define two sets of parameters: those with weight decay, and those without
optimizer_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]
# Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5
optimizer = AdamW(optimizer_parameters, lr=3e-5)
# Create a scheduler to set the learning rate at each training step
# "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
# Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=num_train_steps
)

# Apply early stopping with patience of 2
# This means to stop training new epochs when 2 rounds have passed without any improvement
es = utils.EarlyStopping(patience=2, mode="max")
print(f"Training is Starting for fold={fold}")

# I'm training only for 3 epochs even though I specified 5!!!
for epoch in range(5):
train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
jaccard = eval_fn(valid_data_loader, model, device)
print(f"Jaccard Score = {jaccard}")
es(jaccard, model, model_path=f"model_{fold}.bin")
if es.early_stop:
print("Early stopping")
break
1
run(fold=0)
1
run(fold=1)
1
run(fold=2)
1
run(fold=3)
1
run(fold=4)

Do the evaluation on test data

1
2
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values
1
df_test
textID text sentiment selected_text
0 f87dea47db Last session of the day http://twitpic.com/67ezh neutral Last session of the day http://twitpic.com/67ezh
1 96d74cb729 Shanghai is also really exciting (precisely -... positive Shanghai is also really exciting (precisely -...
2 eee518ae67 Recession hit Veronique Branquinho, she has to... negative Recession hit Veronique Branquinho, she has to...
3 01082688c6 happy bday! positive happy bday!
4 33987a8ee5 http://twitpic.com/4w75p - I like it!! positive http://twitpic.com/4w75p - I like it!!
... ... ... ... ...
3529 e5f0e6ef4b its at 3 am, im very tired but i can`t sleep ... negative its at 3 am, im very tired but i can`t sleep ...
3530 416863ce47 All alone in this old house again. Thanks for... positive All alone in this old house again. Thanks for...
3531 6332da480c I know what you mean. My little dog is sinkin... negative I know what you mean. My little dog is sinkin...
3532 df1baec676 _sutra what is your next youtube video gonna b... positive _sutra what is your next youtube video gonna b...
3533 469e15c5a8 http://twitpic.com/4woj2 - omgssh ang cute n... positive http://twitpic.com/4woj2 - omgssh ang cute n...

3534 rows × 4 columns

1
2
3
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Load each of the five trained models and move to GPU
model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("model_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load("model_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load("model_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load("model_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load("model_4.bin"))
model5.eval()
TweetModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (2): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (3): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (4): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (5): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (6): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (7): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (8): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (9): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (10): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (11): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (drop_out): Dropout(p=0.1, inplace=False)
  (l_1): Linear(in_features=1536, out_features=400, bias=True)
  (l0): Linear(in_features=400, out_features=2, bias=True)
)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
final_output = []

# Instantiate TweetDataset with the test data
test_dataset = TweetDataset(
tweet=df_test.text.values,
sentiment=df_test.sentiment.values,
selected_text=df_test.selected_text.values
)

# Instantiate DataLoader with `test_dataset`
data_loader = torch.utils.data.DataLoader(
test_dataset,
shuffle=False,
batch_size=config.VALID_BATCH_SIZE,
num_workers=1
)

# Turn of gradient calculations
with torch.no_grad():
tk0 = tqdm(data_loader, total=len(data_loader))
# Predict the span containing the sentiment for each batch
for bi, d in enumerate(tk0):
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
sentiment = d["sentiment"]
orig_selected = d["orig_selected"]
orig_tweet = d["orig_tweet"]
targets_start = d["targets_start"]
targets_end = d["targets_end"]
offsets = d["offsets"].numpy()

ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets_start = targets_start.to(device, dtype=torch.long)
targets_end = targets_end.to(device, dtype=torch.long)

# Predict start and end logits for each of the five models
outputs_start1, outputs_end1 = model1(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)

outputs_start2, outputs_end2 = model2(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)

outputs_start3, outputs_end3 = model3(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)

outputs_start4, outputs_end4 = model4(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)

outputs_start5, outputs_end5 = model5(
ids=ids,
mask=mask,
token_type_ids=token_type_ids
)

# Get the average start and end logits across the five models and use these as predictions
# This is a form of "ensembling"
outputs_start = (
outputs_start1
+ outputs_start2
+ outputs_start3
+ outputs_start4
+ outputs_start5
) / 5
outputs_end = (
outputs_end1
+ outputs_end2
+ outputs_end3
+ outputs_end4
+ outputs_end5
) / 5

# Apply softmax to the predicted start and end logits
outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

# Convert the start and end scores to actual predicted spans (in string form)
for px, tweet in enumerate(orig_tweet):
selected_tweet = orig_selected[px]
tweet_sentiment = sentiment[px]
_, output_sentence = calculate_jaccard_score(
original_tweet=tweet,
target_string=selected_tweet,
sentiment_val=tweet_sentiment,
idx_start=np.argmax(outputs_start[px, :]),
idx_end=np.argmax(outputs_end[px, :]),
offsets=offsets[px]
)
final_output.append(output_sentence)
HBox(children=(FloatProgress(value=0.0, max=221.0), HTML(value='')))


1
2
3
4
5
# post-process trick:
# Note: This trick comes from: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/140942
# When the LB resets, this trick won't help
def post_process(selected):
return " ".join(set(selected.lower().split()))
1
2
3
4
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output
sample.selected_text = sample.selected_text.map(post_process)
sample.to_csv("submission.csv", index=False)
1
sample.head()
textID selected_text
0 f87dea47db http://twitpic.com/67ezh the of day session last
1 96d74cb729 exciting
2 eee518ae67 such shame! a
3 01082688c6 happy bday!
4 33987a8ee5 i like
1
2
3
4
import utils
import inspect
source_DF = inspect.getsource(utils)
print(source_DF)
import numpy as np
import torch

class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))