# BERT Tutorial


## Install the libraries
First you need to install the following libraries:

    pip install transformers
    pip install ipywidgets
    pip install bertviz

Once everything is installed you can download 

In [1]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']

FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo


## Imports and definitions

In [2]:
from bertviz import model_view, head_view
from transformers import *

import numpy as np
import pprint

# Get the interactive Tools for Matplotlib
%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel
import torch

In [3]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [4]:
#model_path = 'nboost/pt-bert-base-uncased-msmarco'
model_path = 'bert-base-uncased'

CLS_token = "[CLS]"
SEP_token = "[SEP]"


# Load the required tokenizer, configuration and model

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path,  output_hidden_states=True, output_attentions=True)  
model = AutoModel.from_pretrained(model_path, config=config)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Tokenization

See here for details: https://huggingface.co/docs/transformers/tokenizer_summary

In [6]:
sentence_a = "Is throat cancer treatable nowadays?"
sentence_b = "Tell me about lung cancer."
sentence_a = "58-year-old woman with hypertension"
sentence_b = "BACKGROUND : Longitudinal studies hypertension"
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
pprint.pprint(inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  5388,  1011,  2095,  1011,  2214,  2450,  2007, 23760, 29048,
           102,  4281,  1024, 20134,  2913, 23760, 29048,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])}


In [7]:
print(tokenizer.decode(inputs["input_ids"][0].tolist()))

[CLS] 58 - year - old woman with hypertension [SEP] background : longitudinal studies hypertension [SEP]


In [8]:
input_ids = inputs['input_ids']
pprint.pprint(input_ids[0].tolist())

[101,
 5388,
 1011,
 2095,
 1011,
 2214,
 2450,
 2007,
 23760,
 29048,
 102,
 4281,
 1024,
 20134,
 2913,
 23760,
 29048,
 102]


In [9]:
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
pprint.pprint(tokens)

['[CLS]',
 '58',
 '-',
 'year',
 '-',
 'old',
 'woman',
 'with',
 'hyper',
 '##tension',
 '[SEP]',
 'background',
 ':',
 'longitudinal',
 'studies',
 'hyper',
 '##tension',
 '[SEP]']


# Model inference output

In [10]:
with torch.no_grad():
    outputs = model(**inputs)

In [11]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])

## Layer embeddings

In [12]:
# the last layer is the output embedding layer
output_embeddings = outputs['last_hidden_state']

In [39]:
token_throat = 2
token_lung = 11
# out[0][token]
throat_output_embedding = output_embeddings[0][token_throat]
throat_output_embedding

tensor([-9.2372e-01,  8.5944e-02, -1.2626e-01,  1.0111e-01,  8.1886e-01,
         9.4940e-01, -3.9501e-01,  4.4476e-01, -3.4203e-02,  3.1099e-01,
        -7.0724e-01, -9.2198e-01,  2.6819e-02,  1.0012e+00, -5.1365e-02,
         6.2856e-01,  1.1207e+00, -5.4419e-01, -3.7237e-01, -4.1753e-01,
        -4.6319e-01, -5.0991e-01,  8.6580e-01,  8.8857e-01,  1.1531e+00,
        -9.0972e-03,  6.0069e-01,  8.9090e-02,  1.5814e-01, -4.1446e-01,
         5.5867e-01,  7.8422e-02, -7.3513e-01, -3.8700e-01,  1.3769e-01,
         3.5607e-01, -7.5665e-01, -3.4882e-01, -7.3833e-02, -6.2202e-03,
        -8.8341e-01, -4.1256e-01, -3.7885e-01,  4.3063e-01,  2.4179e-01,
        -3.2909e-01,  7.7296e-01, -5.9851e-01,  1.0237e+00, -2.1382e-01,
        -7.0192e-01, -3.8709e-01,  3.6224e-01,  1.2725e-01, -1.3441e+00,
         7.9420e-01,  4.2507e-01,  3.7870e-01, -4.5034e-01,  9.5753e-01,
         2.9034e-01, -8.9403e-02,  1.4516e-01, -1.6441e+00,  1.7425e-01,
         3.2065e-01, -6.3529e-01,  1.4575e+00, -3.5

In [14]:
hidden_states = outputs['hidden_states']

In [40]:
#This is the output token embedding for the word throat
# hidden_states[layer][0][token])
layer = 0
throat_input_embedding = hidden_states[layer][0][token_throat]
throat_input_embedding

tensor([-2.6892e-02,  1.4420e-01,  1.9869e-01, -3.3330e-01,  2.0060e-01,
        -3.1660e-01,  5.9768e-01,  1.8488e-01,  3.9649e-01,  1.8929e-02,
         1.6458e-02, -1.7443e-02,  5.4390e-01, -9.6431e-01,  2.1967e-01,
         3.9405e-01,  4.9111e-01,  1.6861e-01,  3.6628e-01,  7.3393e-01,
        -2.2867e-01, -1.1707e-01,  9.9808e-02, -5.6193e-02,  5.3661e-01,
         3.8328e-01, -4.6531e-01,  8.5566e-01,  5.5343e-02, -5.9750e-01,
         8.1373e-01,  4.2629e-01, -5.7305e-01, -1.5089e-01, -3.6476e-01,
         2.9699e-01,  5.7970e-01,  1.5513e-01,  1.1959e-01, -9.3418e-01,
         5.7652e-01, -3.0385e-02, -2.0056e-01,  1.6055e-01, -1.0179e-01,
         4.7861e-01, -4.3599e-01, -2.3868e-01,  8.4883e-01, -2.3793e-01,
        -7.8738e-01,  5.0590e-02,  7.0244e-01,  2.9986e-01,  4.6054e-01,
        -3.8442e-01,  1.2574e-01, -4.0470e-01, -3.5727e-01,  1.6641e-01,
        -9.8675e-01,  4.1528e-01, -5.0538e-01,  5.0751e-01,  7.5293e-03,
        -7.0122e-01, -4.1537e-01,  8.8603e-02, -2.3

## Self-attention matrices

In [16]:
attention = outputs['attentions']
# The format of the attention tensor is:
# attention[layer][0][head][token1][token2]
layer = 3
head = 3

In [18]:
# this will given the attention from one token vs the other token
attention[layer][0][head][token_throat][token_lung]

tensor(0.0023)

In [17]:
# There's a softmax, so, the sum should be 1 
attention[layer][0][head][token_throat].sum()

tensor(1.)

In [41]:
attention[layer][0][head][token_throat].sum()

tensor(1.)

# Extract Word embeddings



In [20]:
def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)

def get_word_vector(inputs, outputs, idx, layer):
    """Get a word vector by averaging the embeddings of 
       all word occurrences of that word in the input"""

    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(inputs.word_ids()) == idx)
    word_tokens_output = outputs.hidden_states[layer][0][token_ids_word]

    return word_tokens_output.mean(dim=0)

# The code below converts the tokens into a space delimited string.
# This will allow computing in which position of the BERT input sequence a given word is.
sentence_a = tokenizer.decode(inputs["input_ids"][0].tolist()).replace("[CLS] ", '').replace(" [SEP]", '')
word = "hypertension"
idx = get_word_idx(sentence_a, word)
print("Input sequence:", sentence_a)
print("The word \"", word, "\" occurs in position", idx, "of the BERT input sequence.")

word_embedding = get_word_vector(inputs, outputs, idx, 4)


Input sequence: 58 - year - old woman with hypertension background : longitudinal studies hypertension
The word " hypertension " occurs in position 7 of the BERT input sequence.


In [21]:
import torch
import re
from transformers import AutoTokenizer, AutoModel

def get_word_vector_from_ab(inputs, outputs, word, layer = '-1', ab = 'A'):
    """
    This method extracts a word embedding from the requested layer 
    for sentence_a or sentence_b. If the word is divided into tokens, 
    the word embedding will be the average of the corresponding token 
    embeddings.

    NOTE: If the same word occurs multiple times in the sentence, 
    this method returns the word embedding of the first occurrence.

    Keyword arguments:
        inputs -- input passed to the transformer
        outputs -- output of the transformer
        word -- target word
        layer -- layer from where the word embedding vector should 
        be extracted.
        ab -- should be 'A' or 'B' indication if the word embedding is to be extracted 
        from sentence_a or sentence_b, i.e., query or document.
    """
       
    sep_token = np.where(np.array(inputs["input_ids"][0].tolist()) == 102)[0][0]
    if ab == 'A':
        tokens_a = inputs["input_ids"][0][1:sep_token]
        sent = tokenizer.decode(tokens_a.tolist())
    else:
        tokens_b = inputs["input_ids"][0][sep_token+1:-1]
        sent = tokenizer.decode(tokens_b.tolist())

    word_ids = get_word_idx(sent, word)

    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(inputs.word_ids()) == word_ids)[0]
    sep_word = np.where(np.array(inputs.word_ids()) == None)[0][1]

    if ab == 'A':
        token_pos = token_ids_word < sep_word
    else:
        token_pos = token_ids_word > sep_word
        
    token_ids_word = token_ids_word[token_pos]
    word_tokens_output = outputs.hidden_states[layer][0][token_ids_word]

    # Change this to True for inspection
    details = True
    if details:
        input_id_list = input_ids[0].tolist() # Batch index 0
        tokens = tokenizer.convert_ids_to_tokens(input_id_list)
        str1 = " "

        print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
        print("INPUT SEQUENCE TOKENS: ", str1.join(tokens))
        print("TARGET WORD:", word)
        print("TARGET SENTENCE:", ab)
        print("TARGET SENTENCE WORDS [", sent, "]")
        print("The word [", word, "] occurs in position", idx, "of the BERT input sentence", ab)
        print("The word [", word, "] corresponds to the token(s)", token_ids_word, "of the BERT input sequence", ab)

    return word_tokens_output.mean(dim=0)


word_embedding = get_word_vector_from_ab(inputs, outputs, "woman", 4, 'A')

word_embedding = get_word_vector_from_ab(inputs, outputs, "hypertension", 4, 'B')

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
INPUT SEQUENCE TOKENS:  [CLS] 58 - year - old woman with hyper ##tension [SEP] background : longitudinal studies hyper ##tension [SEP]
TARGET WORD: woman
TARGET SENTENCE: A
TARGET SENTENCE WORDS [ 58 - year - old woman with hypertension ]
The word [ woman ] occurs in position 7 of the BERT input sentence A
The word [ woman ] corresponds to the token(s) [6] of the BERT input sequence A
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
INPUT SEQUENCE TOKENS:  [CLS] 58 - year - old woman with hyper ##tension [SEP] background : longitudinal studies hyper ##tension [SEP]
TARGET WORD: hypertension
TARGET SENTENCE: B
TARGET SENTENCE WORDS [ background : longitudinal studies hypertension ]
The word [ hypertension ] occurs in position 7 of the BERT input sentence B
The word [ hypertension ] corresponds to the token(s) [15 16] of the BERT input sequence B


In [26]:
word_embedding.shape

torch.Size([768])

# Attention visualization

More details are available here: https://github.com/jessevig/bertviz

In [23]:
call_html()
head_view(attention, tokens)

<IPython.core.display.Javascript object>

In [24]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

## Other pre-trained BERT models

There are many other models available for download (https://huggingface.co/models).

BioBERT is a popular BERT model trained on biomedical literature (https://academic.oup.com/bioinformatics/article/36/4/1234/5566506):

    model_path = 'dmis-lab/biobert-v1.1'

Another popular BERT is the SciBERT trained on scientific literature (https://arxiv.org/abs/1903.10676):

    model_path = 'allenai/scibert_scivocab_uncased'

See above where the variable 'model_path' is defined.