# Lab 1 - Basic text processing

https://spacy.io/usage/linguistic-features

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html


## Word and Character based tokenization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]

my_stop_words = {'is', 'the'}
# UNIGRAMS
vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word', stop_words = None)
#vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', stop_words = 'english')
#vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', stop_words = my_stop_words)

# UNIGRAMS and BIGRAMS
#vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')

# Character GRAMS
#vectorizer = CountVectorizer(ngram_range=(3,4), analyzer='char')
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

X_dense = X.todense()
print(X_dense)


In [None]:
## Exercise 1: After having the corpus tokenized and in a dense matrix format: find the indexes of the documents that have a given word. 
# Print both the indexes and the number of resulting documents.
# Hint: use the function get_feature_names() to obtain the corpus vocabulary, and the np.where function.
import numpy as np

word = "document"

...

In [None]:
# Exercise 2: Repeat the previous exercise, but now assume that you have N words.
words = ["third", "one"]


# Writing a Custom Tokenizer - The Whitespace tokenizer

In [None]:
import spacy
from spacy import displacy
from pathlib import Path
from spacy.tokens import Doc

class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(" ")
        spaces = [True] * len(words)
        # Avoid zero-length tokens
        for i, word in enumerate(words):
            if word == "":
                words[i] = " "
                spaces[i] = False
        # Remove the final trailing space
        if words[-1] == " ":
            words = words[0:-1]
            spaces = spaces[0:-1]
        else:
           spaces[-1] = False
            
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([token.text for token in doc])

In [None]:
## Exercise 2: Adapt the Whitespace Tokenizer to remove punctuation.


# Tokenization with Spacy

In [None]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

save_figures = False

print("token".ljust(10), "lemma".ljust(10), "pos".ljust(6), "tag".ljust(6), "dep".ljust(10),
            "shape".ljust(10), "alpha", "stop")
print("------------------------------------------------------------------------------")
for token in doc:
    print(token.text.ljust(10), token.lemma_.ljust(10), token.pos_.ljust(6), token.tag_.ljust(6), token.dep_.ljust(10),
            token.shape_.ljust(10), token.is_alpha, token.is_stop)

html_dep = displacy.render(doc, style="dep", jupyter=True)

if save_figures:
    file_name = "demo-dep.html"
    output_path = Path("./images/" + file_name)
    output_path.open("w", encoding="utf-8").write(html_dep)


# Word Piece Tokenizer

Download the Word Piece Vocabulary.


In [None]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt

In [None]:
from tokenizers import BertWordPieceTokenizer
from spacy.tokens import Doc
import spacy


class BertTokenizer:
    def __init__(self, vocab, vocab_file, lowercase=True):
        self.vocab = vocab
        self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)

    def __call__(self, text):
        tokens = self._tokenizer.encode(text)
        words = []
        spaces = []
        for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
            words.append(text)
            if i < len(tokens.tokens) - 1:
                # If next start != current end we assume a space in between
                next_start, next_end = tokens.offsets[i + 1]
                spaces.append(next_start > end)
            else:
                spaces.append(True)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.blank("en")
nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-large-cased-vocab.txt")
#doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
for token in doc:
    print(token)
print(doc.text)

# Named Entities

In [None]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text.ljust(12), ent.label_.ljust(10), ent.start_char, ent.end_char)

html_ent = displacy.render(doc, style="ent", jupyter=True)

if save_figures:
    file_name = "demo-ent.html"
    output_path = Path("./images/" + file_name)
    output_path.open("w", encoding="utf-8").write(html_ent)
