- A common simple character encoding is ASCII,
- We can encode each word as a number (token) —
Tokenizer
.
- Tokenize words > build all the words to make a corpus > turn your sentences into lists of values based on these tokens. > manipulate these lists (make the same length, for example)
1from tensorflow.keras.preprocessing.text import Tokenizer
2
3sentences = [
4 'i love my dog',
5 'I, love my cat',
6 'You love my dog so much!'
7]
8
9tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
10 # num_words: max of words to be tokenized & pick
11 # the most common 100 words.
12 # More words, more accuracy, more time to train
13 # oov_token: replace unseen words by "<OOV>"
14tokenizer.fit_on_texts(sentences) # fix texts based on tokens
1# indexing words
2word_index = tokenizer.word_index
3print(word_index)
4# {'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'so': 8, 'much': 9}
5# "!", ",", capital, ... are removed
1# encode sentences
2sequences = tokenizer.texts_to_sequences(sentences)
3print(sequences)
4# [[4, 2, 3, 5],
5# [4, 2, 3, 6],
6# [7, 2, 3, 5, 8, 9]]
7# if a word is not in the word index, it will be lost in the text_to_sequences()
1# make encoded sentences equal
2from tensorflow.keras.preprocessing.sequence import pad_sequences
3
4padded = pad_sequences(sequences, value=-1,
5 maxlen=5, padding="post", truncating="post")
6 # maxlen: max len of encoded sentence
7 # value: value to be filld (default 0)
8 # padding: add missing values at beginning or ending of sentence?
9 # truncating: longer than maxlen? cut at beginning or ending?
10print(padded)
11# [[ 4 2 3 5 -1]
12# [ 4 2 3 6 -1]
13# [ 7 2 3 5 8]]
1# read json text
2import json
3with open("/tmp/sarcasm.json", 'r') as f:
4 datastore = json.load(f)
5
6sentences = []
7labels = []
8urls = []
9for item in datastore:
10 sentences.append(item['headline'])
11 labels.append(item['is_sarcastic'])
12 urls.append(item['article_link'])
- Word embeddings = the idea in which words and associated words are clustered as vectors in a multi-dimensional space. That allows words with similar meaning to have a similar representation.
- The meaning of the words can come from labeling of the dataset.
- Example: "dull" and "boring" show up a lot in negative reviews → they have similar sentiments → they are close to each other in the sentence → thus their vectors will be similar → NN train + learn these vectors + associating them with the labels to come up with what's called in embedding.
- The purpose of embedding dimension is the number of dimensions for the vector representing the word encoding.
1import tensorflow as tf
2print(tf.__version__) # check version of tensorflow
3
4# If you are using tf1, you need below code
5tf.enable_eager_execution()
1# IMDB reviews dataset
2import tensorflow_datasets as tfds
3imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
4
5train_data, test_data = imdb['train'], imdb['test']
6
7for s,l in train_data: # "s" for sentences "l" for labels
8 # The values for "s" and "l" are tensors
9 # so we need to extracr their values
10 training_sentences.append(s.numpy().decode('utf8'))
11 training_labels.append(l.numpy())
1# Prepare for the NN
2vocab_size = 10000
3embedding_dim = 16 # embedding to dim 16
4max_length = 120 # of each sentence
5trunc_type='post' # cut the last words
6oov_tok = "<OOV>" # replace not-encoded words by this
7
8from tensorflow.keras.preprocessing.text import Tokenizer
9from tensorflow.keras.preprocessing.sequence import pad_sequences
10
11tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
12tokenizer.fit_on_texts(training_sentences)
13 # encoding the words
14word_index = tokenizer.word_index
15 # list of word index (built based on training set)
16 # there may be many oov_tok in test set
17sequences = tokenizer.texts_to_sequences(training_sentences)
18 # apply on sentences
19padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
20 # padding the sentences
21
22# apply to the test set
23testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
24testing_padded = pad_sequences(testing_sequences,maxlen=max_length)
1# Simple NN
2model = tf.keras.Sequential([
3 tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
4 # The result of embedding will be a 2D array:
5 # length of sentence x embedding_dim
6 tf.keras.layers.Flatten(),
7 # Alternatively (a little diff on speed and accuracy):
8 # tf.keras.layers.GlobalAveragePooling1D()
9 # average across the vectors to flatten it out
10 tf.keras.layers.Dense(6, activation='relu'),
11 tf.keras.layers.Dense(1, activation='sigmoid')
12])
13model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
14model.summary()
1# Training
2model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))
1# the result
2e = model.layers[0] # get the result of the embedding layers
3weights = e.get_weights()[0]
4print(weights.shape) # shape: (vocab_size, embedding_dim)
If you wanna visualize the result (in 3D) with Embedding projector
1import io
2
3out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
4out_m = io.open('meta.tsv', 'w', encoding='utf-8')
5for word_num in range(1, vocab_size):
6 word = reverse_word_index[word_num]
7 embeddings = weights[word_num]
8 out_m.write(word + "\\n")
9 out_v.write('\\t'.join([str(x) for x in embeddings]) + "\\n")
10out_v.close()
11out_m.close()
12
13try:
14 from google.colab import files
15except ImportError:
16 pass
17else:
18 files.download('vecs.tsv')
19 files.download('meta.tsv')
📙 Notebook: Train Sacarsm dataset.
- In text data, it usually happens that the accuracy increase over the number of training but the loss increase sharply also. We can "play" with hyperparameter to see the effect.
1# Run this to ensure TensorFlow 2.x is used
2try:
3 # %tensorflow_version only exists in Colab.
4 %tensorflow_version 2.x
5except Exception:
6 pass
👉 datasets/imdb_reviews.md at master · tensorflow/datasets
👉 tfds.features.text.SubwordTextEncoder | TensorFlow Datasets
📙 Notebook: Pre-tokenizer example.
👉 Video exaplain the codes.
👉 tfds.features.text.SubwordTextEncoder | TensorFlow Datasets
📙 Notebook: Pre-tokenizer example.
👉 Video exaplain the codes.
- There are someones who did the work (tokenization) for you.
- Try on IMDB dataset that has been pre-tokenized.
- The tokenization is done on subwords!
- The sequence of words can be just important as their existence.
1# load imdb dataset from tensorflow
2import tensorflow_datasets as tfds
3imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
4
5# extract train/test sets
6train_data, test_data = imdb['train'], imdb['test']
7
8# take the tokernizer
9tokenizer = info.features['text'].encoder
10
11print(tokenizer.subwords)
12# ['the_', ', ', '. ', 'a_', 'and_', 'of_', 'to_', 's_', 'is_',...
1sample_string = 'TensorFlow, from basics to mastery'
2
3tokenized_string = tokenizer.encode(sample_string)
4print ('Tokenized string is {}'.format(tokenized_string))
5# Tokenized string is [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]
6
7original_string = tokenizer.decode(tokenized_string)
8print ('The original string: {}'.format(original_string))
9# The original string: TensorFlow, from basics to mastery
1# take a look on tokenized string
2# case sensitive + punctuation maintained
3for ts in tokenized_string:
4 print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))
5
6# 6307 ----> Ten
7# 2327 ----> sor
8# 4043 ----> Fl
9# ...
- The code run quite long (4 minutes each epoch if using GPU on colab) because there are a lot of hyperparameters and sub-words.
- Result: 50% acc & loss is decreasing but very small.
- Because we are using sub-words, not for-words → they (sub-words) are nonsensical. → they are only when we put them together in sequences → learning from sequences would be a great way forward → RNN (Recurrent Neural Networks)
- The relative ordering, the sequence of words, matters for the meaning of the sentence .
- For NN to take into account for the ordering of the words: RNN (Recurrent Neural Networks), LSTM (Long short-term memory).
- Why not RNN but LSTM ? With RNN, the context is preserved from timstamp to timestamp BUT that may get lost in longer sentences → LSTM gets better because it has cell state.
- Example of using LSTM: "I grew up in Ireland, I went to school and at school, they made me learn how to speak..." → "speak" is the context and we go back to the beginning to catch "Ireland", then the next word could be "leanr how to speak Gaelic"!
- The usual NN, something like "f(data, labels)=rules" cannot take into account of sequences.
- An example of using sequences: Fibonacci sequence → the result of current function is the input of next function itself,...
- Sometimes, the sequence context leads to lose information like the example of "Ireland" and "Gaelic" before.
- LSTM has an additional pipeline called Cell State. It can pass through the network to impact it + help to keep context from earlier tokens relevance.
1# SINGLE LAYER LSTM
2model = tf.keras.Sequential([
3 tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
4 tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
5 # 64: #oututs desired (but the result may be different)
6 tf.keras.layers.Dense(64, activation='relu'),
7 tf.keras.layers.Dense(1, activation='sigmoid')
8])
📙 Notebook: IMDB Subwords 8K with Single Layer LSTM
1# MULTI PLAYER LSTM
2model = tf.keras.Sequential([
3 tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
4 tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
5 # return_sequences=True: required if we wanna feed LSTM into another one
6 # It ensures that the output of LSTM match the desired inputs of the next one
7 tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
8 tf.keras.layers.Dense(64, activation='relu'),
9 tf.keras.layers.Dense(1, activation='sigmoid')
10])
📙 Notebook: IMDB Subwords 8K with Multi Layer LSTM
1# WITHOUT LSTM (like previous section)
2model = tf.keras.Sequential([
3 tf.keras.layers.Embedding(vocab_size, embedding_dim,
4 input_length=max_length),
5 #
6 tf.keras.layers.Flatten(),
7 tf.keras.layers.GlobalmaxPooling1D(),
8 #
9 tf.keras.layers.Dense(6, activation='relu'),
10 tf.keras.layers.Dense(1, activation='sigmoid')
11])
1# WITH LSTM
2model = tf.keras.Sequential([
3 tf.keras.layers.Embedding(vocab_size, embedding_dim,
4 input_length=max_length),
5 #
6 tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
7 #
8 tf.keras.layers.Dense(6, activation='relu'),
9 tf.keras.layers.Dense(1, activation='sigmoid')
10])
1model = tf.keras.Sequential([
2 tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
3 #
4 tf.keras.layers.Conv1D(128, 5, activation='relu'),
5 #
6 tf.keras.layers.GlobalAveragePooling1D(),
7 tf.keras.layers.Dense(64, activation='relu'),
8 tf.keras.layers.Dense(1, activation='sigmoid')
9])
Try with 3 different choices:
- Simple NN: 5s/epoch, 170K params, nice acc, overfitting.
- LSTM: 43s/epoch, 30K params, acc better, overfitting.
- GRU (Gated Recurrent Unit layer, a different type of RNN): 20s/epoch, 169K params, very good acc, overfitting.
- Conv1D: 6s/epoch, 171K params, good acc, overfitting.
Remark: With the texts, you'll probably get a bit more overfitting than you would have done with images. Because we have out of voca words in validation data.
One application of sequence models: read text then generate another look-alike text.
- How they predict a new word in the notebook? → Check this video.
1input_sequences = []
2for line in corpus:
3 # convert each sentence to list of numbers
4 token_list = tokenizer.texts_to_sequences([line])[0]
5 # convert each list to n-gram sequence
6 # eg. from [1,2,3,4,5]
7 # to [1,2], [1,2,3], [1,2,3,4], [1,2,3,4,5]
8 for i in range(1, len(token_list)):
9 n_gram_sequence = token_list[:i+1]
10 input_sequences.append(n_gram_sequence)
11
12# pad sequences to the maximum length of all sentences
13max_sequence_len = max([len(x) for x in input_sequences])
14input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
15
16# create predictors and label
17# [0,0,1,2] -> 2 is label
18# [0,1,2,3] -> 3 is label
19# [1,2,3,4] -> 4 is label
20xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
21
22# one-hot encoding the labels (classification problem)
23ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
1model = Sequential()
2model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
3model.add(Bidirectional(LSTM(20))) # take only 20 units (bi-direction) to train
4model.add(Dense(total_words, activation='softmax'))
5model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
6history = model.fit(xs, ys, epochs=500, verbose=1)
1seed_text = "Laurence went to dublin"
2next_words = 100
3
4for _ in range(next_words):
5 token_list = tokenizer.texts_to_sequences([seed_text])[0]
6 # "went to dublin" -> [134, 13, 59]
7 token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
8 # [0, 0, 0, 0, 0, 0, 0, 134, 13, 59]
9 predicted = model.predict_classes(token_list, verbose=0)
10 output_word = ""
11 # revert an index back to the word
12 for word, index in tokenizer.word_index.items():
13 if index == predicted:
14 output_word = word
15 break
16 # add predicted word to the seed text and make another prediction
17 seed_text += " " + output_word
18print(seed_text)
19# all the words are predicted based on the probability
20# next one will be less certain than the previous
21# -> less meaningful
- Using more words will help.
1# read from a file
2tokenizer = Tokenizer()
3data = open('/tmp/irish-lyrics-eof.txt').read()
4corpus = data.lower().split("\\n")
A little changes from the previous,
1model = Sequential()
2model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
3model.add(Bidirectional(LSTM(150)))
4model.add(Dense(total_words, activation='softmax'))
5adam = Adam(lr=0.01) # customized optimizer
6model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
7#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
8history = model.fit(xs, ys, epochs=100, verbose=1)
- Different convernges can create different poetry.
- If we use one-hot for a very big corpus → take a lot of RAM → use character-based prediction → #unique characters is far less than #unique words. → notebook "Text generation with RNN"