# coding=utf-8
# Copyright 2018 deepset team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import re
import numpy as np
from transformers.tokenization_bert import BertTokenizer
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_xlnet import XLNetTokenizer
from transformers.tokenization_albert import AlbertTokenizer
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
from transformers.tokenization_distilbert import DistilBertTokenizer
logger = logging.getLogger(__name__)
# Special characters used by the different tokenizers to indicate start of word / whitespace
SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)"
[docs]class Tokenizer:
"""
Simple Wrapper for Tokenizers from the transformers package. Enables loading of different Tokenizer classes with a uniform interface.
"""
[docs] @classmethod
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
"""
Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
`pretrained_model_name_or_path` or define it manually via `tokenizer_class`.
:param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
:type pretrained_model_name_or_path: str
:param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
:type tokenizer_class: str
:param kwargs:
:return: Tokenizer
"""
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
# guess tokenizer type from name
if tokenizer_class is None:
if "albert" in pretrained_model_name_or_path.lower():
tokenizer_class = "AlbertTokenizer"
elif "xlm-roberta" in pretrained_model_name_or_path.lower():
tokenizer_class = "XLMRobertaTokenizer"
elif "roberta" in pretrained_model_name_or_path.lower():
tokenizer_class = "RobertaTokenizer"
elif "distilbert" in pretrained_model_name_or_path.lower():
tokenizer_class = "DistilBertTokenizer"
elif "bert" in pretrained_model_name_or_path.lower():
tokenizer_class = "BertTokenizer"
elif "xlnet" in pretrained_model_name_or_path.lower():
tokenizer_class = "XLNetTokenizer"
else:
raise ValueError(f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
f"XLNetTokenizer.")
logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
# return appropriate tokenizer object
if tokenizer_class == "AlbertTokenizer":
ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
elif tokenizer_class == "XLMRobertaTokenizer":
ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "RobertaTokenizer":
ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "DistilBertTokenizer":
ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "BertTokenizer":
ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "XLNetTokenizer":
ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
if ret is None:
raise Exception("Unable to load tokenizer")
else:
return ret
def _words_to_tokens(words, word_offsets, tokenizer):
"""
Tokenize "words" into subword tokens while keeping track of offsets and if a token is the start of a word.
:param words: list of words.
:type words: list
:param word_offsets: Character indices where each word begins in the original text
:type word_offsets: list
:param tokenizer: Tokenizer (e.g. from Tokenizer.load())
:return: tokens, offsets, start_of_word
"""
tokens = []
token_offsets = []
start_of_word = []
for w, w_off in zip(words, word_offsets):
# Get (subword) tokens of single word.
# empty / pure whitespace
if len(w) == 0:
continue
# For the first word of a text: we just call the regular tokenize function.
# For later words: we need to call it with add_prefix_space=True to get the same results with roberta / gpt2 tokenizer
# see discussion here. https://github.com/huggingface/transformers/issues/1196
elif len(tokens) == 0:
tokens_word = tokenizer.tokenize(w)
else:
try:
tokens_word = tokenizer.tokenize(w, add_prefix_space=True)
except TypeError:
tokens_word = tokenizer.tokenize(w)
# Sometimes the tokenizer returns no tokens
if len(tokens_word) == 0:
continue
tokens += tokens_word
# get global offset for each token in word + save marker for first tokens of a word
first_tok = True
for tok in tokens_word:
token_offsets.append(w_off)
# Depending on the tokenizer type special chars are added to distinguish tokens with preceeding
# whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token
orig_tok = re.sub(SPECIAL_TOKENIZER_CHARS, "", tok)
w_off += len(orig_tok)
if first_tok:
start_of_word.append(True)
first_tok = False
else:
start_of_word.append(False)
assert len(tokens) == len(token_offsets) == len(start_of_word)
return tokens, token_offsets, start_of_word
[docs]def truncate_sequences(seq_a, seq_b, tokenizer, max_seq_len, truncation_strategy='longest_first',
with_special_tokens=True, stride=0):
"""
Reduces a single sequence or a pair of sequences to a maximum sequence length.
The sequences can contain tokens or any other elements (offsets, masks ...).
If `with_special_tokens` is enabled, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.)
Supported truncation strategies:
- longest_first: (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence.
- only_first: Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- only_second: Only truncate the second sequence
- do_not_truncate: Does not truncate (raise an error if the input sequence is longer than max_length)
:param seq_a: First sequence of tokens/offsets/...
:type seq_a: list
:param seq_b: Optional second sequence of tokens/offsets/...
:type seq_b: None or list
:param tokenizer: Tokenizer (e.g. from Tokenizer.load())
:param max_seq_len:
:type max_seq_len: int
:param truncation_strategy: how the sequence(s) should be truncated down. Default: "longest_first" (see above for other options).
:type truncation_strategy: str
:param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.)
:type with_special_tokens: bool
:param stride: optional stride of the window during truncation
:type stride: int
:return: truncated seq_a, truncated seq_b, overflowing tokens
"""
pair = bool(seq_b is not None)
len_a = len(seq_a)
len_b = len(seq_b) if pair else 0
num_special_tokens = tokenizer.num_added_tokens(pair=pair) if with_special_tokens else 0
total_len = len_a + len_b + num_special_tokens
overflowing_tokens = []
if max_seq_len and total_len > max_seq_len:
seq_a, seq_b, overflowing_tokens = tokenizer.truncate_sequences(seq_a, pair_ids=seq_b,
num_tokens_to_remove=total_len - max_seq_len,
truncation_strategy=truncation_strategy,
stride=stride)
return (seq_a, seq_b, overflowing_tokens)
[docs]def insert_at_special_tokens_pos(seq, special_tokens_mask, insert_element):
"""
Adds elements to a sequence at the positions that align with special tokens.
This is useful for expanding label ids or masks, so that they align with corresponding tokens (incl. the special tokens)
Example:
.. code-block:: python
# Tokens: ["CLS", "some", "words","SEP"]
>>> special_tokens_mask = [1,0,0,1]
>>> lm_label_ids = [12,200]
>>> insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1)
[-1, 12, 200, -1]
:param seq: List where you want to insert new elements
:type seq: list
:param special_tokens_mask: list with "1" for positions of special chars
:type special_tokens_mask: list
:param insert_element: the value you want to insert
:return: list
"""
new_seq = seq.copy()
special_tokens_indices = np.where(np.array(special_tokens_mask) == 1)[0]
for idx in special_tokens_indices:
new_seq.insert(idx, insert_element)
return new_seq