Source code for farm.data_handler.input_features

"""
Contains functions that turn readable clear text input into dictionaries of features
"""


import logging

from farm.data_handler.samples import Sample
from farm.data_handler.utils import (
    expand_labels,
    pad)
from farm.modeling.tokenization import insert_at_special_tokens_pos

import numpy as np

logger = logging.getLogger(__name__)


[docs]def sample_to_features_text(
    sample, tasks, max_seq_len, tokenizer
):
    """
    Generates a dictionary of features for a given input sample that is to be consumed by a text classification model.

    :param sample: Sample object that contains human readable text and label fields from a single text classification data sample
    :type sample: Sample
    :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name)
    :type tasks: dict
    :param max_seq_len: Sequences are truncated after this many tokens
    :type max_seq_len: int
    :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens
    :return: A list with one dictionary containing the keys "input_ids", "padding_mask" and "segment_ids" (also "label_ids" if not
             in inference mode). The values are lists containing those features.
    :rtype: list
    """

    if tokenizer.is_fast:
        text = sample.clear_text["text"]
        # Here, we tokenize the sample for the second time to get all relevant ids
        # This should change once we git rid of FARM's tokenize_with_metadata()
        inputs = tokenizer(text,
                           return_token_type_ids=True,
                           truncation=True,
                           truncation_strategy="longest_first",
                           max_length=max_seq_len,
                           return_special_tokens_mask=True)

        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
                         f"from number of tokens produced in tokenize_with_metadata(). \n"
                         f"Further processing is likely to be wrong.")
    else:
        # TODO It might be cleaner to adjust the data structure in sample.tokenized
        tokens_a = sample.tokenized["tokens"]
        tokens_b = sample.tokenized.get("tokens_b", None)

        inputs = tokenizer.encode_plus(
            tokens_a,
            tokens_b,
            add_special_tokens=True,
            truncation=False,  # truncation_strategy is deprecated
            return_token_type_ids=True,
            is_split_into_words=False,
        )

    input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    padding_mask = [1] * len(input_ids)

    # Padding up to the sequence length.
    # Normal case: adding multiple 0 to the right
    # Special cases:
    # a) xlnet pads on the left and uses  "4"  for padding token_type_ids
    if tokenizer.__class__.__name__ == "XLNetTokenizer":
        pad_on_left = True
        segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
    else:
        pad_on_left = False
        segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)

    input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
    padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)

    assert len(input_ids) == max_seq_len
    assert len(padding_mask) == max_seq_len
    assert len(segment_ids) == max_seq_len

    feat_dict = {
        "input_ids": input_ids,
        "padding_mask": padding_mask,
        "segment_ids": segment_ids,
    }

    # Add Labels for different tasks
    for task_name, task in tasks.items():
        try:
            label_name = task["label_name"]
            label_raw = sample.clear_text[label_name]
            label_list = task["label_list"]
            if task["task_type"] == "classification":
                # id of label
                try:
                    label_ids = [label_list.index(label_raw)]
                except ValueError as e:
                    raise ValueError(f'[Task: {task_name}] Observed label {label_raw} not in defined label_list')
            elif task["task_type"] == "multilabel_classification":
                # multi-hot-format
                label_ids = [0] * len(label_list)
                for l in label_raw.split(","):
                    if l != "":
                        label_ids[label_list.index(l)] = 1
            elif task["task_type"] == "regression":
                label_ids = [float(label_raw)]
            else:
                raise ValueError(task["task_type"])
        except KeyError:
            # For inference mode we don't expect labels
            label_ids = None
        if label_ids is not None:
            feat_dict[task["label_tensor_name"]] = label_ids
    return [feat_dict]


#TODO remove once NQ processing is adjusted
[docs]def get_roberta_seq_2_start(input_ids):
    # This commit (https://github.com/huggingface/transformers/commit/dfe012ad9d6b6f0c9d30bc508b9f1e4c42280c07)from
    # huggingface transformers now means that RobertaTokenizer.encode_plus returns only zeros in token_type_ids. Therefore, we need
    # another way to infer the start of the second input sequence in RoBERTa. Roberta input sequences have the following
    # format: <s> P1 </s> </s> P2 </s>
    # <s> has index 0 and </s> has index 2. To find the beginning of the second sequence, this function first finds
    # the index of the second </s>
    first_backslash_s = input_ids.index(2)
    second_backslash_s = input_ids.index(2, first_backslash_s + 1)
    return second_backslash_s + 1

#TODO remove once NQ processing is adjusted
[docs]def get_camembert_seq_2_start(input_ids):
    # CamembertTokenizer.encode_plus returns only zeros in token_type_ids (same as RobertaTokenizer).
    # This is another way to find the start of the second sequence (following get_roberta_seq_2_start)
    # Camembert input sequences have the following
    # format: <s> P1 </s> </s> P2 </s>
    # <s> has index 5 and </s> has index 6. To find the beginning of the second sequence, this function first finds
    # the index of the second </s>
    first_backslash_s = input_ids.index(6)
    second_backslash_s = input_ids.index(6, first_backslash_s + 1)
    return second_backslash_s + 1


# def _SQUAD_improve_answer_span(
#     doc_tokens, input_start, input_end, tokenizer, orig_answer_text
# ):
#     """Returns tokenized answer spans that better match the annotated answer."""
#
#     # The SQuAD annotations are character based. We first project them to
#     # whitespace-tokenized words. But then after WordPiece tokenization, we can
#     # often find a "better match". For example:
#     #
#     #   Question: What year was John Smith born?
#     #   Context: The leader was John Smith (1895-1943).
#     #   Answer: 1895
#     #
#     # The original whitespace-tokenized answer will be "(1895-1943).". However
#     # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
#     # the exact answer, 1895.
#     #
#     # However, this is not always possible. Consider the following:
#     #
#     #   Question: What country is the top exporter of electornics?
#     #   Context: The Japanese electronics industry is the lagest in the world.
#     #   Answer: Japan
#     #
#     # In this case, the annotator chose "Japan" as a character sub-span of
#     # the word "Japanese". Since our WordPiece tokenizer does not split
#     # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
#     # in SQuAD, but does happen.
#     tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
#
#     for new_start in range(input_start, input_end + 1):
#         for new_end in range(input_end, new_start - 1, -1):
#             text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
#             if text_span == tok_answer_text:
#                 return (new_start, new_end)
#
#     return (input_start, input_end)