Source code for farm.data_handler.input_features

"""
Contains functions that turn readable clear text input into dictionaries of features
"""


import logging

from farm.data_handler.samples import Sample
from farm.data_handler.utils import (
    expand_labels,
    pad)
from farm.modeling.tokenization import insert_at_special_tokens_pos

import numpy as np

logger = logging.getLogger(__name__)


[docs]def sample_to_features_text( sample, tasks, max_seq_len, tokenizer ): """ Generates a dictionary of features for a given input sample that is to be consumed by a text classification model. :param sample: Sample object that contains human readable text and label fields from a single text classification data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :return: A list with one dictionary containing the keys "input_ids", "padding_mask" and "segment_ids" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: list """ if tokenizer.is_fast: text = sample.clear_text["text"] # Here, we tokenize the sample for the second time to get all relevant ids # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, truncation=True, truncation_strategy="longest_first", max_length=max_seq_len, return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata(). \n" f"Further processing is likely to be wrong.") else: # TODO It might be cleaner to adjust the data structure in sample.tokenized tokens_a = sample.tokenized["tokens"] tokens_b = sample.tokenized.get("tokens_b", None) inputs = tokenizer.encode_plus( tokens_a, tokens_b, add_special_tokens=True, truncation=False, # truncation_strategy is deprecated return_token_type_ids=True, is_split_into_words=False, ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len feat_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, } # Add Labels for different tasks for task_name, task in tasks.items(): try: label_name = task["label_name"] label_raw = sample.clear_text[label_name] label_list = task["label_list"] if task["task_type"] == "classification": # id of label try: label_ids = [label_list.index(label_raw)] except ValueError as e: raise ValueError(f'[Task: {task_name}] Observed label {label_raw} not in defined label_list') elif task["task_type"] == "multilabel_classification": # multi-hot-format label_ids = [0] * len(label_list) for l in label_raw.split(","): if l != "": label_ids[label_list.index(l)] = 1 elif task["task_type"] == "regression": label_ids = [float(label_raw)] else: raise ValueError(task["task_type"]) except KeyError: # For inference mode we don't expect labels label_ids = None if label_ids is not None: feat_dict[task["label_tensor_name"]] = label_ids return [feat_dict]
#TODO remove once NQ processing is adjusted
[docs]def get_roberta_seq_2_start(input_ids): # This commit (https://github.com/huggingface/transformers/commit/dfe012ad9d6b6f0c9d30bc508b9f1e4c42280c07)from # huggingface transformers now means that RobertaTokenizer.encode_plus returns only zeros in token_type_ids. Therefore, we need # another way to infer the start of the second input sequence in RoBERTa. Roberta input sequences have the following # format: <s> P1 </s> </s> P2 </s> # <s> has index 0 and </s> has index 2. To find the beginning of the second sequence, this function first finds # the index of the second </s> first_backslash_s = input_ids.index(2) second_backslash_s = input_ids.index(2, first_backslash_s + 1) return second_backslash_s + 1
#TODO remove once NQ processing is adjusted
[docs]def get_camembert_seq_2_start(input_ids): # CamembertTokenizer.encode_plus returns only zeros in token_type_ids (same as RobertaTokenizer). # This is another way to find the start of the second sequence (following get_roberta_seq_2_start) # Camembert input sequences have the following # format: <s> P1 </s> </s> P2 </s> # <s> has index 5 and </s> has index 6. To find the beginning of the second sequence, this function first finds # the index of the second </s> first_backslash_s = input_ids.index(6) second_backslash_s = input_ids.index(6, first_backslash_s + 1) return second_backslash_s + 1
# def _SQUAD_improve_answer_span( # doc_tokens, input_start, input_end, tokenizer, orig_answer_text # ): # """Returns tokenized answer spans that better match the annotated answer.""" # # # The SQuAD annotations are character based. We first project them to # # whitespace-tokenized words. But then after WordPiece tokenization, we can # # often find a "better match". For example: # # # # Question: What year was John Smith born? # # Context: The leader was John Smith (1895-1943). # # Answer: 1895 # # # # The original whitespace-tokenized answer will be "(1895-1943).". However # # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match # # the exact answer, 1895. # # # # However, this is not always possible. Consider the following: # # # # Question: What country is the top exporter of electornics? # # Context: The Japanese electronics industry is the lagest in the world. # # Answer: Japan # # # # In this case, the annotator chose "Japan" as a character sub-span of # # the word "Japanese". Since our WordPiece tokenizer does not split # # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # # in SQuAD, but does happen. # tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) # # for new_start in range(input_start, input_end + 1): # for new_end in range(input_end, new_start - 1, -1): # text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) # if text_span == tok_answer_text: # return (new_start, new_end) # # return (input_start, input_end)