Source code for farm.data_handler.processor

import abc
import inspect
import json
import logging
import os
import random
from abc import ABC
from inspect import signature
from pathlib import Path
from random import randint

import numpy as np
import torch
from numpy.random import random as random_float
from sklearn.preprocessing import StandardScaler
from transformers import AutoConfig
from tokenizers import Encoding

from farm.data_handler.dataset import convert_features_to_dataset
from farm.data_handler.input_features import get_roberta_seq_2_start, get_camembert_seq_2_start
from farm.data_handler.input_features import sample_to_features_text
from farm.data_handler.nq_utils import (
    sample_to_features_qa_Natural_Questions,
    create_samples_qa_Natural_Question,
    convert_qa_input_dict,
)

from farm.data_handler.samples import (
    Sample,
    SampleBasket,
    get_passage_offsets,
    offset_to_token_idx_vecorized
)

from farm.data_handler.utils import (
    pad,
    expand_labels,
    read_tsv,
    read_tsv_sentence_pair,
    read_docs_from_txt,
    read_ner_file,
    read_squad_file,
    read_jsonl,
    read_dpr_json,
    is_json,
    get_sentence_pair,
    split_with_metadata,
)
from farm.modeling.tokenization import (
    Tokenizer,
    tokenize_with_metadata,
    truncate_sequences,
    tokenize_batch_question_answering,
    _get_start_of_word
)
from farm.utils import MLFlowLogger as MlLogger
from farm.utils import try_get

from tokenizers.pre_tokenizers import WhitespaceSplit

ID_NAMES = ["example_id", "external_id", "doc_id", "id"]


logger = logging.getLogger(__name__)


[docs]class Processor(ABC): """ Is used to generate PyTorch Datasets from input data. An implementation of this abstract class should be created for each new data source. Implement the abstract methods: file_to_dicts(), _dict_to_samples(), _sample_to_features() to be compatible with your data format """ subclasses = {}
[docs] def __init__( self, tokenizer, max_seq_len, train_filename, dev_filename, test_filename, dev_split, data_dir, tasks={}, proxies=None, multithreading_rust=True, ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: The name of the file containing test data. :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param data_dir: The directory in which the train, test and perhaps dev files can be found. :type data_dir: str :param tasks: Tasks for which the processor shall extract labels from the input data. Usually this includes a single, default task, e.g. text classification. In a multitask setting this includes multiple tasks, e.g. 2x text classification. The task name will be used to connect with the related PredictionHead. :type tasks: dict :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers. Note: Enabling multithreading in Rust AND multiprocessing in python might cause deadlocks. :type multithreading_rust: bool """ if not multithreading_rust: os.environ["RAYON_RS_NUM_CPUS"] = "1" self.tokenizer = tokenizer self.max_seq_len = max_seq_len self.tasks = tasks self.proxies = proxies # data sets self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.dev_split = dev_split if data_dir: self.data_dir = Path(data_dir) else: self.data_dir = None self.baskets = [] self._log_params() self.problematic_sample_ids = set()
def __init_subclass__(cls, **kwargs): """ This automatically keeps track of all available subclasses. Enables generic load() and load_from_dir() for all specific Processor implementation. """ super().__init_subclass__(**kwargs) cls.subclasses[cls.__name__] = cls
[docs] @classmethod def load( cls, processor_name, data_dir, tokenizer, max_seq_len, train_filename, dev_filename, test_filename, dev_split, **kwargs, ): """ Loads the class of processor specified by processor name. :param processor_name: The class of processor to be loaded. :type processor_name: str :param data_dir: Directory where data files are located. :type data_dir: str :param tokenizer: A tokenizer object :param max_seq_len: Sequences longer than this will be truncated. :type max_seq_len: int :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: The name of the file containing test data. :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param kwargs: placeholder for passing generic parameters :type kwargs: object :return: An instance of the specified processor. """ sig = signature(cls.subclasses[processor_name]) unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters} logger.debug( f"Got more parameters than needed for loading {processor_name}: {unused_args}. " f"Those won't be used!" ) processor = cls.subclasses[processor_name]( data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, **kwargs, ) return processor
[docs] @classmethod def load_from_dir(cls, load_dir): """ Infers the specific type of Processor from a config file (e.g. GNADProcessor) and loads an instance of it. :param load_dir: str, directory that contains a 'processor_config.json' :return: An instance of a Processor Subclass (e.g. GNADProcessor) """ # read config processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) config["inference"] = True # init tokenizer if "lower_case" in config.keys(): logger.warning("Loading tokenizer from deprecated FARM config. " "If you used `custom_vocab` or `never_split_chars`, this won't work anymore.") tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]) else: tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config) for task_name, task in config["tasks"].items(): processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"], label_column_name=task["label_column_name"], text_column_name=task.get("text_column_name", None), task_type=task["task_type"]) if processor is None: raise Exception return processor
[docs] @classmethod def convert_from_transformers(cls, tokenizer_name_or_path, task_type, max_seq_len, doc_stride, revision=None, tokenizer_class=None, tokenizer_args=None, use_fast=True): config = AutoConfig.from_pretrained(tokenizer_name_or_path, revision=revision) tokenizer_args = tokenizer_args or {} tokenizer = Tokenizer.load(tokenizer_name_or_path, tokenizer_class=tokenizer_class, use_fast=use_fast, revision=revision, **tokenizer_args, ) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir="data", doc_stride=doc_stride ) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) elif task_type == "ner": label_list = list(config.id2label.values()) processor = NERProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", metric="seq_f1", label_list=label_list ) else: raise ValueError(f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', " f"'embeddings', 'text_classification', 'ner'") return processor
[docs] def save(self, save_dir): """ Saves the vocabulary to file and also creates a json file containing all the information needed to load the same processor. :param save_dir: Directory where the files are to be saved :type save_dir: str """ os.makedirs(save_dir, exist_ok=True) config = self.generate_config() # save tokenizer incl. attributes config["tokenizer"] = self.tokenizer.__class__.__name__ # Because the fast tokenizers expect a str and not Path # always convert Path to str here. self.tokenizer.save_pretrained(str(save_dir)) # save processor config["processor"] = self.__class__.__name__ output_config_file = Path(save_dir) / "processor_config.json" with open(output_config_file, "w") as file: json.dump(config, file)
[docs] def generate_config(self): """ Generates config file from Class and instance attributes (only for sensible config parameters). """ config = {} # self.__dict__ doesn't give parent class attributes for key, value in inspect.getmembers(self): if is_json(value) and key[0] != "_": if issubclass(type(value), Path): value = str(value) config[key] = value return config
[docs] def add_task(self, name, metric, label_list, label_column_name=None, label_name=None, task_type=None, text_column_name=None): if type(label_list) is not list: raise ValueError(f"Argument `label_list` must be of type list. Got: f{type(label_list)}") if label_name is None: label_name = f"{name}_label" label_tensor_name = label_name + "_ids" self.tasks[name] = { "label_list": label_list, "metric": metric, "label_tensor_name": label_tensor_name, "label_name": label_name, "label_column_name": label_column_name, "text_column_name": text_column_name, "task_type": task_type }
[docs] @abc.abstractmethod def file_to_dicts(self, file: str) -> [dict]: raise NotImplementedError()
def _dict_to_samples(cls, dictionary: dict, all_dicts=None) -> [Sample]: raise NotImplementedError() def _sample_to_features(cls, sample: Sample) -> dict: raise NotImplementedError() def _dict_to_samples_and_features(self, dictionary: dict, all_dicts=None) -> [Sample]: raise NotImplementedError() def _init_samples_in_baskets(self): all_dicts = [b.raw for b in self.baskets] for basket in self.baskets: try: basket.samples = self._dict_to_samples(dictionary=basket.raw, all_dicts=all_dicts) for num, sample in enumerate(basket.samples): sample.id = f"{basket.id_internal}-{num}" except Exception as e: logger.error(f"Could not create sample(s) from this dict: \n {basket.raw}") logger.error(f"Error message: {e}") def _featurize_samples(self): curr_problematic_sample_ids = [] for basket in self.baskets: for sample in basket.samples: try: sample.features = self._sample_to_features(sample=sample) except Exception as e: curr_problematic_sample_ids.append(sample.id) if curr_problematic_sample_ids: self.problematic_sample_ids.update(curr_problematic_sample_ids)
[docs] @staticmethod def log_problematic(problematic_sample_ids): if problematic_sample_ids: n_problematic = len(problematic_sample_ids) problematic_id_str = ", ".join(problematic_sample_ids) logger.error( f"Unable to convert {n_problematic} samples to features. Their ids are : {problematic_id_str}")
def _init_and_featurize_samples_in_baskets(self): for basket in self.baskets: all_dicts = [b.raw for b in self.baskets] try: basket.samples = self._dict_to_samples_and_features(dictionary=basket.raw, all_dicts=all_dicts, basket_id_internal=basket.id_internal) for num, sample in enumerate(basket.samples): sample.id = f"{basket.id_internal}-{num}" except Exception as e: logger.error(f"Could not create sample(s) from this dict: \n {basket.raw}") logger.error(f"Error message: {e}") @staticmethod def _check_sample_features(basket): """Check if all samples in the basket has computed its features. Args: basket: the basket containing the samples Returns: True if all the samples in the basket has computed its features, False otherwise """ if len(basket.samples) == 0: return False for sample in basket.samples: if sample.features is None: return False return True def _create_dataset(self): features_flat = [] basket_to_remove = [] for basket in self.baskets: if self._check_sample_features(basket): for sample in basket.samples: features_flat.extend(sample.features) else: # remove the entire basket basket_to_remove.append(basket) if len(basket_to_remove) > 0: for basket in basket_to_remove: # if basket_to_remove is not empty remove the related baskets self.baskets.remove(basket) dataset, tensor_names = convert_features_to_dataset(features=features_flat) return dataset, tensor_names
[docs] def dataset_from_dicts(self, dicts, indices=None, return_baskets = False): """ Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a list of tensor names. This can be used for inference mode. :param dicts: List of dictionaries where each contains the data of one input sample. :type dicts: list of dicts :return: a Pytorch dataset and a list of tensor names. """ # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID self.baskets = [] for id_internal, d in enumerate(dicts): id_external = self._id_from_dict(d) if indices: id_internal = indices[id_internal] self.baskets.append(SampleBasket(raw=d, id_external=id_external, id_internal=id_internal)) self._init_samples_in_baskets() self._featurize_samples() if indices: if 0 in indices: self._log_samples(1) else: self._log_samples(1) dataset, tensor_names = self._create_dataset() # This mode is for inference where we need to keep baskets if return_baskets: #TODO simplify return dataset, tensor_names, self.problematic_sample_ids, self.baskets # This mode is for training where we can free ram by removing baskets else: return dataset, tensor_names, self.problematic_sample_ids
def _log_samples(self, n_samples): logger.info("*** Show {} random examples ***".format(n_samples)) for i in range(n_samples): random_basket = random.choice(self.baskets) random_sample = random.choice(random_basket.samples) logger.info(random_sample) def _log_params(self): params = { "processor": self.__class__.__name__, "tokenizer": self.tokenizer.__class__.__name__, } names = ["max_seq_len", "dev_split"] for name in names: value = getattr(self, name) params.update({name: str(value)}) MlLogger.log_params(params) @staticmethod def _id_from_dict(d): ext_id = try_get(ID_NAMES, d) if not ext_id and "qas" in d: ext_id = try_get(ID_NAMES, d["qas"][0]) return ext_id
[docs]class TextClassificationProcessor(Processor): """ Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.) """
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, label_list=None, metric=None, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", quote_char="'", skiprows=None, label_column_name="label", multilabel=False, header=0, proxies=None, max_samples=None, text_column_name="text", **kwargs ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"] :type label_list: list :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param delimiter: Separator used in the input tsv / csv file :type delimiter: str :param quote_char: Character used for quoting strings in the input tsv/ csv file :type quote_char: str :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers) :type skiprows: int :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels :type label_column_name: str :param multilabel: set to True for multilabel classification :type multilabel: bool :param header: which line to use as a header in the input csv/tsv :type header: int :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param text_column_name: name of the column in the input csv/tsv that shall be used as training text :type text_column_name: str :param kwargs: placeholder for passing generic parameters :type kwargs: object """ #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs # Custom processor attributes self.delimiter = delimiter self.quote_char = quote_char self.skiprows = skiprows self.header = header self.max_samples = max_samples super(TextClassificationProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric and label_list: if multilabel: task_type = "multilabel_classification" else: task_type = "classification" self.add_task(name="text_classification", metric=metric, label_list=label_list, label_column_name=label_column_name, text_column_name=text_column_name, task_type=task_type) else: logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for " "using the default task or add a custom task later via processor.add_task()")
[docs] def file_to_dicts(self, file: str) -> [dict]: column_mapping = {} for task in self.tasks.values(): column_mapping[task["label_column_name"]] = task["label_name"] column_mapping[task["text_column_name"]] = "text" dicts = read_tsv( filename=file, delimiter=self.delimiter, skiprows=self.skiprows, quotechar=self.quote_char, rename_columns=column_mapping, header=self.header, proxies=self.proxies, max_samples=self.max_samples ) return dicts
[docs] def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False): self.baskets = [] # Tokenize in batches texts = [x["text"] for x in dicts] tokenized_batch = self.tokenizer.batch_encode_plus( texts, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length" ) input_ids_batch = tokenized_batch["input_ids"] segment_ids_batch = tokenized_batch["token_type_ids"] padding_masks_batch = tokenized_batch["attention_mask"] tokens_batch = [x.tokens for x in tokenized_batch.encodings] # From here we operate on a per sample basis for dictionary, input_ids, segment_ids, padding_mask, tokens in zip( dicts, input_ids_batch, segment_ids_batch, padding_masks_batch, tokens_batch ): tokenized = {} if debug: tokenized["tokens"] = tokens feat_dict = {"input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids} # Create labels # i.e. not inference if not return_baskets: label_dict = self.convert_labels(dictionary) feat_dict.update(label_dict) # Add Basket to self.baskets curr_sample = Sample(id=None, clear_text=dictionary, tokenized=tokenized, features=[feat_dict]) curr_basket = SampleBasket(id_internal=None, raw=dictionary, id_external=None, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) # TODO populate problematic ids problematic_ids = set() logger.warning("Currently no support in Processor for returning problematic ids") dataset, tensornames = self._create_dataset() if return_baskets: return dataset, tensornames, problematic_ids, self.baskets else: return dataset, tensornames, problematic_ids
[docs] def convert_labels(self, dictionary): ret = {} # Add labels for different tasks for task_name, task in self.tasks.items(): label_name = task["label_name"] label_raw = dictionary[label_name] label_list = task["label_list"] if task["task_type"] == "classification": # id of label label_ids = [label_list.index(label_raw)] elif task["task_type"] == "multilabel_classification": # multi-hot-format label_ids = [0] * len(label_list) for l in label_raw.split(","): if l != "": label_ids[label_list.index(l)] = 1 ret[task["label_tensor_name"]] = label_ids return ret
def _create_dataset(self): # TODO this is the proposed new version to replace the mother function features_flat = [] basket_to_remove = [] for basket in self.baskets: if self._check_sample_features(basket): for sample in basket.samples: features_flat.extend(sample.features) else: # remove the entire basket basket_to_remove.append(basket) dataset, tensor_names = convert_features_to_dataset(features=features_flat) return dataset, tensor_names
[docs]class RegressionProcessor(TextClassificationProcessor): """ Processor to handle a regression dataset in tab separated text + label, It uses the text conversion functionality of a TextClassificationProcessor but adds special label conversion (scaled float value as label) """
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", quote_char="'", skiprows=None, label_column_name="label", label_name="regression_label", scaler_mean=None, scaler_scale=None, proxies=None, text_column_name="text", **kwargs ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"] :type label_list: list :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param delimiter: Separator used in the input tsv / csv file :type delimiter: str :param quote_char: Character used for quoting strings in the input tsv/ csv file :type quote_char: str :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers) :type skiprows: int :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels :type label_column_name: str :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases) :type label_name: str :param scaler_mean: Value to substract from the label for normalization :type scaler_mean: float :param scaler_scale: Value to divide the label by for normalization :type scaler_scale: float :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param text_column_name: name of the column in the input csv/tsv that shall be used as training text :type text_column_name: str :param kwargs: placeholder for passing generic parameters :type kwargs: object """ # Custom processor attributes self.delimiter = delimiter self.quote_char = quote_char self.skiprows = skiprows super(RegressionProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, proxies=proxies ) # Note that label_list is being hijacked to store the scaling mean and scale self.add_task(name="regression", metric="mse", label_list=[scaler_mean, scaler_scale], label_column_name=label_column_name, task_type="regression", label_name=label_name, text_column_name=text_column_name)
[docs] def file_to_dicts(self, file: str) -> [dict]: column_mapping = {} for task in self.tasks.values(): column_mapping[task["label_column_name"]] = task["label_name"] column_mapping[task["text_column_name"]] = "text" dicts = read_tsv( rename_columns=column_mapping, filename=file, delimiter=self.delimiter, skiprows=self.skiprows, quotechar=self.quote_char, proxies=self.proxies ) # collect all labels and compute scaling stats train_labels = [] if self.train_filename in str(file): for d in dicts: train_labels.append(float(d[self.tasks["regression"]["label_name"]])) scaler = StandardScaler() scaler.fit(np.reshape(train_labels, (-1, 1))) # add to label list in regression task self.tasks["regression"]["label_list"] = [scaler.mean_.item(), scaler.scale_.item()] return dicts
[docs] def convert_labels(self, dictionary: dict): # For regression the label should be scaled ret = {} for task_name, task in self.tasks.items(): label_name = task["label_name"] label_raw = dictionary[label_name] label_list = task["label_list"] if task["task_type"] == "regression": label = float(label_raw) scaled_label = (label - label_list[0]) / label_list[1] ret[task["label_tensor_name"]] = [scaled_label] return ret
######################################### # Processor for Basic Inference #### #########################################
[docs]class InferenceProcessor(TextClassificationProcessor): """ Generic processor used at inference time: - fast - no labels - pure encoding of text into pytorch dataset - Doesn't read from file, but only consumes dictionaries (e.g. coming from API requests) """
[docs] def __init__( self, tokenizer, max_seq_len, **kwargs, ): super(InferenceProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=None, dev_filename=None, test_filename=None, dev_split=None, data_dir=None, tasks={}, )
[docs] @classmethod def load_from_dir(cls, load_dir): """ Overwriting method from parent class to **always** load the InferenceProcessor instead of the specific class stored in the config. :param load_dir: str, directory that contains a 'processor_config.json' :return: An instance of an InferenceProcessor """ # read config processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] processor = cls.load(tokenizer=tokenizer, processor_name="InferenceProcessor", **config) for task_name, task in config["tasks"].items(): processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"]) if processor is None: raise Exception return processor
[docs] def file_to_dicts(self, file: str) -> [dict]: raise NotImplementedError
[docs] def convert_labels(self, dictionary: dict): # For inference we do not need labels ret = {} return ret
[docs] def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False): """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. For slow tokenizers, s3e or wordembedding tokenizers the function works on _dict_to_samples and _sample_to_features """ # TODO remove this sections once tokenizers work the same way for slow/fast and our special tokenizers if not self.tokenizer.is_fast: self.baskets = [] for d in dicts: sample = self._dict_to_samples(dictionary=d) features = self._sample_to_features(sample) sample.features = features basket = SampleBasket(id_internal=None, raw=d, id_external=None, samples=[sample]) self.baskets.append(basket) if indices and 0 not in indices: pass else: self._log_samples(1) problematic_ids = set() logger.warning("Currently no support in InferenceProcessor for returning problematic ids") dataset, tensornames = self._create_dataset() ret = [dataset, tensornames, problematic_ids] if return_baskets: ret.append(self.baskets) return ret else: return super().dataset_from_dicts(dicts=dicts, indices=indices, return_baskets=return_baskets, debug=debug)
# Private method to keep s3e pooling and embedding extraction working def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return Sample(id=None, clear_text=dictionary, tokenized=tokenized) # Private method to keep s3e pooling and embedding extraction working def _sample_to_features(self, sample) -> dict: features = sample_to_features_text( sample=sample, tasks=self.tasks, max_seq_len=self.max_seq_len, tokenizer=self.tokenizer, ) return features
# TODO remove inheritance from superclass to be able to convert text + text_b
[docs]class TextPairClassificationProcessor(TextClassificationProcessor): """ Used to handle text pair classification datasets (e.g. Answer Selection or Natural Inference) that come in tsv format. The columns should be called text, text_b and label. """
[docs] def __init__(self, **kwargs): super(TextPairClassificationProcessor, self).__init__(**kwargs)
[docs] def file_to_dicts(self, file: str) -> [dict]: column_mapping = {task["label_column_name"]: task["label_name"] for task in self.tasks.values()} dicts = read_tsv_sentence_pair( rename_columns=column_mapping, filename=file, delimiter=self.delimiter, skiprows=self.skiprows, proxies=self.proxies, ) return dicts
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: tokenized_a = tokenize_with_metadata(dictionary["text"], self.tokenizer) tokenized_b = tokenize_with_metadata(dictionary["text_b"], self.tokenizer) if len(tokenized_a["tokens"]) == 0: text = dictionary["text"] logger.warning(f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}") return [] if len(tokenized_b["tokens"]) == 0: text_b = dictionary["text_b"] logger.warning(f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text_b}") return [] tokenized = {"tokens": tokenized_a["tokens"], "tokens_b": tokenized_b["tokens"]} tokenized["tokens"], tokenized["tokens_b"], _ = truncate_sequences(seq_a=tokenized["tokens"], seq_b=tokenized["tokens_b"], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)] def _dict_to_samples_and_features(self, dictionary: dict, **kwargs) -> [Sample]: """This method is used so that we need to tokenize only once when using a fast tokenizer.""" seq_a = dictionary["text"] seq_b = dictionary["text_b"] inputs = self.tokenizer.encode_plus( text=seq_a, text_pair=seq_b, max_length=self.max_seq_len, truncation=True, add_special_tokens=True, return_offsets_mapping=False, return_token_type_ids=True, return_special_tokens_mask=True, ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] # Find position of [SEP]-token # seq_2_start_t is the index of the first token in the second text sequence (e.g. passage) if "RobertaTokenizer" in self.tokenizer.__class__.__name__: seq_2_start_t = get_roberta_seq_2_start(input_ids) elif "CamembertTokenizer" in self.tokenizer.__class__.__name__: seq_2_start_t = get_camembert_seq_2_start(input_ids) else: seq_2_start_t = segment_ids.index(1) # Get tokens as text with metadata tokens_a = [] tokens_b = [] for idx, (token_id, is_special_token) in enumerate(zip(input_ids, inputs["special_tokens_mask"])): if not is_special_token: if idx < seq_2_start_t: tokens_a.append(self.tokenizer.convert_ids_to_tokens(token_id)) else: tokens_b.append(self.tokenizer.convert_ids_to_tokens(token_id)) token_dict = {"tokens": tokens_a, "tokens_b": tokens_b} if len(token_dict["tokens"]) == 0: logger.warning(f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {seq_a}") return [] if len(token_dict["tokens_b"]) == 0: logger.warning(f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {seq_b}") return [] # Build feature dict # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if "XLNetTokenizer" in self.tokenizer.__class__.__name__: pad_on_left = True segment_ids = pad(segment_ids, self.max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, self.max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, self.max_seq_len, self.tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, self.max_seq_len, 0, pad_on_left=pad_on_left) assert len(input_ids) == self.max_seq_len assert len(padding_mask) == self.max_seq_len assert len(segment_ids) == self.max_seq_len feat_dict = {"input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids} # Add labels for different tasks for task_name, task in self.tasks.items(): try: label_name = task["label_name"] label_raw = dictionary[label_name] label_list = task["label_list"] if task["task_type"] == "classification": # id of label try: label_ids = [label_list.index(label_raw)] except ValueError as e: raise ValueError(f'[Task: {task_name}] Observed label {label_raw} not in defined label_list') elif task["task_type"] == "multilabel_classification": # multi-hot-format label_ids = [0] * len(label_list) for l in label_raw.split(","): if l != "": label_ids[label_list.index(l)] = 1 elif task["task_type"] == "regression": label_ids = [float(label_raw)] else: raise ValueError(task["task_type"]) except KeyError: # For inference mode we don't expect labels label_ids = None if label_ids is not None: feat_dict[task["label_tensor_name"]] = label_ids return [Sample(id=None, clear_text=dictionary, tokenized=token_dict, features=[feat_dict])]
######################################### # Processors for NER data #### #########################################
[docs]class NERProcessor(Processor): """ Used to handle most NER datasets, like CoNLL or GermEval 2014 """
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, label_list=None, metric=None, train_filename="train.txt", dev_filename="dev.txt", test_filename="test.txt", dev_split=0.0, delimiter="\t", proxies=None, **kwargs ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param label_list: list of labels to predict (strings). :type label_list: list :param metric: name of metric that shall be used for evaluation, e.g. "seq_f1". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["seq_f1", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param delimiter: Separator used in the input tsv / csv file. German version of Conll03 uses a whitespace. GermEval 2014 is tab separated \t :type delimiter: str :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param kwargs: placeholder for passing generic parameters :type kwargs: object """ # Custom processor attributes self.delimiter = delimiter self.pre_tokenizer = WhitespaceSplit() super(NERProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) if metric and label_list: self.add_task("ner", metric, label_list) else: logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for " "using the default task or add a custom task later via processor.add_task()")
[docs] def file_to_dicts(self, file: str) -> [dict]: dicts = read_ner_file(filename=file, sep=self.delimiter, proxies=self.proxies) return dicts
[docs] def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] # Perform batch tokenization texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] # word_spans_batch is the char span for each whitespace split word word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True ) # Create features by iterating over samples for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] # Either try to extract an ID from the dictionary, or else create an id # based on the order of the dictionaries coming in, taking into account # the indices generated by chunking and multiprocessing id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids # We construct a mask to identify the first token of a word. We will later only use them for predicting entities. # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens # For BERT we add a 0 in the start and end (for CLS and SEP) initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) # This mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = tokenized.attention_mask # i.e. if inference, we need to populate the tokenized_dict so that formatted preds can align # the prediction to the original text if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_list = task["label_list"] label_name = task["label_name"] label_tensor_name = task["label_tensor_name"] labels_word = d[label_name] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: # Usually triggered if label is not in label list label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") # TODO change this when inference flag is implemented except KeyError: # Usually triggered if there is no label in the sample # This is expected during inference since there are no labels # During training, this is a problem label_ids = None logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" "\nIf your are running in *inference* mode: Don't worry!" "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) # Don't log if we are processing a dataset chunk other than the first chunk if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] # This is for inference where we need to keep baskets # By contrast, in training, we can remove baskets to free up RAM if return_baskets: ret.append(self.baskets) return tuple(ret)
@staticmethod def _get_start_of_word(word_ids): words = np.array(word_ids) words[words == None] = -1 start_of_word_single = [0] + list(np.ediff1d(words) > 0) start_of_word_single = [int(x) for x in start_of_word_single] return start_of_word_single
##################### # LM Processors #### #####################
[docs]class BertStyleLMProcessor(Processor): """ Prepares data for masked language model training and next sentence prediction in the style of BERT """
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, train_filename="train.txt", dev_filename="dev.txt", test_filename="test.txt", dev_split=0.0, next_sent_pred=True, next_sent_pred_style="bert-style", max_docs=None, proxies=None, masked_lm_prob=0.15, **kwargs ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"] :type label_list: list :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param next_sent_pred: Whether to use next_sentence_prediction objective or not :type next_sent_pred: bool :param next_sent_pred_style: Two different styles for next sentence prediction available: - "sentence": Use of a single sentence for Sequence A and a single sentence for Sequence B - "bert-style": Fill up all of max_seq_len tokens and split into Sequence A and B at sentence border. If there are too many tokens, Sequence B will be truncated. :type next_sent_pred_style: str :param max_docs: maximum number of documents to include from input dataset :type max_docs: int :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param masked_lm_prob: probability of masking a token :type masked_lm_prob: float :param kwargs: placeholder for passing generic parameters :type kwargs: object """ self.delimiter = "" self.max_docs = max_docs if not tokenizer.is_fast: raise ValueError("This processor only supports FastTokenizers. " "Load one by calling Tokenizer.load(..., use_fast=True)") super(BertStyleLMProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) self.next_sent_pred = next_sent_pred self.next_sent_pred_style = next_sent_pred_style added_tokens = self.get_added_tokens() self.add_task("lm", "acc", list(self.tokenizer.vocab) + added_tokens) if self.next_sent_pred: self.add_task("nextsentence", "acc", ["False", "True"]) self.masked_lm_prob = masked_lm_prob
[docs] def get_added_tokens(self): dictionary = self.tokenizer.get_added_vocab() sorted_tuples = sorted(dictionary.items(), key=lambda x: x[0]) return [x[1] for x in sorted_tuples]
[docs] def file_to_dicts(self, file: str) -> list: dicts = read_docs_from_txt(filename=file, delimiter=self.delimiter, max_docs=self.max_docs, proxies=self.proxies) return dicts
[docs] def dataset_from_dicts(self, dicts, indices=None, return_baskets=False): dicts = [d["doc"] for d in dicts] # 1) Create samples & truncate (sentence pairs) # next sentence prediction ... if self.next_sent_pred: assert len(dicts) > 1, "Need at least 2 documents to sample random sentences from" # ...with single sentences if self.next_sent_pred_style == "sentence": samples = self._create_sequence_pairs_by_line(dicts) # ...bert style elif self.next_sent_pred_style == "bert-style": samples = self._create_sequence_pairs_bert_style(dicts) else: raise NotImplementedError("next_sent_pred_style has to be 'sentence' or 'bert-style'") # no next sentence prediction else: samples = self._create_sequence_pairs_no_next_sent(dicts) # 2) Create labels (masking words + NSP) features = [] vocab_length = len(self.tokenizer.vocab)-1 for sample in samples: features.append(self._create_labels(sample=sample, vocab_length=vocab_length)) # 3) Create dataset dataset, tensor_names = convert_features_to_dataset(features=features) return dataset, tensor_names, set()
def _create_sequence_pairs_by_line(self, docs): samples = [] raw_pairs = [] labels = [] for doc in docs: # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible) for idx in range(len(doc) - 1): text_a, text_b, is_next_label = get_sentence_pair(doc, docs, idx) raw_pairs.append((text_a, text_b)) labels.append(is_next_label) # Tokenize + Encode masks encoded_pairs = self.tokenizer.batch_encode_plus(raw_pairs, max_length=self.max_seq_len, truncation=True, truncation_strategy="longest_first", add_special_tokens=True, padding='max_length' ) assert len(encoded_pairs.input_ids) == len(raw_pairs) # Create "Start of word mask" start_of_word = [] for e in encoded_pairs.encodings: start_of_word.append(_get_start_of_word(e.words, e.special_tokens_mask)) # Create Sample objects for idx in range(len(raw_pairs)): if len(encoded_pairs.input_ids[idx]) == 0: logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {raw_pairs[idx]}") continue # We don't populate 'tokenized' here as we skiped the intermediate string token stage abeoce to improve the speed ... samples.append(Sample(id=None, clear_text={"text_a": raw_pairs[idx][0], "text_b": raw_pairs[idx][1], "nextsentence_label": labels[idx]}, tokenized={"tokens": encoded_pairs.encodings[idx].tokens, "start_of_word": start_of_word[idx], "special_tokens_mask": encoded_pairs.encodings[idx].special_tokens_mask, "offsets": encoded_pairs.encodings[idx].offsets}, features={"input_ids": encoded_pairs.input_ids[idx], "segment_ids": encoded_pairs.token_type_ids[idx], "padding_mask": encoded_pairs.attention_mask[idx], } )) return samples def _create_sequence_pairs_bert_style(self, docs): samples = [] # 1) Tokenize + Encode all docs # TODO optimize for single batch call encoded_docs = [] for doc in docs: encoded_sentences = self.tokenizer.batch_encode_plus(doc, add_special_tokens=False) # Create "Start of word mask" for e in encoded_sentences.encodings: e.start_of_word = _get_start_of_word(e.words, e.special_tokens_mask) encoded_docs.append(encoded_sentences) # 2) Create sequence pairs that utilize full possible length up to max_seq_len # TODO make num special tokens more general # account for [CLS], [SEP], [SEP] max_num_tokens = self.max_seq_len - 3 for enc_doc in encoded_docs: current_chunk = [] current_length = 0 i = 0 while i < len(enc_doc.encodings): current_length += len(enc_doc[i].tokens) current_chunk.append(enc_doc[i]) if current_length >= max_num_tokens: # split our list of sequences (=chunk) into two sequences and create a sample out of it # (incl. special tokens and all other masks) sample, num_unused_segments = self._create_sample_bert_style( chunk=current_chunk, random_doc=encoded_docs[random.randint(0, len(encoded_docs)-1)], max_num_tokens=max_num_tokens, ) samples.append(sample) i -= num_unused_segments current_chunk = [] current_length = 0 i += 1 return samples def _create_sequence_pairs_no_next_sent(self, docs): samples = [] # flatten into list of sentences docs = [sent for doc in docs for sent in doc] # Tokenize + Encode masks #TODO fill up sequences rather than creating one-sentence-samples to make this more efficient encoded_pairs = self.tokenizer.batch_encode_plus(docs, max_length=self.max_seq_len, truncation=True, truncation_strategy="longest_first", add_special_tokens=True, padding='max_length' ) assert len(encoded_pairs.input_ids) == len(docs) # Create "Start of word mask" start_of_word = [] for e in encoded_pairs.encodings: start_of_word.append(_get_start_of_word(e.words, e.special_tokens_mask)) # Create Sample objects for idx in range(len(docs)): if len(encoded_pairs.input_ids[idx]) == 0: logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {docs[idx]}") continue # We don't populate 'tokenized' here as we skiped the intermediate string token stage abeoce to improve the speed ... samples.append(Sample(id=None, clear_text={"text_a": docs[idx]}, tokenized={"tokens": encoded_pairs.encodings[idx].tokens, "start_of_word": start_of_word[idx], "special_tokens_mask": encoded_pairs.encodings[idx].special_tokens_mask, "offsets": encoded_pairs.encodings[idx].offsets}, features={"input_ids": encoded_pairs.input_ids[idx], "segment_ids": encoded_pairs.token_type_ids[idx], "padding_mask": encoded_pairs.attention_mask[idx], } )) return samples def _create_sample_bert_style(self, chunk, random_doc, max_num_tokens, prob_next_sentence=0.5): """ Get one sample from corpus consisting of two sequences. A sequence can consist of more than one sentence. With prob. 50% these are two subsequent sequences from one doc. With 50% the second sequence will be a random one from another document. :param chunk: List of subsequent, tokenized and encoded sentences. :type chunk: [Encoding] :param random_doc: A random doc where we can sample a random next "sentence" from. :type random_doc: [str] :param max_num_tokens: Samples are truncated after this many tokens. :type max_num_tokens: int :return: (Sample, int) sample, number of unused sentences in chunk """ # edge case: if we have only a single sequence, we split that one in half if len(chunk) == 1: # Define splitting point if int(len(chunk[0].tokens) / 2) >= max_num_tokens: boundary = int(max_num_tokens / 2) else: boundary = int(len(chunk[0].tokens) / 2) # Insert special tokens input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=chunk[0].ids[:boundary], token_ids_1=chunk[0].ids[ boundary:max_num_tokens]) segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=chunk[0].ids[:boundary], token_ids_1=chunk[0].ids[ boundary:max_num_tokens]) # TODO make this general for other model types start_of_word = [0] + chunk[0].start_of_word[:boundary] + [0] + chunk[0].start_of_word[boundary:max_num_tokens] + [0] padding_mask = [1] * len(input_ids) assert len(start_of_word) == len(input_ids) assert len(padding_mask) == len(input_ids) assert len(segment_ids) == len(input_ids) sample = Sample(id=None, clear_text= {"text_a": None, "text_b": None, "nextsentence_label": True}, tokenized= {"start_of_word": start_of_word}, features= {"input_ids": input_ids, "segment_ids": segment_ids, "padding_mask": padding_mask, } ) num_unused_segments = 0 return sample, num_unused_segments else: # determine how many segments from chunk go into sequence A a_end = random.randrange(1, len(chunk)) sequence_a = chunk[:a_end] length_a = sum([len(seq) for seq in sequence_a]) # Build sequence B target_b_length = max_num_tokens - length_a # a) .. using actual next sequence if (random.random() > prob_next_sentence) and (len(chunk) > 1): sequence_b = chunk[a_end:] label = True num_unused_segments = 0 # b) ... using random next sequence else: sequence_b = [] length_b = 0 if len(random_doc.encodings) == 1: sequence_b.append(random_doc[0]) else: # pick random start sentence and then fill up to target length random_start = random.randrange(len(random_doc.encodings)-1) for i in range(random_start, len(random_doc.encodings)): sequence_b.append(random_doc[i]) length_b += len(random_doc[i].ids) if length_b >= target_b_length: break label = False # We didn't use all of the segments in this chunk as we sampled a random sequence => put them back num_unused_segments = len(chunk) - a_end # Join everything to single sample def merge_start_of_word(sequences): start_of_word = [] for s in sequences: start_of_word.extend(s.start_of_word) return start_of_word start_of_word_a = merge_start_of_word(sequence_a) start_of_word_b = merge_start_of_word(sequence_b) sequence_a = Encoding.merge(sequence_a) sequence_b = Encoding.merge(sequence_b) assert len(sequence_a.ids) > 0 assert len(sequence_b.ids) > 0 # Insert special tokens input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=sequence_a.ids, token_ids_1=sequence_b.ids[:target_b_length]) segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=sequence_a.ids, token_ids_1=sequence_b.ids[:target_b_length]) # TODO make this general for other model types start_of_word = [0] + start_of_word_a + [0] + start_of_word_b[:target_b_length] + [0] padding_mask = [1] * len(input_ids) if len(input_ids) < self.max_seq_len: # Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1) pad_idx = self.tokenizer.pad_token_id padding = [pad_idx] * (self.max_seq_len - len(input_ids)) zero_padding = [0] * (self.max_seq_len - len(input_ids)) input_ids += padding padding_mask += zero_padding segment_ids += zero_padding start_of_word += zero_padding assert len(start_of_word) == len(input_ids) assert len(padding_mask) == len(input_ids) assert len(segment_ids) == len(input_ids) sample = Sample(id=None, clear_text={"text_a": None, "text_b": None, "nextsentence_label": label}, tokenized={"start_of_word": start_of_word}, features={"input_ids": input_ids, "segment_ids": segment_ids, "padding_mask": padding_mask, } ) return sample, num_unused_segments def _create_labels(self, sample, vocab_length) -> dict: # Mask random words input_ids, lm_label_ids = self._mask_random_words(sample.features["input_ids"], vocab_length, token_groups=sample.tokenized["start_of_word"]) sample.features["lm_label_ids"] = lm_label_ids sample.features["input_ids"] = input_ids # NSP label if self.next_sent_pred: # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: sample.features["nextsentence_label_ids"] = [0] else: sample.features["nextsentence_label_ids"] = [1] assert len(sample.features["input_ids"]) == self.max_seq_len assert len(sample.features["padding_mask"]) == self.max_seq_len assert len(sample.features["segment_ids"]) == self.max_seq_len assert len(sample.features["lm_label_ids"]) == self.max_seq_len return sample.features def _mask_random_words(self, tokens, vocab_length, token_groups=None, max_predictions_per_seq=20): """ Masking some random tokens for Language Model task with probabilities as in the original BERT paper. num_masked. If token_groups is supplied, whole word masking is applied, so *all* tokens of a word are either masked or not. This option was added by the BERT authors later and showed solid improvements compared to the original objective. Whole Word Masking means that if we mask all of the wordpieces corresponding to an original word. When a word has been split intoWordPieces, the first token does not have any marker and any subsequence tokens are prefixed with ##. So whenever we see the ## token, we append it to the previous set of word indexes. Note that Whole Word Masking does *not* change the training code at all -- we still predict each WordPiece independently, softmaxed over the entire vocabulary. This implementation is mainly a copy from the original code by Google, but includes some simplifications. :param tokens: tokenized sentence. :type tokens: [str] :param vocab_length: number of tokens in the vocabulary :type vocab_length: int :param token_groups: If supplied, only whole groups of tokens get masked. This can be whole words but also other types (e.g. spans). Booleans indicate the start of a group. :type token_groups: [bool] :param max_predictions_per_seq: maximum number of masked tokens :type max_predictions_per_seq: int :return: (list of int, list of int), masked tokens and related labels for LM prediction """ # 1. Combine tokens to one group (e.g. all subtokens of a word) cand_indices = [] for (i, token) in enumerate(tokens): if token == 101 or token == 102 or token == 0: continue if (token_groups and len(cand_indices) >= 1 and not token_groups[i]): cand_indices[-1].append(i) else: cand_indices.append([i]) num_to_mask = min(max_predictions_per_seq, max(1, int(round(len(tokens) * self.masked_lm_prob )))) random.shuffle(cand_indices) output_label = [-1] * len(tokens) num_masked = 0 assert 103 not in tokens #mask token # 2. Mask the first groups until we reach the number of tokens we wanted to mask (num_to_mask) for index_set in cand_indices: if num_masked >= num_to_mask: break # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. if num_masked + len(index_set) > num_to_mask: continue for index in index_set: prob = random.random() num_masked += 1 original_token = tokens[index] # 80% randomly change token to mask token if prob < 0.8: tokens[index] = 103 # 10% randomly change token to random token # TODO currently custom vocab is not included here elif prob < 0.9: tokens[index] = random.randint(0, vocab_length) # -> rest 10% randomly keep current token # append current token to output (we will predict these later) try: output_label[index] = original_token except KeyError: # For unknown words (should not occur with BPE vocab) output_label[index] = 100 # UNK token logger.warning( "Cannot find token '{}' in vocab. Using [UNK] instead".format(original_token) ) return tokens, output_label
[docs] def estimate_n_samples(self, filepath, max_docs=500): """ Estimates the number of samples from a given file BEFORE preprocessing. Used in StreamingDataSilo to estimate the number of steps before actually processing the data. The estimated number of steps will impact some types of Learning Rate Schedules. :param filepath: str or Path, file with data used to create samples (e.g. train.txt) :param max_docs: int, maximum number of docs to read in & use for our estimate of n_samples :return: int, number of samples in the given dataset """ total_lines = sum(1 for line in open(filepath, encoding="utf-8")) empty_lines = sum(1 if line == "\n" else 0 for line in open(filepath, encoding="utf-8")) if self.next_sent_pred_style == "sentence": # one sample = two lines (except last line in doc) n_samples = total_lines - (2 * empty_lines) elif self.next_sent_pred_style == "bert-style": # Original BERT LM training (filling up sequence pairs with sentences until max_seq_len) # (This is a very rough heuristic, as we can only estimate the real number of samples AFTER tokenization) logging.info(f"Estimating total number of samples ...") # read in subset of docs if self.max_docs: temp = self.max_docs self.max_docs = min(max_docs, temp) dicts = list(self.file_to_dicts(filepath)) self.max_docs = temp else: self.max_docs = max_docs dicts = list(self.file_to_dicts(filepath)) self.max_docs = None # count samples dicts = [d["doc"] for d in dicts] n_samples = len(self._create_sequence_pairs_bert_style(docs=dicts)) # extrapolate to the whole file n_samples = int(n_samples / len(dicts)) * (empty_lines+1) logging.info(f"Heuristic estimate of number of samples in {filepath} based on {len(dicts)} docs: {n_samples}") else: raise NotImplementedError(f"No estimate logic for next_sent_pred_style={self.next_sent_pred_style} implemented") return n_samples
######################################### # QA Processors #### #########################################
[docs]class SquadProcessor(Processor): """ Used to handle the SQuAD dataset"""
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, label_list=None, metric="squad", train_filename=Path("train-v2.0.json"), dev_filename=Path("dev-v2.0.json"), test_filename=None, dev_split=0, doc_stride=128, max_query_length=64, proxies=None, max_answers=6, **kwargs ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"] :type label_list: list :param metric: name of metric that shall be used for evaluation, can be "squad" or "top_n_accuracy" :type metric: str :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param doc_stride: When the document containing the answer is too long it gets split into part, strided by doc_stride :type doc_stride: int :param max_query_length: Maximum length of the question (in number of subword tokens) :type max_query_length: int :param kwargs: placeholder for passing generic parameters :type kwargs: object """ self.target = "classification" self.ph_output_type = "per_token_squad" assert doc_stride < (max_seq_len - max_query_length), \ "doc_stride is longer than max_seq_len minus space reserved for query tokens. \nThis means that there will be gaps " \ "as the passage windows slide, causing the model to skip over parts of the document.\n" \ "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n " \ "Or decrease max_query_length" self.doc_stride = doc_stride self.max_query_length = max_query_length self.max_answers = max_answers super(SquadProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) self._initialize_special_tokens_count() if metric and label_list: self.add_task("question_answering", metric, label_list) else: logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for " "using the default task or add a custom task later via processor.add_task()")
[docs] def dataset_from_dicts(self, dicts, indices, return_baskets=False): """ Convert input dictionaries into a pytorch dataset for Question Answering. For this we have an internal representation called "baskets". Each basket is a question-document pair. Each stage adds or transforms specific information to our baskets. @param dicts: dict, input dictionary with SQuAD style information present @param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique @param return_baskets: boolean, weather to return the baskets or not (baskets are needed during inference) @param return_problematic: boolean, weather to return the IDs of baskets that created errors during processing """ # Convert to standard format pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion # Tokenize documents and questions baskets = tokenize_batch_question_answering(pre_baskets, self.tokenizer, indices) # Split documents into smaller passages to fit max_seq_len baskets = self._split_docs_into_passages(baskets) # Convert answers from string to token space, skip this step for inference if not return_baskets: baskets = self._convert_answers(baskets) # Convert internal representation (nested baskets + samples with mixed types) to pytorch features (arrays of numbers) baskets = self._passages_to_pytorch_features(baskets, return_baskets) # Convert features into pytorch dataset, this step also removes potential errors during preprocessing dataset, tensor_names, baskets = self._create_dataset(baskets) # Logging if 0 in indices: self._log_samples(1, baskets) # During inference we need to keep the information contained in baskets. if return_baskets: return dataset, tensor_names, self.problematic_sample_ids, baskets else: return dataset, tensor_names, self.problematic_sample_ids
[docs] def file_to_dicts(self, file: str) -> [dict]: nested_dicts = read_squad_file(filename=file) dicts = [y for x in nested_dicts for y in x["paragraphs"]] return dicts
# TODO use Input Objects instead of this function
[docs] def convert_qa_input_dict(self, infer_dict): """ Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or ["text", "questions"] (api format). This function converts the latter into the former. It also converts the is_impossible field to answer_type so that NQ and SQuAD dicts have the same format. """ # check again for doc stride vs max_seq_len when. Parameters can be changed for already initialized models (e.g. in haystack) assert self.doc_stride < (self.max_seq_len - self.max_query_length), \ "doc_stride is longer than max_seq_len minus space reserved for query tokens. \nThis means that there will be gaps " \ "as the passage windows slide, causing the model to skip over parts of the document.\n" \ "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n " \ "Or decrease max_query_length" try: # Check if infer_dict is already in internal json format if "context" in infer_dict and "qas" in infer_dict: return infer_dict # converts dicts from inference mode to data structure used in FARM questions = infer_dict["questions"] text = infer_dict["text"] uid = infer_dict.get("id", None) qas = [{"question": q, "id": uid, "answers": [], "answer_type": None} for i, q in enumerate(questions)] converted = {"qas": qas, "context": text} return converted except KeyError: raise Exception("Input does not have the expected format")
def _initialize_special_tokens_count(self): vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"], token_ids_1=["b"]) self.sp_toks_start = vec.index("a") self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1 self.sp_toks_end = len(vec) - vec.index("b") - 1 def _split_docs_into_passages(self, baskets): """ Because of the sequence length limitation of Language Models, the documents need to be divided into smaller parts that we call passages. """ n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True) for basket in baskets: samples = [] ########## perform some basic checking # TODO, eventually move checking into input validation functions # ignore samples with empty context if basket.raw["document_text"] == "": logger.warning("Ignoring sample with empty context") continue ########## end checking # Calculate the number of tokens that can be reserved for the passage. This is calculated by considering # the max_seq_len, the number of tokens in the question and the number of special tokens that will be added # when the question and passage are joined (e.g. [CLS] and [SEP]) passage_len_t = self.max_seq_len - len(basket.raw["question_tokens"][:self.max_query_length]) - n_special_tokens # passage_spans is a list of dictionaries where each defines the start and end of each passage # on both token and character level try: passage_spans = get_passage_offsets(basket.raw["document_offsets"], self.doc_stride, passage_len_t, basket.raw["document_text"]) except Exception as e: logger.warning(f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n" f"With error: {e}") passage_spans = [] for passage_span in passage_spans: # Unpack each variable in the dictionary. The "_t" and "_c" indicate # whether the index is on the token or character level passage_start_t = passage_span["passage_start_t"] passage_end_t = passage_span["passage_end_t"] passage_start_c = passage_span["passage_start_c"] passage_end_c = passage_span["passage_end_c"] passage_start_of_word = basket.raw["document_start_of_word"][passage_start_t: passage_end_t] passage_tokens = basket.raw["document_tokens"][passage_start_t: passage_end_t] passage_text = basket.raw["document_text"][passage_start_c: passage_end_c] clear_text = {"passage_text": passage_text, "question_text": basket.raw["question_text"], "passage_id": passage_span["passage_id"], } tokenized = {"passage_start_t": passage_start_t, "passage_start_c": passage_start_c, "passage_tokens": passage_tokens, "passage_start_of_word": passage_start_of_word, "question_tokens": basket.raw["question_tokens"][:self.max_query_length], "question_offsets": basket.raw["question_offsets"][:self.max_query_length], "question_start_of_word": basket.raw["question_start_of_word"][:self.max_query_length], } # The sample ID consists of internal_id and a passage numbering sample_id = f"{basket.id_internal}-{passage_span['passage_id']}" samples.append(Sample(id=sample_id, clear_text=clear_text, tokenized=tokenized)) basket.samples=samples return baskets def _convert_answers(self, baskets): """ Converts answers that are pure strings into the token based representation with start and end token offset. Can handle multiple answers per question document pair as is common for development/text sets """ for basket in baskets: error_in_answer = False for num, sample in enumerate(basket.samples): # Dealing with potentially multiple answers (e.g. Squad dev set) # Initializing a numpy array of shape (max_answers, 2), filled with -1 for missing values label_idxs = np.full((self.max_answers, 2), fill_value=-1) if error_in_answer or (len(basket.raw["answers"]) == 0): # If there are no answers we set label_idxs[0, :] = 0 else: # For all other cases we use start and end token indices, that are relative to the passage for i, answer in enumerate(basket.raw["answers"]): # Calculate start and end relative to document answer_len_c = len(answer["text"]) answer_start_c = answer["answer_start"] answer_end_c = answer_start_c + answer_len_c - 1 # Convert character offsets to token offsets on document level answer_start_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_start_c) answer_end_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_end_c) # TODO remove after testing 'offset_to_token_idx_vecorized()' # answer_start_t2 = offset_to_token_idx(doc_offsets, answer_start_c) # answer_end_t2 = offset_to_token_idx(doc_offsets, answer_end_c) # if (answer_start_t != answer_start_t2) or (answer_end_t != answer_end_t2): # pass # Adjust token offsets to be relative to the passage answer_start_t -= sample.tokenized["passage_start_t"] answer_end_t -= sample.tokenized["passage_start_t"] # Initialize some basic variables question_len_t = len(sample.tokenized["question_tokens"]) passage_len_t = len(sample.tokenized["passage_tokens"]) # Check that start and end are contained within this passage # answer_end_t is 0 if the first token is the answer # answer_end_t is passage_len_t if the last token is the answer if passage_len_t > answer_start_t >= 0 and passage_len_t >= answer_end_t >= 0: # Then adjust the start and end offsets by adding question and special token label_idxs[i][0] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_start_t label_idxs[i][1] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_end_t # If the start or end of the span answer is outside the passage, treat passage as no_answer else: label_idxs[i][0] = 0 label_idxs[i][1] = 0 ########## answer checking ############################## # TODO, move this checking into input validation functions and delete wrong examples there # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn if answer_start_t < 0 or answer_end_t >= passage_len_t: pass else: doc_text = basket.raw["document_text"] answer_indices = doc_text[answer_start_c: answer_end_c + 1] answer_text = answer["text"] # check if answer string can be found in context if answer_text not in doc_text: logger.warning(f"Answer '{answer['text']}' not contained in context.\n" f"Example will not be converted for training/evaluation.") error_in_answer = True label_idxs[i][0] = -100 # TODO remove this hack also from featurization label_idxs[i][1] = -100 break # Break loop around answers, so the error message is not shown multiple times elif answer_indices.strip() != answer_text.strip(): logger.warning(f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n" f"Example will not be converted for training/evaluation.") error_in_answer = True label_idxs[i][0] = -100 # TODO remove this hack also from featurization label_idxs[i][1] = -100 break # Break loop around answers, so the error message is not shown multiple times ########## end of checking #################### sample.tokenized["labels"] = label_idxs return baskets def _passages_to_pytorch_features(self, baskets, return_baskets): """ Convert internal representation (nested baskets + samples with mixed types) to python features (arrays of numbers). We first join question and passages into on large vector. Then we add additional vectors for: - #TODO """ for basket in baskets: # Add features to samples for num, sample in enumerate(basket.samples): # Initialize some basic variables question_tokens = sample.tokenized["question_tokens"] question_start_of_word = sample.tokenized["question_start_of_word"] question_len_t = len(question_tokens) passage_start_t = sample.tokenized["passage_start_t"] passage_tokens = sample.tokenized["passage_tokens"] passage_start_of_word = sample.tokenized["passage_start_of_word"] passage_len_t = len(passage_tokens) sample_id = [int(x) for x in sample.id.split("-")] # - Combines question_tokens and passage_tokens into a single vector called input_ids # - input_ids also contains special tokens (e.g. CLS or SEP tokens). # - It will have length = question_len_t + passage_len_t + n_special_tokens. This may be less than # max_seq_len but never greater since truncation was already performed when the document was chunked into passages question_input_ids = sample.tokenized["question_tokens"] passage_input_ids = sample.tokenized["passage_tokens"] input_ids = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=question_input_ids, token_ids_1=passage_input_ids) segment_ids = self.tokenizer.create_token_type_ids_from_sequences(token_ids_0=question_input_ids, token_ids_1=passage_input_ids) # To make the start index of passage tokens the start manually seq_2_start_t = self.sp_toks_start + question_len_t + self.sp_toks_mid start_of_word = [0] * self.sp_toks_start + \ question_start_of_word + \ [0] * self.sp_toks_mid + \ passage_start_of_word + \ [0] * self.sp_toks_end # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # The passage mask has 1 for tokens that are valid start or ends for QA spans. # 0s are assigned to question tokens, mid special tokens, end special tokens and padding # Note that start special tokens are assigned 1 since they can be chosen for a no_answer prediction span_mask = [1] * self.sp_toks_start span_mask += [0] * question_len_t span_mask += [0] * self.sp_toks_mid span_mask += [1] * passage_len_t span_mask += [0] * self.sp_toks_end # Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1) pad_idx = self.tokenizer.pad_token_id padding = [pad_idx] * (self.max_seq_len - len(input_ids)) zero_padding = [0] * (self.max_seq_len - len(input_ids)) input_ids += padding padding_mask += zero_padding segment_ids += zero_padding start_of_word += zero_padding span_mask += zero_padding # TODO possibly remove these checks after input validation is in place len_check = len(input_ids) == len(padding_mask) == len(segment_ids) == len(start_of_word) == len(span_mask) id_check = len(sample_id) == 3 label_check = return_baskets or len(sample.tokenized.get("labels",[])) == self.max_answers label_check2 = return_baskets or np.all(sample.tokenized["labels"] > -99) # labels are set to -100 when answer cannot be found if len_check and id_check and label_check and label_check2: # - The first of the labels will be used in train, and the full array will be used in eval. # - start_of_word and spec_tok_mask are not actually needed by model.forward() but are needed for # model.formatted_preds() during inference for creating answer strings # - passage_start_t is index of passage's first token relative to document feature_dict = {"input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "passage_start_t": passage_start_t, "start_of_word": start_of_word, "labels": sample.tokenized.get("labels",[]), "id": sample_id, "seq_2_start_t": seq_2_start_t, "span_mask": span_mask} sample.features = [feature_dict] # other processor's features can be lists else: self.problematic_sample_ids.add(sample.id) sample.features = None return baskets def _create_dataset(self, baskets): """ Convert python features into pytorch dataset. Also removes potential errors during preprocessing. Flattens nested basket structure to create a flat list of features """ features_flat = [] basket_to_remove = [] for basket in baskets: if self._check_sample_features(basket): for sample in basket.samples: features_flat.extend(sample.features) else: # remove the entire basket basket_to_remove.append(basket) if len(basket_to_remove) > 0: for basket in basket_to_remove: # if basket_to_remove is not empty remove the related baskets baskets.remove(basket) dataset, tensor_names = convert_features_to_dataset(features=features_flat) return dataset, tensor_names, baskets def _log_samples(self, n_samples, baskets): logger.info("*** Show {} random examples ***".format(n_samples)) for i in range(n_samples): random_basket = random.choice(baskets) random_sample = random.choice(random_basket.samples) logger.info(random_sample)
[docs]class NaturalQuestionsProcessor(Processor): """ Used to handle the Natural Question QA dataset"""
[docs] def __init__( self, tokenizer, max_seq_len, data_dir, train_filename=Path("train-v2.0.json"), dev_filename=Path("dev-v2.0.json"), test_filename=None, dev_split=0, doc_stride=128, max_query_length=64, proxies=None, keep_no_answer=0.02, downsample_context_size=None, inference=False, max_answers=6, **kwargs): """ Deals with all the preprocessing steps needed for Natural Questions. Follows Alberti 2019 et al. (https://arxiv.org/abs/1901.08634) in merging multiple disjoint short answers into the one longer label span and also by downsampling samples of no_answer during training :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: The name of the file containing the test data. :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param doc_stride: When the document containing the answer is too long it gets split into parts, strided by doc_stride :type doc_stride: int :param max_query_length: Maximum length of the question (in number of subword tokens) :type max_query_length: int :param keep_no_answer: The probability that a sample with an no_answer label is kept (0.0 < keep_no_answer <= 1.0). Only works if inference is False :type keep_no_answer: float :param downsample_context_size: Downsampling before any data conversion by taking a short text window of size downsample_context_size around the long answer span. To disable set to None :type downsample_context_size: int :param inference: Whether we are currently using the Processsor for model inference. If True, the keep_no_answer will be overridden and set to 1 :type inference: bool :param kwargs: placeholder for passing generic parameters :type kwargs: object """ self.target = "classification" self.ph_output_type = "per_token_squad" # These are classification labels from Natural Questions. Note that in this implementation, we are merging # the "long_answer" and "short_answer" labels into the one "span" label self.answer_type_list = ["no_answer", "span", "yes", "no"] self.doc_stride = doc_stride self.max_query_length = max_query_length self.keep_no_answer = keep_no_answer self.downsample_context_size = downsample_context_size self.inference = inference self.max_answers = max_answers super(NaturalQuestionsProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) # Todo rename metric from squad to maybe QA spans or something like that self.add_task("question_answering", "squad", ["start_token", "end_token"]) self.add_task("text_classification", "f1_macro", self.answer_type_list, label_name="answer_type") self._initialize_special_tokens_count()
def _initialize_special_tokens_count(self): vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"], token_ids_1=["b"]) self.sp_toks_start = vec.index("a") self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1 self.sp_toks_end = len(vec) - vec.index("b") - 1
[docs] def file_to_dicts(self, file: str) -> [dict]: dicts = read_jsonl(file, proxies=self.proxies) return dicts
def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]: """ This method will split question-document pairs from the SampleBasket into question-passage pairs which will each form one sample. The "t" and "c" in variables stand for token and character respectively. This uses many methods that the SquadProcessor calls but note that the SquadProcessor overwrites Processor._dicts_to_baskets() while the NaturalQuestionsProcessor does not. This was done in Squad to avoid retokenizing documents that are paired with multiple questions. This is not necessary for Natural Questions since there is generally a 1 to 1 mapping from document to question. Input dictionaries can have either ["context", "qas"] (internal format) as keys or ["text", "questions"] (api format). Both are supported. """ # Turns NQ dictionaries into a SQuAD style dictionaries if self._is_nq_dict(dictionary): dictionary = self._prepare_dict(dictionary=dictionary) dictionary_tokenized = self._apply_tokenization(dictionary, self.tokenizer, self.answer_type_list)[0] n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True) samples = create_samples_qa_Natural_Question(dictionary_tokenized, self.max_query_length, self.max_seq_len, self.doc_stride, n_special_tokens) # Downsample the number of samples with an no_answer label. This fn will always return at least one sample # so that we don't end up with a basket with 0 samples if not self.inference: samples = self._downsample(samples, self.keep_no_answer) return samples @staticmethod def _is_nq_dict(dictionary): if set(dictionary.keys()) == {'document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'}: return True return False def _downsample(self, samples, keep_prob): # Downsamples samples with a no_answer label (since there is an overrepresentation of these in NQ) # This method will always return at least one sample. This is done so that we don't end up with SampleBaskets # with 0 samples ret = [] for s in samples: if self._check_no_answer_sample(s): if random_float() > 1 - keep_prob: ret.append(s) else: ret.append(s) if len(ret) == 0: ret = [random.choice(samples)] return ret def _downsample_unprocessed(self, dictionary): doc_text = dictionary["document_text"] doc_tokens = doc_text.split(" ") annotations = dictionary.get("annotations",[]) # for simplicity we only downsample wiki pages with one long answer annotation if len(annotations) == 1: annotation = annotations[0] # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer if self._check_no_answer(annotation): dictionary["document_text"] = " ".join(doc_tokens[:self.max_seq_len+randint(1,self.downsample_context_size)]) else: # finding earliest start and latest end labels long_answer_start = annotation['long_answer']['start_token'] long_answer_end = annotation['long_answer']['end_token'] short_answer_start = 1e10 short_answer_end = -1 for s in annotation["short_answers"]: if s["start_token"] < short_answer_start: short_answer_start = s["start_token"] if s["end_token"] > short_answer_end: short_answer_end = s["end_token"] start_threshold = min(long_answer_start,short_answer_start) - randint(1,self.downsample_context_size) start_threshold = max(0, start_threshold) end_threshold = max(long_answer_end,short_answer_end) + randint(1,self.downsample_context_size) # taking subset of doc text and shift labels sub_document_text = " ".join( doc_tokens[start_threshold:end_threshold] ) dictionary["document_text"] = sub_document_text # change of offsets happens in place (of dictionary) annotation['long_answer']['start_token'] -= start_threshold annotation['long_answer']['end_token'] -= start_threshold for s in annotation["short_answers"]: s["start_token"] -= start_threshold s["end_token"] -= start_threshold return dictionary def _prepare_dict(self, dictionary): """ Casts a Natural Questions dictionary that is loaded from a jsonl file into SQuAD format so that the same featurization functions can be called for both tasks. Each annotation can be one of four answer types, ["yes", "no", "span", "no_answer"]""" if self.downsample_context_size is not None: dictionary = self._downsample_unprocessed(dictionary) converted_answers = [] doc_text = dictionary["document_text"] _, tok_to_ch = split_with_metadata(doc_text) for annotation in dictionary["annotations"]: # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer if self._check_no_answer(annotation): continue sa_text, sa_start_c = self._unify_short_answers(annotation["short_answers"], doc_text, tok_to_ch) la_text, la_start_c = self._retrieve_long_answer(annotation["long_answer"]["start_token"], annotation["long_answer"]["end_token"], tok_to_ch, doc_text) # Picks the span to be considered as annotation by choosing between short answer, long answer and no_answer text, start_c = self._choose_span(sa_text, sa_start_c, la_text, la_start_c) converted_answers.append({"text": text, "answer_start": start_c}) if len(converted_answers) == 0: answer_type = "no_answer" else: answer_type = dictionary["annotations"][0]["yes_no_answer"].lower() if answer_type == "none": answer_type = "span" # TODO: answer_type should be in answers since in NQ, each annotator can give either a span, no_answer, yes or no converted = {"id": dictionary["example_id"], "context": doc_text, "qas": [{"question": dictionary["question_text"], "id": dictionary["example_id"], "answers": converted_answers, "answer_type": answer_type}]} return converted @staticmethod def _check_no_answer(annotation): if annotation["long_answer"]["start_token"] > -1 or annotation["long_answer"]["end_token"] > -1: return False for sa in annotation["short_answers"]: if sa["start_token"] > -1 or sa["end_token"] > -1: return False else: return True @staticmethod def _check_no_answer_sample(sample): sample_tok = sample.tokenized if len(sample_tok["answers"]) == 0: return True first_answer = sample_tok["answers"][0] if first_answer["start_t"] < sample_tok["passage_start_t"]: return True if first_answer["end_t"] > sample_tok["passage_start_t"] + len(sample_tok["passage_tokens"]): return True if first_answer["answer_type"] == "no_answer": return True else: return False def _retrieve_long_answer(self, start_t, end_t, tok_to_ch, doc_text): """ Retrieves the string long answer and also its starting character index""" start_c, end_c = self._convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text) text = doc_text[start_c: end_c] return text, start_c @staticmethod def _choose_span(sa_text, sa_start_c, la_text, la_start_c): if sa_text: return sa_text, sa_start_c elif la_text: return la_text, la_start_c else: return "", -1 def _unify_short_answers(self, short_answers, doc_text, tok_to_ch): """ In cases where an NQ sample has multiple disjoint short answers, this fn generates the single shortest span that contains all the answers""" if not short_answers: return "", -1 short_answer_idxs = [] # TODO write comment explaining this for short_answer in short_answers: short_answer_idxs.append(short_answer["start_token"]) short_answer_idxs.append(short_answer["end_token"]) answer_start_t = min(short_answer_idxs) answer_end_t = max(short_answer_idxs) answer_start_c, answer_end_c = self._convert_tok_to_ch(answer_start_t, answer_end_t, tok_to_ch, doc_text) answer_text = doc_text[answer_start_c: answer_end_c] assert answer_text == " ".join(doc_text.split()[answer_start_t: answer_end_t]) return answer_text, answer_start_c @staticmethod def _convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text): n_tokens = len(tok_to_ch) if start_t == -1 and end_t == -1: return -1, -1 start_c = tok_to_ch[start_t] # when the end of the answer span is the end of the text if end_t == n_tokens: end_c = len(doc_text) else: next_word_start_c = tok_to_ch[end_t] span = doc_text[:next_word_start_c].strip() end_c = len(span) return start_c, end_c def _sample_to_features(self, sample: Sample) -> dict: self._check_valid_answer(sample) features = sample_to_features_qa_Natural_Questions(sample=sample, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len, sp_toks_start=self.sp_toks_start, sp_toks_mid=self.sp_toks_mid, sp_toks_end=self.sp_toks_end, answer_type_list=self.answer_type_list, max_answers=self.max_answers) return features def _check_valid_answer(self, sample): passage_text = sample.clear_text["passage_text"] for answer in sample.clear_text["answers"]: len_passage = len(passage_text) start = answer["start_c"] end = answer["end_c"] # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn if start < 0 or end >= len_passage: continue answer_indices = passage_text[start: end + 1] answer_text = answer["text"] if answer_indices != answer_text: raise ValueError( f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'""") def _dict_to_samples_and_features(self, dictionary: dict, **kwargs) -> [Sample]: """ This method will split the question-document pair from the dictionary into question-passage pairs which will each form one sample. The "t" and "c" in variables stand for token and character respectively. Input dictionaries can have either ["context", "qas"] (internal format) as keys or ["text", "questions"] (api format). Both are supported. """ if self._is_nq_dict(dictionary): dictionary = self._prepare_dict(dictionary=dictionary) basket_id_internal = kwargs["basket_id_internal"] dictionary_tokenized = self._apply_tokenization(dictionary, self.tokenizer, self.answer_type_list)[0] n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True) samples = create_samples_qa_Natural_Question(dictionary_tokenized, self.max_query_length, self.max_seq_len, self.doc_stride, n_special_tokens) # Downsample the number of samples with an no_answer label. This fn will always return at least one sample # so that we don't end up with a basket with 0 samples. if not self.inference: samples = self._downsample(samples, self.keep_no_answer) # Get features for each sample for num, sample in enumerate(samples): sample.id = f"{basket_id_internal}-{num}" features = self._sample_to_features(sample) sample.features = features return samples def _apply_tokenization(self, dictionary, tokenizer, answer_types_list=[]): raw_baskets = [] dictionary = convert_qa_input_dict(dictionary) dictionary["qas"] = self._is_impossible_to_answer_type(dictionary["qas"]) document_text = dictionary["context"] document_tokenized = tokenize_with_metadata(document_text, tokenizer) document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]] questions = dictionary["qas"] for question in questions: answers = [] # For training and dev with labelled examples try: external_id = question["id"] question_text = question["question"] for answer in question["answers"]: if 'answer_type' in answer.keys() and answer['answer_type'] in answer_types_list: answer_type = answer['answer_type'] else: if answer["text"] == "": answer_type = "no_answer" else: answer_type = "span" a = {"text": answer["text"], "offset": answer["answer_start"], "answer_type": answer_type} answers.append(a) # For inference where samples are read in as dicts without an id or answers except TypeError: external_id = try_get(ID_NAMES, dictionary) question_text = question question_tokenized = tokenize_with_metadata(question_text, tokenizer) question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]] # During inference, there is no_answer type. Also, question might be a str instead of a dict if type(question) == str: answer_type = None elif type(question) == dict: answer_type = question.get("answer_type", None) else: raise Exception("Question was neither in str nor dict format") raw = {"document_text": document_text, "document_tokens": document_tokenized["tokens"], "document_offsets": document_tokenized["offsets"], "document_start_of_word": document_start_of_word, "question_text": question_text, "question_tokens": question_tokenized["tokens"], "question_offsets": question_tokenized["offsets"], "question_start_of_word": question_start_of_word, "answers": answers, "answer_type": answer_type, "external_id": external_id} raw_baskets.append(raw) return raw_baskets def _is_impossible_to_answer_type(self, qas): """ Converts questions from having an is_impossible field to having an answer_type field""" new_qas = [] for q in qas: answer_type = "span" if "is_impossible" in q: if q["is_impossible"] == True: answer_type = "no_answer" del q["is_impossible"] q["answer_type"] = answer_type new_qas.append(q) return new_qas
[docs]class TextSimilarityProcessor(Processor): """ Used to handle the text DPR datasets that come in json format, example: nq-train.json, nq-dev.json, trivia-train.json, trivia-dev.json Datasets can be downloaded from the official DPR github repository (https://github.com/facebookresearch/DPR) dataset format: list of dictionaries with keys: 'dataset', 'question', 'answers', 'positive_ctxs', 'negative_ctxs', 'hard_negative_ctxs' Each sample is a dictionary of format: {"dataset": str, "question": str, "answers": list of str "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} } Example of 1 sample in DPR data json: { "dataset": "nq_dev_psgs_w100", "question": "who sings does he love me with reba", "answers": ["Linda Davis"], "positive_ctxs": [ { "title": "Does He Love You", "text": "Does He Love You \"Does He Love You\" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba's album \"Greatest Hits Volume Two\". It is one of country music's several songs about a love triangle. \"Does He Love You\" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members", "score": 1000, "title_score": 1, "passage_id": "11828866" }, { "title": "Does He Love You", "text": "Does He Love You \"Does He Love You\" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba's album \"Greatest Hits Volume Two\". It is one of country music's several songs about a love triangle. \"Does He Love You\" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members", "score": 13.394315, "title_score": 0, "passage_id": "11828866" }, .... ] "negative_ctxs": [ { "title": "Cormac McCarthy", "text": "chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from \"The New York Times\", \"McCarthy doesn't drink anymore \u2013 he quit 16 years ago in El Paso, with one of his young", "score": 0, "title_score": 0, "passage_id": "2145653" }, { "title": "Pragmatic Sanction of 1549", "text": "one heir, Charles effectively united the Netherlands as one entity. After Charles' abdication in 1555, the Seventeen Provinces passed to his son, Philip II of Spain. The Pragmatic Sanction is said to be one example of the Habsburg contest with particularism that contributed to the Dutch Revolt. Each of the provinces had its own laws, customs and political practices. The new policy, imposed from the outside, angered many inhabitants, who viewed their provinces as distinct entities. It and other monarchical acts, such as the creation of bishoprics and promulgation of laws against heresy, stoked resentments, which fired the eruption of", "score": 0, "title_score": 0, "passage_id": "2271902" }, ..... ] "hard_negative_ctxs": [ { "title": "Why Don't You Love Me (Beyonce\u0301 song)", "text": "song. According to the lyrics of \"Why Don't You Love Me\", Knowles impersonates a woman who questions her love interest about the reason for which he does not value her fabulousness, convincing him she's the best thing for him as she sings: \"Why don't you love me... when I make me so damn easy to love?... I got beauty... I got class... I got style and I got ass...\". The singer further tells her love interest that the decision not to choose her is \"entirely foolish\". Originally released as a pre-order bonus track on the deluxe edition of \"I Am...", "score": 14.678405, "title_score": 0, "passage_id": "14525568" }, { "title": "Does He Love You", "text": "singing the second chorus. Reba stays behind the wall the whole time, while Linda is in front of her. It then briefly goes back to the dressing room, where Reba continues to smash her lover's picture. The next scene shows Reba approaching Linda's house in the pouring rain at night, while Linda stands on her porch as they sing the bridge. The scene then shifts to the next day, where Reba watches from afar as Linda and the man are seen on a speedboat, where he hugs her, implying that Linda is who he truly loves. Reba finally smiles at", "score": 14.385411, "title_score": 0, "passage_id": "11828871" }, ...] """
[docs] def __init__( self, tokenizer, passage_tokenizer, max_seq_len_query, max_seq_len_passage, data_dir="", metric=None, train_filename="train.json", dev_filename=None, test_filename="test.json", dev_split=0.1, proxies=None, max_samples=None, embed_title=True, num_positives=1, num_hard_negatives=1, shuffle_negatives=True, shuffle_positives=False, label_list=None, **kwargs ): """ :param tokenizer: Used to split a question (str) into tokens :param passage_tokenizer: Used to split a passage (str) into tokens. :param max_seq_len_query: Query samples are truncated after this many tokens. :type max_seq_len_query: int :param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens. :type max_seq_len_passage: int :param data_dir: The directory in which the train and dev files can be found. If not available the dataset will be loaded automaticaly if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_. :type data_dir: str :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param max_samples: maximum number of samples to use :type max_samples: int :param embed_title: Whether to embed title in passages during tensorization (bool), :param num_hard_negatives: maximum number to hard negative context passages in a sample :param num_positives: maximum number to positive context passages in a sample :param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the num_hard_negative number of passages :type shuffle_negatives: bool :param shuffle_positives: Whether to shuffle all the positive passages before selecting the num_positive number of passages :type shuffle_positives: bool :param label_list: list of labels to predict. Usually ["hard_negative", "positive"] :type label_list: list[str] :param kwargs: placeholder for passing generic parameters :type kwargs: object """ #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs # Custom processor attributes self.max_samples = max_samples self.query_tokenizer = tokenizer self.passage_tokenizer = passage_tokenizer self.embed_title = embed_title self.num_hard_negatives = num_hard_negatives self.num_positives = num_positives self.shuffle_negatives = shuffle_negatives self.shuffle_positives = shuffle_positives self.max_seq_len_query = max_seq_len_query self.max_seq_len_passage = max_seq_len_passage super(TextSimilarityProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len_query, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric: self.add_task(name="text_similarity", metric=metric, label_list=label_list, label_name="label", task_type="text_similarity") else: logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for " "using the default task or add a custom task later via processor.add_task()")
[docs] @classmethod def load_from_dir(cls, load_dir): """ Overwriting method from parent class to **always** load the TextSimilarityProcessor instead of the specific class stored in the config. :param load_dir: str, directory that contains a 'processor_config.json' :return: An instance of an TextSimilarityProcessor """ # read config processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"]) passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] del config["passage_tokenizer"] processor = cls.load(tokenizer=tokenizer, passage_tokenizer=passage_tokenizer, processor_name="TextSimilarityProcessor", **config) for task_name, task in config["tasks"].items(): processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"]) if processor is None: raise Exception return processor
[docs] def save(self, save_dir): """ Saves the vocabulary to file and also creates a json file containing all the information needed to load the same processor. :param save_dir: Directory where the files are to be saved :type save_dir: str """ os.makedirs(save_dir, exist_ok=True) config = self.generate_config() # save tokenizer incl. attributes config["tokenizer"] = self.tokenizer.__class__.__name__ config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__ # Because the fast tokenizers expect a str and not Path # always convert Path to str here. self.tokenizer.save_pretrained(str(save_dir)) self.passage_tokenizer.save_pretrained(str(save_dir)) # save processor config["processor"] = self.__class__.__name__ output_config_file = Path(save_dir) / "processor_config.json" with open(output_config_file, "w") as file: json.dump(config, file)
[docs] def file_to_dicts(self, file: str) -> [dict]: """ Converts a Dense Passage Retrieval (DPR) data file in json format to a list of dictionaries. :param file: filename of DPR data in json format Returns: list of dictionaries: List[dict] each dictionary: {"query": str, "passages": [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123}, {"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134}, ...]} """ dicts = read_dpr_json(file, max_samples=self.max_samples) return dicts
def _normalize_question(self, question: str) -> str: """ Removes '?' from queries/questions :param question: string representing the question Returns: Question without the '?' """ if question[-1] == '?': question = question[:-1] return question def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: """ Creates one sample from one dict consisting of the query, positive passages and hard negative passages :param dictionary: {"query": str, "passages": List[ {'title': str, 'text': str, 'label': 'hard_negative', 'external_id': str}, {'title': str, 'text': str, 'label': 'positive', 'external_id': str}, .... ] } Returns: sample: instance of Sample """ clear_text = {} tokenized = {} features = {} # extract query, positive context passages and titles, hard-negative passages and titles if "query" in dictionary.keys(): query = self._normalize_question(dictionary["query"]) # featurize the query query_inputs = self.query_tokenizer.encode_plus( text=query, max_length=self.max_seq_len_query, add_special_tokens=True, truncation=True, truncation_strategy='longest_first', padding="max_length", return_token_type_ids=True, ) query_input_ids, query_segment_ids, query_padding_mask = query_inputs["input_ids"], query_inputs[ "token_type_ids"], query_inputs["attention_mask"] # tokenize query tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_input_ids) if len(tokenized_query) == 0: logger.warning( f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize") return None clear_text["query_text"] = query tokenized["query_tokens"] = tokenized_query features["query_input_ids"] = query_input_ids features["query_segment_ids"] = query_segment_ids features["query_attention_mask"] = query_padding_mask if "passages" in dictionary.keys(): positive_context = list(filter(lambda x: x["label"] == "positive", dictionary["passages"])) if self.shuffle_positives: random.shuffle(positive_context) positive_context = positive_context[:self.num_positives] hard_negative_context = list(filter(lambda x: x["label"] == "hard_negative", dictionary["passages"])) if self.shuffle_negatives: random.shuffle(hard_negative_context) hard_negative_context = hard_negative_context[:self.num_hard_negatives] positive_ctx_titles = [passage.get("title", None) for passage in positive_context] positive_ctx_texts = [passage["text"] for passage in positive_context] hard_negative_ctx_titles = [passage.get("title", None) for passage in hard_negative_context] hard_negative_ctx_texts = [passage["text"] for passage in hard_negative_context] # all context passages and labels: 1 for positive context and 0 for hard-negative context ctx_label = [1]*self.num_positives + [0]*self.num_hard_negatives #(self.num_positives if self.num_positives < len(positive_context) else len(positive_context)) + \ # +(self.num_hard_negatives if self.num_hard_negatives < len(hard_negative_context) else len(hard_negative_context)) # featurize context passages if self.embed_title: # concatenate title with positive context passages + negative context passages def _combine_title_context(titles, texts): res = [] for title, ctx in zip(titles, texts): if title is None: title = "" logger.warning( f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' ") res.append(tuple((title, ctx))) return res all_ctx = _combine_title_context(positive_ctx_titles, positive_ctx_texts) + _combine_title_context( hard_negative_ctx_titles, hard_negative_ctx_texts) else: all_ctx = positive_ctx_texts + hard_negative_ctx_texts # assign empty string tuples if hard_negative passages less than num_hard_negatives all_ctx += [('', '')] * ((self.num_positives + self.num_hard_negatives)-len(all_ctx)) ctx_inputs = self.passage_tokenizer.batch_encode_plus( all_ctx, add_special_tokens=True, truncation=True, padding="max_length", max_length=self.max_seq_len_passage, return_token_type_ids=True ) ctx_input_ids, ctx_segment_ids_, ctx_padding_mask = ctx_inputs["input_ids"], ctx_inputs["token_type_ids"], \ ctx_inputs["attention_mask"] ctx_segment_ids = list(torch.zeros((len(ctx_segment_ids_), len(ctx_segment_ids_[0]))).numpy()) # tokenize query and contexts tokenized_passage = [self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in ctx_input_ids] if len(tokenized_passage) == 0: logger.warning(f"The context could not be tokenized, likely because it contains a character that the context tokenizer does not recognize") return None clear_text["passages"] = positive_context + hard_negative_context tokenized["passages_tokens"] = tokenized_passage features["passage_input_ids"] = ctx_input_ids features["passage_segment_ids"] = ctx_segment_ids features["passage_attention_mask"] = ctx_padding_mask features["label_ids"] = ctx_label sample = Sample(id=None, clear_text=clear_text, tokenized=tokenized, features=features) return [sample] def _sample_to_features(self, sample) -> dict: return [sample.features] def _dict_to_samples_and_features(self, dictionary: dict, **kwargs) -> [Sample]: samples = self._dict_to_samples(dictionary, **kwargs) for sample in samples: sample.features = self._sample_to_features(sample) return samples