Source code for farm.data_handler.data_silo

import copy
import logging
import torch.multiprocessing as mp
from contextlib import ExitStack
from functools import partial
import random
from pathlib import Path
from itertools import chain, groupby

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import ConcatDataset, Dataset, Subset, IterableDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
import torch
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm import tqdm

from farm.data_handler.dataloader import NamedDataLoader
from farm.data_handler.processor import Processor, BertStyleLMProcessor
from farm.data_handler.utils import grouper
from farm.modeling.tokenization import EmbeddingTokenizer
from farm.utils import MLFlowLogger as MlLogger
from farm.utils import log_ascii_workers, calc_chunksize
from farm.utils import get_dict_checksum
from farm.visual.ascii.images import TRACTOR_SMALL


logger = logging.getLogger(__name__)



[docs]class DataSilo:
    """ Generates and stores PyTorch DataLoader objects for the train, dev and test datasets.
    Relies upon functionality in the processor to do the conversion of the data. Will also
    calculate and display some statistics.
     """

[docs]    def __init__(
        self,
        processor,
        batch_size,
        eval_batch_size=None,
        distributed=False,
        automatic_loading=True,
        max_multiprocessing_chunksize=2000,
        max_processes=128,
        caching=False,
        cache_path=Path("cache/data_silo"),
    ):
        """
        :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset.
        :type processor: Processor
        :param batch_size: The size of batch that should be returned by the DataLoader for the training set.
        :type batch_size: int
        :param eval_batch_size: The size of batch that should be returned by the DataLoaders for the dev and test set.
        :type eval_batch_size: int
        :param distributed: Set to True if you are running in a distributed evn, e.g. using DistributedDataParallel.
                            The DataSilo will init the DataLoader with a DistributedSampler() to distribute batches.
        :type distributed: bool
        :param automatic_loading: Set to False, if you don't want to automatically load data at initialization.
        :type automatic_loading: bool
        :param max_multiprocessing_chunksize: max possible value for chunksize as calculated by `calc_chunksize()`
            in `farm.utils`. For certain cases like lm_finetuning, a smaller value can be set, as the default chunksize
            values are rather large that might cause memory issues.
        :type max_multiprocessing_chunksize: int
        :param max_processes: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
                              It can be set to 1 to disable the use of multiprocessing or make debugging easier.
        :type max_processes: int
        :param caching: save the processed datasets on disk to save time/compute if the same train data is used to run
                        multiple experiments. Each cache has a checksum based on the train_filename of the Processor
                        and the batch size.
        :type caching: bool
        :param cache_path: root dir for storing the datasets' cache.
        :type cache_path: Path
        """
        self.distributed = distributed
        self.processor = processor
        self.data = {}
        self.batch_size = batch_size
        self.class_weights = None
        self.max_processes = max_processes
        self.max_multiprocessing_chunksize = max_multiprocessing_chunksize
        self.caching = caching
        self.cache_path = cache_path
        self.tensor_names = None
        if eval_batch_size is None:
            self.eval_batch_size = batch_size
        else:
            self.eval_batch_size = eval_batch_size

        if len(self.processor.tasks) == 0:
            raise Exception("No task initialized. Try initializing the processor with a metric and a label list. "
                            "Alternatively you can add a task using Processor.add_task()")

        if type(self.processor.tokenizer) == EmbeddingTokenizer:
            if max_processes != 1:
                logger.warning("Multiprocessing not efficient for WordEmbedding Tokenizers. Please set max_process \n"
                            "argument in DataSilo to 1.")

        loaded_from_cache = False
        if self.caching:  # Check if DataSets are present in cache
            checksum = self._get_checksum()
            dataset_path = self.cache_path / checksum

            if dataset_path.exists():
                self._load_dataset_from_cache(dataset_path)
                loaded_from_cache = True

        if not loaded_from_cache and automatic_loading:
            # In most cases we want to load all data automatically, but in some cases we rather want to do this
            # later or load from dicts instead of file (https://github.com/deepset-ai/FARM/issues/85)
            self._load_data()

    @classmethod
    def _dataset_from_chunk(cls, chunk, processor):
        """
        Creating a dataset for a chunk (= subset) of dicts. In multiprocessing:
          * we read in all dicts from a file
          * split all dicts into chunks
          * feed *one chunk* to *one process*
          => the *one chunk*  gets converted to *one dataset* (that's what we do here)
          * all datasets get collected and concatenated
        :param chunk: Instead of only having a list of dicts here we also supply an index (ascending int) for each.
            => [(0, dict), (1, dict) ...]
        :type chunk: list of tuples
        :param processor: FARM Processor (e.g. TextClassificationProcessor)
        :return: PyTorch Dataset
        """
        dicts = [d[1] for d in chunk]
        indices = [x[0] for x in chunk]
        dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices)
        return dataset, tensor_names, problematic_sample_ids

    def _get_dataset(self, filename, dicts=None):
        if not filename and not dicts:
            raise ValueError("You must either supply `filename` or `dicts`")

        # loading dicts from file (default)
        if dicts is None:
            dicts = list(self.processor.file_to_dicts(filename))
            #shuffle list of dicts here if we later want to have a random dev set splitted from train set
            if str(self.processor.train_filename) in str(filename):
                if not self.processor.dev_filename:
                    if self.processor.dev_split > 0.0:
                        random.shuffle(dicts)

        num_dicts = len(dicts)
        multiprocessing_chunk_size, num_cpus_used = calc_chunksize(
            num_dicts=num_dicts,
            max_processes=self.max_processes,
            max_chunksize=self.max_multiprocessing_chunksize,
        )

        with ExitStack() as stack:
            if self.max_processes > 1:  # use multiprocessing only when max_processes > 1
                p = stack.enter_context(mp.Pool(processes=num_cpus_used))

                logger.info(
                    f"Got ya {num_cpus_used} parallel workers to convert {num_dicts} dictionaries "
                    f"to pytorch datasets (chunksize = {multiprocessing_chunk_size})..."
                )
                log_ascii_workers(num_cpus_used, logger)

                results = p.imap(
                    partial(self._dataset_from_chunk, processor=self.processor),
                    grouper(dicts, multiprocessing_chunk_size),
                    chunksize=1,
                )
            else:
                logger.info(
                    f"Multiprocessing disabled, using a single worker to convert {num_dicts}"
                    f"dictionaries to pytorch datasets."
                )

                results = map(partial(self._dataset_from_chunk, processor=self.processor), grouper(dicts, num_dicts))

            datasets = []
            problematic_ids_all = set()

            desc = f"Preprocessing Dataset"
            if filename:
                desc += f" {filename}"
            with tqdm(total=len(dicts), unit=' Dicts', desc=desc) as pbar:
                for dataset, tensor_names, problematic_samples in results:
                    datasets.append(dataset)
                    # update progress bar (last step can have less dicts than actual chunk_size)
                    pbar.update(min(multiprocessing_chunk_size, pbar.total-pbar.n))
                    problematic_ids_all.update(problematic_samples)

            self.processor.log_problematic(problematic_ids_all)
            # _dataset_from_chunk can return a None in cases where downsampling has occurred
            datasets = [d for d in datasets if d]
            concat_datasets = ConcatDataset(datasets)
            return concat_datasets, tensor_names

    def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
        """
        Loading the train, dev and test datasets either from files (default) or from supplied dicts.
        The processor is called to handle the full conversion from "raw data" to a Pytorch Dataset.
        The resulting datasets are loaded into DataSilo.data

        :param train_dicts: (Optional) dicts containing examples for training.
        :param dev_dicts: (Optional) dicts containing examples for dev.
        :param test_dicts: (Optional) dicts containing examples for test.
        :return: None
        """

        logger.info("\nLoading data into the data silo ..."
                    "{}".format(TRACTOR_SMALL))
        # train data
        logger.info("LOADING TRAIN DATA")
        logger.info("==================")
        if train_dicts:
            # either from supplied dicts
            logger.info("Loading train set from supplied dicts ")
            self.data["train"], self.tensor_names = self._get_dataset(filename=None, dicts=train_dicts)
        elif self.processor.train_filename:
            # or from a file (default)
            train_file = self.processor.data_dir / self.processor.train_filename
            logger.info("Loading train set from: {} ".format(train_file))
            self.data["train"], self.tensor_names = self._get_dataset(train_file)
        else:
            logger.info("No train set is being loaded")
            self.data["train"] = None

        # dev data
        logger.info("")
        logger.info("LOADING DEV DATA")
        logger.info("=================")
        if dev_dicts:
            # either from supplied dicts
            logger.info("Loading train set from supplied dicts ")
            self.data["dev"], self.tensor_names = self._get_dataset(filename=None, dicts=dev_dicts)
        elif self.processor.dev_filename:
            # or from file (default)
            dev_file = self.processor.data_dir / self.processor.dev_filename
            logger.info("Loading dev set from: {}".format(dev_file))
            self.data["dev"], _ = self._get_dataset(dev_file)
        elif self.processor.dev_split > 0.0:
            # or split it apart from train set
            logger.info("Loading dev set as a slice of train set")
            self._create_dev_from_train()
        else:
            logger.info("No dev set is being loaded")
            self.data["dev"] = None

        logger.info("")
        logger.info("LOADING TEST DATA")
        logger.info("=================")
        # test data
        if test_dicts:
            # either from supplied dicts
            logger.info("Loading train set from supplied dicts ")
            self.data["test"], self.tensor_names = self._get_dataset(filename=None, dicts=test_dicts)
        elif self.processor.test_filename:
            # or from file (default)
            test_file = self.processor.data_dir / self.processor.test_filename
            logger.info("Loading test set from: {}".format(test_file))
            if self.tensor_names:
                self.data["test"], _ = self._get_dataset(test_file)
            else:
                self.data["test"], self.tensor_names = self._get_dataset(test_file)
        else:
            logger.info("No test set is being loaded")
            self.data["test"] = None

        if self.caching:
            self._save_dataset_to_cache()

        # derive stats and meta data
        self._calculate_statistics()
        # self.calculate_class_weights()

        self._initialize_data_loaders()

    def _load_dataset_from_cache(self, cache_dir):
        """
        Load serialized dataset from a cache.
        """
        logger.info(f"Loading datasets from cache at {cache_dir}")
        self.data["train"] = torch.load(cache_dir / "train_dataset")

        dev_dataset_path = cache_dir / "dev_dataset"
        if dev_dataset_path.exists():
            self.data["dev"] = torch.load(dev_dataset_path)
        else:
            self.data["dev"] = None

        test_dataset_path = cache_dir / "test_dataset"
        if test_dataset_path.exists():
            self.data["test"] = torch.load(test_dataset_path)
        else:
            self.data["test"] = None

        self.tensor_names = torch.load(cache_dir / "tensor_names")

        # derive stats and meta data
        self._calculate_statistics()
        # self.calculate_class_weights()

        self._initialize_data_loaders()

    def _get_checksum(self):
        """
        Get checksum based on a dict to ensure validity of cached DataSilo
        """
        # keys in the dict identifies uniqueness for a given DataSilo.
        payload_dict = {
            "train_filename": str(Path(self.processor.train_filename).absolute()),
            "data_dir": str(self.processor.data_dir.absolute()),
            "max_seq_len": self.processor.max_seq_len,
            "dev_split": self.processor.dev_split,
            "tasks": self.processor.tasks
        }
        checksum = get_dict_checksum(payload_dict)
        return checksum

    def _save_dataset_to_cache(self):
        """
        Serialize and save dataset to a cache.
        """
        checksum = self._get_checksum()

        cache_dir = self.cache_path / checksum
        cache_dir.mkdir(parents=True, exist_ok=True)

        torch.save(self.data["train"], cache_dir / "train_dataset")

        if self.data["dev"]:
            torch.save(self.data["dev"], cache_dir / "dev_dataset")

        if self.data["test"]:
            torch.save(self.data["test"], cache_dir / "test_dataset")

        torch.save(self.tensor_names, cache_dir / "tensor_names")
        logger.info(f"Cached the datasets at {cache_dir}")

    def _initialize_data_loaders(self):
        """ Initializing train, dev and test data loaders for the already loaded datasets """

        if self.data["train"] is not None:
            if self.distributed:
                sampler_train = DistributedSampler(self.data["train"])
            else:
                sampler_train = RandomSampler(self.data["train"])

            data_loader_train = NamedDataLoader(
                dataset=self.data["train"],
                sampler=sampler_train,
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_train = None

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.eval_batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None

        if self.data["test"] is not None:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.eval_batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_test = None

        self.loaders = {
            "train": data_loader_train,
            "dev": data_loader_dev,
            "test": data_loader_test,
        }

    def _create_dev_from_train(self):
        """ Split a dev set apart from the train dataset """
        n_dev = int(self.processor.dev_split * len(self.data["train"]))
        n_train = len(self.data["train"]) - n_dev

        train_dataset, dev_dataset = self.random_split_ConcatDataset(self.data["train"], lengths=[n_train, n_dev])
        self.data["train"] = train_dataset
        if(len(dev_dataset) > 0):
            self.data["dev"] = dev_dataset
        else:
            logger.warning("No dev set created. Please adjust the dev_split parameter.")

        logger.info(
            f"Took {len(dev_dataset)} samples out of train set to create dev set (dev split is roughly {self.processor.dev_split})"
        )

[docs]    def random_split_ConcatDataset(self, ds, lengths):
        """
        Roughly split a Concatdataset into non-overlapping new datasets of given lengths.
        Samples inside Concatdataset should already be shuffled

        :param ds: Dataset to be split
        :type ds: Dataset
        :param lengths: lengths of splits to be produced
        :type lengths: list
        """
        if sum(lengths) != len(ds):
            raise ValueError("Sum of input lengths does not equal the length of the input dataset!")

        try:
            idx_dataset = np.where(np.array(ds.cumulative_sizes) > lengths[0])[0][0]
        except IndexError:
            raise Exception("All dataset chunks are being assigned to train set leaving no samples for dev set. "
                            "Either consider increasing dev_split or setting it to 0.0\n"
                            f"Cumulative chunk sizes: {ds.cumulative_sizes}\n"
                            f"train/dev split: {lengths}")

        assert idx_dataset >= 1, "Dev_split ratio is too large, there is no data in train set. " \
                             f"Please lower dev_split = {self.processor.dev_split}"

        train = ConcatDataset(ds.datasets[:idx_dataset])
        test = ConcatDataset(ds.datasets[idx_dataset:])
        return train, test

    def _calculate_statistics(self):
        """ Calculate and log simple summary statistics of the datasets """
        logger.info("")
        logger.info("DATASETS SUMMARY")
        logger.info("================")

        self.counts = {}

        if self.data["train"]:
            self.counts["train"] = len(self.data["train"])
        else:
            self.counts["train"] = 0

        if self.data["dev"]:
            self.counts["dev"] = len(self.data["dev"])
        else:
            self.counts["dev"] = 0

        if self.data["test"]:
            self.counts["test"] = len(self.data["test"])
        else:
            self.counts["test"] = 0

        seq_lens = []
        if self.data["train"]:
            for dataset in self.data["train"].datasets:
                train_input_numpy = dataset[:][0].numpy()
                seq_lens.extend(np.sum(train_input_numpy != self.processor.tokenizer.pad_token_id, axis=1))
            max_seq_len = dataset[:][0].shape[1]

        self.clipped = np.mean(np.array(seq_lens) == max_seq_len) if seq_lens else 0
        self.ave_len = np.mean(seq_lens) if seq_lens else 0

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        if self.data["train"]:
            logger.info("Longest sequence length observed after clipping:     {}".format(max(seq_lens)))
            logger.info("Average sequence length after clipping: {}".format(self.ave_len))
            logger.info("Proportion clipped:      {}".format(self.clipped))
            if self.clipped > 0.5:
                logger.info("[Farmer's Tip] {}% of your samples got cut down to {} tokens. "
                            "Consider increasing max_seq_len. "
                            "This will lead to higher memory consumption but is likely to "
                            "improve your model performance".format(round(self.clipped * 100, 1), max_seq_len))

        MlLogger.log_params(
            {
                "n_samples_train": self.counts["train"],
                "n_samples_dev": self.counts["dev"],
                "n_samples_test": self.counts["test"],
                "batch_size": self.batch_size,
                "ave_seq_len": self.ave_len,
                "clipped": self.clipped,
            }
        )

[docs]    def calculate_class_weights(self, task_name, source="train"):
        """ For imbalanced datasets, we can calculate class weights that can be used later in the
        loss function of the prediction head to upweight the loss of minorities.

        :param task_name: name of the task as used in the processor
        :type task_name: str
        """
        
        tensor_name = self.processor.tasks[task_name]["label_tensor_name"]
        label_list = self.processor.tasks[task_name]["label_list"]
        tensor_idx = list(self.tensor_names).index(tensor_name)
        # we need at least ONE observation for each label to avoid division by zero in compute_class_weights.
        observed_labels = copy.deepcopy(label_list)
        if source == "all":
            datasets = self.data.values()
        elif source == "train":
            datasets = [self.data["train"]]
        else:
            raise Exception("source argument expects one of [\"train\", \"all\"]")
        for dataset in datasets:
            if "multilabel" in self.processor.tasks[task_name]["task_type"]:
                for x in dataset:
                    observed_labels += [label_list[label_id] for label_id in (x[tensor_idx] == 1).nonzero()]
            else:
                observed_labels += [label_list[x[tensor_idx].item()] for x in dataset]

        #TODO scale e.g. via logarithm to avoid crazy spikes for rare classes
        class_weights = compute_class_weight("balanced", classes=np.asarray(label_list), y=observed_labels)

        # conversion necessary to have class weights of same type as model weights
        class_weights = class_weights.astype(np.float32)
        return class_weights

[docs]    def get_data_loader(self, dataset_name):
        return self.loaders[dataset_name]

[docs]    def n_samples(self, dataset_name):
        """
        Returns the number of samples in a given dataset.

        :param dataset_name: Choose from train, dev or test
        :type dataset_name: str
        """
        return self.counts[dataset_name]


[docs]class StreamingDataSilo:
    """
    Streaming Data Silo loads and preprocesses datasets in parallel to the model training.

    The samples are lazily created from the input file and batches are yielded on-the-fly when required during training.
    This is useful if you:
    - work with large datasets that don't fit in memory
    - want to save time (by not preprocessing the entire dataset before starting training)

    For optimal training performance and efficient utilization of shiny GPUs, the pipeline always keeps a few
    pre-computed batches ready to avoid any waiting time when a batch is requested during training.

    To parallelize the creation of batches, PyTorch DataLoader provide an option to use
    multiple workers that utilize the available CPU cores and ensure enough pre-computed batches.
    """

[docs]    def __init__(self, processor, batch_size, distributed=False, dataloader_workers=8):
        """
        :param processor: A dataset specific Processor object which will turn input file into a Pytorch Dataset.
        :type processor: Processor
        :param batch_size: The size of batch to use for model training.
        :type batch_size: int
        :param dataloader_workers: number of workers for PyTorch DataLoader to create batches in parallel
        :type dataloader_workers: int
        """

        self.processor = processor
        self.batch_size = batch_size
        self.dataloader_workers = dataloader_workers
        self.distributed = distributed

[docs]    def get_data_loader(self, dataset_name):
        """
        Returns a new instance of dataloader for the given dataset.

        The dataloader lazily yields from Iterable DataSets. After a complete iteration
        over the input data, the generators gets exhausted. So, for instance, in the 
        case of model training, a new train dataloader must be used for each train epoch.

        :param dataset_name: 'train', 'dev', or 'test' set.
        :type dataset_name: str
        """
        filename = None
        if dataset_name == "train":
            filename = self.processor.train_filename
        elif dataset_name == "dev":
            if self.processor.dev_split > 0.0:
                raise NotImplemented(
                            "StreamingDataSilo does not have dev_split implemented. "
                            "To use dev data, supply a dev filename when creating the Processor."
                )
            elif self.processor.dev_filename:
                filename = self.processor.dev_filename
        elif dataset_name == "test":
            if self.processor.test_filename:
                filename = self.processor.test_filename

        if not filename:
            return None

        #  Batching:
        #
        #  The model Trainer is passed a PyTorch DataLoader instance that yields dataset batches for training.
        #
        #  By default, the PyTorch DataLoader prefetch (2 * num_workers) samples. However, given the higher
        #  batch sizes(usually >64) for model training, the default prefetch is not sufficient to keep the
        #  model Training saturated with datasets.
        #
        #  As a workaround, we yield batches of samples instead of yielding individual samples. The DataLoader
        #  can then prefetch (2 * num_workers) number of batches of samples.
        #
        #  Since the batching is now handled within _StreamingDataSet, we disable the batching on DataLoader side
        #  by initializing the data loader with batch_size as 1.

        if isinstance(filename, Path) and filename.is_dir():
            filepath = filename
        else:
            filepath = self.processor.data_dir / filename

        data_set = _StreamingDataSet(
            processor=self.processor,
            filepath=filepath,
            batch_size=self.batch_size,
            dataloader_workers=self.dataloader_workers,
            distributed = self.distributed
        )

        data_loader = NamedDataLoader(
            dataset=data_set, batch_size=1, num_workers=self.dataloader_workers, pin_memory=True
        )
        return data_loader


class _StreamingDataSet(IterableDataset):
    def __init__(self, processor, filepath, batch_size, dataloader_workers, distributed=False, n_samples=None):
        """
        :param processor: A dataset specific Processor object which will turn input file into a Pytorch Dataset.
        :type processor: Processor
        :param batch_size: The size of batch that should be returned by the DataLoaders.
        :type batch_size: int
        :param filepath: input filename to load the dataset from
        :type filepath: Path
        :param dataloader_workers: number of workers for PyTorch Dataloader
        :type dataloader_workers: int
        """

        self.batch_size = batch_size
        self.processor = processor
        self.filepath = filepath
        self.dataloader_workers = dataloader_workers
        self.distributed = distributed

        # calculate or estimate number of samples so that the data loader can derive number of training steps
        if filepath.is_file():
            files = [filepath]
        else:
            files = [file for file in filepath.iterdir()]

        if n_samples:
            self.n_samples = n_samples
        else:
            try:
                self.n_samples = self.processor.estimate_n_samples(files[0]) * len(files)
            except AttributeError:
                AttributeError(f"Could not estimate n_samples for {self.processor.__class__.__name__} in StreamingDataSilo. "
                                    f"Make sure that your Processor has `estimate_n_samples()` implemented")
        logger.info(f"Found data for {self.n_samples} samples")
        self.shuffle_files(files)

        dicts_from_files = [processor.file_to_dicts(file) for file in files]
        self.file_to_dicts_generator = chain(*dicts_from_files)

        if self.distributed:
            self.rank = torch.distributed.get_rank()
            self.world_size = torch.distributed.get_world_size()

    def __len__(self):
        if self.distributed:
            # only a heuristic as we don't necessarily split samples equally across ranks
            len = self.n_samples // self.world_size
        else:
            len = self.n_samples
        return len

    def __iter__(self):
        #  With IterableDataset, the same __iter__ is copied over to the multiple workers of
        #  a Dataloader. Hence, we need to configure the __iter__ to not yield duplicated data
        #  when more than 1 workers are used.
        #
        #  To avoid duplicates, we need to split the input dicts between the workers.
        #  The grouper() converts a dict generator given as input and yields only the
        #  dicts that are to be processed by the given worker_id.
        #
        #  For instance, consider input as [dictA, dictB, dictC, ...], then the grouper
        #  (with n=2) will return, [[dictA, dictB], [dictE, dictF] ...] for worker 1 and
        #  [[dictC, dictD], [dictG, dictH] ...] for worker 2.

        worker_info = torch.utils.data.get_worker_info()
        if self.distributed:
            worker_id = self.rank * worker_info.num_workers + worker_info.id
            total_workers = self.world_size * worker_info.num_workers
        else:
            worker_id = worker_info.id
            total_workers = self.dataloader_workers

        dicts = grouper(self.file_to_dicts_generator, n=10, worker_id=worker_id, total_workers=total_workers)
        results = map(self._dataset_from_chunk, dicts)

        batch = []
        for datasets, tensor_names in results:
            if not datasets:
                continue
            self.tensor_names = tensor_names
            for ds in datasets:
                batch.append(ds)
                if len(batch) == self.batch_size:
                    yield batch
                    batch = []
        if batch:
            yield batch

    def _dataset_from_chunk(self, chunk):
        """
        Creating a dataset for a chunk (= subset) of dicts.
        :param chunk: Instead of only having a list of dicts here we also supply an index (ascending int) for each.
            => [(0, dict), (1, dict) ...]
        :type chunk: list of tuples
        :return: PyTorch Dataset
        """
        dicts = [d[1] for d in chunk]
        # need at least 2 documents to sample random sentences from
        if len(dicts) < 2 and type(self.processor) == BertStyleLMProcessor:
            logger.info("Skipping a dict chunk as it contains less than 2 documents ...")
            return None, None
        indices = [x[0] for x in chunk]
        datasets, tensor_names, _ = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
        return datasets, tensor_names

    def shuffle_files(self, files, seed=None):
        if not seed:
            seed = random.randrange(100)
        random.seed(seed)
        random.shuffle(files)
        return files


[docs]class DataSiloForCrossVal:
    """
    Perform cross validation or nested cross validation.

    For performing cross validation or nested cross validation, we really want to combine all the
    instances from all the sets or just some of the sets, then create a different data silo
    instance for each fold or nested fold.
    Calling DataSiloForCrossVal.make() creates a list of DataSiloForCrossVal instances - one for each fold.
    """

[docs]    def __init__(self, origsilo, trainset, devset, testset):
        self.tensor_names = origsilo.tensor_names
        self.data = {"train": trainset, "dev": devset, "test": testset}
        self.processor = origsilo.processor
        self.batch_size = origsilo.batch_size
        # should not be necessary, xval makes no sense with huge data
        # sampler_train = DistributedSampler(self.data["train"])
        sampler_train = RandomSampler(trainset)

        self.data_loader_train = NamedDataLoader(
            dataset=trainset,
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_dev = NamedDataLoader(
            dataset=devset,
            sampler=SequentialSampler(devset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_test = NamedDataLoader(
            dataset=testset,
            sampler=SequentialSampler(testset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.loaders = {
            "train": self.data_loader_train,
            "dev": self.data_loader_dev,
            "test": self.data_loader_test,
        }

[docs]    def get_data_loader(self, which):
        return self.loaders[which]

[docs]    @classmethod
    def make(cls, datasilo, sets=["train", "dev", "test"], n_splits=5, shuffle=True, random_state=None,
             stratified=True, n_neg_answers_per_question=1, n_inner_splits=None):
        """
        Create number of folds data-silo-like objects which can be used for training from the
        original data silo passed on.

        :param datasilo: the data silo that contains the original data
        :type datasilo: DataSilo
        :param sets: which sets to use to create the xval folds (strings)
        :type sets: list
        :param n_splits: number of folds to create
        :type n_splits: int
        :param shuffle: shuffle each class' samples before splitting
        :type shuffle: bool
        :param random_state: random state for shuffling
        :type random_state: int
        :param stratified: If class stratification should be done.
            It is never done with question answering.
        :type stratified: bool
        :param n_neg_answers_per_question: number of negative answers per question to include for training
        :type n_neg_answers_per_question: int
        :param n_inner_splits: Number of inner splits of a nested cross validation.
            Default is ``None`` which means to do a normal (not nested) cross validation.
            If at least 2 is given a nested cross validation is done. In that case the ``n_splits``
            parameter is the number of outer splits.
            The outer cross validation splits the data into a test set and a rest set.
            The inner cross validation splits the rest data into a train set and a dev set.
            The advantage of a nested cross validation is that it is doing the inner split
            not just by random but in a more systematic way. When doing model evaluation
            this also reduces the variance. This is because you train on more different
            iterations with more different data constellations.
        :type n_inner_splits: int
        """
        # check n_inner_splits param
        if (n_inner_splits is not None) and (not n_inner_splits >= 2):
            raise ValueError("'n_inner_splits' must be at least 2!")

        if "question_answering" in datasilo.processor.tasks and n_inner_splits is None:
            return cls._make_question_answering(
                datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question
            )
        elif "question_answering" in datasilo.processor.tasks and n_inner_splits is not None:
            raise NotImplementedError()
        elif n_inner_splits is None:
            return cls._make(
                datasilo, sets, n_splits, shuffle, random_state, stratified
            )
        elif n_inner_splits is not None:
            return cls._make_nested(
                datasilo, sets, n_splits, shuffle, random_state, stratified,
                n_inner_splits
            )
        else:
            raise RuntimeError("Cross validation can not be done under these conditions!")


    @classmethod
    def _make_question_answering(cls, datasilo, sets=["train", "dev", "test"], n_splits=5, shuffle=True,
                                 random_state=None, n_neg_answers_per_question=1):
        """
        Create number of folds data-silo-like objects which can be used for training from the
        original data silo passed on. This function takes into account the characteristics of the
        data for question-answering-

        :param datasilo: the data silo that contains the original data
        :type datasilo: DataSilo
        :param sets: which sets to use to create the xval folds (strings)
        :type sets: list
        :param n_splits: number of folds to create
        :type n_splits: int
        :param shuffle: shuffle each class' samples before splitting
        :type shuffle: bool
        :param random_state: random state for shuffling
        :type random_state: int
        :param n_neg_answers_per_question: number of negative answers per question to include for training
        :type n_neg_answers_per_question: int
        """
        assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}"
        assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}"

        id_index = datasilo.tensor_names.index("id")
        label_index = datasilo.tensor_names.index("labels")

        sets_to_concat = []
        for setname in sets:
            if datasilo.data[setname]:
                sets_to_concat.extend(datasilo.data[setname])
        all_data = ConcatDataset(sets_to_concat)

        documents = []
        keyfunc = lambda x: x[id_index][0]
        all_data = sorted(all_data.datasets, key=keyfunc)
        for key, document in groupby(all_data, key=keyfunc):
            documents.append(list(document))

        xval_split = cls._split_for_qa(documents = documents,
                                       id_index=id_index,
                                       n_splits=n_splits,
                                       shuffle=shuffle,
                                       random_state=random_state,
                                       )
        silos = []

        for train_set, test_set in xval_split:
            # Each training set is further divided into actual train and dev set
            if datasilo.processor.dev_split > 0:
                dev_split = datasilo.processor.dev_split
                n_dev = int(np.ceil(dev_split * len(train_set)))
                assert n_dev > 0, f"dev split of {dev_split} is not large enough to split away a development set"
                n_actual_train = len(train_set) - n_dev
                actual_train_set = train_set[:n_actual_train]
                dev_set = train_set[n_actual_train:]
                ds_dev = [sample for document in dev_set for sample in document]
            else:
                ds_dev = None
                actual_train_set = train_set

            train_samples = []
            for doc in actual_train_set:
                keyfunc = lambda x: x[id_index][1]
                doc = sorted(doc, key=keyfunc)
                for key, question in groupby(doc, key=keyfunc):
                    # add all available answrs to train set
                    sample_list = list(question)
                    neg_answer_idx = []
                    for index, sample in enumerate(sample_list):
                        if sample[label_index][0][0] or sample[label_index][0][1]:
                            train_samples.append(sample)
                        else:
                            neg_answer_idx.append(index)
                    # add random n_neg_answers_per_question samples to train set
                    if len(neg_answer_idx) <= n_neg_answers_per_question:
                        train_samples.extend([sample_list[idx] for idx in neg_answer_idx])
                    else:
                        neg_answer_idx = random.sample(neg_answer_idx, n_neg_answers_per_question)
                        train_samples.extend([sample_list[idx] for idx in neg_answer_idx])

            ds_train = train_samples
            ds_test = [sample for document in test_set for sample in document]
            silos.append(DataSiloForCrossVal(datasilo, ds_train, ds_dev, ds_test))
        return silos

    @staticmethod
    def _make(datasilo, sets=["train", "dev", "test"], n_splits=5, shuffle=True,
              random_state=None, stratified=True):
        """
        Create number of folds data-silo-like objects which can be used for training from the
        original data silo passed on.

        :param datasilo: the data silo that contains the original data
        :param sets: which sets to use to create the xval folds
        :param n_splits: number of folds to create
        :param shuffle: shuffle each class' samples before splitting
        :param random_state: random state for shuffling
        :param stratified: if class stratification should be done
        """
        setstoconcat = [datasilo.data[setname] for setname in sets]
        ds_all = ConcatDataset(setstoconcat)
        idxs = list(range(len(ds_all)))
        dev_split = datasilo.processor.dev_split
        if stratified:
            # get all the labels for stratification
            ytensors = [t[3][0] for t in ds_all]
            Y = torch.stack(ytensors)
            xval = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
            xval_split = xval.split(idxs,Y)
        else:
            xval = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
            xval_split = xval.split(idxs)
        # for each fold create a DataSilo4Xval instance, where the training set is further
        # divided into actual train and dev set
        silos = []
        for train_idx, test_idx in xval_split:
            n_dev = int(dev_split * len(train_idx))
            n_actual_train = len(train_idx) - n_dev
            # TODO: this split into actual train and test set could/should also be stratified, for now
            # we just do this by taking the first/last indices from the train set (which should be
            # shuffled by default)
            actual_train_idx = train_idx[:n_actual_train]
            dev_idx = train_idx[n_actual_train:]
            # create the actual datasets
            ds_train = Subset(ds_all, actual_train_idx)
            ds_dev = Subset(ds_all, dev_idx)
            ds_test = Subset(ds_all, test_idx)
            silos.append(DataSiloForCrossVal(datasilo, ds_train, ds_dev, ds_test))
        return silos

    @staticmethod
    def _split_for_qa(documents, id_index, n_splits=5, shuffle=True, random_state=None):
        keyfunc = lambda x: x[id_index][1]
        if shuffle:
            random.shuffle(documents, random_state)

        questions_per_doc = []
        for doc in documents:
            # group samples in current doc by question id
            doc = sorted(doc, key=keyfunc)
            questions = list(groupby(doc, key=keyfunc))
            questions_per_doc.append(len(questions))

        # split documents into n_splits splits with approximately same number of questions per split
        questions_per_doc = np.array(questions_per_doc)
        accumulated_questions_per_doc = questions_per_doc.cumsum()
        questions_per_fold = accumulated_questions_per_doc[-1] // n_splits
        accumulated_questions_per_fold = np.array(range(1, n_splits)) * questions_per_fold
        if accumulated_questions_per_fold[0] < accumulated_questions_per_doc[0]:
            accumulated_questions_per_fold[0] = accumulated_questions_per_doc[0] + 1
        indices_to_split_at = np.searchsorted(accumulated_questions_per_doc, accumulated_questions_per_fold, side="right")
        splits = np.split(documents, indices_to_split_at)

        for split in splits:
            assert len(split) > 0

        for idx, split in enumerate(splits):
            current_test_set = split
            current_train_set = np.hstack(np.delete(splits, idx, axis=0))

            yield current_train_set, current_test_set

    @staticmethod
    def _make_nested(datasilo, sets=["train", "dev", "test"],
                     n_splits=5, shuffle=True, random_state=None,
                     stratified=True, n_inner_splits=5):
        setstoconcat = [datasilo.data[setname] for setname in sets]
        ds_all = ConcatDataset(setstoconcat)
        idxs = list(range(len(ds_all)))

        silos = []

        # outer cross validation where we split all data to test and rest
        if stratified:
            # get all the labels for stratification
            ytensors = [t[3][0] for t in ds_all]
            y = torch.stack(ytensors)
            outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
            outer_split = outer_cv.split(idxs, y)
        else:
            outer_cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
            outer_split = outer_cv.split(idxs)
        for idxs_rest, idxs_test in outer_split:

            # inner cross validation where we split rest data into train and dev
            if stratified:
                y_rest = y[idxs_rest]
                inner_cv = StratifiedKFold(n_splits=n_inner_splits, shuffle=shuffle, random_state=random_state)
                inner_split = inner_cv.split(idxs_rest, y_rest)
            else:
                inner_cv = KFold(n_splits=n_inner_splits, shuffle=shuffle, random_state=random_state)
                inner_split = inner_cv.split(idxs_rest)
            for idxs_train_idxs, idxs_dev_idxs in inner_split:

                # split idxs_rest with indexes from inner cross validation
                idxs_train = idxs_rest[idxs_train_idxs]
                idxs_dev = idxs_rest[idxs_dev_idxs]

                ds_train = Subset(ds_all, idxs_train)
                ds_dev = Subset(ds_all, idxs_dev)
                ds_test = Subset(ds_all, idxs_test)
                silos.append(DataSiloForCrossVal(datasilo, ds_train, ds_dev, ds_test))
        return silos