datasplitters

Classes for splitting data.

Classes

DatasetSplitter

class DatasetSplitter():

Parent class for different types of dataset splits.

Ancestors

abc.ABC

Subclasses

PercentageSplitter
SplitterDefinedInData
bitfount.data.datasplitters._InferenceSplitter

Static methods

create

def create(    splitter_name: str, **kwargs: Any,) ‑> DatasetSplitter:

Create a DataSplitter of the requested type.

splitter_name

def splitter_name() ‑> str:

Returns string name for splitter type.

Methods

get_dataset_split_indices

def get_dataset_split_indices(    self,    data: pd.DataFrame,) ‑> tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]]]:

Returns indices for data sets.

get_filenames

def get_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> list[str]:

Returns a list of filenames for a given split.

Only used for file system sources.

Arguments

datasource: A FileSystemIterableSource object.
split: The relevant split to return filenames for.

Returns A list of filenames.

iter_dataset_split

def iter_dataset_split(    self, datasource: BaseSource, split: DataSplit, **kwargs: Any,) ‑> Iterable[pd.DataFrame]:

Yield data for a given split.

Arguments

datasource: The datasource to iterate over.
split: The split to yield data for.
kwargs: Additional args to pass to the underlying datasource yield_data().

iter_dataset_split_indices

def iter_dataset_split_indices(    self, datasource: BaseSource, split: DataSplit,) ‑> Iterable[int]:

Yield indices/keys for a given split.

iter_filenames

def iter_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> Iterable[str]:

Yield filenames for a given split.

Only used for file system sources.

Arguments

datasource: A FileSystemIterableSource object.
split: The relevant split to return filenames for.

PercentageSplitter

class PercentageSplitter(    validation_percentage: int = 10,    test_percentage: int = 10,    shuffle: bool = True,    time_series_sort_by: Optional[Union[str, list[str]]] = None,):

Splits data into sets based on percentages.

The default split is 80% of the data is used training, and 10% for each validation and testing, respectively.

Arguments

validation_percentage: The percentage of data to be used for validation. Defaults to 10.
test_percentage: The percentage of data to be used for testing. Defaults to 10.
time_series_sort_by: A string/list of strings to be used for sorting time series. The strings should correspond to feature names from the dataset. This sorts the dataframe by the values of those features ensuring the validation and test sets come after the training set data to remove potential bias during training and evaluation. Defaults to None.
shuffle: A bool indicating whether we shuffle the data for the splits. Defaults to True.

Ancestors

DatasetSplitter
abc.ABC
bitfount.types.UsedForConfigSchemas

Variables

static shuffle : bool

static test_percentage : int

static time_series_sort_by : Union[str, list[str], ForwardRef(None)]

static validation_percentage : int

Static methods

create

def create(    splitter_name: str, **kwargs: Any,) ‑> DatasetSplitter:

Inherited from:

DatasetSplitter.create :

Create a DataSplitter of the requested type.

splitter_name

def splitter_name() ‑> str:

Define the name of the splitter.

Methods

get_dataset_split_indices

def get_dataset_split_indices(    self,    data: pd.DataFrame,) ‑> tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]]]:

Returns indices for dataset splits.

get_filenames

def get_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> list[str]:

Returns a list of filenames for a given split.

iter_dataset_split

def iter_dataset_split(    self, datasource: BaseSource, split: DataSplit, **kwargs: Any,) ‑> Iterable[pd.DataFrame]:

Yield data for a given split.

iter_dataset_split_indices

def iter_dataset_split_indices(    self, datasource: BaseSource, split: DataSplit,) ‑> Iterable[int]:

Yield indices for a given split.

iter_filenames

def iter_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> Iterable[str]:

Yield filenames for a given split.

SplitterDefinedInData

class SplitterDefinedInData(    column_name: str = 'BITFOUNT_SPLIT_CATEGORY',    training_set_label: str = 'TRAIN',    validation_set_label: str = 'VALIDATION',    test_set_label: str = 'TEST',    infer_data_split_labels: bool = False,):

Splits data into sets based on value in each row.

The splitting is done based on the values in a user specified column.

Arguments

column_name: The column name for which contains the labels for splitting. Defaults to "BITFOUNT_SPLIT_CATEGORY".
training_set_label: The label for the data points to be included in the training set. Defaults to "TRAIN".
validation_set_label: The label for the data points to be included in the validation set. Defaults to "VALIDATION".
test_set_label: The label for the data points to be included in the test set. Defaults to "TEST".

Ancestors

DatasetSplitter
abc.ABC
bitfount.types.UsedForConfigSchemas

Variables

static column_name : str

static infer_data_split_labels : bool

static test_set_label : str

static training_set_label : str

static validation_set_label : str

Static methods

create

def create(    splitter_name: str, **kwargs: Any,) ‑> DatasetSplitter:

Inherited from:

DatasetSplitter.create :

Create a DataSplitter of the requested type.

splitter_name

def splitter_name() ‑> str:

Define the name of the splitter.

Methods

get_dataset_split_indices

def get_dataset_split_indices(    self,    data: pd.DataFrame,) ‑> tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]], numpy.ndarray[typing.Any, numpy.dtype[numpy.integer]]]:

Returns indices for dataset splits.

get_filenames

def get_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> list[str]:

Returns a list of filenames for a given split.

iter_dataset_split

def iter_dataset_split(    self, datasource: BaseSource, split: DataSplit, **kwargs: Any,) ‑> Iterable[pd.DataFrame]:

Yield data for a given split.

iter_dataset_split_indices

def iter_dataset_split_indices(    self, datasource: BaseSource, split: DataSplit,) ‑> Iterable[int]:

Yield indices for a given split.

iter_filenames

def iter_filenames(    self, datasource: FileSystemIterableSource, split: DataSplit,) ‑> Iterable[str]:

Yield filenames for a given split.

Classes​

DatasetSplitter​

Ancestors​

Subclasses​

Static methods​

create​

splitter_name​

Methods​

get_dataset_split_indices​

get_filenames​

iter_dataset_split​

iter_dataset_split_indices​

iter_filenames​

PercentageSplitter​

Ancestors​

Variables​

Static methods​

create​

splitter_name​

Methods​

get_dataset_split_indices​

get_filenames​

iter_dataset_split​

iter_dataset_split_indices​

iter_filenames​

SplitterDefinedInData​

Ancestors​

Variables​

Static methods​

create​

splitter_name​

Methods​

get_dataset_split_indices​

get_filenames​

iter_dataset_split​

iter_dataset_split_indices​

iter_filenames​

Classes

DatasetSplitter

Ancestors

Subclasses

Static methods

create

splitter_name

Methods

get_dataset_split_indices

get_filenames

iter_dataset_split

iter_dataset_split_indices

iter_filenames

PercentageSplitter

Ancestors

Variables

Static methods

create

splitter_name

Methods

get_dataset_split_indices

get_filenames

iter_dataset_split

iter_dataset_split_indices

iter_filenames

SplitterDefinedInData

Ancestors

Variables

Static methods

create

splitter_name

Methods

get_dataset_split_indices

get_filenames

iter_dataset_split

iter_dataset_split_indices

iter_filenames