Source code for neural_network.util.data_splitter

import math
from typing import List, Tuple
import pandas as pd


[docs] class DataSplitter: """Class to split a dataset into training, validation and testing, given a split ratio. """
[docs] def __init__(self, path: str, proportions: List[int]): """Constructor method Parameters ---------- path : str Path to the .csv file containing the data proportions : typing.List The proportions in the sequence training:validation:testing """ self._df = pd.read_csv(path, index_col=0) if not 1 <= len(proportions) <= 3: raise ValueError("proportions must have 1-3 elements denoting the" "train:validation:test ratio") self._proportions = proportions
[docs] def split(self) -> Tuple[pd.DataFrame, ...]: """Main method for the class - splits the data into train:valid:test Returns ------- Tuple[pd.DataFrame, ...] A tuple containing the training, validation and testing dataframes or fewer, if fewer proportions have been passed """ n = len(self._df) prop_total = sum(self._proportions) splits = [0] dfs = [] for i in range(len(self._proportions) - 1): len_new_df = math.floor(n * (self._proportions[i] / prop_total)) if len_new_df == 0: len_new_df = 1 splits.append(sum(splits) + len_new_df) new_df = self._df.iloc[splits[i]:splits[i + 1]] new_df.index = range(len_new_df) dfs.append(new_df) len_final_df = n - splits[-1] final_df = self._df.iloc[splits[-1]:] final_df.index = range(len_final_df) dfs.append(final_df) return tuple(dfs)