Source code for neural_network.util.weighted_partitioner
import math
import random
from typing import List
import pandas as pd
from .partitioner import Partitioner
[docs]
class WeightedPartitioner(Partitioner):
"""Class to create `m` sets from a list of `n` integers weighted by which
ground truth class each integer lies in
"""
[docs]
def __init__(self, n: int, m: int, df: pd.DataFrame,
do_regression: bool = False, bins: int = 10):
"""Constructor method
Parameters
----------
n : int
Number of integers
m : int
Number of sets for the partition
df : pd.DataFrame
The classes for the integers
do_regression : bool
Whether we are partitioning regressional or classificational data
bins : int
If regression is True, this represents the number of bins to
split the data in to. Otherwise, this parameter is ignored
"""
super().__init__(n, m)
if not len(df) == n:
raise ValueError(f"n must equal the length of the dataframe "
f"(n = {n}, len(df) = {len(df)})")
if do_regression:
self._num_classes = bins
class_dict = {j: [] for j in range(self._num_classes)}
min_y, max_y = min(df['y']), max(df['y'])
class_width = (max_y - min_y) / bins
for i in range(len(df)):
y = df.loc[i, 'y']
chosen_bin = int((y - min_y) / class_width)
if chosen_bin == self._num_classes:
chosen_bin = self._num_classes - 1
class_dict[chosen_bin].append(i)
# Need to account for potentially empty class lists
for j in range(self._num_classes):
if not class_dict[j]:
class_dict.pop(j)
self._num_classes -= 1
self._class_dict = {}
for i, j in enumerate(class_dict.keys()):
# Relabelling the classes
self._class_dict[i] = class_dict[j]
else:
self._num_classes = len(set(df['y'].to_numpy()))
self._class_dict = {j: [] for j in range(self._num_classes)}
for i in range(len(df)):
self._class_dict[int(df.loc[i, 'y'])].append(i)
[docs]
def __call__(self) -> List[List[int]]:
"""Uses weights for each class to create sets of size `m` containing
integers (sampled with replacement).
Returns
-------
List[List[int]]
The list of sets
"""
# Produces a list of `n` class indices
chosen_classes = random.choices(population=range(self._num_classes),
k=self._n)
output_list = []
num_sets = math.ceil(self._n / self._m)
for i in range(num_sets - 1):
inner_list = []
for j in range(self._m):
chosen_class = chosen_classes[i * self._m + j]
inner_list.append(random.choice(
self._class_dict[chosen_class]))
output_list.append(inner_list)
inner_list = []
for k in range((num_sets - 1) * self._m, self._n):
chosen_class = chosen_classes[k]
inner_list.append(random.choice(self._class_dict[chosen_class]))
output_list.append(inner_list)
return output_list