Source code for EDAspy.optimization.custom.probabilistic_models.univariate_categorical

#!/usr/bin/env python
# coding: utf-8

import numpy as np
import pandas as pd

from ._probabilistic_model import ProbabilisticModel


[docs]def obtain_probabilities(array) -> dict:
    res = {}
    unique, counts = np.unique(array, return_counts=True)
    for i in range(len(unique)):
        res[unique[i]] = counts[i]/array.size

    '''for label in list(set(labels) - set(unique)):
        res[label] = 0'''

    return res


[docs]class UniCategorical(ProbabilisticModel):

    """
    This probabilistic model is discrete and univariate.
    """

    def __init__(self, variables: list):
        super().__init__(variables)

        self.prob_table = {}  # dictionary with variable: {value: prob}

[docs]    def learn(self, dataset: np.array, *args, **kwargs):
        """
        Estimates the independent categorical probability distribution for each variable.

        :param dataset: dataset from which learn the probabilistic model.
        """
        for i in range(self.len_variables):
            label = self.variables[i]
            probs = obtain_probabilities(dataset[:, i])
            self.prob_table[label] = probs

[docs]    def sample(self, size: int) -> np.array:
        """
        Samples new solutions from the probabilistic model. In each solution, each variable is sampled
        from its respective categorical distribution.

        :param size: number of samplings of the probabilistic model.
        :return: array with the dataset sampled
        :rtype: np.array
        """
        result = pd.DataFrame(columns=self.variables)
        for i in range(self.len_variables):
            label = self.variables[i]
            result[label] = np.random.choice(list(self.prob_table[label].keys()), size=size,
                                             p=list(self.prob_table[label].values())).tolist()

        return result.to_numpy()

[docs]    def print_structure(self) -> list:
        """
        Prints the arcs between the nodes that represent the variables in the dataset. This function
        must be used after the learning process. Univariate approaches generate no-edged graphs.

        :return: list of arcs between variables
        :rtype: list
        """
        return list()