Source code for EDAspy.optimization.custom.probabilistic_models.kde_bayesian_network

#!/usr/bin/env python
# coding: utf-8

import numpy as np
from pybnesian import KDENetwork, hc
from ._probabilistic_model import ProbabilisticModel
import pandas as pd


[docs]class KDEBN(ProbabilisticModel):

    """
    This probabilistic model is a Kernel Density Estimation Bayesian network [1]. It allows dependencies
    between variables which have been estimated using KDE.

    References:

        [1]: Atienza, D., Bielza, C., & Larrañaga, P. (2022). PyBNesian: an extensible Python package
        for Bayesian networks. Neurocomputing, 504, 204-209.

    """

    def __init__(self, variables: list, white_list: list = None, black_list: list = None):
        """
        :param variables: Number of variables
        :param white_list: List of tuples with mandatory arcs in the BN structure
        :param black_list: List of tuples with forbidden arcs in the BN structure
        """

        super().__init__(variables)

        self.variables = variables
        self.pm = KDENetwork(variables)

        self.white_list = white_list
        self.black_list = black_list

        self.id = 6

[docs]    def learn(self, dataset: np.array, num_folds: int = 10, *args, **kwargs):
        """
        Learn a KDE Bayesian network from the dataset passed as argument.

        :param dataset: dataset from which learn the KDEBN.
        :param num_folds: Number of folds used for the SPBN learning. The higher, the more accurate, but also higher
            CPU demand. By default, it is set to 10.
        """
        data = pd.DataFrame(dataset, columns=self.variables)
        self.pm = KDENetwork(self.variables)

        if self.white_list and self.black_list:
            self.pm = hc(data, start=self.pm, operators=["arcs"],
                         arc_whitelist=self.white_list, arc_blacklist=self.black_list, num_folds=num_folds)
        elif self.white_list:
            self.pm = hc(data, start=self.pm, operators=["arcs"],
                         arc_whitelist=self.white_list, num_folds=num_folds)
        elif self.black_list:
            self.pm = hc(data, start=self.pm, operators=["arcs"],
                         arc_blacklist=self.black_list, num_folds=num_folds)
        else:
            self.pm = hc(data, start=self.pm, operators=["arcs"], num_folds=num_folds)

        self.pm.fit(data)

[docs]    def sample(self, size: int) -> np.array:
        """
        Samples the KDE Bayesian network several times defined by the user. The dataset is returned
        as a numpy matrix. The sampling process is implemented using probabilistic logic sampling.

        :param size: number of samplings of the KDE Bayesian network.
        :return: array with the dataset sampled.
        :rtype: np.array
        """

        dataset = self.pm.sample(size, ordered=True).to_pandas()
        dataset = dataset[self.variables].to_numpy()
        return dataset

[docs]    def print_structure(self) -> list:
        """
        Prints the arcs between the nodes that represent the variables in the dataset. This function
        must be used after the learning process.

        :return: list of arcs between variables
        :rtype: list
        """

        return self.pm.arcs()

[docs]    def logl(self, data: pd.DataFrame):
        """
        Returns de log-likelihood of some data in the model.

        :param data: dataset to evaluate its likelihood in the model.
        :return: log-likelihood of the instances in the model.
        :rtype: np.array
        """
        return self.pm.logl(data)