Source code for expandLHS

from __future__ import annotations # for compatibility

import numpy as np
from scipy.stats.qmc import LatinHypercube as LHSampler, discrepancy, geometric_discrepancy
from numba import jit, types as ntypes
from typing import Tuple, Literal
from warnings import warn
 


@jit(nopython=True)
def _regridding_jitted(
    N : ntypes.u8,
    P : ntypes.u8,
    samples : np.ndarray,
    M : ntypes.u8
    ) -> np.ndarray:
    """
    Numba JITed code for ExpandLHS.regridding().
    """
    
    intervals = np.floor(samples * (N + M)).astype(ntypes.u8)
    voids = np.ones((N + M, P), dtype=ntypes.b1)
    
    for j in range(P):
        voids[intervals[:, j], j] = 0

    return voids


@jit(nopython=True)
def _count_samples_jitted(
    N : ntypes.u8,
    P : ntypes.u8,
    samples : np.ndarray,
    M : ntypes.u8
    ) -> np.ndarray:
    """
    Numba JITed code for ExpandLHS.count_samples().
    """
    
    intervals = np.floor(samples * (N + M)).astype(ntypes.u8)
    population = np.zeros((N + M, P), dtype=ntypes.u8)
    
    for i in range(N):
        for j in range(P):
            population[intervals[i, j], j] += 1
        
    return population


@jit(nopython=True)
def _degree_jitted(
    N : ntypes.u8,
    P : ntypes.u8,
    samples : np.ndarray,
    M : ntypes.u8
    ) -> ntypes.f8:
    """
    Numba JITed code for ExpandLHS.degree().
    """
    
    voids = _regridding_jitted(N, P, samples, M)
    return 1 - (np.sum(voids) - M * P) / ((N + M) * P)


@jit(nopython=True)
def _optimal_expansion_jitted(
    N : ntypes.u8,
    P : ntypes.u8,
    samples : np.ndarray,
    radius : Tuple[ntypes.u8, ntypes.u8]
    ) -> list[ntypes.u8, ntypes.f8]:
    """
    Numba JITed code for ExpandLHS.optimal_expansion().
    """
        
    scaling_up = [(_degree_jitted(N, P, samples, radius[0]), radius[0])]
    for m in range(radius[0]+1, radius[1]+1):
        scaling_up += [(_degree_jitted(N, P, samples, m), m)]
        
    scaling_up = [(_degree_jitted(N, P, samples, 0), 0)] + \
        sorted(scaling_up, reverse=True)
    scaling_up = [(tmp[1], tmp[0]) for tmp in scaling_up]
    
    return scaling_up


@jit(nopython=True)
def _LHSinLHS_sampling_jitted(
    N : ntypes.u8,
    P : ntypes.u8,
    M : ntypes.u8, 
    voids : np.ndarray,
    rng : np.random.Generator
    ) -> np.ndarray:
    """
    Numba JITed code for ExpandLHS.LHSinLHS_sampling().
    """
    
    new_samples = np.empty((M, P))
    intervals = np.arange(0, 1, 1 / (N + M))
    
    for j in range(P):
        # TODO: change np.random.choice in rng.choice when Numba will support it
        l_bounds = np.random.choice(intervals[voids[:,j]], M, replace=False)
        u_bounds = l_bounds + 1 / (N + M)
    
        new_samples[:,j] = [rng.uniform(l_bound, u_bound) \
            for l_bound, u_bound in zip(l_bounds, u_bounds)]
    
    return new_samples
    



[docs]
class ExpandLHS:
    """
    Class of the Latin Hypercube Sampling (LHS) Expansion algorithm.
    
    The Latin Hypercube Sampling is a stratified sampling method that
    allows to generate N near-random samples in the P-dimensional hypercube
    [0, 1)^P. It is a space-filling sampling strategy that ensures the 
    one-dimensional projection property, i.e. the samples are uniformly
    distributed in each one-dimension projection.
    
    Given an initial LHS set of size N defined in the P-dimensional hypercube 
    [0, 1)^P, this class expands the set adding new M samples.
    This procedure requires a new partition of the interval [0,1) in each 
    dimension P that may result in loosing the LHS projection property. The case
    M = k * N, where k is a natural number, always preserves the LHS projection 
    property. 
    This code allows to compute a new metric D, with 0 < D <= 1, corresponding 
    to the degree-of-LHS of a samples set, where D = 1 corresponds to a perfect 
    LHS. This allows to quantify the impact of an epansion on the initial set.
    It is worth noticing that the degree does not depend on the sampling 
    strategy, but only on the initial set and the new partition of the intervals.
    
    Attributes
    ----------
    N : int
        Cardinality of the sample set.
        
    P : int
        Number of dimensions.
        
    samples : numpy.ndarray with shape (N, P)
        Initial Latin Hypercube sample set.
   
    
    Methods
    -------
    _regridding
        Compute the new partition of the interval [0,1).
        
    _count_samples 
        Compute the number of samples in each interval and dimension.
        
    _LHSinLHS_sampling 
        Sample M additional samples filling M empty intervals created by 
        the new partition in each dimension P.
        
    _LHSinLHS_optimized
        Sample M additional samples and optimize the their spatial distribution.
        
    degree 
        Compute the degree-of-LHS of a samples set.
        
    optimal_expansion 
        Find the optimal expansion size M maximizing the final degree.
        
    __call__ 
        Expand a given sample set adding additional M samples.
    """
    
    def __init__(
        self, 
        samples : np.ndarray | None = None,
        *,
        N : int | None = None,
        P : int | None = None,
        **kwargs
        ) -> None:
        """
        Initialize the LHSExpansion class.

        Parameters
        ----------
            samples : numpy.ndarray | None (optional)
                A 2D array of shape (N, P) representing the initial
                Latin Hypercube sample set with N samples in P dimensions. 
                If None, the sample set will be generated using the
                scipy.stats.qmc.LatinHypercube sampler with the given N and P.  
                
            N : int | None (optional)
                Number of samples in the initial set. If None, it must be
                provided through the samples parameter. The initial set is
                sampled using the implementation scipy.stats.qmc.LatinHypercube.
                
            P : int | None (optional)
                Number of dimensions in the initial set. If None, it must be
                provided through the samples parameter. The initial set is
                sampled using the implementation scipy.stats.qmc.LatinHypercube.    
                
        Keyword Args
        --------------
            **kwargs :
                Additional keyword arguments to be passed to the Scipy
                Latin Hypercube sampler if N and P are provided.    
                
        Raises:
            ValueError: If samples is None and N or P are not provided.
            ValueError: If samples is not a 2D array with shape (N, P).
            Warning: If samples is not None and N or P are provided.
            Warning: If the shape of samples does not match the given N and P.
            
        """
        
        if samples is None:
            if N is None or P is None:
                raise ValueError("Either sample set or both N and P must be provided.")
            else:
                self.N = N
                self.P = P
                sampler = LHSampler(d=P, **kwargs)
                samples = sampler.random(N)
                self.samples = samples
        else:
            if len(samples.shape) != 2:
                raise ValueError("Sample set must be a 2D array with shape (N, P).")
            if N is not None or P is not None:
                warn("Sample set is provided, N and P will be inferred from the shape of samples.")
            if N is not None and samples.shape[0] != N:
                warn(f"Shape of samples ({samples.shape[0]}) does not match given N ({N}).")
            if P is not None and samples.shape[1] != P:
                warn(f"Shape of samples ({samples.shape[1]}) does not match given P ({P}).")
            
            self.samples = samples
            self.N = samples.shape[0]
            self.P = samples.shape[1]
        
        
        self.rng = np.random.default_rng()
    

    def _regridding(
        self,
        M : int = 1,
        *,
        samples : np.ndarray | None = None
        ) -> np.ndarray:
        """
        Regrid the Latin Hypercube samples by adding M new intervals in 
        each dimension.

        Parameters
        ----------
            M : int (optional)
                Number of new intervals to add. Defaults to 1.
                
            samples : numpy.ndarray | None (optional)
                If given, the regridding will be performed on this set and not
                on the default one.
                
        Returns
        -------
            voids : numpy.ndarray(N + M, P)
                A boolean array indicating the empty intervals. 
                In each dimension the number of voids is >= M, as adding M 
                intervals may cause two samples to fall in the same bin, 
                thus leaving a permanent void.
        """
        
        if samples is None:
            N = self.N
            P = self.P
            samples = self.samples
            
        else:
            N = samples.shape[0]
            P = samples.shape[1]
        
        voids = _regridding_jitted(N, P, samples, M)

        return voids
    
    
    def _count_samples(
        self,
        M : int = 1,
        *,
        samples : np.ndarray | None = None
        ) -> np.ndarray:
        """
        Count the number of samples in each of the (N + M) x P intervals.

        Parameters
        ----------
        
            M : int (optional)
                Number of new intervals to add. Defaults to 1.
                
            samples : numpy.ndarray | None (optional)
                If given, the counting of samples will be performed on this set 
                and not on the default one. 
        
        Returns
        -------
            population : numpy.ndarray(N + M, P)
                An array indicating the number of samples in each interval.
                Adding M intervals may cause two samples to fall in the same 
                bin, thus leaving a permanent void.                
        """
        
        if samples is None:
            N = self.N
            P = self.P
            samples = self.samples
            
        else:
            N = samples.shape[0]
            P = samples.shape[1]
        
        population = _count_samples_jitted(N, P, samples, M)
            
        return population
    
    
    def _LHSinLHS_sampling(
        self,
        M : int, 
        voids : np.ndarray,
        ) -> np.ndarray:
        """
        Generate new samples within the voids of the current sample set 
        preserving as much as possible the properties of a Latin Hypercube.

        Parameters
        ----------  
            M : int 
                Number of new samples to generate.
                
            voids : numpy.ndarray(N + M, P) 
                Boolean array indicating the empty intervals.

        Returns
        -------
            new samples : numpy.ndarray(M, P)
                New samples generated within the voids.
        """
        
        new_samples = _LHSinLHS_sampling_jitted(self.N, self.P, M, voids, self.rng)
        
        return new_samples
    
    
    def _LHSinLHS_optimized(
        self,
        M : int, 
        voids : np.ndarray,
        criterion : str,
        trials : int,
        tol : float,
        ) -> np.ndarray:
        """
        Generate new samples within the voids of the current sample set 
        preserving as much as possible the properties of a Latin Hypercube. 
        The samples spatial distribution is optimize in order to achieve the 
        lowest centered discrepancy or the higher geometric discrepancy within
        a number of samplings equal to trials.

        Parameters
        ----------
            M : int
                Number of new samples to generate.
                
            voids : numpy.ndarray(N + M, P) 
                Boolean array indicating the empty intervals.
                
            criterion : str 
                Optimization strategy. Available methods are 
                centered discrepancy and geometric discrepancy.
                
            trials : int 
                Number of expansions to sample. Defaults to 1000.
                
            tol : float
                Tolerance for the optimization. Defaults to 1e-4.

        Returns
        -------
            opt_samples : numpy.ndarray(M, P) 
                New samples generated within the voids and optimized with the 
                given criterion.
        """
        
        opt_mode = {
            'discrepancy' : discrepancy,
            'geometric_discrepancy' : geometric_discrepancy
            }
        
        opt_samples = _LHSinLHS_sampling_jitted(self.N, self.P, M, voids, self.rng)
        opt_criterion = opt_mode[criterion](
                np.concatenate([self.samples, opt_samples], axis=0))
        
        for _ in range(trials):
            tmp_samples = _LHSinLHS_sampling_jitted(self.N, self.P, M, voids, self.rng)
            tmp_criterion = opt_mode[criterion](
                np.concatenate([self.samples, tmp_samples], axis=0))
            
            if np.abs(tmp_criterion - opt_criterion) < tol:
                break
            
            elif criterion == 'discrepancy' and tmp_criterion < opt_criterion:
                opt_samples = tmp_samples
                opt_criterion = tmp_criterion 
                
            elif criterion == 'geometric_discrepancy' and \
                tmp_criterion > opt_criterion:
                opt_samples = tmp_samples
                opt_criterion = tmp_criterion
        
        return opt_samples



[docs]
    def degree(
        self,
        M : int = 1
        ) -> float:
        """
        Compute the degree-of-LHS of the current sample set 
        when expanded to size N + M, assuming M new samples will be generated.
        If M = 0, compute the degree of the initial set.

        Parameters
        ----------

            M : int
                Number of new intervals to add. Defaults to 1.

        Returns
        -------
            lhs_degree : float
                Degree of the Latin Hypercube Sampling, with 0 < D <= 1.
                A perfect Latin Hypercube has degree D = 1.
        """
        
        lhs_degree = _degree_jitted(self.N, self.P, self.samples, M)
        
        return lhs_degree

    
    

[docs]
    def optimal_expansion(
        self,
        radius : int | tuple[int, int],
        verbose : bool = False
        ) -> list[int, float]:
        """
        Find the optimal expansion size ---the expansion that has 
        the higher degree-of-LHS in a given range.

        Parameters
        ----------
            radius : int | (int, int)
                Range of values to consider for the expansion. 
                If a single value is provided, it is interpreted as 
                [1, upper bound] 
                If a tuple is provided, it is interpreted as 
                [lower bound, upper bound].
                Both the lower bound and the upper bound are included.
                
            verbose : bool
                If False return the expansion size with the highest degree, 
                if True returns all the expansion sizes within radius.
                Defaults to False.
                
        Returns
        -------
            expansions : list of tuples(int, float) or float
                List of expansion sizes and their corresponding degree-of-LHS.
                If verbose is True, returns a list of tuples with the
                expansion size and the corresponding degree-of-LHS.
                The tuple with expansion size equal 0 is the degree of the 
                current set (=1 for a perfect Latin Hypercube).
                If verbose is False, returns the expansion size with the
                highest degree-of-LHS.
        """
        
        if not isinstance(radius, tuple):
            radius = (1, radius)
        
        expansions = _optimal_expansion_jitted(self.N, self.P, self.samples, radius)
        
        if verbose:
            return expansions
        
        else:
            # expansions[0] is always the degree of the initial set
            return expansions[1]

    
            

[docs]
    def __call__(
        self, 
        M : int = 1,
        *,
        seed : int | None = None,
        optimize : Literal['discrepancy', 'geometric_discrepancy'] | None = None,
        trials : int = 1000,
        tol : float = 1e-4
        ) -> np.ndarray:
        """
        Build the Latin Hypercube expansion of size M.
        
        The optional argument 'optimize' allows to find an optimal set of new 
        samples with lower centered discrepancy (space filling metric) 
        or higher geometric discrepancy (pairwise minimum distance).
        Both these metrics are implemented in Scipy.stats.qmc.
        The code produce a number of possible expansion equal to 'trials'. 
        It is not garanteed that the final sample will be the absolute minimum 
        of the discrepancy or the absolute maximum of the geometric 
        discrepancy for the given sample. The optimization through geometric
        discrepancy could fail if the minimum pairwise distance is set by the 
        initial set.
        

        Parameters
        ----------
            M : int (optional)
                Number of new samples to generate. Defaults to 1.
            
            seed : int | None (optional)
                Seed for the random number generator. Defaults to None.
                
            optimize : str | None (optional)
                Optimization criterion, the available options are centered
                discrepancy and geometric discrepancy
                
            trials : int (optional)
                number of trials for the optimal expansion set. Defoult to 1000.
                
            tol : float (optional)
                Tolerance for the optimization. Defaults to 1e-4.

        Returns
        -------
            expansion : numpy.ndarray(N + M, P)
                Expanded Latin Hypercube sample set.
        """
        
        if M == 0:
            return self.samples
        
        if seed is not None:
            self.rng = np.random.default_rng(seed)
            
        voids = self._regridding(M)
        
        if optimize is None:
            new_samples = self._LHSinLHS_sampling(M, voids)
        else:
            new_samples = self._LHSinLHS_optimized(M, voids, optimize, trials, tol)
                    
        return np.concatenate([self.samples, new_samples], axis=0)