Source code for pal.variables

"""Multi-dimensional stochastic variables for actuarial modeling.

This module provides the ProteusVariable class, which represents multi-dimensional
stochastic variables commonly used in actuarial and risk modeling. A ProteusVariable
can contain different types of stochastic objects across multiple dimensions,
enabling complex risk factor modeling.

Key features:
- Multi-dimensional stochastic variables with named dimensions
- Support for various stochastic types (StochasticScalar, FreqSevSims, etc.)
- Mathematical operations across dimensions and simulations
- Correlation analysis and upsampling capabilities
- Export functionality for analysis and reporting

NOTE: The serialization/deserialization methods (from_csv, from_dict, from_series)
      are currently incomplete and have significant limitations. A comprehensive
      codec system is planned to address these issues.
      See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22

The ProteusVariable is designed for actuarial applications such as:
- Multi-factor risk modeling (e.g., frequency, severity, inflation)
- Portfolio-level aggregation across risk dimensions
- Scenario analysis with correlated risk factors
- Capital modeling with interdependent variables

Example:
    >>> from pal.stochastic_scalar import StochasticScalar
    >>> from pal.frequency_severity import FreqSevSims
    >>>
    >>> # Create a multi-dimensional risk variable
    >>> risk_var = ProteusVariable(
    ...     dim_name="insurance_risk",
    ...     values={
    ...         "frequency": StochasticScalar([10, 12, 8, 15]),
    ...         "severity": StochasticScalar([5000, 6000, 4500, 7000]),
    ...         "expense_ratio": StochasticScalar([0.3, 0.32, 0.28, 0.35])
    ...     }
    ... )
    >>> total_cost = (
    ...     risk_var["frequency"]
    ...     * risk_var["severity"]
    ...     * (1 + risk_var["expense_ratio"])
    ... )
"""

# standard library imports
from __future__ import annotations

import os
import typing as t
from numbers import Number

# third-party imports
import numpy as np
import numpy.typing as npt
import pandas as pd
import plotly.graph_objects as go  # type: ignore
import plotly.io as pio  # type: ignore
import scipy.stats

from . import maths as pnp

# local imports
from ._compat import Self
from .couplings import ProteusStochasticVariable
from .frequency_severity import FreqSevSims
from .stochastic_scalar import StochasticScalar
from .types import VectorLike

pio.templates.default = "none"

T = t.TypeVar("T")

__all__ = [
    "ProteusVariable",
]


def _format_number(val: int | float) -> str:
    if isinstance(val, int) or abs(val) >= 100:
        return f"{val:,.0f}"
    if abs(val) >= 1:
        return f"{val:,.4f}"
    return f"{val:.6g}"



[docs]
class ProteusVariable(t.Generic[T]):
    """A generic, homogeneous container for multivariate variables in simulations.

    ProteusVariable is a hierarchical structure that holds multiple variables of
    the SAME type (homogeneous container). Each instance must contain either all
    scalars, all vectors (like StochasticScalar), or all nested ProteusVariables
    - but never a mix of different types.

    Type Parameter:
        T: The type of values stored. By convention, T should be a ScalarOrVector
           type (NumericLike | VectorLike), though the parameter is unconstrained
           to allow flexible type inference. Usage with non-ScalarOrVector types
           may not be fully supported by all operations.

    Key Features:
    - **Homogeneous**: All values in a single instance must be the same type.
      Like List[T], you cannot mix types within one container.
    - **Type Safety**: Operations like mean() return type T, preserving type
      information through the computation.
    - **Nesting**: ProteusVariable containing ProteusVariable enables hierarchical
      data structures (e.g., risks by region by peril)
    - **Dictionary Access**: Sub-elements accessed via [] notation with
      string keys or integer indices

    Examples:
        >>> # Homogeneous scalar container
        >>> scalar_risks = ProteusVariable(
        ...     dim_name="risk_amounts",
        ...     values={"fire": 100000, "flood": 200000}  # All int
        ... )

        >>> # Homogeneous vector container
        >>> vector_risks = ProteusVariable(
        ...     dim_name="stochastic_losses",
        ...     values={
        ...         "fire": StochasticScalar([100, 200, 300]),
        ...         "flood": StochasticScalar([150, 250, 350])
        ...     }  # All StochasticScalar
        ... )

        >>> # Homogeneous nested container
        >>> nested_risks = ProteusVariable(
        ...     dim_name="regions",
        ...     values={
        ...         "north": scalar_risks,
        ...         "south": scalar_risks
        ...     }  # All ProteusVariable instances
        ... )

        >>> # INVALID - mixing types not allowed
        >>> # mixed = ProteusVariable(values={"a": 100, "b": StochasticScalar([1])})
        >>> # This would violate homogeneity and cause type errors

    Note: Statistical operations should be performed using numpy and scipy functions
    directly on ProteusVariable instances. For example:
    - Use np.percentile(variable, p)
    - Use np.mean(variable)
    - Use pal.stats.tvar(variable, p)
    """

    dim_name: str
    values: dict[str, T]
    dimensions: list[str]


[docs]
    def __init__(
        self,
        dim_name: str,
        values: dict[str, T],
    ):
        """Initialize a ProteusVariable.

        Args:
            dim_name: Name of the dimension.
            values: A dict containing variables that must
                support PAL variable operations.

        Raises:
            TypeError: If values is not a mapping type.
        """
        self.dim_name: str = dim_name
        # TODO: Clarify whether the values dict is intended to be mutable during the
        # variable's lifetime, or if it should be treated as immutable after
        # initialization. Consider using a frozen dict if immutability is desired.
        # See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/20
        self.values = values
        self.dimensions = [dim_name]
        self._dimension_set = set(self.dimensions)
        # Ensure that values is a mapping type
        if not isinstance(values, dict):  # type: ignore[redundant-expr]
            raise TypeError(f"Expected a mapping (dict-like) for 'values', got {type(values).__name__}")
        # check the number of simulations in each variable
        self.n_sims = None
        for value in (
            self.values.values() if isinstance(self.values, dict) else self.values  # type: ignore[reportUnknownMemberType]
        ):
            if isinstance(value, ProteusVariable):
                if self._dimension_set.intersection(value._dimension_set) or self.dim_name == value.dim_name:
                    raise ValueError("Duplicate dimension names in ProteusVariable hierarchy.")
                self._dimension_set.intersection_update(value.dimensions)
                self.dimensions.extend(value.dimensions)

            if self.n_sims is None:
                if isinstance(value, ProteusStochasticVariable):
                    self.n_sims = value.n_sims
                else:
                    self.n_sims = 1
            elif isinstance(value, ProteusStochasticVariable):
                if value.n_sims != self.n_sims:
                    if self.n_sims == 1:
                        self.n_sims = value.n_sims
                    else:
                        raise ValueError("Number of simulations do not match.")


    def __len__(self) -> int:
        """Return the number of elements in the variable."""
        return len(self.values)

    def __array__(self, dtype: t.Any = None) -> npt.NDArray[t.Any]:
        """Convert ProteusVariable to numpy array for basic operations.

        This method enables ProteusVariable to work with numpy functions like
        np.sum(), making it VectorLike protocol compliant. Current implementation
        provides basic functionality by concatenating all values into a 1D array.

        NOTE: This is a simplified implementation. Complex nested container
              scenarios and multi-dimensional operations need architectural
              decisions about data representation and operation semantics.
              See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/23

        Args:
            dtype: Optional data type for the resulting array.

        Returns:
            A numpy array created by concatenating all values.

        Raises:
            NotImplementedError: For mismatched simulation lengths or other
                               complex scenarios requiring architectural decisions.
        """
        # For basic 1D operations like np.sum(), concatenate all values
        arrays = [np.asarray(value) for value in self.values.values()]

        # If we have any scalars (0D arrays), convert them to 1D arrays with single
        # element
        processed_arrays: list[npt.NDArray[t.Any]] = []
        for arr in arrays:
            if arr.ndim == 0:
                # Convert scalar to 1D array with single element
                processed_arrays.append(np.array([arr.item()]))
            else:
                processed_arrays.append(arr)

        # Now check if all 1D arrays have the same length (simulation dimension)
        lengths = [len(arr) for arr in processed_arrays]
        if len(set(lengths)) > 1:
            raise NotImplementedError(
                "Array conversion not supported for ProteusVariable with "
                "mismatched simulation lengths. Use .upsample() first."
            )

        # Concatenate arrays - this creates a 1D array suitable for np.sum()
        result = np.concatenate(processed_arrays)

        if dtype is not None:
            result = result.astype(dtype)

        return result

    def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: t.Any, **kwargs: t.Any) -> ProteusVariable[T]:
        """Handle numpy universal functions applied to ProteusVariable objects.

        This method enables ProteusVariable objects to work with numpy ufuncs by
        recursively applying the ufunc to the hierarchical structure of values.

        Args:
            ufunc: The numpy universal function to apply.
            method: The method name (only "__call__" is supported).
            *inputs: Input arguments to the ufunc.
            **kwargs: Keyword arguments to pass to the ufunc.

        Returns:
            A new ProteusVariable with the ufunc applied to its values.

        Raises:
            NotImplementedError: If method is not "__call__".
        """
        if method != "__call__":
            raise NotImplementedError(f"Method {method} not implemented for ProteusVariable.")

        def recursive_apply(*items: t.Any, **kwargs: t.Any) -> t.Any:
            # If none of the items is a ProteusVariable (i.e. a container), then
            # assume they are leaf nodes (e.g., numbers or stochastic types) and
            # simply call ufunc.
            if not any(isinstance(item, ProteusVariable) for item in items):
                # For stochastic types that implement __array_ufunc__, this call will
                # automatically delegate to their own __array_ufunc__.
                return ufunc(*items, **kwargs)

            # Otherwise, at least one of the items is a container.
            # We assume that the container structure is consistent across items.

            first_container = items[[i for i, item in enumerate(items) if isinstance(item, ProteusVariable)][0]]

            # if the first container is a ProteusVariable, we can assume that
            # all other items are also ProteusVariables or compatible types.
            if not isinstance(first_container, ProteusVariable):
                raise TypeError(f"No {type(self).__name__} found in inputs, cannot apply ufunc.")

            # Process dictionary containers.
            if isinstance(
                first_container.values,  # type: ignore[reportUnknownMemberType]
                dict,
            ):
                new_data: dict[str, t.Any] = {}
                # Iterate over each key in the container.
                # Type ignore: Runtime type narrowing - we're intentionally checking
                # types at runtime to handle heterogeneous inputs from numpy ufuncs
                for key in first_container.values:  # type: ignore[reportUnknownMemberType]  # noqa: E501
                    new_items: list[t.Any] = []
                    for item in items:
                        # Assumes that data types are homogeneous across nodes ie. if
                        # the parent ProteusVariable contains dicts, then children
                        # should also contain dicts.
                        if isinstance(item, ProteusVariable):
                            # Type ignore: Runtime type checking for structural
                            # validation. We need to verify dict structure at runtime
                            # for ufunc recursion
                            vals = item.values  # type: ignore[reportUnknownMemberType]
                            if not isinstance(vals, dict):  # type: ignore[redundant-expr]
                                raise TypeError(
                                    f"Expected dict values in {type(self).__name__}, but got {type(vals).__name__}."  # type: ignore[reportArgumentType]  # noqa: E501
                                )
                            new_items.append(vals[key])
                        else:
                            new_items.append(item)
                    new_data[key] = recursive_apply(*new_items, **kwargs)
                # Return ProteusVariable without type parameter: The type is determined
                # at runtime through recursive_apply, not statically knowable
                return ProteusVariable(first_container.dim_name, new_data)

            # In case data is not a dict, try applying ufunc directly.
            # Type ignore: Return type depends on runtime ufunc behavior and value
            # types, not statically determinable in this dynamic dispatch context
            return ufunc(first_container.values, **kwargs)  # type: ignore[return-value]

        # Type ignore: recursive_apply's return type is determined at runtime based on
        # the actual ufunc and input types - this method handles arbitrary numpy
        # operations with dynamic type resolution
        return recursive_apply(*inputs, **kwargs)  # type: ignore[return-value]

    def __array_function__(
        self,
        func: t.Any,
        _: tuple[t.Any, ...],
        args: tuple[t.Any, ...],
        kwargs: dict[str, t.Any],
    ) -> t.Any:
        """Handle numpy array functions applied to ProteusVariable objects.

        This method enables ProteusVariable objects to work with numpy array functions
        by extracting the underlying values, applying the function, and reconstructing
        the ProteusVariable with the result.

        Special handling for mean(): Returns a ProteusVariable where each key's value
        is replaced by its mean, preserving the original structure.

        Args:
            func: The numpy array function to apply.
            _: Tuple of types involved in the operation (unused).
            args: Positional arguments to the function.
            kwargs: Keyword arguments to pass to the function.

        Returns:
            A new ProteusVariable with the function applied to its values.
        """
        # Special handling for mean() to preserve ProteusVariable structure
        if func.__name__ == "mean" and len(args) == 1 and args[0] is self:
            mean_values: dict[str, T] = {}
            for key, value in self.values.items():
                # Use pnp.mean() for all values to ensure consistent behavior
                # across all PAL types (StochasticScalar, FreqSevSims, etc.)
                mean_values[key] = pnp.mean(value)

            return ProteusVariable(dim_name=self.dim_name, values=mean_values)

        parsed_args: list[t.Any] = []
        for arg in args:
            if arg is self:
                # For the ProteusVariable itself, stack its dictionary values as columns
                value_arrays = [np.asarray(value) for value in self.values.values()]
                parsed_args.append(np.column_stack(value_arrays))
            else:
                parsed_args.append(arg)

        if "axis" in kwargs:
            # Nothing to do here as the axis is already specified.
            pass
        else:
            if func.__name__ in ["cumsum", "cumprod", "diff"]:
                # For functions that need axis specification, add axis=1 to kwargs
                kwargs["axis"] = 1
            elif func.__name__ in ["sum", "mean", "std", "var"]:
                # For reduction operations, check the type of values to determine axis
                # behavior...
                first_value = next(iter(self.values.values())) if self.values else None

                # Check if we have vector-like values (StochasticScalar, FreqSevSims,
                # etc.)
                if first_value is not None and isinstance(first_value, VectorLike):
                    # Vector-like values: use axis=1 to sum across dimensions (row-wise)
                    # This preserves simulation structure for StochasticScalar objects
                    kwargs["axis"] = 1

                # In case this if doesn't match, for scalar values, use default (no
                # axis) which will return a scalar result.
            else:
                # In all other cases, do not specify axis and let numpy decide
                pass

        temp = func(*parsed_args, **kwargs)

        # Handle 0D (scalar), 1D and 2D results
        if temp.ndim == 0:
            # If result is scalar (0D), return the scalar directly
            return temp.item()

        if temp.ndim == 1:
            # Check if this is a reduction result from vector-like values
            first_value = next(iter(self.values.values())) if self.values else None

            if (
                func.__name__ in ["sum", "mean", "std", "var"]
                and first_value is not None
                and isinstance(first_value, VectorLike)
            ):
                # Reduction of vector-like values: return a single StochasticScalar
                result = StochasticScalar(temp)

                # Merge coupling groups from all original values
                for value in self.values.values():
                    if hasattr(value, "coupled_variable_group"):
                        # Type ignore: we know that this attribute exists because we've
                        # done a runtime check above.
                        result.coupled_variable_group.merge(
                            value.coupled_variable_group  # type: ignore[attr-defined]
                        )

                return result

            # Other 1D results: distribute evenly across keys
            n_keys = len(self.values.keys())
            chunk_size = len(temp) // n_keys
            return ProteusVariable(
                self.dim_name,
                {
                    key: StochasticScalar(temp[i * chunk_size : (i + 1) * chunk_size])
                    for i, key in enumerate(self.values.keys())
                },
            )

        if temp.ndim == 2:
            # If result is 2D, use columns
            return ProteusVariable(
                self.dim_name,
                {key: StochasticScalar(temp[:, i]) for i, key in enumerate(self.values.keys())},
            )

        # This should be unreachable - we've handled 0D, 1D, and 2D arrays
        raise NotImplementedError(
            f"Unexpected array dimensionality: {temp.ndim}D array returned by "
            f"{func.__name__}. Only 0D (scalar), 1D, and 2D arrays are supported."
        )

    def __bool__(self) -> bool:
        raise ValueError(
            "ProteusVariable does not have a single truth value. Use explicit checks on its values instead."
        )

    def __iter__(self) -> t.Iterator[T]:
        """Iterate over the values in the variable."""
        return iter(self.values.values())

    def __contains__(self, value: object) -> bool:
        """Check if value is in the container.

        Required for Sequence protocol compatibility.
        """
        return value in self.values.values()

    def __reversed__(self) -> t.Iterator[T]:
        """Return a reverse iterator over the values.

        Required for Sequence protocol compatibility.
        """
        return reversed(list(self.values.values()))

    def __repr__(self) -> str:
        lines = [f"ProteusVariable ({self.dim_name}):"]
        max_key_len = max(len(k) for k in self.values)
        for key, val in self.values.items():
            if isinstance(val, ProteusVariable):
                nested = repr(val).replace("\n", "\n  ")
                lines.append(f"  {key}: {nested}")
            elif isinstance(val, (int, float)):
                lines.append(f"  {key:<{max_key_len}}  {_format_number(val):>14}")
            else:
                lines.append(f"  {key:<{max_key_len}}  {val!r}")
        return "\n".join(lines)

    # Arithmetic operations
    def __add__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a + b))

    def __radd__(self, other: t.Any) -> Self:
        return self.__add__(other)

    def __sub__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a - b))

    def __rsub__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: b - a))

    def __mul__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a * b))

    def __rmul__(self, other: t.Any) -> Self:
        return self.__mul__(other)

    def __truediv__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a / b))

    def __rtruediv__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: b / a))

    def __pow__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a**b))

    def __rpow__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: b**a))

    def __neg__(self) -> Self:
        """Return the negation of the variable."""
        return t.cast(Self, self._binary_operation(self, lambda a, _: -a))

    # Comparison operations
    def __lt__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a < b))

    def __rlt__(self, other: t.Any) -> Self:
        return self.__ge__(other)

    def __le__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a <= b))

    def __rle__(self, other: t.Any) -> Self:
        return self.__gt__(other)

    def __gt__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a > b))

    def __rgt__(self, other: t.Any) -> Self:
        return self.__le__(other)

    def __ge__(self, other: t.Any) -> Self:
        return t.cast(Self, self._binary_operation(other, lambda a, b: a >= b))

    def __rge__(self, other: t.Any) -> Self:
        return self.__lt__(other)

    # Equality operations
    def __eq__(self, other: object) -> Self:  # type: ignore[override]
        return t.cast(Self, self._binary_operation(other, lambda a, b: a == b))

    def __ne__(self, other: object) -> Self:  # type: ignore[override]
        return t.cast(Self, self._binary_operation(other, lambda a, b: a != b))

    def __getitem__(self, key: int | str) -> T:
        # FIXME: This assumes that the ordering of the values never changes. At the
        # moment, this is not true. The values are stored in mutable container!
        if isinstance(key, int):
            return list(self.values.values())[key]
        if isinstance(key, str):  # type: ignore[redundant-expr]
            return self.values[key]
        raise TypeError(f"Key must be an integer or string, got {type(key).__name__}.")

    def __setitem__(self, key: int | str, value: T) -> None:
        if isinstance(key, int):
            dict_key = list(self.values.keys())[key]
            self.values[dict_key] = value
        if isinstance(key, str):  # type: ignore[redundant-expr]
            self.values[key] = value


[docs]
    def count(self, value: T) -> int:
        """Count occurrences of value in the container.

        Required for Sequence protocol compatibility.
        """
        return list(self.values.values()).count(value)



[docs]
    def index(self, value: T, start: int = 0, stop: int | None = None) -> int:
        """Return index of first occurrence of value.

        Required for Sequence protocol compatibility.

        Raises:
            ValueError: If value is not found.
        """
        values_list = list(self.values.values())
        if stop is None:
            stop = len(values_list)
        try:
            return values_list.index(value, start, stop)
        except ValueError as error:
            raise ValueError(f"{value!r} is not in ProteusVariable") from error



[docs]
    def get_value_at_sim(self, sim_no: int | StochasticScalar) -> ProteusVariable[T | StochasticScalar]:
        """Get values at specific simulation number(s).

        Args:
            sim_no: Simulation index(es) to extract. Can be a single numeric value,
                a list of integers, or a VectorLike object such as StochasticScalar.

        Returns:
            A new ProteusVariable with values at the specified simulation indices.
        """
        # FIXME: this makes a bit of a mess of the interface. Would make sense to just
        # make use of the __getitem__ method instead. Since ProteusVariable is
        # SequenceLike, it should support indexing with integers and strings.
        # For this to work, we need to be sure that the contents of values is indeed
        # VectorLike. Remember that ProteusVariables may be nested and a ProteusVariable
        # will not be VectorLike.
        return ProteusVariable(
            dim_name=self.dim_name,
            values={k: self._get_value_at_sim_helper(v, sim_no) for k, v in self.values.items()},
        )



[docs]
    def upsample(self, n_sims: int) -> ProteusVariable[T]:
        """Upsample the variable to the specified number of simulations."""
        if self.n_sims == n_sims:
            return self
        return ProteusVariable(
            dim_name=self.dim_name,
            values={
                key: (value.upsample(n_sims) if isinstance(value, ProteusStochasticVariable) else value)
                for key, value in self.values.items()
            },
        )



[docs]
    def sum(self) -> T:
        """Return the sum across the outer dimension."""
        return sum(self)  # type: ignore[arg-type]



[docs]
    def validate_freqsev_consistency(self, _is_nested: bool = False) -> tuple[bool, str, npt.NDArray[t.Any] | None]:
        """Validate that all FreqSevSims have consistent sim_index.

        When a ProteusVariable contains multiple FreqSevSims objects, operations like
        sum() or aggregation require that all FreqSevSims have identical simulation
        indices for meaningful results. This method recursively checks for that
        consistency across nested ProteusVariable structures.

        All leaf values in the ProteusVariable tree must be FreqSevSims with matching
        simulation indices. Nested ProteusVariable structures are supported and will
        be recursively validated.

        Use this validation before performing aggregation operations on ProteusVariable
        instances containing FreqSevSims to ensure the results will be valid.

        Args:
            _is_nested: Internal parameter for tracking recursion depth.
                       Do not set manually.

        Returns:
            A tuple of (is_valid, error_message, sim_index):
            - is_valid: True if all leaf values are FreqSevSims with matching sim_index,
                       or if there are 0 FreqSevSims (trivially consistent)
            - error_message: Empty string if valid, descriptive error message otherwise
            - sim_index: Representative sim_index array if valid and FreqSevSims found,
                        None if no FreqSevSims or invalid

        Example:
            >>> freq_sev_1 = FreqSevSims([0, 1, 2], [10, 20, 30], 3)
            >>> freq_sev_2 = FreqSevSims([0, 1, 2], [15, 25, 35], 3)
            >>> var = ProteusVariable(
            ...     "losses", {"fire": freq_sev_1, "flood": freq_sev_2}
            ... )
            >>> is_valid, msg, sim_idx = var.validate_freqsev_consistency()
            >>> if is_valid:
            ...     total = var.sum()  # Safe to sum
        """
        try:
            reference_sim_index: npt.NDArray[t.Any] | None = None

            for key, value in self.values.items():
                if isinstance(value, FreqSevSims):
                    if reference_sim_index is None:
                        reference_sim_index = value.sim_index
                    elif not np.array_equal(value.sim_index, reference_sim_index):
                        return False, f"Simulation index mismatch at key {key}", None
                elif isinstance(value, ProteusVariable):
                    # Recursively validate nested ProteusVariable
                    is_valid, error, nested_sim_index = value.validate_freqsev_consistency(_is_nested=True)
                    if not is_valid:
                        return False, error, None
                    # Check consistency with current level's sim_index
                    if nested_sim_index is not None:
                        if reference_sim_index is None:
                            reference_sim_index = nested_sim_index
                        elif not np.array_equal(nested_sim_index, reference_sim_index):
                            return (
                                False,
                                f"Simulation index mismatch at key {key}",
                                None,
                            )
                else:
                    # Found a non-FreqSevSims, non-ProteusVariable value
                    level = "Immediate" if not _is_nested else "Nested"
                    return (
                        False,
                        f"{level} value for key {key} is {type(value).__name__}, not FreqSevSims",
                        None,
                    )

            return True, "", reference_sim_index

        except Exception as e:
            return False, f"Error validating FreqSevSims consistency: {str(e)}", None



[docs]
    @classmethod
    def from_csv(
        cls,
        file_name: str,
        dim_name: str,
        values_column: str,
        simulation_column: str = "Simulation",
    ) -> ProteusVariable[StochasticScalar]:
        """Import a ProteusVariable from a CSV file.

        This method currently has significant limitations and will be replaced
        with a more comprehensive serialization system.

        Current Limitations:
        - Only supports one-dimensional variables
        - Always creates StochasticScalar values regardless of intended type
        - Cannot preserve generic type information through deserialization
        - No support for nested ProteusVariable structures

        Args:
            file_name: Path to the CSV file to read
            dim_name: Name of the dimension column in the CSV
            values_column: Name of the column containing the values
            simulation_column: Name of the column containing simulation indices

        Returns:
            ProteusVariable with StochasticScalar values loaded from the CSV

        TODO: Implement comprehensive codec system for proper serialization
              See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22
        """
        # Type ignore: pandas-stubs has complex overloads causing Pyright to report
        # the function signature as "partially unknown" despite correct usage
        df: pd.DataFrame = pd.read_csv(file_name)  # type: ignore[misc]
        pivoted_df = df.pivot(index=simulation_column, columns=dim_name, values=values_column)
        count = df[dim_name].value_counts()
        # Type ignore: pandas-stubs overloads cause "partially unknown" warnings
        pivoted_df.sort_index(inplace=True)  # type: ignore[misc]

        # classmethods can't preserve generic type parameters so we need a type ignore
        # here. When data is loaded, the contents of the ProteusVariable will be
        # whatever was present in the CSV file. It may be necessary to separate these
        # factory functions from ProteusVariable completely.
        result = cls(
            dim_name,
            {
                str(label): StochasticScalar(pivoted_df[label].values[: count[label]])  # type: ignore[misc]
                for label in df[dim_name].unique()  # type: ignore[misc]
            },
        )
        result.n_sims = max(count)

        return result  # type: ignore



[docs]
    @classmethod
    def from_dict(
        cls,
        data: dict[str, list[float]],
    ) -> ProteusVariable[StochasticScalar]:
        """Create a ProteusVariable from a dictionary.

        This method currently has significant limitations and will be replaced
        with a more comprehensive serialization system.

        Current Limitations:
        - Only supports one-dimensional variables
        - Always creates StochasticScalar values from float lists
        - Cannot preserve generic type information
        - No support for nested structures or other value types

        Args:
            data: Dictionary mapping dimension labels to lists of float values

        Returns:
            ProteusVariable with StochasticScalar values created from the data

        TODO: Implement comprehensive codec system for proper serialization
              See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22
        """
        # Type ignore: Classmethods can't preserve generic type parameters.
        # This always creates StochasticScalar values regardless of T.
        result = cls(  # type: ignore[arg-type]
            dim_name="Dim1",
            values={str(label): StochasticScalar(data[label]) for label in data.keys()},  # type: ignore[arg-type]
        )
        result.n_sims = max([len(v) for v in data.values()])

        return result  # type: ignore



[docs]
    @classmethod
    def from_series(cls, data: pd.Series) -> ProteusVariable[float]:
        """Create a ProteusVariable from a pandas Series.

        This method currently has significant limitations and will be replaced
        with a more comprehensive serialization system.

        Current Limitations:
        - Only supports one-dimensional variables
        - Creates scalar values, not StochasticScalar
        - Cannot preserve generic type information
        - Limited to single simulation (n_sims=1)

        Args:
            data: Pandas Series with values to load

        Returns:
            ProteusVariable with scalar values from the Series

        TODO: Implement comprehensive codec system for proper serialization
              See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22
        """
        # Type ignore: Classmethods can't preserve generic type parameters.
        # The values type depends on the Series content, not the generic T.
        result = cls(  # type: ignore[arg-type]
            dim_name=str(data.index.name),
            values={str(label): data[label] for label in data.index},  # type: ignore[arg-type]
        )
        result.n_sims = 1

        return result  # type: ignore



[docs]
    def correlation_matrix(self, correlation_type: str = "spearman") -> list[list[float]]:
        """Compute correlation matrix between variables."""
        # validate type
        correlation_type = correlation_type.lower()
        if correlation_type not in ["linear", "spearman", "kendall"]:
            raise ValueError(
                f"Invalid correlation_type: '{correlation_type}'. Must be one of: 'linear', 'spearman', 'kendall'"
            )
        if not hasattr(self[0], "values"):
            raise TypeError(f"First element must have 'values' attribute, got {type(self[0]).__name__}")
        n = len(self.values)
        result: list[list[float]] = [[0.0] * n] * n
        values: list[npt.NDArray[t.Any]] = [t.cast(npt.NDArray[t.Any], self[i]) for i in range(len(self.values))]
        if correlation_type.lower() in ["spearman", "kendall"]:
            # rank the variables first
            for i, value in enumerate(values):
                values[i] = scipy.stats.rankdata(value)  # type: ignore[assignment]

        if correlation_type == "kendall":
            for i, value1 in enumerate(values):
                for j, value2 in enumerate(values):
                    result[i][j] = float(
                        scipy.stats.kendalltau(value1, value2).statistic  # type: ignore[arg-type]
                    )
        else:
            result = np.corrcoef(values).tolist()

        return result



[docs]
    def show_histogram(self, title: str | None = None) -> None:
        """Show a histogram of the variable values.

        Args:
            title (str | None): The title of the histogram. If None, no title is set.

        """
        if os.getenv("PAL_SUPPRESS_PLOTS", "").lower() == "true":
            return
        fig = go.Figure(layout=go.Layout(title=title))
        for label, value in self.values.items():
            try:
                # Type ignore: plotly-stubs has incomplete type information
                fig.add_trace(go.Histogram(x=value.values(), name=label))  # type: ignore[union-attr,misc]
            except AttributeError:
                # not all values are ProteusVariable or StochasticScalar and therefore
                # do not have a values() method.
                pass
        # Type ignore: plotly-stubs has incomplete type information
        fig.show()  # type: ignore[misc]



[docs]
    def show_cdf(self, title: str | None = None) -> None:
        """Plot the cumulative distribution function (cdf) of the variable values.

        Args:
            title: Optional title for the cdf. If None, no title is set.
        """
        if os.getenv("PAL_SUPPRESS_PLOTS", "").lower() == "true":
            return
        fig = go.Figure(layout=go.Layout(title=title))
        for label, value in self.values.items():
            if not isinstance(value, (ProteusVariable, ProteusStochasticVariable)):
                raise TypeError(f"{type(value).__name__} does not support CDF plotting. ")
            if value.n_sims is None or value.n_sims <= 1:
                raise ValueError("CDF can only be plotted for variables with multiple simulations.")
            # Type ignore: plotly-stubs has incomplete type information
            fig.add_trace(  # type: ignore[misc]
                go.Scatter(
                    # Type ignore: value.values is known to exist due to isinstance
                    # check
                    x=np.sort(np.array(value.values)),  # type: ignore[attr-defined]
                    y=np.arange(value.n_sims) / value.n_sims,
                    name=label,
                )
            )
        # Type ignore: plotly-stubs has incomplete type information
        fig.update_xaxes(title_text="Value")  # type: ignore[misc]
        # Type ignore: plotly-stubs has incomplete type information
        fig.update_yaxes(title_text="Cumulative Probability")  # type: ignore[misc]
        # Type ignore: plotly-stubs has incomplete type information
        fig.show()  # type: ignore[misc]


    def _binary_operation(
        self,
        other: object,
        operation: t.Callable[[t.Any, t.Any], t.Any],
    ) -> t.Any:
        if isinstance(other, ProteusVariable):
            if self.dimensions != other.dimensions:
                raise ValueError("Dimensions of the two variables do not match.")
            return ProteusVariable(
                dim_name=self.dim_name,
                values={
                    # Type ignore: Runtime type checking - values is dict-like at this
                    # point. We've had to lean on runtime checks here over static.
                    key: operation(value, other.values[key])  # type: ignore[index]
                    for key, value in self.values.items()
                },
            )
        return ProteusVariable(
            dim_name=self.dim_name,
            values={key: operation(value, other) for key, value in self.values.items()},
        )

    def _get_value_at_sim_helper(
        self,
        x: T,
        sim_no: int | StochasticScalar,
    ) -> T | StochasticScalar:
        """Helper method to get value at simulation for a single element."""
        if isinstance(x, ProteusVariable):
            # Type ignore: Private helper method with runtime type checks ensures
            # correct return type based on isinstance branching - static analyzer cannot
            # infer the precise type through the generic parameter T
            return x.get_value_at_sim(sim_no)  # pyright: ignore[reportReturnType, reportUnknownVariableType]

        if isinstance(x, StochasticScalar) or isinstance(x, FreqSevSims):
            # Handle StochasticScalar and FreqSevSims types
            if x.n_sims <= 1:
                # If n_sims is 1 or None, return the value directly
                return x

            if isinstance(sim_no, StochasticScalar):
                # Extract all values and return a new StochasticScalar with those
                # indices
                indices = sim_no.values.astype(int)
                return StochasticScalar(x.values[indices])

            # Handle the main case: extract value at specific simulation index
            if isinstance(sim_no, int):  # type: ignore[redundant-expr]
                # Type ignore: numpy array indexing returns element type which is compatible
                # with T | StochasticScalar in practice but type checker can't infer this
                return x.values[sim_no]  # type: ignore[return-value]

            return x

        if isinstance(x, Number):  # type: ignore[uneccesaryIsInstance]
            # If x is a numeric type, return it directly
            return x

        raise TypeError(
            f"Unsupported type for value at simulation: {type(x).__name__}.\n"
            f"Value: {x}\n"
            f"Expected one of: ProteusVariable, StochasticScalar, FreqSevSims, or "
            f"Number."
        )