Source code for pal.variables

"""Multi-dimensional stochastic variables for actuarial modeling.

This module provides the ProteusVariable class, which represents multi-dimensional
stochastic variables commonly used in actuarial and risk modeling. A ProteusVariable
can contain different types of stochastic objects across multiple dimensions,
enabling complex risk factor modeling.

Key features:
- Multi-dimensional stochastic variables with named dimensions
- Support for various stochastic types (StochasticScalar, FreqSevSims, etc.)
- Mathematical operations across dimensions and simulations
- Correlation analysis and upsampling capabilities
- Export functionality for analysis and reporting

NOTE: The serialization/deserialization methods (from_csv, from_dict, from_series)
      are currently incomplete and have significant limitations. A comprehensive
      codec system is planned to address these issues.
      See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22

The ProteusVariable is designed for actuarial applications such as:
- Multi-factor risk modeling (e.g., frequency, severity, inflation)
- Portfolio-level aggregation across risk dimensions
- Scenario analysis with correlated risk factors
- Capital modeling with interdependent variables

Example:
    >>> from pal.stochastic_scalar import StochasticScalar
    >>> from pal.frequency_severity import FreqSevSims
    >>>
    >>> # Create a multi-dimensional risk variable
    >>> risk_var = ProteusVariable(
    ...     dim_name="insurance_risk",
    ...     values={
    ...         "frequency": StochasticScalar([10, 12, 8, 15]),
    ...         "severity": StochasticScalar([5000, 6000, 4500, 7000]),
    ...         "expense_ratio": StochasticScalar([0.3, 0.32, 0.28, 0.35])
    ...     }
    ... )
    >>> total_cost = (
    ...     risk_var["frequency"]
    ...     * risk_var["severity"]
    ...     * (1 + risk_var["expense_ratio"])
    ... )
"""

# standard library imports
from __future__ import annotations

import os
import typing as t
from numbers import Number

# third-party imports
import numpy as np
import numpy.typing as npt
import pandas as pd
import plotly.graph_objects as go  # type: ignore
import plotly.io as pio  # type: ignore
import scipy.stats

from . import maths as pnp

# local imports
from ._compat import Self
from .couplings import ProteusStochasticVariable
from .frequency_severity import FreqSevSims
from .stochastic_scalar import StochasticScalar
from .types import VectorLike

pio.templates.default = "none"

T = t.TypeVar("T")

__all__ = [
    "ProteusVariable",
]


def _format_number(val: int | float) -> str:
    if isinstance(val, int) or abs(val) >= 100:
        return f"{val:,.0f}"
    if abs(val) >= 1:
        return f"{val:,.4f}"
    return f"{val:.6g}"


[docs] class ProteusVariable(t.Generic[T]): """A generic, homogeneous container for multivariate variables in simulations. ProteusVariable is a hierarchical structure that holds multiple variables of the SAME type (homogeneous container). Each instance must contain either all scalars, all vectors (like StochasticScalar), or all nested ProteusVariables - but never a mix of different types. Type Parameter: T: The type of values stored. By convention, T should be a ScalarOrVector type (NumericLike | VectorLike), though the parameter is unconstrained to allow flexible type inference. Usage with non-ScalarOrVector types may not be fully supported by all operations. Key Features: - **Homogeneous**: All values in a single instance must be the same type. Like List[T], you cannot mix types within one container. - **Type Safety**: Operations like mean() return type T, preserving type information through the computation. - **Nesting**: ProteusVariable containing ProteusVariable enables hierarchical data structures (e.g., risks by region by peril) - **Dictionary Access**: Sub-elements accessed via [] notation with string keys or integer indices Examples: >>> # Homogeneous scalar container >>> scalar_risks = ProteusVariable( ... dim_name="risk_amounts", ... values={"fire": 100000, "flood": 200000} # All int ... ) >>> # Homogeneous vector container >>> vector_risks = ProteusVariable( ... dim_name="stochastic_losses", ... values={ ... "fire": StochasticScalar([100, 200, 300]), ... "flood": StochasticScalar([150, 250, 350]) ... } # All StochasticScalar ... ) >>> # Homogeneous nested container >>> nested_risks = ProteusVariable( ... dim_name="regions", ... values={ ... "north": scalar_risks, ... "south": scalar_risks ... } # All ProteusVariable instances ... ) >>> # INVALID - mixing types not allowed >>> # mixed = ProteusVariable(values={"a": 100, "b": StochasticScalar([1])}) >>> # This would violate homogeneity and cause type errors Note: Statistical operations should be performed using numpy and scipy functions directly on ProteusVariable instances. For example: - Use np.percentile(variable, p) - Use np.mean(variable) - Use pal.stats.tvar(variable, p) """ dim_name: str values: dict[str, T] dimensions: list[str]
[docs] def __init__( self, dim_name: str, values: dict[str, T], ): """Initialize a ProteusVariable. Args: dim_name: Name of the dimension. values: A dict containing variables that must support PAL variable operations. Raises: TypeError: If values is not a mapping type. """ self.dim_name: str = dim_name # TODO: Clarify whether the values dict is intended to be mutable during the # variable's lifetime, or if it should be treated as immutable after # initialization. Consider using a frozen dict if immutability is desired. # See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/20 self.values = values self.dimensions = [dim_name] self._dimension_set = set(self.dimensions) # Ensure that values is a mapping type if not isinstance(values, dict): # type: ignore[redundant-expr] raise TypeError(f"Expected a mapping (dict-like) for 'values', got {type(values).__name__}") # check the number of simulations in each variable self.n_sims = None for value in ( self.values.values() if isinstance(self.values, dict) else self.values # type: ignore[reportUnknownMemberType] ): if isinstance(value, ProteusVariable): if self._dimension_set.intersection(value._dimension_set) or self.dim_name == value.dim_name: raise ValueError("Duplicate dimension names in ProteusVariable hierarchy.") self._dimension_set.intersection_update(value.dimensions) self.dimensions.extend(value.dimensions) if self.n_sims is None: if isinstance(value, ProteusStochasticVariable): self.n_sims = value.n_sims else: self.n_sims = 1 elif isinstance(value, ProteusStochasticVariable): if value.n_sims != self.n_sims: if self.n_sims == 1: self.n_sims = value.n_sims else: raise ValueError("Number of simulations do not match.")
def __len__(self) -> int: """Return the number of elements in the variable.""" return len(self.values) def __array__(self, dtype: t.Any = None) -> npt.NDArray[t.Any]: """Convert ProteusVariable to numpy array for basic operations. This method enables ProteusVariable to work with numpy functions like np.sum(), making it VectorLike protocol compliant. Current implementation provides basic functionality by concatenating all values into a 1D array. NOTE: This is a simplified implementation. Complex nested container scenarios and multi-dimensional operations need architectural decisions about data representation and operation semantics. See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/23 Args: dtype: Optional data type for the resulting array. Returns: A numpy array created by concatenating all values. Raises: NotImplementedError: For mismatched simulation lengths or other complex scenarios requiring architectural decisions. """ # For basic 1D operations like np.sum(), concatenate all values arrays = [np.asarray(value) for value in self.values.values()] # If we have any scalars (0D arrays), convert them to 1D arrays with single # element processed_arrays: list[npt.NDArray[t.Any]] = [] for arr in arrays: if arr.ndim == 0: # Convert scalar to 1D array with single element processed_arrays.append(np.array([arr.item()])) else: processed_arrays.append(arr) # Now check if all 1D arrays have the same length (simulation dimension) lengths = [len(arr) for arr in processed_arrays] if len(set(lengths)) > 1: raise NotImplementedError( "Array conversion not supported for ProteusVariable with " "mismatched simulation lengths. Use .upsample() first." ) # Concatenate arrays - this creates a 1D array suitable for np.sum() result = np.concatenate(processed_arrays) if dtype is not None: result = result.astype(dtype) return result def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: t.Any, **kwargs: t.Any) -> ProteusVariable[T]: """Handle numpy universal functions applied to ProteusVariable objects. This method enables ProteusVariable objects to work with numpy ufuncs by recursively applying the ufunc to the hierarchical structure of values. Args: ufunc: The numpy universal function to apply. method: The method name (only "__call__" is supported). *inputs: Input arguments to the ufunc. **kwargs: Keyword arguments to pass to the ufunc. Returns: A new ProteusVariable with the ufunc applied to its values. Raises: NotImplementedError: If method is not "__call__". """ if method != "__call__": raise NotImplementedError(f"Method {method} not implemented for ProteusVariable.") def recursive_apply(*items: t.Any, **kwargs: t.Any) -> t.Any: # If none of the items is a ProteusVariable (i.e. a container), then # assume they are leaf nodes (e.g., numbers or stochastic types) and # simply call ufunc. if not any(isinstance(item, ProteusVariable) for item in items): # For stochastic types that implement __array_ufunc__, this call will # automatically delegate to their own __array_ufunc__. return ufunc(*items, **kwargs) # Otherwise, at least one of the items is a container. # We assume that the container structure is consistent across items. first_container = items[[i for i, item in enumerate(items) if isinstance(item, ProteusVariable)][0]] # if the first container is a ProteusVariable, we can assume that # all other items are also ProteusVariables or compatible types. if not isinstance(first_container, ProteusVariable): raise TypeError(f"No {type(self).__name__} found in inputs, cannot apply ufunc.") # Process dictionary containers. if isinstance( first_container.values, # type: ignore[reportUnknownMemberType] dict, ): new_data: dict[str, t.Any] = {} # Iterate over each key in the container. # Type ignore: Runtime type narrowing - we're intentionally checking # types at runtime to handle heterogeneous inputs from numpy ufuncs for key in first_container.values: # type: ignore[reportUnknownMemberType] # noqa: E501 new_items: list[t.Any] = [] for item in items: # Assumes that data types are homogeneous across nodes ie. if # the parent ProteusVariable contains dicts, then children # should also contain dicts. if isinstance(item, ProteusVariable): # Type ignore: Runtime type checking for structural # validation. We need to verify dict structure at runtime # for ufunc recursion vals = item.values # type: ignore[reportUnknownMemberType] if not isinstance(vals, dict): # type: ignore[redundant-expr] raise TypeError( f"Expected dict values in {type(self).__name__}, but got {type(vals).__name__}." # type: ignore[reportArgumentType] # noqa: E501 ) new_items.append(vals[key]) else: new_items.append(item) new_data[key] = recursive_apply(*new_items, **kwargs) # Return ProteusVariable without type parameter: The type is determined # at runtime through recursive_apply, not statically knowable return ProteusVariable(first_container.dim_name, new_data) # In case data is not a dict, try applying ufunc directly. # Type ignore: Return type depends on runtime ufunc behavior and value # types, not statically determinable in this dynamic dispatch context return ufunc(first_container.values, **kwargs) # type: ignore[return-value] # Type ignore: recursive_apply's return type is determined at runtime based on # the actual ufunc and input types - this method handles arbitrary numpy # operations with dynamic type resolution return recursive_apply(*inputs, **kwargs) # type: ignore[return-value] def __array_function__( self, func: t.Any, _: tuple[t.Any, ...], args: tuple[t.Any, ...], kwargs: dict[str, t.Any], ) -> t.Any: """Handle numpy array functions applied to ProteusVariable objects. This method enables ProteusVariable objects to work with numpy array functions by extracting the underlying values, applying the function, and reconstructing the ProteusVariable with the result. Special handling for mean(): Returns a ProteusVariable where each key's value is replaced by its mean, preserving the original structure. Args: func: The numpy array function to apply. _: Tuple of types involved in the operation (unused). args: Positional arguments to the function. kwargs: Keyword arguments to pass to the function. Returns: A new ProteusVariable with the function applied to its values. """ # Special handling for mean() to preserve ProteusVariable structure if func.__name__ == "mean" and len(args) == 1 and args[0] is self: mean_values: dict[str, T] = {} for key, value in self.values.items(): # Use pnp.mean() for all values to ensure consistent behavior # across all PAL types (StochasticScalar, FreqSevSims, etc.) mean_values[key] = pnp.mean(value) return ProteusVariable(dim_name=self.dim_name, values=mean_values) parsed_args: list[t.Any] = [] for arg in args: if arg is self: # For the ProteusVariable itself, stack its dictionary values as columns value_arrays = [np.asarray(value) for value in self.values.values()] parsed_args.append(np.column_stack(value_arrays)) else: parsed_args.append(arg) if "axis" in kwargs: # Nothing to do here as the axis is already specified. pass else: if func.__name__ in ["cumsum", "cumprod", "diff"]: # For functions that need axis specification, add axis=1 to kwargs kwargs["axis"] = 1 elif func.__name__ in ["sum", "mean", "std", "var"]: # For reduction operations, check the type of values to determine axis # behavior... first_value = next(iter(self.values.values())) if self.values else None # Check if we have vector-like values (StochasticScalar, FreqSevSims, # etc.) if first_value is not None and isinstance(first_value, VectorLike): # Vector-like values: use axis=1 to sum across dimensions (row-wise) # This preserves simulation structure for StochasticScalar objects kwargs["axis"] = 1 # In case this if doesn't match, for scalar values, use default (no # axis) which will return a scalar result. else: # In all other cases, do not specify axis and let numpy decide pass temp = func(*parsed_args, **kwargs) # Handle 0D (scalar), 1D and 2D results if temp.ndim == 0: # If result is scalar (0D), return the scalar directly return temp.item() if temp.ndim == 1: # Check if this is a reduction result from vector-like values first_value = next(iter(self.values.values())) if self.values else None if ( func.__name__ in ["sum", "mean", "std", "var"] and first_value is not None and isinstance(first_value, VectorLike) ): # Reduction of vector-like values: return a single StochasticScalar result = StochasticScalar(temp) # Merge coupling groups from all original values for value in self.values.values(): if hasattr(value, "coupled_variable_group"): # Type ignore: we know that this attribute exists because we've # done a runtime check above. result.coupled_variable_group.merge( value.coupled_variable_group # type: ignore[attr-defined] ) return result # Other 1D results: distribute evenly across keys n_keys = len(self.values.keys()) chunk_size = len(temp) // n_keys return ProteusVariable( self.dim_name, { key: StochasticScalar(temp[i * chunk_size : (i + 1) * chunk_size]) for i, key in enumerate(self.values.keys()) }, ) if temp.ndim == 2: # If result is 2D, use columns return ProteusVariable( self.dim_name, {key: StochasticScalar(temp[:, i]) for i, key in enumerate(self.values.keys())}, ) # This should be unreachable - we've handled 0D, 1D, and 2D arrays raise NotImplementedError( f"Unexpected array dimensionality: {temp.ndim}D array returned by " f"{func.__name__}. Only 0D (scalar), 1D, and 2D arrays are supported." ) def __bool__(self) -> bool: raise ValueError( "ProteusVariable does not have a single truth value. Use explicit checks on its values instead." ) def __iter__(self) -> t.Iterator[T]: """Iterate over the values in the variable.""" return iter(self.values.values()) def __contains__(self, value: object) -> bool: """Check if value is in the container. Required for Sequence protocol compatibility. """ return value in self.values.values() def __reversed__(self) -> t.Iterator[T]: """Return a reverse iterator over the values. Required for Sequence protocol compatibility. """ return reversed(list(self.values.values())) def __repr__(self) -> str: lines = [f"ProteusVariable ({self.dim_name}):"] max_key_len = max(len(k) for k in self.values) for key, val in self.values.items(): if isinstance(val, ProteusVariable): nested = repr(val).replace("\n", "\n ") lines.append(f" {key}: {nested}") elif isinstance(val, (int, float)): lines.append(f" {key:<{max_key_len}} {_format_number(val):>14}") else: lines.append(f" {key:<{max_key_len}} {val!r}") return "\n".join(lines) # Arithmetic operations def __add__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a + b)) def __radd__(self, other: t.Any) -> Self: return self.__add__(other) def __sub__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a - b)) def __rsub__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: b - a)) def __mul__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a * b)) def __rmul__(self, other: t.Any) -> Self: return self.__mul__(other) def __truediv__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a / b)) def __rtruediv__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: b / a)) def __pow__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a**b)) def __rpow__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: b**a)) def __neg__(self) -> Self: """Return the negation of the variable.""" return t.cast(Self, self._binary_operation(self, lambda a, _: -a)) # Comparison operations def __lt__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a < b)) def __rlt__(self, other: t.Any) -> Self: return self.__ge__(other) def __le__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a <= b)) def __rle__(self, other: t.Any) -> Self: return self.__gt__(other) def __gt__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a > b)) def __rgt__(self, other: t.Any) -> Self: return self.__le__(other) def __ge__(self, other: t.Any) -> Self: return t.cast(Self, self._binary_operation(other, lambda a, b: a >= b)) def __rge__(self, other: t.Any) -> Self: return self.__lt__(other) # Equality operations def __eq__(self, other: object) -> Self: # type: ignore[override] return t.cast(Self, self._binary_operation(other, lambda a, b: a == b)) def __ne__(self, other: object) -> Self: # type: ignore[override] return t.cast(Self, self._binary_operation(other, lambda a, b: a != b)) def __getitem__(self, key: int | str) -> T: # FIXME: This assumes that the ordering of the values never changes. At the # moment, this is not true. The values are stored in mutable container! if isinstance(key, int): return list(self.values.values())[key] if isinstance(key, str): # type: ignore[redundant-expr] return self.values[key] raise TypeError(f"Key must be an integer or string, got {type(key).__name__}.") def __setitem__(self, key: int | str, value: T) -> None: if isinstance(key, int): dict_key = list(self.values.keys())[key] self.values[dict_key] = value if isinstance(key, str): # type: ignore[redundant-expr] self.values[key] = value
[docs] def count(self, value: T) -> int: """Count occurrences of value in the container. Required for Sequence protocol compatibility. """ return list(self.values.values()).count(value)
[docs] def index(self, value: T, start: int = 0, stop: int | None = None) -> int: """Return index of first occurrence of value. Required for Sequence protocol compatibility. Raises: ValueError: If value is not found. """ values_list = list(self.values.values()) if stop is None: stop = len(values_list) try: return values_list.index(value, start, stop) except ValueError as error: raise ValueError(f"{value!r} is not in ProteusVariable") from error
[docs] def get_value_at_sim(self, sim_no: int | StochasticScalar) -> ProteusVariable[T | StochasticScalar]: """Get values at specific simulation number(s). Args: sim_no: Simulation index(es) to extract. Can be a single numeric value, a list of integers, or a VectorLike object such as StochasticScalar. Returns: A new ProteusVariable with values at the specified simulation indices. """ # FIXME: this makes a bit of a mess of the interface. Would make sense to just # make use of the __getitem__ method instead. Since ProteusVariable is # SequenceLike, it should support indexing with integers and strings. # For this to work, we need to be sure that the contents of values is indeed # VectorLike. Remember that ProteusVariables may be nested and a ProteusVariable # will not be VectorLike. return ProteusVariable( dim_name=self.dim_name, values={k: self._get_value_at_sim_helper(v, sim_no) for k, v in self.values.items()}, )
[docs] def upsample(self, n_sims: int) -> ProteusVariable[T]: """Upsample the variable to the specified number of simulations.""" if self.n_sims == n_sims: return self return ProteusVariable( dim_name=self.dim_name, values={ key: (value.upsample(n_sims) if isinstance(value, ProteusStochasticVariable) else value) for key, value in self.values.items() }, )
[docs] def sum(self) -> T: """Return the sum across the outer dimension.""" return sum(self) # type: ignore[arg-type]
[docs] def validate_freqsev_consistency(self, _is_nested: bool = False) -> tuple[bool, str, npt.NDArray[t.Any] | None]: """Validate that all FreqSevSims have consistent sim_index. When a ProteusVariable contains multiple FreqSevSims objects, operations like sum() or aggregation require that all FreqSevSims have identical simulation indices for meaningful results. This method recursively checks for that consistency across nested ProteusVariable structures. All leaf values in the ProteusVariable tree must be FreqSevSims with matching simulation indices. Nested ProteusVariable structures are supported and will be recursively validated. Use this validation before performing aggregation operations on ProteusVariable instances containing FreqSevSims to ensure the results will be valid. Args: _is_nested: Internal parameter for tracking recursion depth. Do not set manually. Returns: A tuple of (is_valid, error_message, sim_index): - is_valid: True if all leaf values are FreqSevSims with matching sim_index, or if there are 0 FreqSevSims (trivially consistent) - error_message: Empty string if valid, descriptive error message otherwise - sim_index: Representative sim_index array if valid and FreqSevSims found, None if no FreqSevSims or invalid Example: >>> freq_sev_1 = FreqSevSims([0, 1, 2], [10, 20, 30], 3) >>> freq_sev_2 = FreqSevSims([0, 1, 2], [15, 25, 35], 3) >>> var = ProteusVariable( ... "losses", {"fire": freq_sev_1, "flood": freq_sev_2} ... ) >>> is_valid, msg, sim_idx = var.validate_freqsev_consistency() >>> if is_valid: ... total = var.sum() # Safe to sum """ try: reference_sim_index: npt.NDArray[t.Any] | None = None for key, value in self.values.items(): if isinstance(value, FreqSevSims): if reference_sim_index is None: reference_sim_index = value.sim_index elif not np.array_equal(value.sim_index, reference_sim_index): return False, f"Simulation index mismatch at key {key}", None elif isinstance(value, ProteusVariable): # Recursively validate nested ProteusVariable is_valid, error, nested_sim_index = value.validate_freqsev_consistency(_is_nested=True) if not is_valid: return False, error, None # Check consistency with current level's sim_index if nested_sim_index is not None: if reference_sim_index is None: reference_sim_index = nested_sim_index elif not np.array_equal(nested_sim_index, reference_sim_index): return ( False, f"Simulation index mismatch at key {key}", None, ) else: # Found a non-FreqSevSims, non-ProteusVariable value level = "Immediate" if not _is_nested else "Nested" return ( False, f"{level} value for key {key} is {type(value).__name__}, not FreqSevSims", None, ) return True, "", reference_sim_index except Exception as e: return False, f"Error validating FreqSevSims consistency: {str(e)}", None
[docs] @classmethod def from_csv( cls, file_name: str, dim_name: str, values_column: str, simulation_column: str = "Simulation", ) -> ProteusVariable[StochasticScalar]: """Import a ProteusVariable from a CSV file. This method currently has significant limitations and will be replaced with a more comprehensive serialization system. Current Limitations: - Only supports one-dimensional variables - Always creates StochasticScalar values regardless of intended type - Cannot preserve generic type information through deserialization - No support for nested ProteusVariable structures Args: file_name: Path to the CSV file to read dim_name: Name of the dimension column in the CSV values_column: Name of the column containing the values simulation_column: Name of the column containing simulation indices Returns: ProteusVariable with StochasticScalar values loaded from the CSV TODO: Implement comprehensive codec system for proper serialization See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22 """ # Type ignore: pandas-stubs has complex overloads causing Pyright to report # the function signature as "partially unknown" despite correct usage df: pd.DataFrame = pd.read_csv(file_name) # type: ignore[misc] pivoted_df = df.pivot(index=simulation_column, columns=dim_name, values=values_column) count = df[dim_name].value_counts() # Type ignore: pandas-stubs overloads cause "partially unknown" warnings pivoted_df.sort_index(inplace=True) # type: ignore[misc] # classmethods can't preserve generic type parameters so we need a type ignore # here. When data is loaded, the contents of the ProteusVariable will be # whatever was present in the CSV file. It may be necessary to separate these # factory functions from ProteusVariable completely. result = cls( dim_name, { str(label): StochasticScalar(pivoted_df[label].values[: count[label]]) # type: ignore[misc] for label in df[dim_name].unique() # type: ignore[misc] }, ) result.n_sims = max(count) return result # type: ignore
[docs] @classmethod def from_dict( cls, data: dict[str, list[float]], ) -> ProteusVariable[StochasticScalar]: """Create a ProteusVariable from a dictionary. This method currently has significant limitations and will be replaced with a more comprehensive serialization system. Current Limitations: - Only supports one-dimensional variables - Always creates StochasticScalar values from float lists - Cannot preserve generic type information - No support for nested structures or other value types Args: data: Dictionary mapping dimension labels to lists of float values Returns: ProteusVariable with StochasticScalar values created from the data TODO: Implement comprehensive codec system for proper serialization See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22 """ # Type ignore: Classmethods can't preserve generic type parameters. # This always creates StochasticScalar values regardless of T. result = cls( # type: ignore[arg-type] dim_name="Dim1", values={str(label): StochasticScalar(data[label]) for label in data.keys()}, # type: ignore[arg-type] ) result.n_sims = max([len(v) for v in data.values()]) return result # type: ignore
[docs] @classmethod def from_series(cls, data: pd.Series) -> ProteusVariable[float]: """Create a ProteusVariable from a pandas Series. This method currently has significant limitations and will be replaced with a more comprehensive serialization system. Current Limitations: - Only supports one-dimensional variables - Creates scalar values, not StochasticScalar - Cannot preserve generic type information - Limited to single simulation (n_sims=1) Args: data: Pandas Series with values to load Returns: ProteusVariable with scalar values from the Series TODO: Implement comprehensive codec system for proper serialization See: https://github.com/ProteusLLP/proteusllp-actuarial-library/issues/22 """ # Type ignore: Classmethods can't preserve generic type parameters. # The values type depends on the Series content, not the generic T. result = cls( # type: ignore[arg-type] dim_name=str(data.index.name), values={str(label): data[label] for label in data.index}, # type: ignore[arg-type] ) result.n_sims = 1 return result # type: ignore
[docs] def correlation_matrix(self, correlation_type: str = "spearman") -> list[list[float]]: """Compute correlation matrix between variables.""" # validate type correlation_type = correlation_type.lower() if correlation_type not in ["linear", "spearman", "kendall"]: raise ValueError( f"Invalid correlation_type: '{correlation_type}'. Must be one of: 'linear', 'spearman', 'kendall'" ) if not hasattr(self[0], "values"): raise TypeError(f"First element must have 'values' attribute, got {type(self[0]).__name__}") n = len(self.values) result: list[list[float]] = [[0.0] * n] * n values: list[npt.NDArray[t.Any]] = [t.cast(npt.NDArray[t.Any], self[i]) for i in range(len(self.values))] if correlation_type.lower() in ["spearman", "kendall"]: # rank the variables first for i, value in enumerate(values): values[i] = scipy.stats.rankdata(value) # type: ignore[assignment] if correlation_type == "kendall": for i, value1 in enumerate(values): for j, value2 in enumerate(values): result[i][j] = float( scipy.stats.kendalltau(value1, value2).statistic # type: ignore[arg-type] ) else: result = np.corrcoef(values).tolist() return result
[docs] def show_histogram(self, title: str | None = None) -> None: """Show a histogram of the variable values. Args: title (str | None): The title of the histogram. If None, no title is set. """ if os.getenv("PAL_SUPPRESS_PLOTS", "").lower() == "true": return fig = go.Figure(layout=go.Layout(title=title)) for label, value in self.values.items(): try: # Type ignore: plotly-stubs has incomplete type information fig.add_trace(go.Histogram(x=value.values(), name=label)) # type: ignore[union-attr,misc] except AttributeError: # not all values are ProteusVariable or StochasticScalar and therefore # do not have a values() method. pass # Type ignore: plotly-stubs has incomplete type information fig.show() # type: ignore[misc]
[docs] def show_cdf(self, title: str | None = None) -> None: """Plot the cumulative distribution function (cdf) of the variable values. Args: title: Optional title for the cdf. If None, no title is set. """ if os.getenv("PAL_SUPPRESS_PLOTS", "").lower() == "true": return fig = go.Figure(layout=go.Layout(title=title)) for label, value in self.values.items(): if not isinstance(value, (ProteusVariable, ProteusStochasticVariable)): raise TypeError(f"{type(value).__name__} does not support CDF plotting. ") if value.n_sims is None or value.n_sims <= 1: raise ValueError("CDF can only be plotted for variables with multiple simulations.") # Type ignore: plotly-stubs has incomplete type information fig.add_trace( # type: ignore[misc] go.Scatter( # Type ignore: value.values is known to exist due to isinstance # check x=np.sort(np.array(value.values)), # type: ignore[attr-defined] y=np.arange(value.n_sims) / value.n_sims, name=label, ) ) # Type ignore: plotly-stubs has incomplete type information fig.update_xaxes(title_text="Value") # type: ignore[misc] # Type ignore: plotly-stubs has incomplete type information fig.update_yaxes(title_text="Cumulative Probability") # type: ignore[misc] # Type ignore: plotly-stubs has incomplete type information fig.show() # type: ignore[misc]
def _binary_operation( self, other: object, operation: t.Callable[[t.Any, t.Any], t.Any], ) -> t.Any: if isinstance(other, ProteusVariable): if self.dimensions != other.dimensions: raise ValueError("Dimensions of the two variables do not match.") return ProteusVariable( dim_name=self.dim_name, values={ # Type ignore: Runtime type checking - values is dict-like at this # point. We've had to lean on runtime checks here over static. key: operation(value, other.values[key]) # type: ignore[index] for key, value in self.values.items() }, ) return ProteusVariable( dim_name=self.dim_name, values={key: operation(value, other) for key, value in self.values.items()}, ) def _get_value_at_sim_helper( self, x: T, sim_no: int | StochasticScalar, ) -> T | StochasticScalar: """Helper method to get value at simulation for a single element.""" if isinstance(x, ProteusVariable): # Type ignore: Private helper method with runtime type checks ensures # correct return type based on isinstance branching - static analyzer cannot # infer the precise type through the generic parameter T return x.get_value_at_sim(sim_no) # pyright: ignore[reportReturnType, reportUnknownVariableType] if isinstance(x, StochasticScalar) or isinstance(x, FreqSevSims): # Handle StochasticScalar and FreqSevSims types if x.n_sims <= 1: # If n_sims is 1 or None, return the value directly return x if isinstance(sim_no, StochasticScalar): # Extract all values and return a new StochasticScalar with those # indices indices = sim_no.values.astype(int) return StochasticScalar(x.values[indices]) # Handle the main case: extract value at specific simulation index if isinstance(sim_no, int): # type: ignore[redundant-expr] # Type ignore: numpy array indexing returns element type which is compatible # with T | StochasticScalar in practice but type checker can't infer this return x.values[sim_no] # type: ignore[return-value] return x if isinstance(x, Number): # type: ignore[uneccesaryIsInstance] # If x is a numeric type, return it directly return x raise TypeError( f"Unsupported type for value at simulation: {type(x).__name__}.\n" f"Value: {x}\n" f"Expected one of: ProteusVariable, StochasticScalar, FreqSevSims, or " f"Number." )