Source code for catsim.estimation.numerical

"""Numerical estimation methods for ability estimation."""

import numpy
import numpy.typing as npt
from scipy.optimize import minimize_scalar

from .. import cat, irt
from ..irt import THETA_MAX_EXTENDED, THETA_MIN_EXTENDED
from ..item_bank import ItemBank
from .base import BaseEstimator

# Mathematical constants
GOLDEN_RATIO = (1 + 5**0.5) / 2



[docs]
class NumericalSearchEstimator(BaseEstimator):
  """Implement search algorithms in unimodal functions to find the maximum of the log-likelihood function.

  This class provides multiple numerical search methods for ability estimation in IRT,
  including ternary search, dichotomous search, Fibonacci search, and golden-section
  search, according to [Veliz20]_. Also check [Brent02]_. It is also possible to use
  the methods from :py:func:`scipy.optimize.minimize_scalar`.

  Parameters
  ----------
  tol : float, optional
      Tolerance for convergence in the optimization algorithm. Default is 1e-6.
  dodd : bool, optional
      Whether to employ Dodd's estimation heuristic [Dod90]_ when the response vector
      only has one kind of response (all correct or all incorrect, see
      :py:func:`catsim.cat.dodd`). Default is True.
  verbose : bool, optional
      Whether to print detailed information during optimization. Default is False.
  method : str, optional
      The search method to employ. Must be one of: 'ternary', 'dichotomous', 'fibonacci',
      'golden', 'brent', 'bounded', or 'golden2'. Default is 'bounded'.
  """

  __methods = frozenset(["ternary", "dichotomous", "fibonacci", "golden", "brent", "bounded", "golden2"])


[docs]
  @staticmethod
  def available_methods() -> frozenset[str]:
    """Get a set of available estimation methods.

    Returns
    -------
    frozenset[str]
        Set of available estimation methods: {'ternary', 'dichotomous', 'fibonacci',
        'golden', 'brent', 'bounded', 'golden2'}.
    """
    return NumericalSearchEstimator.__methods


  def __str__(self) -> str:
    """Return a string representation of the estimator."""
    return f"Numerical Search Estimator ({self.__search_method})"

  def __init__(
    self,
    tol: float = 1e-6,
    dodd: bool = True,
    verbose: bool = False,
    method: str = "bounded",
  ) -> None:
    """Initialize the estimator.

    Parameters
    ----------
    tol : float, optional
        Tolerance for convergence in the optimization algorithm. Default is 1e-6.
    dodd : bool, optional
        Whether to use Dodd's estimation heuristic for edge cases (all correct or
        all incorrect responses). Default is True.
    verbose : bool, optional
        Whether to print detailed information during optimization. Default is False.
    method : str, optional
        The numerical search method to use for optimization. Default is "bounded".

    Raises
    ------
    ValueError
        If the parameter `method` is not one of the available methods.
    """
    super().__init__(verbose)

    if method not in NumericalSearchEstimator.__methods:
      msg = f"Parameter 'method' must be one of {NumericalSearchEstimator.__methods}."
      raise ValueError(msg)

    self._tol = tol
    self._dodd = dodd
    self.__search_method = method


[docs]
  def estimate(
    self,
    index: int | None = None,
    item_bank: ItemBank | None = None,
    administered_items: list[int] | None = None,
    response_vector: list[bool] | None = None,
    est_theta: float | None = None,
  ) -> float:
    r"""Compute the theta value that maximizes the log-likelihood function for the given examinee.

    When this method is used inside a simulator, its arguments are automatically filled.
    Outside of a simulation, the user can also specify the arguments to use the Estimator
    as a standalone object.

    Parameters
    ----------
    index : int or None, optional
        Index of the current examinee in the simulator. Default is None.
    item_bank : ItemBank or None, optional
        An ItemBank containing item parameters. Default is None.
    administered_items : list[int] or None, optional
        A list containing the indexes of items that were already administered.
        Default is None.
    response_vector : list[bool] or None, optional
        A boolean list containing the examinee's answers to the administered items.
        Default is None.
    est_theta : float or None, optional
        A float containing the current estimated ability. Default is None.

    Returns
    -------
    float
        The current estimated ability :math:`\hat\theta`.

    Raises
    ------
    ValueError
        If required parameters are None when not using a simulator.
    """
    item_bank, administered_items, response_vector, est_theta = self._prepare_args(
      return_item_bank=True,
      return_administered_items=True,
      return_response_vector=True,
      return_est_theta=True,
      index=index,
      item_bank=item_bank,
      administered_items=administered_items,
      response_vector=response_vector,
      est_theta=est_theta,
    )

    if item_bank is None:
      msg = "item_bank parameter cannot be None"
      raise ValueError(msg)
    if administered_items is None:
      msg = "administered_items parameter cannot be None"
      raise ValueError(msg)
    if response_vector is None:
      msg = "response_vector parameter cannot be None"
      raise ValueError(msg)
    if est_theta is None:
      msg = "est_theta parameter cannot be None"
      raise ValueError(msg)

    self._calls += 1
    self._evaluations = 0

    summarized_answers = set(response_vector)

    # enter here if examinee has only answered correctly or incorrectly
    if len(summarized_answers) == 1:
      answer = summarized_answers.pop()

      # if the estimator was initialized with dodd = True,
      # use Dodd's estimation heuristic to return a theta value
      if self._dodd:
        candidate_theta = cat.dodd(est_theta, item_bank, answer)

      # otherwise, return positive or negative infinity,
      # in accordance with the definition of the MLE
      elif answer:
        candidate_theta = float("inf")
      else:
        candidate_theta = float("-inf")

      return candidate_theta

    if self.__search_method in {"ternary", "dichotomous"}:
      candidate_theta = self._solve_ternary_dichotomous(
        THETA_MAX_EXTENDED, THETA_MIN_EXTENDED, response_vector, item_bank.get_items(administered_items)
      )
    elif self.__search_method == "fibonacci":
      candidate_theta = self._solve_fibonacci(
        THETA_MAX_EXTENDED, THETA_MIN_EXTENDED, response_vector, item_bank.get_items(administered_items)
      )
    elif self.__search_method == "golden2":
      candidate_theta = self._solve_golden_section(
        THETA_MAX_EXTENDED, THETA_MIN_EXTENDED, response_vector, item_bank.get_items(administered_items)
      )
    elif self.__search_method in {"brent", "bounded", "golden"}:
      res = minimize_scalar(
        irt.negative_log_likelihood,
        bracket=(THETA_MIN_EXTENDED, THETA_MAX_EXTENDED),
        bounds=(THETA_MIN_EXTENDED, THETA_MAX_EXTENDED) if self.__search_method == "bounded" else None,
        method=self.__search_method,
        args=(response_vector, item_bank.get_items(administered_items)),
        tol=self._tol if self.__search_method != "bounded" else None,
      )
      self._evaluations = res.nfev
      candidate_theta = res.x

    if self._verbose:
      print(f"{self._evaluations} evaluations")

    return candidate_theta


  def _solve_ternary_dichotomous(
    self,
    b: float,
    a: float,
    response_vector: list[bool],
    item_params: npt.NDArray[numpy.floating],
  ) -> float:
    """Find the most likely ability using ternary or dichotomous search methods.

    Parameters
    ----------
    b : float
        The upper bound to search for the ability, in the ability/difficulty scale.
    a : float
        The lower bound to search for the ability, in the ability/difficulty scale.
    response_vector : list[bool]
        The responses given to the answered items.
    item_params : numpy.ndarray
        The parameter matrix of the answered items.

    Returns
    -------
    float
        The estimated ability.

    Raises
    ------
    ValueError
        If the search interval becomes invalid during iteration.
    """
    error = float("inf")
    while error >= self._tol:
      self._evaluations += 2

      if self.__search_method == "ternary":
        c = (b + 2 * a) / 3
        d = (2 * b + a) / 3
      elif self.__search_method == "dichotomous":
        m = (a + b) / 2
        c = m - (self._tol / 2)
        d = m + (self._tol / 2)

      left_side_ll = irt.log_likelihood(c, response_vector, item_params)
      right_side_ll = irt.log_likelihood(d, response_vector, item_params)

      if left_side_ll >= right_side_ll:
        b = d
      else:
        a = c

      if not (a <= c <= d <= b):
        msg = f"Invalid interval: a={a}, c={c}, d={d}, b={b}"
        raise ValueError(msg)

      candidate_theta = (b + a) / 2

      error = abs(b - a)
      if self.__search_method == "dichotomous":
        error /= 2

      if self._verbose:
        print(f"\t\tTheta: {candidate_theta}, LL: {max(left_side_ll, right_side_ll)}")
    return candidate_theta

  def _solve_fibonacci(
    self,
    b: float,
    a: float,
    response_vector: list[bool],
    item_params: npt.NDArray[numpy.floating],
  ) -> float:
    """Find the most likely ability using the Fibonacci search method.

    Parameters
    ----------
    b : float
        The upper bound to search for the ability, in the ability/difficulty scale.
    a : float
        The lower bound to search for the ability, in the ability/difficulty scale.
    response_vector : list[bool]
        The responses given to the answered items.
    item_params : numpy.ndarray
        The parameter matrix of the answered items.

    Returns
    -------
    float
        The estimated ability.
    """
    fib = [1, 1]
    n = 1

    # while (upper_bound - lower_bound) / fib[-1] > .001:
    while (b - a) / fib[-1] > self._tol:
      n += 1
      fib.append(fib[-1] + fib[-2])

    c = a + (fib[n - 2] / fib[n]) * (b - a)
    d = a + (fib[n - 1] / fib[n]) * (b - a)

    left_side_ll = irt.log_likelihood(c, response_vector, item_params)
    right_side_ll = irt.log_likelihood(d, response_vector, item_params)
    self._evaluations += 2

    while n != 2:  # noqa: PLR2004
      self._evaluations += 1

      n -= 1

      if left_side_ll >= right_side_ll:
        b = d
        d = c
        c = a + (fib[n - 2] / fib[n]) * (b - a)

        right_side_ll = left_side_ll
        left_side_ll = irt.log_likelihood(c, response_vector, item_params)
      else:
        a = c
        c = d
        d = a + (fib[n - 1] / fib[n]) * (b - a)

        left_side_ll = right_side_ll
        right_side_ll = irt.log_likelihood(d, response_vector, item_params)

      # assert a <= c <= d <= b

      if self._verbose:
        print(f"\t\tTheta: {(b + a) / 2}, LL: {max(left_side_ll, right_side_ll)}")
    return (b + a) / 2

  def _solve_golden_section(
    self,
    b: float,
    a: float,
    response_vector: list[bool],
    item_params: npt.NDArray[numpy.floating],
  ) -> float:
    """Find the most likely ability using the golden-section search method.

    Parameters
    ----------
    b : float
        The upper bound to search for the ability, in the ability/difficulty scale.
    a : float
        The lower bound to search for the ability, in the ability/difficulty scale.
    response_vector : list[bool]
        The responses given to the answered items.
    item_params : numpy.ndarray
        The parameter matrix of the answered items.

    Returns
    -------
    float
        The estimated ability.

    Raises
    ------
    ValueError
        If the golden section interval becomes invalid during iteration.
    """
    c = b + (a - b) / GOLDEN_RATIO
    d = a + (b - a) / GOLDEN_RATIO

    left_side_ll = irt.log_likelihood(c, response_vector, item_params)
    right_side_ll = irt.log_likelihood(d, response_vector, item_params)

    while abs(b - a) > self._tol:
      self._evaluations += 1

      if left_side_ll >= right_side_ll:
        b = d
        d = c
        c = b + (a - b) / GOLDEN_RATIO

        right_side_ll = left_side_ll
        left_side_ll = irt.log_likelihood(c, response_vector, item_params)
      else:
        a = c
        c = d
        d = a + (b - a) / GOLDEN_RATIO

        left_side_ll = right_side_ll
        right_side_ll = irt.log_likelihood(d, response_vector, item_params)

      if not (a < c <= d < b):
        msg = f"Invalid golden section interval: a={a}, c={c}, d={d}, b={b}"
        raise ValueError(msg)

      if self._verbose:
        print(f"\t\tTheta: {(b + a) / 2}, LL: {max(left_side_ll, right_side_ll)}")
    return (b + a) / 2

  @property
  def dodd(self) -> bool:
    """Whether Dodd's estimation heuristic [Dod90]_ will be used by the estimator.

    Dodd's method is used when the response vector is composed solely of correct or
    incorrect answers, to prevent maximum likelihood methods from returning -infinity
    or +infinity.

    Returns
    -------
    bool
        Boolean value indicating if Dodd's method will be used or not.

    See Also
    --------
    catsim.cat.dodd : Implementation of Dodd's estimation heuristic.
    """
    return self._dodd

  @property
  def method(self) -> str:
    """Get the estimator search method selected during instantiation.

    Returns
    -------
    str
        The search method ('ternary', 'dichotomous', 'fibonacci', 'golden',
        'brent', 'bounded', or 'golden2').
    """
    return self.__search_method