Source code for skcriteria.cmp.ranks_cmp

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# License: BSD-3 (https://tldrlegal.com/license/bsd-3-clause-license-(revised))
# Copyright (c) 2016-2021, Cabral, Juan; Luczywo, Nadia
# Copyright (c) 2022, 2023, 2024 QuatroPe
# All rights reserved.

# =============================================================================
# DOCS
# =============================================================================

"""Ranking comparison routines."""

# =============================================================================
# IMPORTS
# =============================================================================

import itertools as it
from collections import defaultdict
from collections.abc import Sequence

import matplotlib.pyplot as plt

import methodtools

import pandas as pd

from scipy.spatial import distance

import seaborn as sns

from sklearn import metrics as _skl_metrics

from ..agg import RankResult
from ..utils import (
    AccessorABC,
    Bunch,
    DiffEqualityMixin,
    diff,
    doc_inherit,
    unique_names,
)


# =============================================================================
# CONSTANTS
# =============================================================================

RANKS_LABELS = {
    True: "Untied ranks (lower is better)",
    False: "Ranks (lower is better)",
}


# =============================================================================
# COMPARATOR
# =============================================================================


[docs] class RanksComparator(Sequence, DiffEqualityMixin): """Rankings comparator object. This class is intended to contain a collection of rankings on which you want to do comparative analysis. All rankings must have exactly the same alternatives, although their order may vary. All methods support the ``untied`` parameter, which serves to untie rankings in case there are results that can assign more than one alternative to the same position (e.g.``ELECTRE2``). Parameters ---------- ranks : list List of (name, ranking) tuples of ``skcriteria.agg.RankResult`` with the same alternatives. See Also -------- skcriteria.cmp.mkrank_cmp : Convenience function for simplified ranks comparator construction. """ _skcriteria_dm_type = "ranks_comparator" _skcriteria_parameters = ["ranks"] def __init__(self, ranks): ranks = list(ranks) self._validate_ranks(ranks) self._ranks = ranks # INTERNALS =============================================================== def _validate_ranks(self, ranks): if len(ranks) <= 1: raise ValueError("Please provide more than one ranking") used_names = set() first_alternatives = set(ranks[0][1].alternatives) for name, part in ranks: if not isinstance(name, str): raise ValueError("'name' must be instance of str") if not isinstance(part, RankResult): raise TypeError("ranks must be instances of 'madm.RankResult'") if name in used_names: raise ValueError(f"Duplicated name {name!r}") used_names.add(name) diff = first_alternatives.symmetric_difference(part.alternatives) if diff: miss_str = ", ".join(diff) raise ValueError( f"Some ranks miss the alternative/s: {miss_str!r}" ) # PROPERTIES ============================================================== @property def ranks(self): """List of ranks in the comparator.""" return list(self._ranks) @property def named_ranks(self): """Dictionary-like object, with the following attributes. Read-only attribute to access any rank parameter by user given name. Keys are ranks names and values are rannks parameters. """ return Bunch("ranks", dict(self.ranks)) # DIFF! ===================================================================
[docs] @doc_inherit(DiffEqualityMixin.diff) def diff( self, other, rtol=1e-05, atol=1e-08, equal_nan=True, check_dtypes=False ): def rank_allclose(ranks_a, ranks_b): if len(ranks_a) != len(ranks_b): return False for (ra_name, ra), (rb_name, rb) in zip(ranks_a, ranks_b): if ra_name != rb_name: return False radiff = ra.diff( rb, rtol=rtol, atol=atol, equal_nan=equal_nan, check_dtypes=check_dtypes, ) if radiff.has_differences: return False return True members = {"ranks": rank_allclose} the_diff = diff(self, other, **members) return the_diff
# MAGIC! ================================================================== def __repr__(self): """x.__repr__() <==> repr(x).""" name = type(self).__name__ ranks_names = [rn for rn, _ in self._ranks] return f"<{name} [ranks={ranks_names!r}]>" def __len__(self): """Return the number of rankings to compare.""" return len(self._ranks) def __getitem__(self, ind): """Return a sub-comparator or a single ranking in the pipeline. Indexing with an integer will return an ranking; using a slice returns another RankComparator instance which copies a slice of this RankComparator. This copy is shallow: modifying ranks in the sub-comparator will affect the larger pipeline and vice-versa. However, replacing a value in `step` will not affect a copy. """ if isinstance(ind, slice): if ind.step not in (1, None): cname = type(self).__qualname__ raise ValueError(f"{cname} slicing only supports a step of 1") return self.__class__(self.ranks[ind]) elif isinstance(ind, int): return self._ranks[ind][-1] elif isinstance(ind, str): return self.named_ranks[ind] raise KeyError(ind) def __hash__(self): """x.__hash__() <==> hash(x).""" return id(self) # TO DATA =================================================================
[docs] def to_dataframe(self, *, untied=False): """Convert the entire RanksComparator into a dataframe. The alternatives are the rows, and the different rankings are the columns. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. Returns ------- :py:class:`pd.DataFrame` A RanksComparator as pandas DataFrame. """ columns = { rank_name: rank.to_series(untied=untied) for rank_name, rank in self._ranks } df = pd.DataFrame.from_dict(columns) df.columns.name = "Method" return df
[docs] def corr(self, *, untied=False, **kwargs): """Compute pairwise correlation of rankings, excluding NA/null values. By default the pearson correlation coefficient is used. Please check the full documentation of a ``pandas.DataFrame.corr()`` method for details about the implementation. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``pandas.DataFrame.corr()`` method. Returns ------- :py:class:`pd.DataFrame` A DataFrame with the correlation between rankings. """ return self.to_dataframe(untied=untied).corr(**kwargs)
[docs] def cov(self, *, untied=False, **kwargs): """Compute pairwise covariance of rankings, excluding NA/null values. Please check the full documentation of a ``pandas.DataFrame.cov()`` method for details about the implementation. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``pandas.DataFrame.cov()`` method. Returns ------- :py:class:`pd.DataFrame` A DataFrame with the covariance between rankings. """ return self.to_dataframe(untied=untied).cov(**kwargs)
[docs] def r2_score(self, *, untied=False, **kwargs): """Compute pairwise coefficient of determination regression score \ function of rankings, excluding NA/null values. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). Please check the full documentation of a ``sklearn.metrics.r2_score`` function for details about the implementation and the behaviour. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``sklearn.metrics.r2_score()`` function. Returns ------- :py:class:`pd.DataFrame` A DataFrame with the coefficient of determination between rankings. """ df = self.to_dataframe(untied=untied) # here we are going to create a dict of dict rows = defaultdict(dict) # combine the methods pairwise for r0, r1 in it.combinations(df.columns, 2): r2_score = _skl_metrics.r2_score(df[r0], df[r1], **kwargs) # add the metrics in both directions rows[r0][r1] = r2_score rows[r1][r0] = r2_score # create the dataframe and change the nan for 1 (perfect R2) r2_df = pd.DataFrame.from_dict(rows).fillna(1) r2_df = r2_df[df.columns].loc[df.columns] r2_df.index.name = "Method" r2_df.columns.name = "Method" return r2_df
[docs] def distance(self, *, untied=False, metric="hamming", **kwargs): """Compute pairwise distance between rankings. By default the 'hamming' distance is used, which is simply the proportion of disagreeing components in Two rankings. Please check the full documentation of a ``scipy.spatial.distance.pdist`` function for details about the implementation and the behaviour. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. metric: str or function, default ``"hamming"`` The distance metric to use. The distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulczynski1', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'. kwargs: Other keyword arguments are passed to the ``scipy.spatial.distance.pdist()`` function. Returns ------- :py:class:`pd.DataFrame` A DataFrame with the distance between rankings. """ df = self.to_dataframe(untied=untied).T dis_array = distance.pdist(df, metric=metric, **kwargs) dis_mtx = distance.squareform(dis_array) dis_df = pd.DataFrame( dis_mtx, columns=df.index.copy(), index=df.index.copy() ) return dis_df
# ACCESSORS (YES, WE USE CACHED PROPERTIES IS THE EASIEST WAY) ============ @methodtools.lru_cache(maxsize=None) @property def plot(self): """Plot accessor.""" return RanksComparatorPlotter(self)
# ============================================================================= # PLOTTER # =============================================================================
[docs] class RanksComparatorPlotter(AccessorABC): """RanksComparator plot utilities. Kind of plot to produce: - 'flow' : Changes in the rankings of the alternatives as flow lines (default) - 'reg' : Pairwise rankings data and a linear regression model fit plot. - 'heatmap' : Rankings as a color-encoded matrix. - 'corr' : Pairwise correlation of rankings as a color-encoded matrix. - 'cov' : Pairwise covariance of rankings as a color-encoded matrix. - 'r2_score' : Pairwise coefficient of determination regression score \ function of rankings as a color-encoded matrix. - 'distance' : Pairwise distance between rankings as a color-encoded \ matrix. - 'box' : Box-plot of rankings with respect to alternatives - 'bar' : Ranking of alternatives by method with vertical bars. - 'barh' : Ranking of alternatives by method with horizontal bars. """ _default_kind = "box" def __init__(self, ranks_cmp): self._ranks_cmp = ranks_cmp # MANUAL MADE PLOT ======================================================== # These plots have a much more manually orchestrated code.
[docs] def flow(self, *, untied=False, grid_kws=None, **kwargs): """Represents changes in the rankings of the alternatives as lines \ flowing through the ranking-methods. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. grid_kws: dict or None Dict with keyword arguments passed to ``matplotlib.axes.plt.Axes.grid`` kwargs: Other keyword arguments are passed to the ``seaborn.lineplot()`` function. except for data, estimator and sort. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) ax = sns.lineplot(data=df.T, estimator=None, sort=False, **kwargs) grid_kws = {} if grid_kws is None else grid_kws grid_kws.setdefault("alpha", 0.3) ax.grid(**grid_kws) ax.set_ylabel(RANKS_LABELS[untied]) return ax
[docs] def reg( self, *, untied=False, r2=True, palette=None, legend=True, r2_fmt=".2g", r2_kws=None, **kwargs, ): """Plot a pairwise rankings data and a linear regression model fit. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. r2 : bool, default ``True`` If True, the coefficient of determination results are added to the regression legend. palette: matplotlib/seaborn color palette, default ``None`` Set of colors for mapping the hue variable. legend: bool, default ``True`` If False, suppress the legend for semantic variables. r2_fmt: str, default ``"2.g"`` String formatting code to use when adding the coefficient of determination. r2_kws: dict or None Dict with keywords arguments passed to ``sklearn.metrics.r2_score()`` function. kwargs: Other keyword arguments are passed to the ``seaborn.lineplot()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) # Just to ensure that no manual color reaches regplot if "color" in kwargs: cls_name = type(self).__qualname__ raise TypeError( f"{cls_name}.reg() got an unexpected keyword argument 'color'" ) # if there is a custom axis, we take it out ax = kwargs.pop("ax", None) # r2 if legend and r2: r2_kws = {} if r2_kws is None else r2_kws r2_df = self._ranks_cmp.r2_score(untied=untied, **r2_kws) # we create the infinite cycle of colors for the palette, # so we take out as we need colors = it.cycle(sns.color_palette(palette=palette)) # pairwise ranks iteration for x, y in it.combinations(df.columns, 2): color = next(colors) # The r2 correlation index r2_label = "" if legend and r2: r2_score = format(r2_df[x][y], r2_fmt) r2_label = f" - $R^2={r2_score}$" label = "x={x}, y={y}{r2}".format(x=x, y=y, r2=r2_label) ax = sns.regplot( x=x, y=y, data=df, ax=ax, label=label, color=color, **kwargs ) ranks_label = RANKS_LABELS[untied] ax.set(xlabel=f"'x' {ranks_label}", ylabel=f"'y' {ranks_label}") if legend: ax.legend() return ax
# SEABORN BASED =========================================================== # Thin wrapper around seaborn plots
[docs] def heatmap(self, *, untied=False, **kwargs): """Plot the rankings as a color-encoded matrix. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``seaborn.heatmap()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) kwargs.setdefault("annot", True) kwargs.setdefault("cbar_kws", {"label": RANKS_LABELS[untied]}) return sns.heatmap(data=df, **kwargs)
[docs] def corr(self, *, untied=False, corr_kws=None, **kwargs): """Plot the pairwise correlation of rankings as a color-encoded matrix. By default the pearson correlation coefficient is used. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. corr_kws: dict or None Dict with keywords arguments passed the ``pandas.DataFrame.corr()`` method. kwargs: Other keyword arguments are passed to the ``seaborn.heatmap()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ corr_kws = {} if corr_kws is None else corr_kws corr = self._ranks_cmp.corr(untied=untied, **corr_kws) kwargs.setdefault("annot", True) kwargs.setdefault("cbar_kws", {"label": "Correlation"}) return sns.heatmap(data=corr, **kwargs)
[docs] def cov(self, *, untied=False, cov_kws=None, **kwargs): """Plot the pairwise covariance of rankings as a color-encoded matrix. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. cov_kws: dict or None Dict with keywords arguments passed the ``pandas.DataFrame.cov()`` method. kwargs: Other keyword arguments are passed to the ``seaborn.heatmap()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ cov_kws = {} if cov_kws is None else cov_kws cov = self._ranks_cmp.cov(untied=untied, **cov_kws) kwargs.setdefault("annot", True) kwargs.setdefault("cbar_kws", {"label": "Covariance"}) return sns.heatmap(data=cov, **kwargs)
[docs] def r2_score(self, untied=False, r2_kws=None, **kwargs): """Plot the pairwise coefficient of determination regression score \ function of rankings as a color-encoded matrix. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. cov_kws: dict or None Dict with keywords arguments passed the ``pandas.DataFrame.cov()`` method. kwargs: Other keyword arguments are passed to the ``seaborn.heatmap()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ r2_kws = {} if r2_kws is None else r2_kws r2 = self._ranks_cmp.r2_score(untied=untied, **r2_kws) kwargs.setdefault("annot", True) kwargs.setdefault("cbar_kws", {"label": "$R^2$"}) return sns.heatmap(data=r2, **kwargs)
[docs] def distance( self, *, untied=False, metric="hamming", distance_kws=None, **kwargs ): """Plot the pairwise distance between rankings as a color-encoded \ matrix. By default the 'hamming' distance is used, which is simply the proportion of disagreeing components in Two rankings. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. metric: str or function, default ``"hamming"`` The distance metric to use. The distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulczynski1', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'. distance_kws: dict or None Dict with keywords arguments passed the ``scipy.spatial.distance.pdist`` function kwargs: Other keyword arguments are passed to the ``seaborn.heatmap()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ distance_kws = {} if distance_kws is None else distance_kws dis = self._ranks_cmp.distance( untied=untied, metric=metric, **distance_kws ) kwargs.setdefault("annot", True) kwargs.setdefault( "cbar_kws", {"label": f"{metric} distance".capitalize()} ) return sns.heatmap(data=dis, **kwargs)
[docs] def box(self, *, untied=False, **kwargs): """Draw a boxplot to show rankings with respect to alternatives. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``seaborn.boxplot()`` function. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) ax = sns.boxplot(data=df.T, **kwargs) ranks_label = RANKS_LABELS[untied] if kwargs.get("orient") in (None, "v"): ax.set_ylabel(ranks_label) else: ax.set_xlabel(ranks_label) return ax
# DATAFRAME BASED ======================================================== # Thin wrapper around pandas.DataFrame.plot
[docs] def bar(self, *, untied=False, **kwargs): """Draw plot that presents ranking of alternatives by method with \ vertical bars. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``pandas.Dataframe.plot.bar()`` method. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) kwargs["ax"] = kwargs.get("ax") or plt.gca() ax = df.plot.bar(**kwargs) ax.set_ylabel(RANKS_LABELS[untied]) return ax
[docs] def barh(self, *, untied=False, **kwargs): """Draw plot that presents ranking of alternatives by method with \ horizontal bars. Parameters ---------- untied: bool, default ``False`` If it is ``True`` and any ranking has ties, the ``RankResult.untied_rank_`` property is used to assign each alternative a single ranked order. On the other hand, if it is ``False`` the rankings are used as they are. kwargs: Other keyword arguments are passed to the ``pandas.Dataframe.plot.barh()`` method. Returns ------- matplotlib.axes.Axes or numpy.ndarray of them """ df = self._ranks_cmp.to_dataframe(untied=untied) kwargs["ax"] = kwargs.get("ax") or plt.gca() ax = df.plot.barh(**kwargs) ax.set_xlabel(RANKS_LABELS[untied]) return ax
# ============================================================================= # FACTORY # =============================================================================
[docs] def mkrank_cmp(*ranks): """Construct a RankComparator from the given rankings. This is a shorthand for the RankComparator constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the method attribute of the rankings automatically. Parameters ---------- *ranks: list of RankResult objects List of the scikit-criteria RankResult objcects. Returns ------- rcmp : RanksComparator Returns a scikit-criteria :class:`RanksComparator` object. """ names = [r.method for r in ranks] named_ranks = unique_names(names=names, elements=ranks) return RanksComparator(named_ranks)