Source code for netcdf_scm.normalisation.running_mean

"""
Module for the normaliser which calculates anomalies from a running mean in the pre-industrial control run
"""
import logging

from scmdata import ScmRun

from ..cli_utils import _check_timesteps_are_monthly
from .base import Normaliser

LOGGER = logging.getLogger(__name__)


[docs]class NormaliserRunningMean(Normaliser):
    """
    Normaliser which calculates anomalies from a running mean in the pre-industrial control run

    Each normalisation value is an n-year mean, centred on the equivalent point in the
    pre-industrial control simulation. If there is insufficient data to create a full n-year
    window at the edge of the simulation then a linear extrapolation of the running-mean is used
    to extend the normalisation values to cover the required full range.
    """

    def __init__(self, nyears=21):
        """
        Intialise

        Parameters
        ----------
        nyears : int
            Number of years to use in the running-mean.
        """
        self._nyears = nyears
        self._method_name = "{}-yr-running-mean".format(nyears)

    def _get_reference_values(self, indata, picontrol, picontrol_branching_time):
        running_mean_n_years = self._nyears
        months_per_year = 12
        window_width = running_mean_n_years * months_per_year
        window_roll_back = window_width // 2
        # the window extends one step fewer forward because the centre time point
        # is also included
        window_roll_forward = window_roll_back - 1

        branch_year = picontrol_branching_time.year
        branch_month = picontrol_branching_time.month
        picontrol_branch_col = picontrol.filter(year=branch_year, month=branch_month)[
            "time"
        ][0]
        picontrol_time = picontrol["time"]
        picontrol_branch_idx = (picontrol_time == picontrol_branch_col).idxmax()

        # relevant times go from the branch point minus the number of steps each
        # rolling mean window extends backward up to the branch point plus the
        # length of ``indata`` plus the number of steps the rolling mean window
        # extends forward
        relevant_time_points = picontrol_time[
            max(picontrol_branch_idx - window_roll_back, 0) : picontrol_branch_idx
            + indata["time"].shape[0]
            + window_roll_forward
        ].tolist()

        _check_timesteps_are_monthly(picontrol)  # quick sanity check

        # super slow option below, would be good to have direct functionality in scmdata
        # normalise_ts = picontrol.filter(time=relevant_time_points.tolist()).timeseries()

        picontrol_times = (picontrol["time"] >= relevant_time_points[0]) & (
            picontrol["time"] <= relevant_time_points[-1]
        )
        normalise_ts = picontrol.timeseries().iloc[:, picontrol_times.values]

        normalise_df = normalise_ts.rolling(
            window=window_width, center=True, axis="columns",
        ).mean()

        keep_time_points = picontrol_time[
            picontrol_branch_idx : picontrol_branch_idx + indata["time"].shape[0]
        ].tolist()

        raise_error = False
        try:
            normalise_df = normalise_df[keep_time_points]
        except KeyError:
            raise_error = True

        if normalise_df.isnull().any().any():
            LOGGER.info(
                "Filling gaps in running mean (where not enough values were available to create a full "
                "window) with linear extrapolations."
            )

            target_times = normalise_df.columns
            normalise_df = (
                ScmRun(normalise_df)
                .interpolate(target_times=target_times, extrapolation_type="linear")
                .timeseries()
            )

        if normalise_df.isnull().any().any():
            # if there are any remaining gaps, raise
            raise_error = True

        try:
            normalise_df.columns = indata["time"]
        except ValueError:
            raise_error = True

        if raise_error:
            error_msg = (
                "Only `{:04d}{:02d}` to `{:04d}{:02d}` is available in the "
                "{} data. Given the branching time, `{:04d}{:02d}`, we need "
                "data from ~`{:04d}` to `{:04d}`. {} data in {}".format(
                    picontrol["time"].min().year,
                    picontrol["time"].min().month,
                    picontrol["time"].max().year,
                    picontrol["time"].max().month,
                    picontrol.metadata["experiment_id"],
                    picontrol_branching_time.year,
                    picontrol_branching_time.month,
                    picontrol_branching_time.year - 11,
                    picontrol_branching_time.year
                    + indata["year"].max()
                    - indata["year"].min()
                    + 11,
                    picontrol.metadata["experiment_id"],
                    picontrol.metadata["netcdf-scm crunched file"],
                )
            )
            raise ValueError(error_msg)

        return normalise_df