Source code for mdtools.statistics

# This file is part of MDTools.
# Copyright (C) 2021, The MDTools Development Team and all contributors
# listed in the file AUTHORS.rst
#
# MDTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# MDTools is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with MDTools.  If not, see <http://www.gnu.org/licenses/>.


"""Functions for the statistical analysis of data."""


# Standard libraries
import warnings

# Third-party libraries
import numpy as np
from scipy import constants, special
from scipy.special import comb
from scipy.stats import norm

# First-party libraries
import mdtools as mdt


[docs] def acf(x, axis=None, dt=1, dtau=1, tau_max=None, center=True, unbiased=False): r""" Calculate the autocorrelation function of an array. Parameters ---------- x : array_like The input array. axis : int or None, optional The axis along which to calculate the ACF. By default, the flattened input array is used. dt : int or None, optional Distance between restarting times (actually restarting indices) :math:`t_0`. If ``None`` is given, `dt` is set to the current lag time :math:`\tau` which results in independent sampling windows. dtau : int, optional Distance between evaluated lag times (actually lag indices) :math:`\tau`. tau_max : int or None, optional Maximum lag time :math:`\tau` upon which to compute the ACF. Because the first lag time is 0, the last evaluated lag time is actually ``tau_max - 1``. By default, `tau_max` is set to ``x.size`` is `axis` is ``None`` and to ``x.shape[axis]`` otherwise. `tau_max` must lie within the interval ``[0, len(x)]``. center : bool, optional If ``True``, center the input array around its mean, i.e. subtract the sample mean :math:`\bar{X}` when calculating the variance and covariance. unbiased : bool, optional If ``True``, the covariance :math:`\text{Cov}[X_{t_0}, X_{t_0 + \tau}]` is normed by the actual number of sample points :math:`t_0` (which depends on the lag time :math:`\tau`). If ``False``, the covariance is for all lag times normed by the number of sampling points at :math:`t_0 = 0` (which is equal to the length of the input array `x`). Note that setting `unbiased` to ``True`` might result in values of the ACF that lie outside the interval [-1, 1], especially for lag times larger than ``len(x) // 2`` if `center` is ``True``. Returns ------- ac : numpy.ndarray Autocorrelation function of `x`. If `axis` is ``None``, the output array has shape ``(n,)`` where ``n`` is ``int(np.ceil(tau_max / dtau))``. If a specific axis is given, `ac` has the same shape as `x` except along the given axis where the length is given by ``n``. Raises ------ ValueError : If the first element of `ac` is not unity or one or more elements of `ac` fall outside the interval [-1, 1]. See Also -------- :func:`mdtools.statistics.acf_np` : Different implementation of the ACF using :func:`numpy.correlate` :func:`mdtools.statistics.acf_se` : Calculate the standard errors of an autocorrelation function :func:`mdtools.statistics.acf_confint` : Calculate the confidence intervals of an autocorrelation function Notes ----- This function computes the autocorrelation function (ACF) of the input array `x` assuming that `x` describes a `weakly stationary stochastic process`_. That means, the mean does not vary with time, the variance is finite for all times and the autocovariance does not depend on the starting time :math:`t_0` but only on the lag time :math:`\tau`. In this case the ACF is given by: [1]_:sup:`,` [2]_ .. math:: C(\tau) = \frac{\text{Cov}[X_{t_0}, X_{t_0 + \tau}]}{\text{Var}[X]} = \frac{\langle (X_{t_0} - \mu) \cdot (X_{t_0 + \tau} - \mu) \rangle}{\langle (X - \mu)^2 \rangle} The ACF strictly fulfills :math:`C(\tau) \in [-1, 1] \text{ } \forall \text{ } \tau \in \mathbb{R}`. Numerically, the ACF is calculated using the following formula: [3]_:sup:`,` [4]_ .. math:: C_\tau = \frac{\frac{1}{T^*} \sum_{t_0=0}^{T-\tau} (X_{t_0} - \bar{X}) (X_{t_0 + \tau} - \bar{X})}{\frac{1}{T} \sum_{t_0=0}^T (X_{t_0} - \bar{X})^2} = \frac{\frac{1}{T^*} \sum_{t_0=0}^{T-\tau} (X_{t_0} - \bar{X}) (X_{t_0 + \tau} - \bar{X})}{C_0} The increment of the sums in the formula of :math:`C_\tau` is given by `dt`. :math:`\bar{X} = \frac{1}{T} \sum_{t=1}^T X_t` is the sample mean. For the sample mean, the increment of the sum is always 1, independent of the choice of `dt`. Note that **the sample mean is only subtracted when** `center` **is** ``True``. `T` is always the length of the input array `x`, independent of the choice of `tau_max`. :math:`T^*` is either :math:`T - \tau` if `unbiased` is ``True`` or simply :math:`T` if `unbiased` is ``False``. For the sake of completeness, the ACF of a non-stationary stochastic process is given by: [1]_:sup:`,` [2]_ .. math:: C(t_1, t_2) = \frac{\text{Cov}[X_{t_1}, X_{t_2}]}{\sqrt{\text{Var}[X_{t_1}]} \cdot \sqrt{\text{Var}[X_{t_2}]}} = \frac{\langle (X_{t_1} - \mu_{t_1}) \cdot (X_{t_2} - \mu_{t_2}) \rangle}{\sqrt{\langle (X_{t_1} - \mu_{t_1})^2 \rangle} \cdot \sqrt{\langle (X_{t_2} - \mu_{t_2})^2 \rangle}} .. _weakly stationary stochastic process: https://en.wikipedia.org/wiki/Stationary_process#Weak_or_wide-sense_stationarity References ---------- .. [1] Wikipedia `Autocorrelation <https://en.wikipedia.org/wiki/Autocorrelation#Normalization>`_ .. [2] Kun Il Park, `Fundamentals of Probability and Stochastic Processes with Applications to Communications <https://doi.org/10.1007/978-3-319-68075-0>`_, Springer, 2018, p. 169, Holmdel, New Jersey, USA, ISBN 978-3-319-68074-3 .. [3] Julius O. Smith, `Unbiased Cross-Correlation <https://www.dsprelated.com/freebooks/mdft/Unbiased_Cross_Correlation.html>`_. In Mathematics of the Discrete Fourier Transform (DFT), 2002, p. 189, Stanford, California, USA .. [4] Wikipedia `Correlogram <https://en.wikipedia.org/wiki/Correlogram#Estimation_of_autocorrelations>`_ Examples -------- >>> a = np.array([-2, 2, -2, 2, -2]) >>> mdt.stats.acf(a) array([ 1. , -0.8 , 0.56666667, -0.4 , 0.13333333]) >>> mdt.stats.acf(a, dt=None) array([ 1. , -0.8 , 0.26666667, -0.2 , 0.13333333]) >>> mdt.stats.acf(a, unbiased=True) array([ 1. , -1. , 0.94444444, -1. , 0.66666667]) >>> mdt.stats.acf(a, unbiased=True, dt=None) array([ 1. , -1. , 0.66666667, -1. , 0.66666667]) >>> mdt.stats.acf(a, center=False) array([ 1. , -0.8, 0.6, -0.4, 0.2]) >>> mdt.stats.acf(a, center=False, dtau=2) array([1. , 0.6, 0.2]) >>> mdt.stats.acf(a, center=False, tau_max=len(a)//2) array([ 1. , -0.8]) >>> mdt.stats.acf(a, center=False, dt=None) array([ 1. , -0.8, 0.4, -0.2, 0.2]) >>> mdt.stats.acf(a, center=False, unbiased=True) array([ 1., -1., 1., -1., 1.]) >>> mdt.stats.acf(a, center=False, unbiased=True, dt=None) array([ 1., -1., 1., -1., 1.]) >>> a = np.array([-2, -2, -2, 2, 2]) >>> mdt.stats.acf(a) array([ 1. , 0.36666667, -0.26666667, -0.4 , -0.2 ]) >>> mdt.stats.acf(a, dt=None) array([ 1. , 0.36666667, -0.06666667, -0.2 , -0.2 ]) >>> mdt.stats.acf(a, unbiased=True) array([ 1. , 0.45833333, -0.44444444, -1. , -1. ]) >>> mdt.stats.acf(a, unbiased=True, dt=None) array([ 1. , 0.45833333, -0.16666667, -1. , -1. ]) >>> mdt.stats.acf(a, center=False) array([ 1. , 0.4, -0.2, -0.4, -0.2]) >>> mdt.stats.acf(a, center=False, dt=None) array([ 1. , 0.4, 0. , -0.2, -0.2]) >>> mdt.stats.acf(a, center=False, unbiased=True) array([ 1. , 0.5 , -0.33333333, -1. , -1. ]) >>> mdt.stats.acf(a, center=False, unbiased=True, dt=None) array([ 1. , 0.5, 0. , -1. , -1. ]) >>> a = np.array([[-2, 2, -2, 2, -2], ... [-2, 2, -2, 2, -2], ... [-2, -2, -2, 2, 2], ... [-2, -2, -2, 2, 2]]) >>> mdt.stats.acf(a, center=False) array([ 1. , -0.15, 0. , -0.25, -0.2 , 0.55, 0. , 0.15, -0.1 , -0.25, 0.1 , 0.05, 0.1 , 0.05, -0.2 , 0.05, 0. , 0.05, 0. , -0.05]) >>> mdt.stats.acf(a, center=False, axis=0) array([[ 1. , 1. , 1. , 1. , 1. ], [ 0.75, 0.25, 0.75, 0.75, 0.25], [ 0.5 , -0.5 , 0.5 , 0.5 , -0.5 ], [ 0.25, -0.25, 0.25, 0.25, -0.25]]) >>> mdt.stats.acf(a, center=False, axis=0, unbiased=True) array([[ 1. , 1. , 1. , 1. , 1. ], [ 1. , 0.33333333, 1. , 1. , 0.33333333], [ 1. , -1. , 1. , 1. , -1. ], [ 1. , -1. , 1. , 1. , -1. ]]) >>> mdt.stats.acf(a, center=False, axis=1) array([[ 1. , -0.8, 0.6, -0.4, 0.2], [ 1. , -0.8, 0.6, -0.4, 0.2], [ 1. , 0.4, -0.2, -0.4, -0.2], [ 1. , 0.4, -0.2, -0.4, -0.2]]) >>> mdt.stats.acf(a, center=False, axis=1, unbiased=True) array([[ 1. , -1. , 1. , -1. , 1. ], [ 1. , -1. , 1. , -1. , 1. ], [ 1. , 0.5 , -0.33333333, -1. , -1. ], [ 1. , 0.5 , -0.33333333, -1. , -1. ]]) >>> a = np.array([[[-2, 2, -2, 2, -2], ... [-2, 2, -2, 2, -2]], ... ... [[-2, -2, -2, 2, 2], ... [-2, -2, -2, 2, 2]]]) >>> mdt.stats.acf(a, center=False) array([ 1. , -0.15, 0. , -0.25, -0.2 , 0.55, 0. , 0.15, -0.1 , -0.25, 0.1 , 0.05, 0.1 , 0.05, -0.2 , 0.05, 0. , 0.05, 0. , -0.05]) >>> mdt.stats.acf(a, center=False, axis=0) array([[[ 1. , 1. , 1. , 1. , 1. ], [ 1. , 1. , 1. , 1. , 1. ]], <BLANKLINE> [[ 0.5, -0.5, 0.5, 0.5, -0.5], [ 0.5, -0.5, 0.5, 0.5, -0.5]]]) >>> mdt.stats.acf(a, center=False, axis=0, unbiased=True) array([[[ 1., 1., 1., 1., 1.], [ 1., 1., 1., 1., 1.]], <BLANKLINE> [[ 1., -1., 1., 1., -1.], [ 1., -1., 1., 1., -1.]]]) >>> mdt.stats.acf(a, center=False, axis=1) array([[[1. , 1. , 1. , 1. , 1. ], [0.5, 0.5, 0.5, 0.5, 0.5]], <BLANKLINE> [[1. , 1. , 1. , 1. , 1. ], [0.5, 0.5, 0.5, 0.5, 0.5]]]) >>> mdt.stats.acf(a, center=False, axis=1, unbiased=True) array([[[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.]], <BLANKLINE> [[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.]]]) >>> mdt.stats.acf(a, center=False, axis=2) array([[[ 1. , -0.8, 0.6, -0.4, 0.2], [ 1. , -0.8, 0.6, -0.4, 0.2]], <BLANKLINE> [[ 1. , 0.4, -0.2, -0.4, -0.2], [ 1. , 0.4, -0.2, -0.4, -0.2]]]) >>> mdt.stats.acf(a, center=False, axis=2, unbiased=True) array([[[ 1. , -1. , 1. , -1. , 1. ], [ 1. , -1. , 1. , -1. , 1. ]], <BLANKLINE> [[ 1. , 0.5 , -0.33333333, -1. , -1. ], [ 1. , 0.5 , -0.33333333, -1. , -1. ]]]) """ # noqa: E501, W505 x = np.array(x, dtype=np.float64, copy=center) if center: x = mdt.stats.center(x, axis=axis, dtype=np.float64, inplace=True) # Maximum possible tau value. if axis is None: tau_max_real = x.size else: if abs(axis) >= x.ndim: raise np.AxisError(axis=axis, ndim=x.ndim) tau_max_real = x.shape[axis] if dt is not None and dt < 1: raise ValueError( "'dt' ({}) must be a positive integer or None".format(dt) ) if dtau < 1: raise ValueError( "'dtau' ({}) must be a positive integer or None".format(dtau) ) if tau_max is None: tau_max = tau_max_real elif tau_max < 0 or tau_max > len(x): raise ValueError( "'tau_max' ({}) must lie within the interval" " [0, {}]".format(tau_max, len(x)) ) ac = [] for tau in range(0, tau_max, dtau): if dt is None: t_step = max(1, tau) # Slice step cannot be zero else: t_step = dt t0_max = tau_max_real - tau cov = np.copy(mdt.nph.take(x, stop=t0_max, step=t_step, axis=axis)) cov *= mdt.nph.take(x, start=tau, step=t_step, axis=axis) ac.append(np.sum(cov, axis=axis)) if unbiased: if axis is None: ac[-1] /= cov.size else: ac[-1] /= cov.shape[axis] if axis is None: ac = np.concatenate(ac, axis=axis) else: ac = np.stack(ac, axis=axis) ac0 = mdt.nph.take(ac, start=0, stop=1, axis=axis) ac /= ac0 # Consistency check. if axis is None: shape = (int(np.ceil(tau_max / dtau)),) else: shape = list(x.shape) shape[axis] = int(np.ceil(tau_max / dtau)) shape = tuple(shape) if ac.shape != shape: raise ValueError( "'ac' has shape {} but should have shape {}. This should not have" " happened".format(ac.shape, shape) ) if not np.allclose( mdt.nph.take(ac, start=0, stop=1, axis=axis), 1, rtol=0 ): raise ValueError( "The first element of the ACF is not unity but {}. This should" " not have" " happened".format(mdt.nph.take(ac, start=0, stop=1, axis=axis)) ) if np.any(np.abs(ac) > 1): raise ValueError( "At least one element of the ACF is absolutely seen greater than" " unity. This should not have happened. min(ACF) = {}. max(ACF)" " = {}".format(np.min(ac), np.max(ac)) ) return ac
[docs] def acf_np(x, center=True, unbiased=False): r""" Calculate the autocorrelation function of a 1-dimensional array. Parameters ---------- x : array_like 1-dimensional input array. center : bool, optional If ``True``, center the input array around its mean, i.e. subtract the sample mean :math:`\bar{X}` when calculating the variance and covariance. unbiased : bool, optional If ``True``, the covariance :math:`\text{Cov}[X_{t_0}, X_{t_0 + \tau}]` is normed by the actual number of sample points :math:`t_0` (which depends on the lag time :math:`\tau`). If ``False``, the covariance is for all lag times normed by the number of sampling points at :math:`t_0 = 0` (which is equal to the length of the input array `x`). Note that setting `unbiased` to ``True`` might result in values of the ACF that lie outside the interval [-1, 1], especially for lag times larger than ``len(x) // 2`` if `center` is ``True``. Returns ------- ac : numpy.ndarray Array of the same shape as `x` containing the autocorrelation function of `x`. Raises ------ ValueError : If the first element of `ac` is not unity or one or more elements of `ac` fall outside the interval [-1, 1]. See Also -------- :func:`mdtools.statistics.acf` : Different implementation of the ACF using using a for loop :func:`mdtools.statistics.acf_se` : Calculate the standard errors of an autocorrelation function :func:`mdtools.statistics.acf_confint` : Calculate the confidence intervals of an autocorrelation function Notes ----- See :func:`mdtools.statistics.acf` for the mathematical details of computing the autocorrelation function (ACF). This function uses :func:`numpy.correlate` to calculate the autocorrelation function whereas :func:`mdtools.statistics.acf` uses an explicit for loop. Therefore, :func:`mdtools.statistics.acf` can handle arbitrarily shaped input arrays and offers more options. Examples -------- >>> a = np.random.normal(loc=3, scale=7, size=11) >>> np.allclose(mdt.stats.acf_np(a), mdt.stats.acf(a), rtol=0) True >>> np.allclose( ... mdt.stats.acf_np(a, center=False), ... mdt.stats.acf(a, center=False), ... rtol=0 ... ) True """ x = np.array(x, dtype=np.float64, copy=center) if x.ndim != 1: raise ValueError( "'x' must be 1-dimensional but is {}-dimensional".format(x.ndim) ) if center: x -= np.mean(x, dtype=np.float64) ac = np.correlate(x, x, mode="full") ac = ac[-len(x) :] if unbiased: ac /= np.arange(len(ac), 0, -1) ac /= ac[0] if not np.isclose(ac[0], 1, rtol=0): raise ValueError( "The first element of the ACF is not unity but {}. This should" " not have happened".format(ac[0]) ) if np.any(np.abs(ac) > 1): raise ValueError( "At least one element of the ACF is absolutely seen greater than" " unity. This should not have happened. min(ACF) = {}. max(ACF)" " = {}".format(np.min(ac), np.max(ac)) ) return ac
[docs] def acf_se(x, axis=None, n=None): r""" Calculate the standard errors of an autocorrelation function. Parameters ---------- x : array_like The values of the ACF. Intermediate lag times must not be missing. That means if you used e.g. :func:`mdtools.statistics.acf` to compute the ACF, the argument `dtau` must have been ``1``. axis : int or None, optional The axis along which to calculate the SE. By default, the flattened input array is used. n : int, optional Sample size (i.e. number of recorded time points) of the underlying time series. By default, `n` is set to the number of lag times :math:`\tau` in the given ACF. This is valid for ACFs that were computed at all possible lag times from the underlying time series. That means if you used e.g. :func:`mdtools.statistics.acf` to compute the ACF, the argument `dtau` must have been ``1`` and `tau_max` must have been ``None`` or the length of `x` along the given axis.. Returns ------- se : numpy.ndarray Array of the same shape as `x` containing the standard error of the ACF at each lag time. See Also -------- :func:`mdtools.statistics.acf` : Calculate the autocorrelation function of an array :func:`mdtools.statistics.acf_confint` : Calculate the confidence intervals of an autocorrelation function Notes ----- The standard error (SE) of the autocorrelation function (ACF) :math:`C_\tau` at lag time :math:`\tau` is estimated according to Bartlett's formula for MA(l) processes: [#]_:sup:`,` [#]_ .. math:: SE(C_\tau) = \sqrt{\frac{1 + 2 \sum_{\tau^{'} = 1}^{\tau - 1} C_{\tau^{'}}}{N}} for :math:`\tau \gt 1`. For :math:`\tau = 1` the standard error is estimated by :math:`SE(C_1) = \frac{1}{\sqrt{N}}` and for :math:`\tau = 0` the standard error is :math:`SE(C_0) = 0`. References ---------- .. [#] Wikipedia `Correlogram <https://en.wikipedia.org/wiki/Correlogram#Statistical_inference_with_correlograms>`_ .. [#] Wikipedia `Autoregressive-moving-average model <https://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model#Moving-average_model>`_ Examples -------- >>> mdt.stats.acf_se([3]) array([0.]) >>> a = np.arange(4) >>> mdt.stats.acf_se(a) array([0. , 0.5 , 0.8660254, 1.6583124]) >>> b = np.column_stack([a, a]) >>> mdt.stats.acf_se(b, axis=0) array([[0. , 0. ], [0.5 , 0.5 ], [0.8660254, 0.8660254], [1.6583124, 1.6583124]]) >>> b = np.row_stack([a, a]) >>> mdt.stats.acf_se(b, axis=1) array([[0. , 0.5 , 0.8660254, 1.6583124], [0. , 0.5 , 0.8660254, 1.6583124]]) >>> c = np.array([b, b]) >>> mdt.stats.acf_se(c, axis=2) array([[[0. , 0.5 , 0.8660254, 1.6583124], [0. , 0.5 , 0.8660254, 1.6583124]], <BLANKLINE> [[0. , 0.5 , 0.8660254, 1.6583124], [0. , 0.5 , 0.8660254, 1.6583124]]]) """ # noqa: E501, W505 x = np.asarray(x) if n is None and axis is None: n = x.size elif n is None: # and axis is not None if abs(axis) >= x.ndim: raise np.AxisError(axis=axis, ndim=x.ndim) n = x.shape[axis] elif n <= 0: raise ValueError("'n' ({}) must be a positive integer".format(n)) # SE(lag) = \sqrt{1 + 2 * \sum_{i=1}^{lag-1} ACF(i) / N}. # Sum starts at lag=1. acf = mdt.nph.take(x, start=1, axis=axis) se = np.cumsum(acf**2, axis=axis, dtype=np.float64) # The sum must only run until lag-1 but now runs until lag => Shift # the array by 1 to the right. se = np.roll(se, shift=1, axis=axis) se *= 2 se += 1 se /= n # SE(lag=1) = 1/\sqrt{N}. se1 = mdt.nph.take(se, start=0, stop=1, axis=axis) se1[:] = 1 / n se = np.sqrt(se, out=se) # SE(lag=0) = 0 because ACF(lag=0) is always 1 and has zero error. se = np.insert(se, 0, 0, axis=axis) # Consistency check if se.shape != x.shape: raise ValueError( "'se' has shape {} but should have shape {}. This should not have" " happened".format(se.shape, x.shape) ) if np.any(mdt.nph.take(se, start=0, stop=1, axis=axis) != 0): raise ValueError( "The first element of the standard error of the ACF is not zero" " but {}. This should not have" " happened".format(mdt.nph.take(se, start=0, stop=1, axis=axis)) ) if not np.allclose( mdt.nph.take(se, start=1, stop=2, axis=axis), 1 / np.sqrt(n), rtol=0 ): raise ValueError( "The second element of the standard error of the ACF is not" " 1/sqrt(n) = {} but {}. This should not have happened".format( 1 / np.sqrt(n), mdt.nph.take(se, start=1, stop=2, axis=axis), ) ) if (axis is None and np.any(np.diff(se) < 0)) or ( axis is not None and np.any(np.diff(se, axis=axis) < 0) ): raise ValueError("'se' is not monotonically increasing") return se
[docs] def acf_confint(x, axis=None, alpha=0.05, n=None): r""" Calculate the confidence intervals of an autocorrelation function. Parameters ---------- x : array_like The values of the ACF. Intermediate lag times must not be missing. That means if you used e.g. :func:`mdtools.statistics.acf` to compute the ACF, the argument `dtau` must have been ``1``. axis : int or None, optional The axis along which to calculate the confidence interval. By default, the flattened input array is used. alpha : scalar, optional The significance level (also known as probability of error). The significance level is the maximum probability for rejecting the null hypothesis although its true (Type 1 error). Here, the null hypothesis is that the underlying time series has no autocorrelation. Typical values for the significance level are 0.01 or 0.05. The smaller the significance level (probability for Type 1 error), the higher the probability of a Type 2 error (i.e. the null hypothesis is not rejected although it is wrong). n : int, optional Sample size (i.e. number of recorded time points) of the underlying time series. By default, `n` is set to the number of lag times :math:`\tau` in the given ACF. This is valid for ACFs that were computed at all possible lag times from the underlying time series. That means if you used e.g. :func:`mdtools.statistics.acf` to compute the ACF, the argument `dtau` must have been ``1`` and `tau_max` must have been ``None`` or the length of `x` along the given axis.. Returns ------- confint : numpy.ndarray Array of the same shape as `x` containing the upper limit of the confidence interval of the ACF at each lag time. The lower limit is simply given by ``-confint``. The confidence interval is centered at zero. To get the confidence interval around the actual ACF values compute ``x + confint`` and ``x - confint``. See Also -------- :func:`mdtools.statistics.acf` : Calculate the autocorrelation function of an array :func:`mdtools.statistics.acf_se` : Calculate the standard errors of an autocorrelation function Notes ----- The confidence interval of the autocorrelation function (ACF) :math:`C_\tau` at lag time :math:`\tau` is estimated according to: [1]_ .. math:: B(C_\tau) = z_{1-\alpha/2} SE(C_\tau) with the significance level :math:`\alpha`, the quantile function :math:`z_p` of the *standard* normal distribution (:math:`\sigma = 1`) and the standard error :math:`SE(C_\tau)`. The quantile function :math:`z_p` is also known as the inverse cumulative distribution function :math:`F^{-1}(p)`. The cumulative distribution function :math:`F(X)` returns the probability :math:`p` to gain a value below or equal to :math:`X`. Hence, the inverse cumulative distribution function :math:`F^{-1}(p)` returns the value :math:`X` such that the probability of gaining a value below or equal to :math:`X` is given by :math:`p`. Thus, the probability for gaining a value higher than :math:`X` is :math:`1-p`. A normal random variable :math:`X` will exceed the value :math:`\mu + z_p \sigma` with probability :math:`1-p` and will lie outside the interval :math:`\mu \pm z_p \sigma` with probability :math:`2(1-p)` (for :math:`p \ge 0.5` or :math:`2p` for :math:`p \le 0.5`). In particular, the quantile :math:`z_{0.975}` is :math:`1.96`. Therefore, a normal random variable will lie outside the interval :math:`\mu \pm 1.96\sigma` in only 5% of cases. [2]_ Here :math:`\mu` is the mean of the random variable and :math:`\sigma` is its standard deviation. Note that for the normal distribution :math:`z_p - \mu = -(z_{1-p} - \mu)` holds, because it is symmetric around the mean. The standard error of the ACF is estimated according to Bartlett's formula for MA(l) processes: [1]_:sup:`,` [3]_ .. math:: SE(C_\tau) = \sqrt{\frac{1 + 2 \sum_{\tau^{'} = 1}^{\tau - 1} C_{\tau^{'}}}{N}} for :math:`\tau \gt 1`. For :math:`\tau = 1` the standard error is estimated by :math:`SE(C_1) = \frac{1}{\sqrt{N}}` and for :math:`\tau = 0` the standard error is :math:`SE(C_0) = 0`. If the ACF :math:`C_\tau` is zero within the confidence interval, the null hypothesis that there is no autocorrelation at the given lag time :math:`\tau` is rejected with a significance level of :math:`\alpha`. This is an approximate test that assumes that the underlying time series is normally distributed. [1]_ References ---------- .. [1] Wikipedia `Correlogram <https://en.wikipedia.org/wiki/Correlogram#Statistical_inference_with_correlograms>`_ .. [2] Wikipedia `Normal distribution <https://en.wikipedia.org/wiki/Normal_distribution#Quantile_function>`_ .. [3] Wikipedia `Autoregressive-moving-average model <https://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model#Moving-average_model>`_ Examples -------- >>> mdt.stats.acf_confint([3]) array([0.]) >>> a = np.arange(4) >>> se = mdt.stats.acf_se(a) >>> ci = mdt.stats.acf_confint(a) >>> ci array([0. , 0.97998199, 1.6973786 , 3.25023257]) >>> np.allclose(ci, 1.959960*se, rtol=0, atol=1e-5) True >>> ci = mdt.stats.acf_confint(a, alpha=0.01) >>> ci array([0. , 1.28791465, 2.23073361, 4.27152966]) >>> np.allclose(ci, 2.575830*se, rtol=0, atol=1e-5) True >>> b = np.column_stack([a, a]) >>> mdt.stats.acf_confint(b, axis=0) array([[0. , 0. ], [0.97998199, 0.97998199], [1.6973786 , 1.6973786 ], [3.25023257, 3.25023257]]) >>> b = np.row_stack([a, a]) >>> mdt.stats.acf_confint(b, axis=1) array([[0. , 0.97998199, 1.6973786 , 3.25023257], [0. , 0.97998199, 1.6973786 , 3.25023257]]) >>> c = np.array([b, b]) >>> mdt.stats.acf_confint(c, axis=2) array([[[0. , 0.97998199, 1.6973786 , 3.25023257], [0. , 0.97998199, 1.6973786 , 3.25023257]], <BLANKLINE> [[0. , 0.97998199, 1.6973786 , 3.25023257], [0. , 0.97998199, 1.6973786 , 3.25023257]]]) """ # noqa: E501, W505 rv = norm(loc=0, scale=1) quantile = rv.ppf(1 - alpha / 2) confint = mdt.stats.acf_se(x, axis=axis, n=n) confint *= quantile return confint
[docs] def center(x, axis=None, dtype=None, inplace=False): """ Center a distribution around its sample mean. Parameters ---------- x : array_like Input array. axis : int or None, optional The axis along which to compute the sample mean. By default, the flattened input array is used. dtype : data-type, optional The dtype of the output array. This dtype is also used to compute the mean. Note that computing means can be inaccurate when using the floating-point dtype ``numpy.float32`` or less. See :func:`numpy.mean` for more details. inplace : bool, optional If ``True``, change the input array inplace if possible (the given dtype must be castable to the dtype of `x` and `x` must already be a :class:`numpy.ndarray`). Returns ------- x_centered : numpy.ndarray The centered input distribution: ``x - np.mean(x)``. See Also -------- :func:`numpy.mean` : Compute the arithmetic mean along the specified axis :func:`mdtools.statistics.standardize` : Standardize a distribution Examples -------- >>> a = np.arange(24).reshape(2,3,4) >>> mdt.stats.center(a) array([[[-11.5, -10.5, -9.5, -8.5], [ -7.5, -6.5, -5.5, -4.5], [ -3.5, -2.5, -1.5, -0.5]], <BLANKLINE> [[ 0.5, 1.5, 2.5, 3.5], [ 4.5, 5.5, 6.5, 7.5], [ 8.5, 9.5, 10.5, 11.5]]]) >>> mdt.stats.center(a, axis=0) array([[[-6., -6., -6., -6.], [-6., -6., -6., -6.], [-6., -6., -6., -6.]], <BLANKLINE> [[ 6., 6., 6., 6.], [ 6., 6., 6., 6.], [ 6., 6., 6., 6.]]]) >>> mdt.stats.center(a, axis=1) array([[[-4., -4., -4., -4.], [ 0., 0., 0., 0.], [ 4., 4., 4., 4.]], <BLANKLINE> [[-4., -4., -4., -4.], [ 0., 0., 0., 0.], [ 4., 4., 4., 4.]]]) >>> mdt.stats.center(a, axis=2) array([[[-1.5, -0.5, 0.5, 1.5], [-1.5, -0.5, 0.5, 1.5], [-1.5, -0.5, 0.5, 1.5]], <BLANKLINE> [[-1.5, -0.5, 0.5, 1.5], [-1.5, -0.5, 0.5, 1.5], [-1.5, -0.5, 0.5, 1.5]]]) >>> a = np.ones(3, dtype=np.float64) >>> a array([1., 1., 1.]) >>> b = mdt.stats.center(a, inplace=True) >>> b array([0., 0., 0.]) >>> b is a True >>> b[0] = 1 >>> a array([1., 0., 0.]) """ x = np.asarray(x, dtype=dtype) mean = np.mean(x, axis=axis, dtype=dtype) if axis is not None: mean = np.expand_dims(mean, axis=axis) if inplace: x -= mean else: x = x - mean return x
[docs] def standardize(x, axis=None, dtype=None, inplace=False, ddof=0): """ Standardize a distribution. Standardize a distribution such that its mean is zero and its standard deviation is one. Parameters ---------- x : array_like Input array. axis : int or None, optional The axis along which to compute the sample mean and standard deviation. By default, the flattened input array is used. dtype : data-type, optional The dtype of the output array. This dtype is also used to compute the mean and standard deviation. Note that computing means and standard deviations can be inaccurate when using the floating-point dtype ``numpy.float32`` or less. See :func:`numpy.mean` for more details. inplace : bool, optional If ``True``, change the input array inplace if possible (the given dtype must be castable to the dtype of `x` and `x` must already be a :class:`numpy.ndarray`). ddof : int, optional Delta Degrees of Freedom. When calculating the standard deviation, the used divisor is ``N - ddof``, where ``N`` represents the number of elements. See :func:`numpy.std` for more details. Returns ------- x_standardized : numpy.ndarray The standardized input distribution: ``(x - np.mean(x)) / np.std(x)``. See Also -------- :func:`numpy.mean` : Compute the arithmetic mean along the specified axis :func:`numpy.std` : Compute the standard deviation along the specified axis :func:`mdtools.statistics.center` : Center a distribution around its sample mean Examples -------- >>> a = np.arange(24).reshape(2,3,4) >>> mdt.stats.standardize(a) array([[[-1.66132477, -1.51686175, -1.37239873, -1.2279357 ], [-1.08347268, -0.93900965, -0.79454663, -0.65008361], [-0.50562058, -0.36115756, -0.21669454, -0.07223151]], <BLANKLINE> [[ 0.07223151, 0.21669454, 0.36115756, 0.50562058], [ 0.65008361, 0.79454663, 0.93900965, 1.08347268], [ 1.2279357 , 1.37239873, 1.51686175, 1.66132477]]]) >>> mdt.stats.standardize(a, axis=0) array([[[-1., -1., -1., -1.], [-1., -1., -1., -1.], [-1., -1., -1., -1.]], <BLANKLINE> [[ 1., 1., 1., 1.], [ 1., 1., 1., 1.], [ 1., 1., 1., 1.]]]) >>> mdt.stats.standardize(a, axis=1) array([[[-1.22474487, -1.22474487, -1.22474487, -1.22474487], [ 0. , 0. , 0. , 0. ], [ 1.22474487, 1.22474487, 1.22474487, 1.22474487]], <BLANKLINE> [[-1.22474487, -1.22474487, -1.22474487, -1.22474487], [ 0. , 0. , 0. , 0. ], [ 1.22474487, 1.22474487, 1.22474487, 1.22474487]]]) >>> mdt.stats.standardize(a, axis=2) array([[[-1.34164079, -0.4472136 , 0.4472136 , 1.34164079], [-1.34164079, -0.4472136 , 0.4472136 , 1.34164079], [-1.34164079, -0.4472136 , 0.4472136 , 1.34164079]], <BLANKLINE> [[-1.34164079, -0.4472136 , 0.4472136 , 1.34164079], [-1.34164079, -0.4472136 , 0.4472136 , 1.34164079], [-1.34164079, -0.4472136 , 0.4472136 , 1.34164079]]]) >>> a = np.arange(3, dtype=np.float64) >>> a array([0., 1., 2.]) >>> b = mdt.stats.standardize(a, inplace=True) >>> b array([-1.22474487, 0. , 1.22474487]) >>> b is a True >>> b[0] = 1 >>> a array([1. , 0. , 1.22474487]) """ x = np.asarray(x, dtype=dtype) x = mdt.stats.center(x, axis=axis, dtype=dtype, inplace=inplace) std = np.std(x, axis=axis, dtype=dtype, ddof=ddof) if axis is not None: std = np.expand_dims(std, axis=axis) if inplace: x /= std else: x = x / std return x
[docs] def gaussian(x, mu=0, sigma=1): r""" Gaussian distribution. .. math:: f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}} Parameters ---------- x : scalar or array_like Array of :math:`x` values for which to evaluate :math:`f(x)`. mu : scalar or array_like, optional Mean of the distribution. If an array of means is given, it must be broadcastable to a common shape with `x`. sigma : scalar or array_like, optional Standard deviation of the distribution. If an array of standard deviations is given, it must be broadcastable to a common shape with `x`. Returns ------- f : scalar or array_like The function values :math:`f(x)` at the given :math:`x` values. See Also -------- :obj:`scipy.stats.norm` : Normal continuous random variable. Examples -------- >>> from scipy.stats import norm >>> x = np.linspace(-1, 3, 5) >>> y1 = mdt.stats.gaussian(x, mu=1, sigma=1) >>> y2 = norm.pdf(x, loc=1, scale=1) >>> np.allclose(y1, y2, rtol=0) True >>> y1 array([0.05399097, 0.24197072, 0.39894228, 0.24197072, 0.05399097]) """ # noqa: E501, W505 s2 = 2 * sigma**2 return np.exp(-((x - mu) ** 2) / s2) / np.sqrt(np.pi * s2)
[docs] def ngp(x, axis=None, d=1, center=False, is_squared=False): r""" Compute the non-Gaussian parameter of a distribution of random variables. Parameters ---------- x : array_like Input array. axis : None or int or tuple of ints, optional Axis or axes of `x` along which to compute the NGP. By default, the flattened input array is used. d : int, optional Dimensionality of the random variables (this is *not* the dimensionality of the input array). Originally, this option is provided to compute the NGP of a distribution of particle displacements :math:`\Delta\mathbf{r}` that were calculated in one, two or three dimensional euclidean space. center : bool, optional If ``True``, center the input distribution around its mean, i.e. use central moments :math:`\langle (X - \bar{X})^n \rangle` for computing the NGP (with :math:`\bar{X} = \sum_{i=1}^N X_i` being the sample mean). If ``False``, use the raw moments :math:`\langle X^n \rangle` without subtracting the sample mean. Note that the central moments of the sample are biased estimators for the central moments of the population, whereas the raw moments of the sample are always equal to the raw moments of the population. See Wikipedia `Moment (mathematics) § Sample moments <https://en.wikipedia.org/wiki/Moment_(mathematics)#Sample_moments>`_ `center` must not be used together with `is_squared`, because we cannot estimate the original sample mean if `x` is already squared. is_squared : bool, optional If ``True``, `x` is assumed to be already squared. If ``False`` (default), `x` will be squared before calculating the moments. Setting `is_squared` to ``True`` is for example useful if `x` is the distribution of squared particle displacements :math:`\Delta\mathbf{r}^2(\tau)` of an ensemble of particles at a given lag time :math:`\tau`. `is_squared` must not be used together with `center`. Returns ------- a : scalar The non-Gaussian parameter :math:`\alpha_{2,d}`. Notes ----- The non-Gaussian parameter (NGP) was first introduced by Rahman, Singwi, and Sjölander. [#]_:sup:`,` [#]_ Since then it has become a standard test to check for Gaussian diffusion, i.e. whether the particle displacements :math:`\Delta\mathbf{r}(\tau)` follow a Gaussian distribution. For a Gaussian distribution the NGP is zero. This function uses the definition of the NGP in :math:`d`-dimensional euclidean space given in the text on page 12735 of Reference: [#]_ .. math:: \alpha_{2,d} = \frac{1}{1+\frac{2}{d}} \frac{\langle X^4 \rangle}{\langle X^2 \rangle^2} - 1 A more general definition of the NGP :math:`\alpha_{n,d}` in :math:`d`-dimensional euclidean space was given by Huang, Wang and Yu in Equation (14) of Reference: [#]_ .. math:: \alpha_{n,d} = \frac{(d-2)!! d^n}{(2n + d - 2)!!} \frac{\langle X^{2n} \rangle}{\langle X^2 \rangle^n} - 1 References ---------- .. [#] A. Rahman, K. S. Singwi, A. Sjölander, `Theory of Slow Neutron Scattering by Liquids. I <https://doi.org/10.1103/PhysRev.126.986>`_, Physical Review, 1962, 126, 986. .. [#] A. Rahman, `Correlations in the Motion of Atoms in Liquid Argon <https://doi.org/10.1103/PhysRev.136.A405>`_, Physical Review, 1964, 136, A405. .. [#] Sanggeun Song, Seong Jun Park, Minjung Kim, Jun Soo Kim, Bong June Sung, Sangyoub Lee, Ji-Hyun Kim, Jaeyoung Sung, `Transport dynamics of complex fluids <https://doi.org/10.1073/pnas.1900239116>`_, Proceedings of the National Academy of Sciences, 2019, 116, 26, 12733-12742 .. [#] Zihan Huang, Gaoming Wang, Zhao Yu, `Non-Gaussian Parameter in k-Dimensional Euclidean Space <https://doi.org/10.48550/arXiv.1511.06672>`_, arXiv preprint arXiv:1511.06672v1, 2015. Examples -------- >>> mdt.stats.ngp(np.array([-2, -1, -1, 0, 0, 0, 1, 1, 2])) -0.25 >>> a = mdt.stats.gaussian(np.linspace(-5, 5, 11)) >>> mdt.stats.ngp(a) 0.48352183005980653 >>> a = np.arange(24).reshape(2,3,4) >>> mdt.stats.ngp(a) -0.3876040703052728 >>> mdt.stats.ngp(a, center=True) -0.401391304347826 >>> mdt.stats.ngp(a, axis=0) array([[-0.33333333, -0.34113033, -0.35946667, -0.382643 ], [-0.4071511 , -0.43103845, -0.45333333, -0.47363871], [-0.49187475, -0.50812525, -0.52254957, -0.53533412]]) >>> mdt.stats.ngp(a, axis=0, center=True) array([[-0.66666667, -0.66666667, -0.66666667, -0.66666667], [-0.66666667, -0.66666667, -0.66666667, -0.66666667], [-0.66666667, -0.66666667, -0.66666667, -0.66666667]]) >>> mdt.stats.ngp(a, axis=1) array([[-0.32 , -0.37225959, -0.42285714, -0.46559096], [-0.6152 , -0.62068471, -0.62535515, -0.62936154]]) >>> mdt.stats.ngp(a, axis=1, center=True) array([[-0.5, -0.5, -0.5, -0.5], [-0.5, -0.5, -0.5, -0.5]]) >>> mdt.stats.ngp(a, axis=2) array([[-0.33333333, -0.61552028, -0.64866075], [-0.65763599, -0.66126512, -0.66307898]]) >>> mdt.stats.ngp(a, axis=2, center=True) array([[-0.45333333, -0.45333333, -0.45333333], [-0.45333333, -0.45333333, -0.45333333]]) """ # noqa: E501, W505 if is_squared and center: raise ValueError( "'center' must not be used together with 'is_squared'" ) x2 = np.array(x, dtype=np.float64, copy=not is_squared) if not is_squared: if center: x2 = mdt.stats.center( x2, axis=axis, dtype=np.float64, inplace=True ) x2 **= 2 x4_mean = np.mean(x2**2, axis=axis, dtype=np.float64) x2_mean = np.mean(x2, axis=axis, dtype=np.float64) return x4_mean / ((1 + 2 / d) * x2_mean**2) - 1
[docs] def exp_dist(x, rate=1): r""" Exponential distribution. .. math:: f(x) = \lambda e^{-\lambda x} The mean and standard deviation of an exponential distribution are given by :math:`1/\lambda`. Parameters ---------- x : scalar or array_like Array of :math:`x` values for which to evaluate :math:`f(x)`. rate : scalar or array_like, optional The value(s) for :math:`\lambda`. If an array is provided, it must be broadcastable to a common shape with `x`. Returns ------- f : scalar or array_like The function values :math:`f(x)` at the given :math:`x` values. See Also -------- :obj:`scipy.stats.expon` : Exponential continuous random variable Examples -------- >>> from scipy.stats import expon >>> x = np.linspace(0, 4, 5) >>> y1 = mdt.stats.exp_dist(x, rate=2) >>> y2 = expon.pdf(x, loc=0, scale=1/2) >>> np.allclose(y1, y2, rtol=0) True >>> y1 array([2.00000000e+00, 2.70670566e-01, 3.66312778e-02, 4.95750435e-03, 6.70925256e-04]) """ # noqa: W505 return rate * np.exp(-rate * x)
[docs] def exp_dist_log(x, rate): r""" Logarithm of the exponential distribution. .. math:: f(x) = \ln(\lambda) - \lambda x Parameters ---------- x : scalar or array_like Array of :math:`x` values for which to evaluate :math:`f(x)`. rate : scalar or array_like, optional The value(s) for :math:`\lambda`. If an array is provided, it must be broadcastable to a common shape with `x`. Returns ------- f : scalar or array_like The function values :math:`f(x)` at the given :math:`x` values. See Also -------- :func:`exp_dist` : Exponential distribution Examples -------- >>> x = np.linspace(0, 4, 5) >>> y1 = mdt.stats.exp_dist_log(x, rate=2) >>> y2 = mdt.stats.exp_dist(x, rate=2) >>> np.allclose(y1, np.log(y2), rtol=0) True >>> y1 array([ 0.69314718, -1.30685282, -3.30685282, -5.30685282, -7.30685282]) """ # noqa: W505 return np.log(rate) - rate * x
[docs] def mb_dist(v, temp=None, mass=None, var=None, drift=0, d=3): r""" Maxwell-Boltzmann speed distribution in :math:`d`-dimensional space. [#]_ .. math:: p(v) = S_d \cdot (v - u)^{d-1} \left(\frac{1}{2\pi\sigma^2}\right)^{\frac{d}{2}} \exp{\left[-\frac{(v - u)^2}{2\sigma^2} \right]} With the speed .. math:: v = \sqrt{\sum_{i=1}^d v_i^2}, a drift speed :math:`u`, the variance of the gaussian part (note that this is not the variance of :math:`p(v)`) .. math:: \sigma^2 = \frac{k_B T}{m} and the surface area of a :math:`d`-dimensional unit sphere [#]_ .. math:: S_d = \frac{2\pi^{\frac{d}{2}}}{\Gamma\left(\frac{d}{2}\right)} = \begin{cases} 2 , & d = 1\\ 2\pi , & d = 2\\ 4\pi , & d = 3\\ 2\pi^2 , & d = 4\\ \vdots & \end{cases} where :math:`\Gamma` is the `gamma function <https://en.wikipedia.org/wiki/Gamma_function>`_. Parameters ---------- v : array_like Array of speed values for which to evaluate :math:`p(v)`. temp : scalar or None, optional Temperature of the system. Must be provided if `var` is ``None``. mass : scalar or None, optional Mass of the particles. Must be provided if `var` is ``None``. var : scalar or None, optional Variance :math:`\sigma^2` of the Maxwell-Boltzmann distribution. If ``None``, it is calculated as :math:`\sigma^2 = k_B T / m`, where :math:`k_B` is the Boltzmann constant, :math:`T` is the temperature and :math:`m` is the mass of the particles. If both, `temp` and `mass`, are provided, `var` is ignored. drift : scalar, optional Drift speed of the particles. d : int, optional Number of spatial dimensions in which the particles can move. Returns ------- p : scalar or array_like The function values :math:`p(v)` at the given :math:`v` values. See Also -------- :func:`mdtools.statistics.ekin_dist` : Distribution of kinetic energies in :math:`d`-dimensional space Notes ----- Be aware of the units of your input values! The unit of the Boltzmann constant :math:`k_B` is J/K. Thus, temperatures must be given in K, masses in kg and velocities in m/s. If you parse `var` directly instead of `temp` and `mass`, ``v**2`` and `var` must have the same units. Note that all arguments should be positive to get a physically meaningful output. However, it is also possible to parse negative values to all input arguments. Note that a (positive) drift velocity only leads to a shift of the distribution along the x axis (to higher values). Therefore, parts of the distribution that, without drift, would lie in the region of (unphysical) negative speeds now appear at positive speed values. The user is responsible to take care of this unphysical artifact. Generally, it's easiest to calculate the Maxwell-Boltzmann distribution without drift at positive speed values and afterwards shift the outcome by the drift speed. References ---------- .. [#] Wikipedia `Maxwell-Boltzmann distribution § In n-dimensional space <https://en.wikipedia.org/wiki/Maxwell%E2%80%93Boltzmann_distribution#In_n-dimensional_space>`_ .. [#] Wikipedia `n-sphere § Volume and area <https://en.wikipedia.org/wiki/N-sphere#Volume_and_area>`_ Examples -------- >>> x = np.linspace(0, 4, 5) >>> mdt.stats.mb_dist(x, var=1) array([0. , 0.48394145, 0.43192773, 0.07977327, 0.00428257]) >>> mdt.stats.mb_dist(x, var=2) array([0. , 0.21969564, 0.4151075 , 0.26759315, 0.08266794]) >>> mdt.stats.mb_dist(x, var=1, d=1) array([7.97884561e-01, 4.83941449e-01, 1.07981933e-01, 8.86369682e-03, 2.67660452e-04]) >>> mdt.stats.mb_dist(x, var=1, d=2) array([0. , 0.60653066, 0.27067057, 0.03332699, 0.00134185]) >>> mdt.stats.mb_dist(x, var=1, d=4) array([0. , 0.30326533, 0.54134113, 0.14997145, 0.0107348 ]) >>> mdt.stats.mb_dist(x, var=1, drift=1) array([0.48394145, 0. , 0.48394145, 0.43192773, 0.07977327]) >>> np.allclose( ... mdt.stats.mb_dist(x, var=1)[:-1], ... mdt.stats.mb_dist(x, var=1, drift=1)[1:], ... rtol=0, ... ) True >>> from scipy import constants >>> temp, mass = 273.15, 1.660e-27 >>> var = constants.k * temp / mass >>> np.allclose( ... mdt.stats.mb_dist(x, temp=temp, mass=mass), ... mdt.stats.mb_dist(x, var=var), ... rtol=0, ... ) True >>> np.allclose( ... mdt.stats.mb_dist(x, temp=temp, mass=mass, d=2), ... mdt.stats.mb_dist(x, var=var, d=2), ... rtol=0, ... ) True >>> np.allclose( ... mdt.stats.mb_dist(x, temp=temp, mass=mass, drift=2), ... mdt.stats.mb_dist(x, var=var, drift=2), ... rtol=0, ... ) True The area below the Maxwell-Boltzmann distribution is one: >>> x = np.linspace(0, 20, 200) >>> for d in range(1, 6): ... np.isclose( ... np.trapz(mdt.stats.mb_dist(x, var=1, d=d), x), ... 1, ... rtol=0, ... atol=1e-3, ... ) True True True True True """ # noqa: E501, W505 if var is None: if temp is None and mass is None: raise ValueError( "'mass' and 'temp' must be provided if 'var' is None" ) var = constants.k * temp / mass else: if temp is not None and mass is not None: warnings.warn( "'var' ({}) is ignored, because 'temp' and 'mass' are given", UserWarning, ) dist = norm.pdf(v, loc=drift, scale=np.sqrt(var)) dist *= (v - drift) ** (d - 1) # Remaining factor: S_d * [1/(2*pi*sigma^2)]**(d/2), but # [1/(2*pi*sigma^2)]**(1/2) is already contained in `norm.pdf` # => The remaining factor is actually # S_d * [1/(2*pi*sigma^2)]**((d-1)/2) sd = 2 * np.pi ** (d / 2) sd /= special.gamma(d / 2) dist *= sd dist /= (2 * np.pi * var) ** ((d - 1) / 2) return dist
[docs] def ekin_dist(ekin, temp, d=3): r""" Distribution of kinetic energies in :math:`d`-dimensional space. [#]_ Maxwell-Boltzmann distribution of the kinetic energies of the particles in a canonical (:math:`NVT`) ensemble: .. math:: p(E) = \frac{S_d}{2} (\pi k_B T)^{-\frac{d}{2}} E^{-\frac{d-2}{2}} \exp{\left( -\frac{E}{k_B T} \right)} With the kinetic energy :math:`E`, the Boltzmann constant :math:`k_B`, the temperature of the system :math:`T` and the surface area of a :math:`d`-dimensional unit sphere [#]_ .. math:: S_d = \frac{2\pi^{\frac{d}{2}}}{\Gamma\left(\frac{d}{2}\right)} = \begin{cases} 2 , & d = 1\\ 2\pi , & d = 2\\ 4\pi , & d = 3\\ 2\pi^2 , & d = 4\\ \vdots & \end{cases} where :math:`\Gamma` is the `gamma function <https://en.wikipedia.org/wiki/Gamma_function>`_. Parameters ---------- ekin : array_like Array of kinetic energy values :math:`E` for which to evaluate :math:`p(E)`. temp : scalar Temperature of the system. d : int, optional Number of spatial dimensions in which the particles can move. Returns ------- p : scalar or array_like The function values :math:`p(E)` at the given :math:`E` values. See Also -------- :func:`mdtools.statistics.mb_dist` : Maxwell-Boltzmann speed distribution in :math:`d`-dimensional space Notes ----- Be aware of the units of your input values! The unit of the Boltzmann constant :math:`k_B` is J/K. Thus, temperatures must be given in K and energy values must be given in J. Note that all arguments should be positive to get a physically meaningful output. However, it is also possible to parse negative values to all input arguments. The distribution of the kinetic energies :math:`p(E) \text{ d}E` can be derived from the Maxwell-Boltzmann speed distribution .. math:: p(v) \text{ d}v = S_d v^{d-1} \left( \frac{m}{2 \pi k_B T} \right)^{-\frac{d}{2}} \exp{\left( -\frac{mv^2}{2k_B T} \right)} \text{ d}v by substituting :math:`E = \frac{1}{2} mv^2`. References ---------- .. [#] Wikipedia `Maxwell-Boltzmann distribution § In n-dimensional space <https://en.wikipedia.org/wiki/Maxwell%E2%80%93Boltzmann_distribution#Distribution_for_the_energy>`_ .. [#] Wikipedia `n-sphere § Volume and area <https://en.wikipedia.org/wiki/N-sphere#Volume_and_area>`_ Examples -------- >>> from scipy import constants >>> temp = 1 / constants.k # Set thermal energy kT to 1 >>> x = np.linspace(0, 3, 5) >>> mdt.stats.ekin_dist(x, temp) array([0. , 0.46159897, 0.30836066, 0.17839543, 0.09730435]) >>> mdt.stats.ekin_dist(x, temp, d=1) array([ inf, 0.30773265, 0.10278689, 0.03964343, 0.01621739]) >>> mdt.stats.ekin_dist(x, temp, d=2) array([1. , 0.47236655, 0.22313016, 0.10539922, 0.04978707]) >>> mdt.stats.ekin_dist(x, temp, d=4) array([0. , 0.35427491, 0.33469524, 0.23714826, 0.14936121]) The area below the distribution is one: >>> x = np.linspace(0, 20, 200) >>> # In the 1-dimensional case, the distribution goes to infinity >>> # for E->0 and therefore cannot be integrated numerically. >>> for d in range(2, 6): ... dist = mdt.stats.ekin_dist(x, temp, d) ... integral = np.trapz(dist, x) ... np.isclose(integral, 1, rtol=0, atol=1e-2) True True True True """ # Surface area of a d-dimensional unit sphere divided by 2. sd = np.pi ** (d / 2) sd /= special.gamma(d / 2) # Pre-factor pre = sd * (np.pi * constants.k * temp) ** (-d / 2) # Distribution dist = ekin ** ((d - 2) / 2) dist *= np.exp(-ekin / (constants.k * temp)) dist *= pre return dist
[docs] def var_weighted(a, weights=None, axis=None, return_mean=False): r""" Compute the weighted variance along a given axis. Parameters ---------- a : array_like Input array. weights : array_like or None, optional An array of weights associated with the values in `a`. Each value in `a` contributes to the variance according to its associated weight. The weights array can either be 1-dimensional (in which case its length must be the size of `a` along the given axis) or of the same shape as `a`. If `weights` is ``None``, then all data in `a` are assumed to have a weight equal to one. The only constraint on `weights` is that ``np.sum(weights)`` must not be 0 along the given axis. axis : None or int or tuple of ints, optional Axis or axes along which the variance is computed. The default is to compute the variance of the flattened array. return_mean : bool, optional If ``True``, also return the weighted sample mean :math:`\bar{X}` used to calculate the weighted variance. Returns ------- var : scalar Weighted variance. mean : scalar Weighted sample mean. Only returned if `return_mean` is ``True``. See Also -------- :func:`numpy.average` : Compute the weighted average along the specified axis :func:`numpy.var` : Compute the variance along the specified axis :func:`mdtools.statistics.std_weighted` : Compute the weighted standard deviation along a given axis Notes ----- The weighted variance is given by .. math:: \sigma^2 = \sum_{i=1}^N (X_i - \bar{X})^2 \cdot p(X_i) with the weighted sample mean .. math:: \bar{X} = \sum_{i=1}^N X_i \cdot p(X_i) the probability of element :math:`X_i` .. math:: p(X_i) = \frac{\sum_{i=1}^N X_i w_i}{\sum_{j=1}^N w_j} and the weight :math:`w_i` of the :math:`i`-th element. """ mean = np.average(a, axis=axis, weights=weights) var = np.average(np.abs(a - mean) ** 2, axis=axis, weights=weights) if return_mean: return var, mean else: return var
[docs] def std_weighted(a, weights=None, axis=None, return_mean=False): r""" Compute the weighted standard deviation along a given axis. Parameters ---------- a : array_like Input array. weights : array_like or None, optional An array of weights associated with the values in `a`. Each value in `a` contributes to the standard deviation according to its associated weight. The weights array can either be 1-dimensional (in which case its length must be the size of `a` along the given axis) or of the same shape as `a`. If `weights` is ``None``, then all data in `a` are assumed to have a weight equal to one. The only constraint on `weights` is that ``np.sum(weights)`` must not be 0 along the given axis. axis : None or int or tuple of ints, optional Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation of the flattened array. return_mean : bool, optional If ``True``, also return the weighted sample mean :math:`\bar{X}` used to calculate the weighted standard deviation. Returns ------- std : scalar Weighted standard deviation. mean : scalar Weighted sample mean. Only returned if `return_mean` is ``True``. See Also -------- :func:`numpy.average` : Compute the weighted average along the specified axis :func:`numpy.std` : Compute the standard deviation along the specified axis :func:`mdtools.statistics.var_weighted` : Compute the weighted variance along a given axis Notes ----- The weighted standard deviation is given by .. math:: \sigma = \sqrt{\sum_{i=1}^N (X_i - \bar{X})^2 \cdot p(X_i)} with the weighted sample mean .. math:: \bar{X} = \sum_{i=1}^N X_i \cdot p(X_i) the probability of element :math:`X_i` .. math:: p(X_i) = \frac{\sum_{i=1}^N X_i w_i}{\sum_{j=1}^N w_j} and the weight :math:`w_i` of the :math:`i`-th element. """ var, mean = mdt.stats.var_weighted( a, weights=weights, axis=axis, return_mean=True ) std = np.sqrt(var) if return_mean: return std, mean else: return std
[docs] def cumav(a, axis=None, out=None): r""" Calculate the cumulative average. Parameters ---------- a : array_like Array of values for which to calculate the cumulative average. The dtype is converted to ``numpy.float64``. axis : None or int, optional Axis along which the cumulative average is computed. The default is to compute the cumulative average of the flattened array. out : array_like Alternative output array in which to place the result. It must have the same shape and buffer length as ``a.flatten()`` (if `axis` is ``None``) or `a` (if `axis` is not ``None``), but the type will be cast if necessary. See :func:`numpy.cumsum` for more details. Note that the result will be cast to the dtype of `out`, even if this dtype cannot hold the resulting values (e.g. if the dtype of `out` is :class:`int` but the result contains floating point values, the fractional part of these floats is cut). So it's safest, to parse an output array of dtype ``numpy.float64``, also with respect to the accuracy of the average calculation (compare notes of :func:`numpy.mean`). Returns ------- rav : numpy.ndarray The cumulative average along the specified axis. The result has the same size as `a`, and also the same shape as `a` if `axis` is not ``None`` or if `a` is a 1-d array. See Also -------- :func:`numpy.cumsum` : Return the cumulative sum of the elements along a given axis :func:`mdtools.statistics.movav` : Calculate a (weighted) moving average. Notes ----- The cumulative average at the :math:`n`-th position is given by .. math:: \mu_n = \frac{1}{n} \sum_{i=0}^{n} a_i Examples -------- >>> a = np.arange(6) >>> mdt.stats.cumav(a) array([0. , 0.5, 1. , 1.5, 2. , 2.5]) >>> a = np.arange(6).reshape(2,3) >>> mdt.stats.cumav(a) array([0. , 0.5, 1. , 1.5, 2. , 2.5]) >>> mdt.stats.cumav(a, axis=0) array([[0. , 1. , 2. ], [1.5, 2.5, 3.5]]) >>> mdt.stats.cumav(a, axis=1) array([[0. , 0.5, 1. ], [3. , 3.5, 4. ]]) >>> a = np.arange(12).reshape(2,2,3) >>> mdt.stats.cumav(a) array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]) >>> mdt.stats.cumav(a, axis=0) array([[[0., 1., 2.], [3., 4., 5.]], <BLANKLINE> [[3., 4., 5.], [6., 7., 8.]]]) >>> mdt.stats.cumav(a, axis=1) array([[[0. , 1. , 2. ], [1.5, 2.5, 3.5]], <BLANKLINE> [[6. , 7. , 8. ], [7.5, 8.5, 9.5]]]) >>> mdt.stats.cumav(a, axis=2) array([[[ 0. , 0.5, 1. ], [ 3. , 3.5, 4. ]], <BLANKLINE> [[ 6. , 6.5, 7. ], [ 9. , 9.5, 10. ]]]) """ a = np.asarray(a, dtype=np.float64) cav = np.cumsum(a, axis=axis, out=out) if axis is None: norm = np.arange(1, a.size + 1, dtype=np.uint32) else: s = [1 for i in range(a.ndim)] s[axis] = a.shape[axis] norm = np.arange(1, a.shape[axis] + 1, dtype=np.uint32).reshape(s) cav /= norm return cav
[docs] def movav(a, wlen, axis=None, out=None): r""" Calculate the moving average. Also known as rolling average or running average. Parameters ---------- a : array_like Array of values for which to calculate the moving average. The dtype is converted to ``numpy.float64``. wlen : int Length of the averaging window (corresponds to :math:`k` in the formula below). Must be at least one and not greater than the size of `a` (if `axis` is ``None``) or the length of `a` along the given axis (if `axis` is not ``None``). axis : None or int, optional Axis along which to compute the moving average. The default is to compute the moving average of the flattened array. out : array_like Alternative output array in which to place the result. The first ``wlen - 1`` elements along the given axis must be discarded after receiving the result. `out` must have the same shape and buffer length as ``a.flatten()`` (if `axis` is ``None``) or `a` (if `axis` is not ``None``), but the type will be cast if necessary. See :func:`numpy.cumsum` for more details. Note that the result will be cast to the dtype of `out`, even if this dtype cannot hold the resulting values (e.g. if the dtype of `out` is :class:`int` but the result contains floating point values, the fractional part of these floats is cut). So it's safest, to parse an output array of dtype ``numpy.float64``, also with respect to the accuracy of the average calculation (compare notes of :func:`numpy.mean`). Returns ------- mav : numpy.ndarray The moving average along the specified axis. The result has the same shape as `a` except along the given axis where the length is smaller by ``wlen - 1``. See Also -------- :func:`mdtools.statistics.cumav` : Calculate the cumulative average. Notes ----- The moving average at the :math:`m`-th position with a window size of :math:`k` is given by .. math:: \mu_m^k = \frac{1}{k} \sum_{i=m}^{m+k-1} a_i This function implements the moving average according to the following formula using :func:`numpy.cumsum`: .. math:: \mu_m^k = \frac{1}{k} \left( \sum_{i=0}^{m+k-1} a_i - \sum_{i=0}^{m-1} a_i \right) Examples -------- >>> a = np.arange(6) >>> mdt.stats.movav(a, wlen=3) array([1., 2., 3., 4.]) >>> mdt.stats.movav(a, wlen=4) array([1.5, 2.5, 3.5]) >>> a = np.arange(12).reshape(3,4) >>> mdt.stats.movav(a, wlen=5) array([2., 3., 4., 5., 6., 7., 8., 9.]) >>> mdt.stats.movav(a, wlen=2, axis=0) array([[2., 3., 4., 5.], [6., 7., 8., 9.]]) >>> mdt.stats.movav(a, wlen=2, axis=1) array([[ 0.5, 1.5, 2.5], [ 4.5, 5.5, 6.5], [ 8.5, 9.5, 10.5]]) >>> a = np.arange(36).reshape(3,3,4) >>> mdt.stats.movav(a, wlen=29) array([14., 15., 16., 17., 18., 19., 20., 21.]) >>> mdt.stats.movav(a, wlen=2, axis=0) array([[[ 6., 7., 8., 9.], [10., 11., 12., 13.], [14., 15., 16., 17.]], <BLANKLINE> [[18., 19., 20., 21.], [22., 23., 24., 25.], [26., 27., 28., 29.]]]) >>> mdt.stats.movav(a, wlen=2, axis=1) array([[[ 2., 3., 4., 5.], [ 6., 7., 8., 9.]], <BLANKLINE> [[14., 15., 16., 17.], [18., 19., 20., 21.]], <BLANKLINE> [[26., 27., 28., 29.], [30., 31., 32., 33.]]]) >>> mdt.stats.movav(a, wlen=3, axis=2) array([[[ 1., 2.], [ 5., 6.], [ 9., 10.]], <BLANKLINE> [[13., 14.], [17., 18.], [21., 22.]], <BLANKLINE> [[25., 26.], [29., 30.], [33., 34.]]]) """ a = np.asarray(a, dtype=np.float64) mav = np.cumsum(a, axis=axis, out=out) if wlen < 1: raise ValueError( "The window length must be at least 1 but you gave" " {}.".format(wlen) ) # Call `a.shape[axis]` only after `np.cumsum` to get a proper # `AxisError` instead of an `IndexError` if the axis does not exist. elif axis is None and wlen > a.size: raise ValueError( "The window length ({}) must not be greater than the size of the" " input array {}.".format(wlen, a.size) ) elif axis is not None and wlen > a.shape[axis]: raise ValueError( "The window length ({}) must not be greater than the length of the" " input array along the given axis {}.".format(wlen, a.shape[axis]) ) # `mav_full_sum` is the first sum in the round brackets of the # implementation formula from the above docstring. # Note that `mdt.nph.take` always creates a view, not a copy. mav_full_sum = mdt.nph.take(mav, start=wlen, axis=axis) # Don't do an in-place subtraction here (`mav_windows -= ...`), # because then array elements will already be changed before the # subtraction process has finished. mav_full_sum[:] = mav_full_sum - mdt.nph.take(mav, stop=-wlen, axis=axis) mav_windows = mdt.nph.take(mav, start=wlen - 1, axis=axis) mav_windows /= wlen return mav_windows
[docs] def block_average(data, axis=0, ddof=0, dtype=np.float64): """ Calculate the sample means and their standard deviations over multiple series of measurement. Parameters ---------- data : array_like The array that holds the measured values. Must have at least two dimensions, one dimension containing the different series of measurement and at least one other dimension containing the measured values per series. axis : int, optional Axis along which the means and their standard deviations are computed. ddof : int, optional Delta Degrees of Freedom. The divisor used in calculating the standard deviation is ``N-ddof``, where ``N`` is the number of measurements. dtype : type, optional The data type of the output arrays and the data type to use in computing the means and standard deviations. Note: Computing means and standard deviation using ``numpy.float32`` or lower can be inaccurate. See :func:`numpy.mean` for more details. Returns ------- mean : numpy.ndarray The mean values of the measurements. sd : numpy.ndarray The standard deviations of the mean values. See Also -------- :func:`numpy.mean` : Compute the arithmetic mean along the specified axis :func:`numpy.std` : Compute the standard deviation along the specified axis """ data = np.asarray(data) num_measurements = data.shape[axis] mean = np.mean(data, axis=axis, dtype=dtype) sd = np.std(data, axis=axis, ddof=ddof, dtype=dtype) # Standard deviation of the mean value after n series of # measurement: sd_n = sd / sqrt(n). The mean value remains always # the same inside the error region. np.divide(sd, np.sqrt(num_measurements), out=sd, dtype=dtype) return mean, sd
[docs] def moment_raw2cen(rm, axis=-1): r""" Calculate the :math:`n`-th central moment :math:`\mu_n` from the first :math:`n` raw moments :math:`m_n`. [#]_ .. math:: \mu_n = \langle (x - \mu)^n \rangle = \sum_{k=0}^{n} {n \choose k} (-1)^{n-k} m_k \mu^{n-k} with the :math:`k`-th raw moment :math:`m_k = \langle x^k \rangle` and the mean :math:`\mu = m_1`. Parameters ---------- rm : array_like Array containing the first :math:`n` raw moments. The raw moments must be ordered along the given axis by the raw moments' order (i.e. 1st raw moment first, 2nd raw moment second, ...). The array must not contain the :math:`0`-th raw moment (which is anyway always 1). axis : int, optional The axis along which the raw moments are ordered. Returns ------- cm : scalar or numpy.ndarray The :math:`n`-th central moment, where :math:`n` is given by ``numpy.asarray(rm).shape[-1]``. Notes ----- This function cannot return the :math:`0`-th central moment. Anyway, this is always 1. References ---------- .. [#] Wikipedia `Central moment § Relation to moments about the origin <https://en.wikipedia.org/wiki/Central_moment#Relation_to_moments_about_the_origin>`_ Examples -------- 1-dimensional input arrays: >>> # 1st central moment (is always zero). >>> rm = np.array([3]) >>> mdt.stats.moment_raw2cen(rm) 0 >>> # 2nd central moment (<x^2> - <x>^2 = variance) >>> rm = np.array([ 3, 11]) >>> mdt.stats.moment_raw2cen(rm) 2 >>> # 3rd central moment >>> rm = np.array([ 3, 11, 49]) >>> mdt.stats.moment_raw2cen(rm) 4 >>> # 4th central moment >>> rm = np.array([ 3, 11, 49, 253]) >>> mdt.stats.moment_raw2cen(rm) 16 >>> # The first central moment is always zero. >>> [mdt.stats.moment_raw2cen([rm]) for rm in [0, 1/3, 4, 7/3]] [0, 0.0, 0, 0.0] 2-dimensional input arrays: >>> rm = np.array([[ 3, 11, 49]]) >>> mdt.stats.moment_raw2cen(rm) array([4]) >>> ax = 0 >>> rm = np.array([[ 3, 11, 49], ... [ 13, 121, 2403]]) >>> mdt.stats.moment_raw2cen(rm, axis=ax) array([4, 0, 2]) >>> ax = 1 >>> mdt.stats.moment_raw2cen(rm, axis=ax) array([ 4, 2078]) 3-dimensional input arrays: >>> rm = np.array([[[ 3, 11, 49]]]) >>> mdt.stats.moment_raw2cen(rm) array([[4]]) >>> ax = 0 >>> rm = np.array([[[ 3, 11, 49], ... [ 13, 121, 2403]], ... ... [[ 1, 9, 51], ... [ 2, 87, 2605]]]) >>> mdt.stats.moment_raw2cen(rm, axis=ax) array([[ -8, -112, -2350], [ -167, -14554, -5771804]]) >>> ax = 1 >>> mdt.stats.moment_raw2cen(rm, axis=ax) array([[4, 0, 2], [1, 6, 4]]) >>> ax = 2 >>> mdt.stats.moment_raw2cen(rm, axis=ax) array([[ 4, 2078], [ 26, 2099]]) """ rm = np.asarray(rm) if rm.ndim == 0: raise ValueError("`rm` must be at least 1-dimensional") n = rm.shape[axis] # Prepend the 0-th raw moment to the array of raw moments. # 0-th raw moment is always <x^0> = 1. shape = list(rm.shape) shape[axis] = 1 rm0 = np.ones(shape, dtype=rm.dtype) rm = np.concatenate([rm0, rm], axis=axis) # Mean = 1st raw moment. mean = np.squeeze(mdt.nph.take(rm, start=1, stop=2, axis=axis)) # Powers of the mean: (-1)^{n-k} * \mu^{n-k} = (-\mu)^{n-k}. mean_pows = np.stack([(-mean) ** (n - k) for k in range(n + 1)], axis=axis) # Binomial coefficients: {n \choose k}. coeffs = np.array([comb(n, k, exact=True) for k in range(n + 1)]) shape = [1 for dim in rm.shape] shape[axis] = rm.shape[axis] coeffs = coeffs.reshape(shape) return np.sum(coeffs * mean_pows * rm, axis=axis)