Source code for contact_hist

#!/usr/bin/env python3

# This file is part of MDTools.
# Copyright (C) 2021  The MDTools Development Team and all contributors
# listed in the file AUTHORS.rst
#
# MDTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# MDTools is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with MDTools.  If not, see <http://www.gnu.org/licenses/>.


"""
Calculate the number of contacts between two MDAnalysis
:class:`AtomGroups <MDAnalysis.core.groups.AtomGroup>`.

.. todo::

    Finish docstring.

Contacts are binned into histograms according to how many different
selection :class:`Atoms <MDAnalysis.core.groups.Atom>`/compounds have
contact with a given reference
:class:`~MDAnalysis.core.groups.Atom`/compound and according to how many
contacts exist between a given reference-selection pair.

Options
-------
-f          Trajectory file.  See |supported_coordinate_formats| of
            MDAnalysis.
-s          Topology file.  See |supported_topology_formats| of
            MDAnalysis.
-o          Output filename.
-b          First frame to read from the trajectory.  Frame numbering
            starts at zero.  Default: ``0``.
-e          Last frame to read from the trajectory.  This is exclusive,
            i.e. the last frame read is actually ``END - 1``.  A value
            of ``-1`` means to read the very last frame.  Default:
            ``-1``.
--every     Read every n-th frame from the trajectory.  Default: ``1``.
--ref       Selection string to select the reference group.  See
            MDAnalysis' |selection_syntax| for possible choices.
--sel       Selection string to select the selection group.  See
            MDAnalysis' |selection_syntax| for possible choices.
-c          Cutoff distance in Angstrom.  A reference and selection atom
            are considered to be in contact, if their distance is less
            than or equal to this cutoff.
--refcmp    {'segments', 'residues', 'fragments'}

            The compounds of the reference group to use for calculating
            the contact histograms.  Contact histograms are always
            calculated for single atom contacts, but also for contacts
            between entire compounds.  Specify here, which compounds to
            use for the reference group.  Compounds can be 'segments',
            'residues' or 'fragments'.  Refer to the MDAanalysis user
            guide for an |explanation_of_these_terms|.  Note that in any
            case, even if ``REFCMP`` is e.g. 'residues', only the atoms
            belonging to the reference group are taken into account,
            even if the compound might comprise additional atoms that
            are not contained in the reference group.  Default:
            ``'residues'``
--selcmp    {'segments', 'residues', 'fragments'}

            Same for the selection group.  Default: ``'residues'``
--updating-ref
            Use an :class:`~MDAnalysis.core.groups.UpdatingAtomGroup`
            for the reference group.  Selection expressions of
            :class:`UpdatingAtomGroups <MDAnalysis.core.groups.UpdatingAtomGroup>`
            are re-evaluated every
            :attr:`time step <MDAnalysis.coordinates.base.Timestep.dt>`.
            This is e.g. useful for position-based selections like
            'type Li and prop z <= 2.0'.
--updating-sel
            Use an :class:`~MDAnalysis.core.groups.UpdatingAtomGroup`
            for the selection group.

See Also
--------
:mod:`lig_change_at_pos_change_blocks` :
    Compare the coordination environment of reference compounds before
    and after they have changed their position

Notes
-----
TODO

Examples
--------
TODO
"""


__author__ = "Andreas Thum"


# Standard libraries
import sys
import os
import warnings
import argparse
from datetime import datetime, timedelta
# Third party libraries
import psutil
import numpy as np
# Local application/library specific imports
import mdtools as mdt



[docs]
def cmp_contact_count_matrices(
        cm, natms_per_refcmp=1, natms_per_selcmp=1, dtype=int):
    """
    Take a contact matrix and return all possible compound contact
    **count** matrices.

    A compound is usually a chemically meaningfull subgroup of an
    :class:`~MDAnalysis.core.groups.AtomGroup`.  This can e.g. be a
    :class:`~MDAnalysis.core.groups.Segment`,
    :class:`~MDAnalysis.core.groups.Residue`,
    :attr:`fragment <MDAnalysis.core.groups.AtomGroup.fragments>` or
    a single :class:`~MDAnalysis.core.groups.Atom`.
    Refer to the MDAnalysis' user guide for an
    |explanation_of_these_terms|.  Note that in any case, only
    :class:`Atoms <MDAnalysis.core.groups.Atom>` belonging to the
    original :class:`~MDAnalysis.core.groups.AtomGroup` are taken into
    account, even if the compound might comprise additional
    :class:`Atoms <MDAnalysis.core.groups.Atom>` that are not contained
    in the original :class:`~MDAnalysis.core.groups.AtomGroup`.

    Parameters
    ----------
    cm : array_like
        (Boolean) contact matrix of shape ``(m, n)`` as e.g. generated
        by :func:`mdtools.structure.contact_matrix`, where ``m`` is the
        number of reference :class:`Atoms <MDAnalysis.core.groups.Atom>`
        and ``n`` is the number of selection
        :class:`Atoms <MDAnalysis.core.groups.Atom>`.
    natms_per_refcmp : int or array_like, optional
        Number of :class:`Atoms <MDAnalysis.core.groups.Atom>` per
        reference compound.  Can be a single integer or an array of
        integers.  If `natms_per_refcmp` is a single integer, all
        reference compounds are assumed to contain the same number of
        :class:`Atoms <MDAnalysis.core.groups.Atom>`.  In this case,
        `natms_per_refcmp` must be an integer divisor of ``cm.shape[0]``.
        If `natms_per_refcmp` is an array of integers, it must contain
        the number of reference
        :class:`Atoms <MDAnalysis.core.groups.Atom>` for each single
        reference compound.  In this case,
        ``numpy.sum(natms_per_refcmp)`` must be equal to ``cm.shape[0]``.
    natms_per_selcmp : int or array_like, optional
        Same for selection compounds (`natms_per_selcmp` is checked
        against ``cm.shape[1]``).
    dtype : dtype, optional
        Data type of the output arrays.

    Returns
    -------
    ccm_refatm_selcmp : numpy.ndarray
        Contact **count** matrix indicating how many contacts exsit
        between a reference :class:`~MDAnalysis.core.groups.Atom` and a
        selection compound.
    ccm_refcmp_selatm : numpy.ndarray
        Same for reference compounds and selection
        :class:`Atoms <MDAnalysis.core.groups.Atom>`.
    ccm_refcmp_selcmp : numpy.ndarray
        Same for reference and selection compounds.

    See Also
    --------
    :func:`mdtools.structure.cmp_contact_count_matrix` :
        Take an :class:`~MDAnalysis.core.groups.Atom` contact matrix and
        sum the contacts of all
        :class:`Atoms <MDAnalysis.core.groups.Atom>` belonging to the
        same compound.
    """
    ccm_refatm_selcmp = mdt.strc.cmp_contact_count_matrix(
        cm=cm,
        natms_per_refcmp=1,
        natms_per_selcmp=natms_per_selcmp,
        dtype=dtype
    )
    ccm_refcmp_selatm = mdt.strc.cmp_contact_count_matrix(
        cm=cm,
        natms_per_refcmp=natms_per_refcmp,
        natms_per_selcmp=1,
        dtype=dtype
    )
    ccm_refcmp_selcmp = mdt.strc.cmp_contact_count_matrix(
        cm=ccm_refcmp_selatm,
        natms_per_refcmp=1,
        natms_per_selcmp=natms_per_selcmp,
        dtype=dtype
    )
    return (ccm_refatm_selcmp, ccm_refcmp_selatm, ccm_refcmp_selcmp)



if __name__ == '__main__':
    timer_tot = datetime.now()
    proc = psutil.Process()
    proc.cpu_percent()  # Initiate monitoring of CPU usage
    parser = argparse.ArgumentParser(
        description=(
            "Calculate the number of contacts between two MDAnalysis"
            " AtomGroups.  For more information, refer to the"
            " documetation of this script."
        )
    )
    parser.add_argument(
        '-f',
        dest='TRJFILE',
        type=str,
        required=True,
        help="Trajectory file."
    )
    parser.add_argument(
        '-s',
        dest='TOPFILE',
        type=str,
        required=True,
        help="Topology file."
    )
    parser.add_argument(
        '-o',
        dest='OUTFILE',
        type=str,
        required=True,
        help="Output filename."
    )
    parser.add_argument(
        '-b',
        dest='BEGIN',
        type=int,
        required=False,
        default=0,
        help="First frame to read from the trajectory.  Frame numbering"
             " starts at zero.  Default: %(default)s."
    )
    parser.add_argument(
        '-e',
        dest='END',
        type=int,
        required=False,
        default=-1,
        help="Last frame to read from the trajectory (exclusive)."
             "  Default: %(default)s."
    )
    parser.add_argument(
        '--every',
        dest='EVERY',
        type=int,
        required=False,
        default=1,
        help="Read every n-th frame from the trajectory.  Default:"
             " %(default)s."
    )
    parser.add_argument(
        '--ref',
        dest='REF',
        type=str,
        nargs='+',
        required=True,
        help="Selection string to select the reference group"
    )
    parser.add_argument(
        '--sel',
        dest='SEL',
        type=str,
        nargs='+',
        required=True,
        help="Selection string to select the selection group."
    )
    parser.add_argument(
        '-c',
        dest='CUTOFF',
        type=float,
        required=True,
        help="Cutoff distance in Angstrom."
    )
    parser.add_argument(
        '--refcmp',
        dest='REFCMP',
        type=str,
        required=False,
        choices=('segments', 'residues', 'fragments'),
        default='residues',
        help="Reference compound.  Default: %(default)s"
    )
    parser.add_argument(
        '--selcmp',
        dest='SELCMP',
        type=str,
        required=False,
        choices=('segments', 'residues', 'fragments'),
        default='residues',
        help="Selection compound.  Default: %(default)s"
    )
    parser.add_argument(
        '--updating-ref',
        dest='UPDATING_REF',
        required=False,
        default=False,
        action='store_true',
        help="Use an UpdatingAtomGroup for the reference group."
    )
    parser.add_argument(
        '--updating-sel',
        dest='UPDATING_SEL',
        required=False,
        default=False,
        action='store_true',
        help="Use an UpdatingAtomGroup for the selection group."
    )
    args = parser.parse_args()
    print(mdt.rti.run_time_info_str())
    if args.CUTOFF <= 0:
        raise ValueError("-c ({}) must be positive".format(args.CUTOFF))
    if mdt.rti.get_num_CPUs() > 1:
        mdabackend = 'OpenMP'
    else:
        mdabackend = 'serial'

    print("\n")
    u = mdt.select.universe(top=args.TOPFILE, trj=args.TRJFILE)
    print("\n")
    print("Creating selections...")
    timer = datetime.now()
    ref = u.select_atoms(' '.join(args.REF), updating=args.UPDATING_REF)
    sel = u.select_atoms(' '.join(args.SEL), updating=args.UPDATING_SEL)
    if ref.n_atoms == 0 and not args.UPDATING_REF:
        raise ValueError("The reference group contains no atoms")
    if sel.n_atoms == 0 and not args.UPDATING_SEL:
        raise ValueError("The selection group contains no atoms")
    print("Reference group: '{}'".format(' '.join(args.REF)))
    print(mdt.rti.ag_info_str(ag=ref, indent=2))
    print("Selection group: '{}'".format(' '.join(args.SEL)))
    print(mdt.rti.ag_info_str(ag=sel, indent=2))
    print("Elapsed time:         {}".format(datetime.now() - timer))
    print("Current memory usage: {:.2f} MiB"
          .format(proc.memory_info().rss / 2**20))
    print("\n")
    BEGIN, END, EVERY, N_FRAMES = mdt.check.frame_slicing(
        start=args.BEGIN,
        stop=args.END,
        step=args.EVERY,
        n_frames_tot=u.trajectory.n_frames
    )

    if not args.UPDATING_REF:
        natms_per_refcmp = mdt.strc.natms_per_cmp(
            ag=ref, cmp=args.REFCMP, check_contiguous=True
        )
    if not args.UPDATING_SEL:
        natms_per_selcmp = mdt.strc.natms_per_cmp(
            ag=sel, cmp=args.SELCMP, check_contiguous=True
        )
    if not args.UPDATING_REF and not args.UPDATING_SEL:
        dist_array_tmp = np.full(
            (ref.n_atoms, sel.n_atoms), np.nan, dtype=np.float64
        )
    else:
        dist_array_tmp = None

    expected_max_contacts = 16  # Will be increased if necessary
    # Histogram for refatm_selatm:
    hist_refatm_selatm = np.zeros(expected_max_contacts, dtype=np.uint32)
    # Histograms for refatm_selcmp, refcmp_selatm, refcmp_selcmp:
    hists_cmp = [np.zeros((4, expected_max_contacts), dtype=np.uint32)
                 for i in range(3)]
    n_atms = np.zeros(2, dtype=int)  # refatms and selatms
    n_cmps = np.zeros_like(n_atms)   # refcmps and selcmps
    n_pairs = np.zeros(len(hists_cmp), dtype=np.uint32)

    print("\n")
    print("Reading trajectory...")
    print("Total number of frames: {:>8d}".format(u.trajectory.n_frames))
    print("Frames to read:         {:>8d}".format(N_FRAMES))
    print("First frame to read:    {:>8d}".format(BEGIN))
    print("Last frame to read:     {:>8d}".format(END - 1))
    print("Read every n-th frame:  {:>8d}".format(EVERY))
    print("Time first frame:       {:>12.3f} (ps)"
          .format(u.trajectory[BEGIN].time))
    print("Time last frame:        {:>12.3f} (ps)"
          .format(u.trajectory[END - 1].time))
    print("Time step first frame:  {:>12.3f} (ps)"
          .format(u.trajectory[BEGIN].dt))
    print("Time step last frame:   {:>12.3f} (ps)"
          .format(u.trajectory[END - 1].dt))
    timer = datetime.now()
    trj = mdt.rti.ProgressBar(u.trajectory[BEGIN:END:EVERY])
    for ts in trj:
        if args.UPDATING_REF:
            natms_per_refcmp = mdt.strc.natms_per_cmp(
                ag=ref, cmp=args.REFCMP, check_contiguous=True
            )
        if args.UPDATING_SEL:
            natms_per_selcmp = mdt.strc.natms_per_cmp(
                ag=sel, cmp=args.SELCMP, check_contiguous=True
            )
        # Generate contact matrices:
        cm_refatm_selatm = mdt.strc.contact_matrix(
            ref=ref,
            sel=sel,
            cutoff=args.CUTOFF,
            box=ts.dimensions,
            result=dist_array_tmp,
            mdabackend=mdabackend,
        )
        n_atms += cm_refatm_selatm.shape
        cmp_ccms = cmp_contact_count_matrices(
            cm=cm_refatm_selatm,
            natms_per_refcmp=natms_per_refcmp,
            natms_per_selcmp=natms_per_selcmp,
            dtype=np.uint32
        )
        n_cmps += cmp_ccms[-1].shape
        if cm_refatm_selatm.shape[0] == 0:
            # Number of reference atoms (and compounds) is zero
            continue
        # Pure atom-atom contact histogram (refatm_selatm):
        hists_tmp = mdt.strc.contact_hist_refcmp_diff_selcmp(
            cm=cm_refatm_selatm,
            minlength=hist_refatm_selatm.shape[-1],
            dtype=np.uint32
        )
        hist_refatm_selatm, hists_tmp = mdt.nph.match_shape(
            hist_refatm_selatm,
            hists_tmp
        )
        hist_refatm_selatm += hists_tmp
        # Compound histograms:
        # (refatm_selcmp, refcmp_selatm, refcmp_selcmp)
        for i, cmp_ccm in enumerate(cmp_ccms):
            hists_tmp = np.asarray(mdt.strc.contact_hists(
                cm=cmp_ccm,
                minlength=hists_cmp[i].shape[-1],
                dtype=np.uint32
            ))
            hists_cmp[i], hists_tmp = mdt.nph.match_shape(hists_cmp[i],
                                                          hists_tmp)
            n_pairs[i] += np.sum(hists_tmp[-1])
            hists_cmp[i] += hists_tmp
        # ProgressBar update:
        progress_bar_mem = proc.memory_info().rss / 2**20
        trj.set_postfix_str("{:>7.2f}MiB".format(progress_bar_mem),
                            refresh=False)
    trj.close()
    del cm_refatm_selatm, cmp_ccms, dist_array_tmp
    print("Elapsed time:         {}".format(datetime.now() - timer))
    print("Current memory usage: {:.2f} MiB"
          .format(proc.memory_info().rss / 2**20))

    if np.any(n_atms == 0):
        warnings.warn("The total number of reference or selection atoms"
                      " is zero. The output will be meaningless",
                      RuntimeWarning)
    if np.any(n_pairs == 0):
        warnings.warn("At least one of the pair histograms is void."
                      " This histogram will be meaningless",
                      RuntimeWarning)

    # Average contact numbers:
    n_refatms_bound = np.sum(hist_refatm_selatm[1:])
    n_refcmps_bound = np.sum(hists_cmp[1][0][1:])  # refcmp_diff_selatm
    # Averages of pure atom-atom histogram (refatm_selatm):
    avs_refatm_selatm = np.zeros(2, dtype=np.float64)
    tot_contacts = np.sum(hist_refatm_selatm *
                          np.arange(len(hist_refatm_selatm)))
    # normalized by the total number of reference atoms
    avs_refatm_selatm[0] = tot_contacts / n_atms[0]
    # normalized by the number of refatms that are bound to a selatm
    avs_refatm_selatm[1] = tot_contacts / n_refatms_bound
    # Averages of compound histograms:
    # (refatm_selcmp, refcmp_selatm, refcmp_selcmp)
    avs_cmp = np.zeros((len(hists_cmp), len(hists_cmp[0][:-1]), 2),
                       dtype=np.float64)
    for i, hists in enumerate(hists_cmp):
        for j, hist in enumerate(hists[:-1]):
            tot_contacts = np.sum(hist * np.arange(len(hist)))
            if i == 0:  # refatm_selcmp
                avs_cmp[i][j][0] = tot_contacts / n_atms[0]
                avs_cmp[i][j][1] = tot_contacts / n_refatms_bound
            else:  # refcmp_selatm, refcmp_selcmp
                avs_cmp[i][j][0] = tot_contacts / n_cmps[0]
                avs_cmp[i][j][1] = tot_contacts / n_refcmps_bound
    # Pair histograms (refatm_selcmp, refcmp_selatm, refcmp_selcmp):
    avs_pair = np.zeros(len(n_pairs), dtype=np.float64)
    for i, n in enumerate(n_pairs):
        tot_contacts = np.sum(hists_cmp[i][-1] *
                              np.arange(len(hists_cmp[i][-1])))
        avs_pair[i] = tot_contacts / n

    # Normalization:
    hist_refatm_selatm = hist_refatm_selatm / n_atms[0]
    for i in range(len(hists_cmp)):
        hists_cmp[i] = hists_cmp[i].astype(np.float64)
        if i == 0:  # refatm_selcmp
            hists_cmp[i][:-1] /= n_atms[0]
        else:  # refcmp_selatm, refcmp_selcmp
            hists_cmp[i][:-1] /= n_cmps[0]
    for i, n in enumerate(n_pairs):
        hists_cmp[i][-1] /= n

    # Bring all histograms to the same length:
    max_length = len(hist_refatm_selatm)
    for hists in hists_cmp:
        max_length = max(max_length, hists.shape[-1])
    hist_refatm_selatm = mdt.nph.extend(hist_refatm_selatm, max_length)
    for i, hists in enumerate(hists_cmp):
        hists_cmp[i] = mdt.nph.extend(hists, max_length, axis=-1)

    # Find the last non-zero value up to which to write the histograms
    # to file:
    if np.any(hist_refatm_selatm != 0):
        last_nonzero = np.flatnonzero(hist_refatm_selatm)[-1]
    else:
        last_nonzero = 0
    for hists in hists_cmp:
        if np.any(hists != 0):
            last_nonzero = max(last_nonzero,
                               np.max(np.nonzero(hists)[1]))

    print("\n")
    print("Creating output...")
    timer = datetime.now()
    mdt.fh.write_header(args.OUTFILE)
    with mdt.fh.xopen(args.OUTFILE, 'a') as outfile:
        outfile.write("# \n")
        outfile.write("# \n")
        outfile.write("# Reference: '{}'\n".format(' '.join(args.REF)))
        outfile.write(mdt.fh.indent(text=mdt.rti.ag_info_str(ag=ref),
                                    amount=1,
                                    char="#   ")
                      + "\n")
        outfile.write("# \n")
        outfile.write("# Selection: '{}'\n".format(' '.join(args.SEL)))
        outfile.write(mdt.fh.indent(text=mdt.rti.ag_info_str(ag=sel),
                                    amount=1,
                                    char="#   ")
                      + "\n")
        outfile.write("# \n")
        outfile.write("# \n")
        outfile.write("# Contact histograms\n")
        outfile.write("# Cutoff (Angstrom): {}\n".format(args.CUTOFF))
        outfile.write("# Reference compound = '{}'\n".format(args.REFCMP))
        outfile.write("# Selection compound = '{}'\n".format(args.SELCMP))
        outfile.write("# Number of frames:                             {:>12d}\n".format(N_FRAMES))
        outfile.write("# Total No. of reference atoms     (per frame): {:>12d}  ({:>12.3f})\n".format(n_atms[0], n_atms[0] / N_FRAMES))
        outfile.write("# Total No. of reference compounds (per frame): {:>12d}  ({:>12.3f})\n".format(n_cmps[0], n_cmps[0] / N_FRAMES))
        outfile.write("# Total No. of selection atoms     (per frame): {:>12d}  ({:>12.3f})\n".format(n_atms[1], n_atms[1] / N_FRAMES))
        outfile.write("# Total No. of selection compounds (per frame): {:>12d}  ({:>12.3f})\n".format(n_cmps[1], n_cmps[1] / N_FRAMES))
        outfile.write("# Total No. of refatm-selcmp pairs (per frame): {:>12d}  ({:>12.3f})\n".format(n_pairs[0], n_pairs[0] / N_FRAMES))
        outfile.write("# Total No. of refcmp-selatm pairs (per frame): {:>12d}  ({:>12.3f})\n".format(n_pairs[1], n_pairs[1] / N_FRAMES))
        outfile.write("# Total No. of refcmp-selcmp pairs (per frame): {:>12d}  ({:>12.3f})\n".format(n_pairs[2], n_pairs[2] / N_FRAMES))
        outfile.write("# \n")
        outfile.write("# \n")
        outfile.write("# Histogram averages:\n")
        outfile.write("# Percentage of refatms bound to at least one selatm:       {:10.4e}\n".format(1 - hist_refatm_selatm[0]))
        outfile.write("# Percentage of refcmps bound to at least one selatm:       {:10.4e}\n".format(1 - hists_cmp[1][0][0]))  # refcmp_diff_selatm
        outfile.write("# \n")
        outfile.write("#  (2)* Av.       selatm coordination No. of all   refatms: {:10.4e}  (Every refatm                  has on average           contact  with this many           selatms)\n".format(avs_refatm_selatm[0]))
        outfile.write("#  (2)' Av.       selatm coordination No. of bound refatms: {:10.4e}  (Every refatm bound to selatms has on average           contact  with this many           selatms)\n".format(avs_refatm_selatm[1]))
        outfile.write("# \n")
        outfile.write("#  (3)* Av.       selcmp coordination No. of all   refatms: {:10.4e}  (Every refatm                  has on average           contact  with this many different selcmps)\n".format(avs_cmp[0][0][0]))
        outfile.write("#  (3)' Av.       selcmp coordination No. of bound refatms: {:10.4e}  (Every refatm bound to selatms has on average           contact  with this many different selcmps)\n".format(avs_cmp[0][0][1]))
        outfile.write("#  (5)* Av. total selcmp coordination No. of all   refatms: {:10.4e}  (Every refatm                  has on average this many contacts with                     selcmps)\n".format(avs_cmp[0][2][0]))
        outfile.write("#  (5)' Av. total selcmp coordination No. of bound refatms: {:10.4e}  (Every refatm bound to selatms has on average this many contacts with                     selcmps)\n".format(avs_cmp[0][2][1]))
        outfile.write("#  (6)° Av. No. of 'bonds' between refatm-selcmp pairs:     {:10.4e}  (Every refatm-selcmp pair      is  on average           connected via this many 'bonds')\n".format(avs_pair[0]))
        outfile.write("# \n")
        outfile.write("#  (7)* Av.       selatm coordination No. of all   refcmps: {:10.4e}  (Every refcmp                  has on average           contact  with this many different selatms)\n".format(avs_cmp[1][0][0]))
        outfile.write("#  (7)' Av.       selatm coordination No. of bound refcmps: {:10.4e}  (Every refcmp bound to selatms has on average           contact  with this many different selatms)\n".format(avs_cmp[1][0][1]))
        outfile.write("#  (9)* Av. total selatm coordination No. of all   refcmps: {:10.4e}  (Every refcmp                  has on average this many contacts with                     selatms)\n".format(avs_cmp[1][2][0]))
        outfile.write("#  (9)' Av. total selatm coordination No. of bound refcmps: {:10.4e}  (Every refcmp bound to selatms has on average this many contacts with                     selatms)\n".format(avs_cmp[1][2][1]))
        outfile.write("# (10)° Av. No. of 'bonds' between refcmp-selatm pairs:     {:10.4e}  (Every refcmp-selatm pair      is  on average           connected via this many 'bonds')\n".format(avs_pair[1]))
        outfile.write("# \n")
        outfile.write("# (11)* Av.       selcmp coordination No. of all   refcmps: {:10.4e}  (Every refcmp                  has on average           contact  with this many different selcmps)\n".format(avs_cmp[2][0][0]))
        outfile.write("# (11)' Av.       selcmp coordination No. of bound refcmps: {:10.4e}  (Every refcmp bound to selatms has on average           contact  with this many different selcmps)\n".format(avs_cmp[2][0][1]))
        outfile.write("# (13)* Av. total selcmp coordination No. of all   refcmps: {:10.4e}  (Every refcmp                  has on average this many contacts with                     selcmps)\n".format(avs_cmp[2][2][0]))
        outfile.write("# (13)' Av. total selcmp coordination No. of bound refcmps: {:10.4e}  (Every refcmp bound to selatms has on average this many contacts with                     selcmps)\n".format(avs_cmp[2][2][1]))
        outfile.write("# (14)° Av. No. of 'bonds' between refcmp-selcmp pairs:     {:10.4e}  (Every refcmp-selcmp pair      is  on average           connected via this many 'bonds')\n".format(avs_pair[2]))
        outfile.write("# \n")
        outfile.write("# *) Normalized by the total number of       reference atoms/compounds\n")
        outfile.write("# ') Normalized by the       number of bound reference atoms/compounds\n")
        outfile.write("# °) Normalized by the       number of pairs\n")
        outfile.write("# \n")
        outfile.write("# \n")
        outfile.write("# The columns contain:\n")
        outfile.write("#    1 N:                  Number of contacts between the reference and selection group\n")
        outfile.write("#    2 refatm_selatm:      % of refatms that have   contact  with N different selatms (multiple contacts with the same selatm discounted) [refatm_diff_selatm]\n")
        outfile.write("#                       [= % of refatms that have N contacts with             selatms (multiple contacts with the same selatm    counted)  refatm_selatm_tot]\n")
        outfile.write("# \n")
        outfile.write("#    3 refatm_diff_selcmp: % of refatms that have   contact  with N different selcmps (multiple contacts with the same selcmp discounted)\n")
        outfile.write("#    4 refatm_same_selcmp: % of refatms that have N contacts with   the same  selcmp  (multiple contacts with the same selcmp    counted, multiple connections to different selcmps via the same number of 'bonds' discounted)\n")
        outfile.write("#    5 refatm_selcmp_tot:  % of refatms that have N contacts with             selcmps (multiple contacts with the same selcmp    counted)\n")
        outfile.write("#    6 refatm_selcmp_pair: % of refatm-selcmp pairs connected via N 'bonds'           (first element is meaningless)\n")
        outfile.write("# \n")
        outfile.write("#    7 refcmp_diff_selatm: % of refcmps that have   contact  with N different selatms (multiple contacts with the same selatm discounted)\n")
        outfile.write("#    8 refcmp_same_selatm: % of refcmps that have N contacts with   the same  selatm  (multiple contacts with the same selatm    counted, multiple connections to different selatms via the same number of 'bonds' discounted)\n")
        outfile.write("#    9 refcmp_selatm_tot:  % of refcmps that have N contacts with             selatms (multiple contacts with the same selatm    counted)\n")
        outfile.write("#   10 refcmp_selatm_pair: % of refcmp-selatm pairs connected via N 'bonds'           (first element is meaningless)\n")
        outfile.write("# \n")
        outfile.write("#   11 refcmp_diff_selcmp: % of refcmps that have   contact  with N different selcmps (multiple contacts with the same selcmp discounted)\n")
        outfile.write("#   12 refcmp_same_selcmp: % of refcmps that have N contacts with   the same  selcmp  (multiple contacts with the same selcmp    counted, multiple connections to different selcmps via the same number of 'bonds' discounted)\n")
        outfile.write("#   13 refcmp_selcmp_tot:  % of refcmps that have N contacts with             selcmps (multiple contacts with the same selcmp    counted)\n")
        outfile.write("#   14 refcmp_selcmp_pair: % of refcmp-selcmp pairs connected via N 'bonds'           (first element is meaningless)\n")
        outfile.write("# \n")
        outfile.write("# Note that refatm_selcmp_tot (5) = refatm_selatm_tot (1) (= refatm_selatm) and refcmp_selcmp_tot (13) = refcmp_selatm_tot (9)\n")
        outfile.write("# \n")
        outfile.write('# Column number:\n')
        outfile.write("# {:3d}   {:16d}   {:16d} {:16d} {:16d} {:16d}   {:16d} {:16d} {:16d} {:16d}   {:16d} {:16d} {:16d} {:16d}\n"
                      .format(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14))
        outfile.write("# {:>3s}   {:>16s}   {:>16s} {:>16s} {:>16s} {:>16s}   {:>16s} {:>16s} {:>16s} {:>16s}   {:>16s} {:>16s} {:>16s} {:>16s}\n"
                      .format("N",                 # 01  \vline
                              "refatm_selatm",     # 02  \vline
                              "refatm_d_selcmp",   # 03
                              "refatm_s_selcmp",   # 04
                              "refatm_selcmp_t",   # 05
                              "refatm_selcmp_p",   # 06  \vline
                              "refcmp_d_selatm",   # 07
                              "refcmp_s_selatm",   # 08
                              "refcmp_selatm_t",   # 09
                              "refcmp_selatm_p",   # 10  \vline
                              "refcmp_d_selcmp",   # 11
                              "refcmp_s_selcmp",   # 12
                              "refcmp_selcmp_t",   # 13
                              "refcmp_selcmp_p"))  # 14
        for i in range(last_nonzero + 1):
            outfile.write("  {:3d}   {:16.9e}"
                          .format(i, hist_refatm_selatm[i]))
            for hists in hists_cmp:
                outfile.write(2 * " ")
                for hist in hists:
                    outfile.write(" {:16.9e}".format(hist[i]))
            outfile.write("\n")
        outfile.write("\n")
        outfile.write("\n")
        outfile.write("# Sums\n")
        outfile.write("  {:3d}   {:16.9e}"
                      .format(np.sum(np.arange(last_nonzero + 1)),
                              np.sum(hist_refatm_selatm)))
        for hists in hists_cmp:
            outfile.write(2 * " ")
            for hist in hists:
                outfile.write(" {:16.9e}".format(np.sum(hist)))
        outfile.write("\n")
        outfile.flush()
    print("Created {}".format(args.OUTFILE))
    print("Elapsed time:         {}".format(datetime.now() - timer))
    print("Current memory usage: {:.2f} MiB"
          .format(proc.memory_info().rss / 2**20))

    print("\n")
    print("Checking output for consistency...")
    timer = datetime.now()
    tol = 1e-4
    # Check histogram sums:
    if not np.isclose(np.sum(hist_refatm_selatm), 1, rtol=0, atol=tol):
        raise ValueError("The sum over 'hist_refatm_selatm' ({}) is not"
                         " one".format(np.sum(hist_refatm_selatm)))
    for i, hists in enumerate(hists_cmp):
        for j, hist in enumerate(hists):
            if j == 1:  # refcmp_same_selcmp
                if np.sum(hist) < 1:
                    raise ValueError("The sum over 'hists_cmp[{}][{}]'"
                                     " ({}) is less than one"
                                     .format(i, j, np.sum(hist)))
                if np.any(hist) > 1:
                    raise ValueError("At least one element of"
                                     " 'hists_cmp[{}][{}]' is greater"
                                     " than one".format(i, j))
            else:  # refcmp_diff_selcmp, refcmp_selcmp_tot, recmp_selcmp_pair
                if not np.isclose(np.sum(hist), 1, rtol=0, atol=tol):
                    raise ValueError("The sum over 'hists_cmp[{}][{}]'"
                                     " ({}) is not one"
                                     .format(i, j, np.sum(hist)))
    # Check first element of each histogram:
    for i, hists in enumerate(hists_cmp):
        for j, hist in enumerate(hists[:-1]):
            if i == 0:  # refatm_selcmp
                if not np.isclose(hist[0], hist_refatm_selatm[0],
                                  rtol=0, atol=tol):
                    raise ValueError(
                        "The percentage of refatms having no contact"
                        " with any selatm is not the same in"
                        " 'hists_cmp[{}][{}]' ({}) and"
                        " 'hist_refatm_selatm' ({})"
                        .format(i, j, hist[0], hist_refatm_selatm[0]))
            else:  # refcmp_selatm, refcmp_selcmp
                if not np.isclose(hist[0], hists_cmp[1][0][0],
                                  rtol=0, atol=tol):
                    raise ValueError(
                        "The percentage of refcmps having no contact"
                        " with any selatm is not the same in"
                        " 'hists_cmp[{}][{}]' ({}) and"
                        " 'hists_cmp[1][0]' ({})"
                        .format(i, j, hist[0], hists_cmp[1][0][0]))
    for i in range(len(n_pairs)):
        if not np.isclose(hists_cmp[i][-1][0], 0, rtol=0, atol=tol):
            raise ValueError("The first element of"
                             " 'hists_cmp[{}][-1][0]' ({}) is not zero"
                             .format(i, hists_cmp[i][-1][0]))
    # Check if refatm_selcmp_tot == refatm_selatm:
    if not np.allclose(hists_cmp[0][2], hist_refatm_selatm,
                       rtol=0, atol=tol, equal_nan=True):
        raise ValueError("'hists_cmp[0][2]' != 'hist_refatm_selatm'")
    # Check if refcmp_selcmp_tot == refcmp_selatm_tot:
    if not np.allclose(hists_cmp[2][2], hists_cmp[1][2],
                       rtol=0, atol=tol, equal_nan=True):
        raise ValueError("'hists_cmp[2][2]' != 'hists_cmp[1][2]'")
    # Check averages:
    for i, av_refatm_selatm in enumerate(avs_refatm_selatm):
        if not np.isclose(avs_cmp[0][2][i], av_refatm_selatm):
            # average(refatm_selcmp_tot) != average(refatm_selatm)
            raise ValueError("'avs_cmp[0][2][{}]' ({}) !="
                             " 'avs_refatm_selatm[{}]' ({})"
                             .format(i, avs_cmp[0][2][i],
                                     i, av_refatm_selatm))
    for i, av_refcmp_selatm_tot in enumerate(avs_cmp[1][2]):
        if not np.isclose(avs_cmp[2][2][i], av_refcmp_selatm_tot):
            # average(refcmp_selcmp_tot) != average(refcmp_selatm_tot)
            raise ValueError("'avs_cmp[0][2][{}]' ({}) !="
                             " 'avs_cmp[1][2][{}]' ({})"
                             .format(i, avs_cmp[2][2][i],
                                     i, av_refcmp_selatm_tot))
    for i, av_pair in enumerate(avs_pair):
        for k in range(2):
            if not np.isclose(avs_cmp[i][0][k] * av_pair, avs_cmp[i][2][k],
                              rtol=0, atol=tol):
                # av(refcmp_diff_selcmp)*av(refcmp_selcmp_pair) !=
                # av(refcmp_selcmp_tot)
                raise ValueError(
                    "'avs_cmp[{}][0][{}]*avs_pair[{}]' ({}) !="
                    " 'avs_cmp[{}][2][{}]' ({})"
                    .format(i, k, i, avs_cmp[i][0][k] * av_pair,
                            i, k, avs_cmp[i][2][k])
                )
    if np.isclose(hist_refatm_selatm[0], 0, rtol=0, atol=tol):
        # All refatms are bound to selatms
        if not np.isclose(avs_refatm_selatm[0], avs_refatm_selatm[1],
                          rtol=0, atol=tol):
            raise ValueError("All refatms are bound to selatms, but"
                             "'avs_refatm_selatm[0]' ({}) !="
                             " 'avs_refatm_selatm[1]' ({})"
                             .format(avs_refatm_selatm[0],
                                     avs_refatm_selatm[1]))
        for j, av in enumerate(avs_cmp[0][:-1]):  # refatm_selcmp
            if not np.isclose(av[0], av[1], rtol=0, atol=tol):
                raise ValueError("All refatms are bound to selatms, but"
                                 "'avs_cmp[0][{}][0]' ({}) !="
                                 " 'avs_cmp[0][{}][1]' ({})"
                                 .format(j, av[0], j, av[1]))
    if np.isclose(hists_cmp[1][0][0], 0, rtol=0, atol=tol):  # refcmp_diff_selatm
        # All refcmps are bound to selatms
        for i, avs in enumerate(avs_cmp[1:]):  # refcmp_selatm, refcmp_selcmp
            for j, av in enumerate(avs[:-1]):
                if not np.isclose(av[0], av[1], rtol=0, atol=tol):
                    raise ValueError("All refcmps are bound to selatms,"
                                     " but 'avs_cmp[{}][{}][0]' ({}) !="
                                     " 'avs_cmp[{}][{}][1]' ({})"
                                     .format(i, j, av[0], i, j, av[1]))

    hist_refatm_same_selatm = np.array([hist_refatm_selatm[0],
                                        1 - hist_refatm_selatm[0]])
    hist_refatm_same_selatm = mdt.nph.extend(hist_refatm_same_selatm,
                                             len(hists_cmp[1][1]))
    hist_refatm_selatm_pair = np.array([0, 1])
    hist_refatm_selatm_pair = mdt.nph.extend(hist_refatm_selatm_pair,
                                             len(hists_cmp[1][-1]))
    if not args.UPDATING_REF and np.all(np.equal(natms_per_refcmp, 1)):
        # refcmp == refatm
        # Check if refcmp_selatm == refatm_selatm:
        for j in (0, 2):  # refcmp_diff_selatm, refcmp_selatm_tot
            if not np.allclose(hists_cmp[1][j], hist_refatm_selatm,
                               rtol=0, atol=tol, equal_nan=True):
                raise ValueError("refcmp = refatm, but"
                                 " 'hists_cmp[1][{}]' !="
                                 " 'hist_refatm_selatm'".format(j))
        if not np.allclose(hists_cmp[1][1], hist_refatm_same_selatm,
                           rtol=0, atol=tol, equal_nan=True):
            # refcmp_same_selatm != refatm_same_selatm
            raise ValueError("refcmp = refatm, but 'hists_cmp[1][1]' !="
                             " 'hist_refatm_same_selatm'")
        if not np.allclose(hists_cmp[1][-1], hist_refatm_selatm_pair,
                           rtol=0, atol=tol, equal_nan=True):
            # refcmp_selatm_pair != refatm_selatm_pair
            raise ValueError("refcmp = refatm, but 'hists_cmp[1][-1]' !="
                             " 'hist_refatm_selatm_pair'")
        # Check if refcmp_selcmp == refatm_selcmp:
        for j, hist in enumerate(hists_cmp[2]):
            if not np.allclose(hist, hists_cmp[0][j],
                               rtol=0, atol=tol, equal_nan=True):
                raise ValueError("refcmp = refatm, but hists_cmp[2][{}]"
                                 " != hists_cmp[0][{}]".format(j, j))
    if not args.UPDATING_SEL and np.all(np.equal(natms_per_selcmp, 1)):
        # selcmp == selatm
        # Check if refatm_selcmp == refatm_selatm:
        for j in (0, 2):  # refatm_diff_selcmp, refatm_selcmp_tot
            if not np.allclose(hists_cmp[0][j], hist_refatm_selatm,
                               rtol=0, atol=tol, equal_nan=True):
                raise ValueError("selcmp = selatm, but"
                                 " 'hists_cmp[0][{}]' !="
                                 " 'hist_refatm_selatm'".format(j))
        if not np.allclose(hists_cmp[0][1], hist_refatm_same_selatm,
                           rtol=0, atol=tol, equal_nan=True):
            # refatm_same_selcmp != refatm_same_selatm
            raise ValueError("selcmp = selatm, but 'hists_cmp[0][1]' !="
                             " 'hist_refatm_same_selatm'")
        if not np.allclose(hists_cmp[0][-1], hist_refatm_selatm_pair,
                           rtol=0, atol=tol, equal_nan=True):
            # refatm_selcmp_pair != refatm_selatm_pair
            raise ValueError("selcmp = selatm, but 'hists_cmp[0][-1]' !="
                             " 'hist_refatm_selatm_pair'")
        # Check if refcmp_selcmp == refcmp_selatm:
        for j, hist in enumerate(hists_cmp[2]):
            if not np.allclose(hist, hists_cmp[1][j],
                               rtol=0, atol=tol, equal_nan=True):
                raise ValueError("selcmp = selatm, but hists_cmp[2][{}]"
                                 " != hists_cmp[1][{}]".format(j, j))
    print("Elapsed time:         {}".format(datetime.now() - timer))
    print("Current memory usage: {:.2f} MiB"
          .format(proc.memory_info().rss / 2**20))

    print("\n")
    print("{} done".format(os.path.basename(sys.argv[0])))
    print("Totally elapsed time: {}".format(datetime.now() - timer_tot))
    print("CPU time:             {}"
          .format(timedelta(seconds=sum(proc.cpu_times()[:4]))))
    print("CPU usage:            {:.2f} %".format(proc.cpu_percent()))
    print("Current memory usage: {:.2f} MiB"
          .format(proc.memory_info().rss / 2**20))