Source code for funpack.loadtables

#!/usr/bin/env python
#
# loadtables.py - Functions which load the variable, data coding, processing,
#                 and category tables used by funpack.
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module provides functions and logic to load the variable, data coding,
type, processing, and category tables used by funpack.

The variable table is a ``pandas.DataFrame`` which contains metadata about all
UK Biobank variables in the input data files, along with cleaning rules. The
data coding and type tables contain the same information about all UK Biobank
data codings and types - these are merged into the variable table after
being loaded. All of these tables are loaded by the :func:`loadVariableTable`
function.

The processing table contains an ordered list of processing functions to be
applied to the input data.

The category table contains collections of variable groupings; it is used to
allow the user to select these groups by name, rather than having to
use variable IDs.

.. autosummary::
   :nosignatures:

   loadTables
   loadVariableTable
   addNewVariable
   loadProcessingTable
   loadCategoryTable
   categoryVariables
   columnTypes
"""


import itertools as it
import functools as ft
import os.path   as op
import              re
import              logging
import              warnings
import              collections

from typing import Tuple, Sequence, Union, Dict, List, Type, Any, Callable

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

import numpy  as np
import pandas as pd

import funpack.util       as util
import funpack.fileinfo   as finfo
import funpack.datatable  as datatable
import funpack.parsing    as parsing
import funpack.schema     as schema


log = logging.getLogger(__name__)



[docs]
def convert_type(val : str) -> util.CTYPES:
    """Convert a string containing a UK BioBank type into a numerical
    identifier for that type - see :attr:`funpack.util.CTYPES`.
    """
    valmap = {
        'sequence' :
        util.CTYPES.sequence,
        'integer' :
        util.CTYPES.integer,
        'continuous' :
        util.CTYPES.continuous,
        'categorical (single)' :
        util.CTYPES.categorical_single,
        'categorical (single non-numeric)' :
        util.CTYPES.categorical_single_non_numeric,
        'categorical (multiple)' :
        util.CTYPES.categorical_multiple,
        'categorical (multiple non-numeric)' :
        util.CTYPES.categorical_multiple_non_numeric,
        'time' : util.CTYPES.time,
        'date' :
        util.CTYPES.date,
        'text' :
        util.CTYPES.text,
        'compound' :
        util.CTYPES.compound,
        'unknown' :
        util.CTYPES.unknown,
    }
    return valmap.get(val.lower(), util.CTYPES.unknown)




[docs]
def convert_dtype(val : str) -> Union[np.dtype, Literal[np.nan]]:
    """Convert a string containing a ``numpy.dtype`` (e.g. ``'float32'``)
    into a ``dtype`` object.
    """

    if val == '':
        return np.nan

    dtype = getattr(np, val, None)

    if dtype not in np.ScalarType:
        raise ValueError('Invalid numpy dtype: {}'.format(dtype))

    return dtype




[docs]
def convert_comma_sep_text(val : str) -> Union[List[str], Literal[np.nan]]:
    """Convert a string containing comma-separated text into a list. """
    if val.strip() == '':
        return np.nan
    words = val.split(',')
    return [w.strip() for w in words]




[docs]
def convert_comma_sep_numbers(val : str) -> Union[np.ndarray, Literal[np.nan]]:
    """Convert a string containing comma-separated numbers into a ``numpy``
    array.
    """
    if val.strip() == '':
        return np.nan
    return np.fromstring(val, sep=',', dtype=np.float)




[docs]
def convert_ParentValues(
        val : str
) -> Union[List[parsing.VariableExpression], Literal[np.nan]]:
    """Convert a string containing a sequence of comma-separated
    ``ParentValue`` expressions into a sequence of :class:`.VariableExpression`
    objects.
    """
    if val.strip() == '':
        return np.nan
    return [parsing.VariableExpression(e) for e in val.split(',')]




[docs]
def convert_Process_Variable(
        val      : str,
        cattable : pd.DataFrame = None
) -> Tuple[str, List[int]]:
    """Convert a string containing a process variable specification.

    A process variable specification comprises one or more comma-separated:

         - integer variable IDs,
         - MATLAB-style ``start:stop:step`` ranges denoting a range of
           variable IDs
         - Category IDs, denoted as ``'cat<ID>'``, e.g. ``'cat25'``.

    A specification may be preceded by one of:

      - ``'all'``, indicating that the process is to be applied to all
        variables simultaneously (this is the default)

      - ``'independent,'``, followed by one or more comma-separated variable
        IDs, indicating that the process is to be applied to the specified
        variables independently.


      - ``'all_independent'``, indicating that the process is to be applied
        to all variables independently

      - ``'all_except,'``, followed by one or more comma-separated MATLAB-style
        ranges, indicating that the process is to be applied to all variables
        simultaneously, except for the specified variables.

      - ``'all_independent_except,'``, followed by one or more comma-separated
        MATLAB-style ranges, indicating that the process is to be applied to
        all variables independently, except for the specified variables.

    :returns: A tuple containing:

               - The process variable type - one of ``'all'``,
                 ``'all_independent'``, ``'all_except'``,
                 ``'all_independent_except'``, ``'independent'``, or
                 ``'vids'``

               - A list of variable IDs (empty if the process variable type
                 is ``'all'`` or ``'all_independent'``).
    """
    tokens = convert_comma_sep_text(val)
    if len(tokens) == 1 and \
       tokens[0] in ('all', 'all_independent',
                     'all_except', 'all_independent_except'):
        return np.array((tokens[0], []), dtype=object)

    if tokens[0] in ('independent', 'all_except', 'all_independent_except'):
        ptype  = tokens[0]
        tokens = tokens[1:]
    else:
        ptype = 'vids'

    vids = []

    for token in tokens:
        if token.startswith('cat'):
            if cattable is None:
                raise RuntimeError(
                    'Cannot parse process variable specification '
                    f'without category table: {val}')
            cid = int(token[3:])
            vids.extend(categoryVariables(cattable, [cid]))
        else:
            vids.extend(util.parseMatlabRange(token))

    return np.array((ptype, vids), dtype=object)




[docs]
def convert_Process(
        ptype : str,
        val   : str
) -> Dict[str, parsing.Process]:
    """Convert a string containing a sequence of comma-separated ``Process`` or
    ``Clean`` expressions into an ``OrderedDict`` of :class:`.Process`
    objects (with the process names used as dictionary keys).
    """
    if val.strip() == '':
        return np.nan

    procs = parsing.parseProcesses(val, ptype)

    return collections.OrderedDict([(p.name, p)  for p in procs])




[docs]
def convert_category_variables(val : str) -> List[int]:
    """Convert a string containing a sequence of comma-separated variable IDs
    or ranges into a list of variable IDs. Variables may be specified as
    integer IDs, or via a MATLAB-style ``start:step:stop`` range. See
    :func:`.util.parseMatlabRange`.
    """

    ranges    = convert_comma_sep_text(val)
    variables = list(it.chain(*[util.parseMatlabRange(r) for r in ranges]))

    return variables



VARTABLE_COLUMNS = [
    'ID',
    'Type',
    'InternalType',
    'Description',
    'DataCoding',
    'Instancing',
    'NAValues',
    'RawLevels',
    'NewLevels',
    'ParentValues',
    'ChildValues',
    'Clean']
"""The columns that must be in a variable table file. """


DCTABLE_COLUMNS = [
    'ID',
    'NAValues',
    'RawLevels',
    'NewLevels']
"""The columns that must be in a datacoding table file. """


TYPETABLE_COLUMNS = [
    'Type',
    'Clean']
"""The columns that must be in a type table file. """


PROCTABLE_COLUMNS = [
    'Variable',
    'Process']
"""The columns that must be in a processing table file. """


CATTABLE_COLUMNS = [
    'ID',
    'Category',
    'Variables']
"""The columns that must be in a category table file. """


VARTABLE_DTYPES = {
    'ID'           : np.uint32,
    'Description'  : object,
    'Type'         : object,

    # We can't use an integer for the data
    # coding, because not all variables
    # have a data coding, and pandas uses
    # np.nan to represent missing data.
    'DataCoding'   : np.float32,
    'Instancing'   : np.float32,
    'NAValues'     : object,
    'RawLevels'    : object,
    'NewLevels'    : object,
    'ParentValues' : object,
    'ChildValues'  : object,
    'Clean'        : object,

}
"""Types to use for some columns in the variable table. """


VARTABLE_CONVERTERS = {
    'Type'         : convert_type,
    'InternalType' : convert_dtype,
    'ParentValues' : convert_ParentValues,
    'Clean'        : ft.partial(convert_Process, 'cleaner'),
}
"""Custom converter functinos to use for some columns in the variable
table.
"""


DCTABLE_DTYPES = {
    'ID'         : np.uint32,
    'NAValues'   : object,
    'RawLevels'  : object,
    'NewLevels'  : object,
}
"""Types to use for some columns in the data coding table. """



TYPETABLE_DTYPES = {
    'Type'  : object,
    'Clean' : object,
}
"""Types to use for some columns in the types table. """


TYPETABLE_CONVERTERS = {
    'Type'  : convert_type,
    'Clean' : ft.partial(convert_Process, 'cleaner'),
}
"""Custom converter functinos to use for some columns in the type trable. """



PROCTABLE_CONVERTERS = {
    'Process' : ft.partial(convert_Process, 'processor'),
}
"""Custom converter functinos to use for some columns in the processing
table.
"""


CATTABLE_DTYPES = {
    'ID' : np.int32,
}
"""Types to use for some columns in the category table. """


CATTABLE_CONVERTERS = {
    'Variables' : convert_category_variables
}
"""Custom converter functinos to use for some columns in the category
table.
"""


IMPLICIT_CATEGORIES = {
    'unknown'       : -1,
    'uncategorised' : -2,
}
"""This dict contains some categories which are automatically/implicitly
added to the category table by the :func:`loadTables` function (via a
call to :func:`addImplicitCategories`).
"""



[docs]
def loadTables(
        fileinfo : finfo.FileInfo,
        varfiles : Sequence[str] = None,
        dcfiles  : Sequence[str] = None,
        typefile : str           = None,
        procfile : str           = None,
        catfile  : str           = None,
        **kw
) -> Tuple[pd.DataFrame,
           pd.DataFrame,
           pd.DataFrame,
           List[datatable.Column],
           List[datatable.Column]]:
    """Loads the data tables used to run ``funpack``.

    :arg fileinfo:  :class:`.FileInfo` object describing the input data files.
    :arg varfiles:  Path to one or more partial variable table files
    :arg dcfiles:   Path to one or more partial data coding table files
    :arg typefile:  Path to the type table file
    :arg procfile:  Path to the processing table file
    :arg catfile:   Path to the category table file

    All other arguments are passed throughh to the :func:`loadVariableTable`
    and :func:`loadProcessingTable` functions.

    :returns:      A tuple containing:

                    - The variable table
                    - The processing table
                    - The category table
                    - List of :class:`.Column` objects representing columns
                      which were in the data file(s), but not in the variable
                      table.
                    - List of :class:`.Column` objects representing columns
                      which are uncategorised.
    """

    vartable, unk, _ = loadVariableTable(fileinfo,
                                         varfiles,
                                         dcfiles,
                                         typefile,
                                         **kw)
    cattable         = loadCategoryTable(catfile)
    proctable        = loadProcessingTable(procfile, cattable=cattable, **kw)
    unc              = identifyUncategorisedVariables(fileinfo, cattable)

    # uncategorised should not contain unknown
    unc = [c for c in unc if c not in unk]

    # Add categorise for unknown/
    # uncategorised variables
    addImplicitCategories(cattable, unk, unc)

    return vartable, proctable, cattable, unk, unc




[docs]
def loadVariableTable(
        fileinfo    : finfo.FileInfo,
        varfiles    : Sequence[str]              = None,
        dcfiles     : Sequence[str]              = None,
        typefile    : str                        = None,
        noBuiltins  : bool                       = False,
        naValues    : Dict[int, str]             = None,
        childValues : Dict[int, Tuple[str, str]] = None,
        recoding    : Dict[int, Tuple[str, str]] = None,
        clean       : Dict[int, str]             = None,
        typeClean   : Dict[util.CTYPES, str]     = None,
        globalClean : str                        = None,
        dropAbsent  : bool                       = True,
        **kwargs  # pylint: disable=unused-argument
) -> Tuple[pd.DataFrame,
           Sequence[datatable.Column],
           Sequence[datatable.Column]]:
    """Given variable table and datacoding table file names, builds and returns
    the variable table.

    :arg fileinfo:    :class:`.FileInfo` object describing the input data
                      files.

    :arg varfiles:    Path(s) to one or more variable files

    :arg dcfiles:     Path(s) to one or more data coding files

    :arg typefile:    Path to the type file

    :arg noBuiltins:  If provided, the built-in variable and datacoding base
                      tables are not loaded.

    :arg naValues:    Dictionary of ``{vid : values}`` mappings, specifying
                      values which should be replaced with NA. The values
                      are expected to be strings of comma-separated values.

    :arg childValues: Dictionary of ``{vid : (exprs, values)}`` mappings,
                      specifying parent value expressions, and corresponding
                      child values. The expressions and values
                      are expected to be strings of comma-separated values
                      of the same length.

    :arg recoding:    Dictionary of ``{vid : (rawlevel, newlevel)}``
                      mappings. The raw and enw levels are expected to be
                      strings of comma-separated values of the same length.

    :arg clean:       Dictionary of ``{vid : expr}`` mappings containing
                      cleaning functions to apply - this will override
                      any cleaning specified in the variable file, and
                      any cleaning specified in ``typeClean``. The expressions
                      are expected to be strings.

    :arg typeClean:   Dictionary of ``{type : expr}`` mappings containing
                      cleaning functions to apply to all variables of a
                      specific type - this will override any cleaning
                      specified in the type file. The expressions
                      are expected to be strings.

    :arg globalClean: Expression containing cleaning functions to
                      apply to every variable - this will be performed after
                      variable-specific cleaning in the variable table,
                      or specified via ``clean`` or ``typeClean``. The
                      expressions are expected to be strings.

    :arg dropAbsent:  If ``True`` (the default), remove all variables from the
                      variable table which are not present in the data
                      file(s).

    All other keyword arguments are ignored.

    :returns: A tuple containing:

                - A ``pandas.DataFrame`` containing the variable table

                - A sequence of :class:`.Column` objects representing variables
                  which were present in the data files, but not in the variable
                  table, but were added to the variable table.

                - A sequence of :class:`.Column` objects representing variables
                  which were present in the data files and in the variable
                  table, but which did not have any cleaning rules specified.
    """

    if noBuiltins: varbase, dcbase = None, None
    else:          varbase, dcbase = loadTableBases()

    vartable = mergeTableFiles(varbase,
                               varfiles,
                               'variable',
                               VARTABLE_DTYPES,
                               VARTABLE_CONVERTERS,
                               VARTABLE_COLUMNS)
    dctable  = mergeTableFiles(dcbase,
                               dcfiles,
                               'data coding',
                               DCTABLE_DTYPES,
                               {},
                               DCTABLE_COLUMNS)
    tytable  = mergeTableFiles(None,
                               [typefile],
                               'type',
                               TYPETABLE_DTYPES,
                               TYPETABLE_CONVERTERS,
                               TYPETABLE_COLUMNS)

    # Make sure data types are aligned,
    # otherwise we may run into problems
    # when merging them together.
    vartable = vartable.astype(
        {c : t for c, t in VARTABLE_DTYPES .items() if c != 'ID'})
    dctable  = dctable .astype(
        {c : t for c, t in DCTABLE_DTYPES  .items() if c != 'ID'})
    tytable  = tytable .astype(
        {c : t for c, t in TYPETABLE_DTYPES.items() if c != 'Type'})

    vartable.index = vartable.index.astype(VARTABLE_DTYPES[ 'ID'])
    dctable .index = dctable .index.astype(DCTABLE_DTYPES[  'ID'])
    tytable .index = tytable .index.astype(TYPETABLE_DTYPES['Type'])

    # Build a list of all columns in the input
    # data files, with the index column(s)
    # from each file dropped (index columns
    # are assigned a VID of 0)
    cols = []
    for df in fileinfo.datafiles:
        dfcols = fileinfo.columns(df)
        cols.extend([c for c in dfcols if c.vid != 0])

    # Make sure the variable table
    # contains an entry for every
    # variable in the input data.
    unknownVars = sanitiseVariableTable(vartable, cols, dropAbsent)

    # Merge data coding specific NAValues,
    # RawLevels, and NewLevels from the data
    # coding table into the variable table.
    mergeDataCodingTable(vartable, dctable)

    # Merge provided naValues, recodings,
    # and childValues into the variable
    # table (overriding whatever was specified
    # in the datacoding/variable tables)
    if naValues is not None:
        mergeIntoVariableTable(
            vartable,
            'NAValues',
            naValues)

    if recoding is not None:
        mergeIntoVariableTable(
            vartable,
            ['RawLevels', 'NewLevels'],
            recoding)

    if childValues is not None:
        childValues = {vid : (convert_ParentValues(expr), values)
                       for vid, (expr, values) in childValues.items()}
        mergeIntoVariableTable(
            vartable,
            ['ParentValues', 'ChildValues'],
            childValues)

    # navalues, raw/new levels and child values
    # are all still comma-separated strings -
    # convert them to types appropriate to the
    # datafield/variable
    def convert(rowvalues, column):
        val = rowvalues[column]
        if pd.isna(val):
            return np.nan
        return parsing.parseValueExpressions(val, rowvalues['Type'])

    # Make sure the series type stays as "object" - if
    # no rules are specified on any vid, the convert
    # function will return all nans, and pandas will
    # coerce the series type to float64, which may
    # interfere with subsequent rule assignments (as
    # these series may contain numpy arrays or
    # list-likes)
    if len(vartable) > 0:
        for col in ('NAValues', 'RawLevels', 'NewLevels', 'ChildValues'):
            vals          = vartable.apply(convert, axis=1, args=(col,))
            vartable[col] = vals.astype(object)

    # Before merging the cleaning functions
    # in, we generate a list of variables
    # which are "uncleaned", i.e. have not
    # had any cleaning specified, as this
    # may indicate that a variable has been
    # overlooked.
    #
    # If a variable has indirectly had NA
    # value insertion or recoding applied
    # via its data coding, it is not included
    # in this list.
    if clean is not None: ucmask = ~vartable.index.isin(clean.keys())
    else:                 ucmask =  vartable.index.notna()

    ucmask      = (ucmask                          &
                   vartable['NAValues']    .isna() &
                   vartable['RawLevels']   .isna() &
                   vartable['ParentValues'].isna() &
                   vartable['Clean']       .isna())
    ucmask      = ucmask[ucmask]
    uncleanVars = [c for c in cols
                   if (c.vid in ucmask.index and
                       c not in unknownVars)]

    # Merge clean options into variable table
    mergeCleanFunctions(vartable, tytable, clean, typeClean, globalClean)

    # Check where we can that the
    # vartable contains valid rules
    def checkLengths(col1, col2, row):
        val1  = row[col1]
        val2  = row[col2]
        isna1 = pd.isna(val1)
        isna2 = pd.isna(val2)

        # ugh. if the value is a sequence, isna
        # will return a sequence of bools
        if not isinstance(isna1, bool): isna1 = False
        if not isinstance(isna2, bool): isna2 = False

        if isna1 and isna2:
            return
        if isna1 or isna2 or (len(val1) != len(val2)):
            raise ValueError('Columns don\'t match [len({}) != '
                             'len({})]: {}'.format(val1, val2, row.name))

    checkRecoding     = ft.partial(checkLengths, 'RawLevels',    'NewLevels')
    checkParentValues = ft.partial(checkLengths, 'ParentValues', 'ChildValues')

    vartable.apply(checkRecoding,     axis=1)
    vartable.apply(checkParentValues, axis=1)

    return vartable, unknownVars, uncleanVars




[docs]
def loadTableBases() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Loads the UK Biobank variable and data coding schema files.

    This function is called by :func:`loadVariableTable`. It loads the UK
    Biobank variable and data coding schema files (available from the UK
    Biobank data showcase web site), and returns the information contained
    within as two ``pandas.Dataframe`` objects. These dataframes are then
    used as bases for the ``funpack`` variable table.

    Information in the base tables is loaded from the following files:

     - ``field.txt``:    A list of all UKB variables
     - ``encoding.txt``: A list of all UKB data codings
     - ``type.txt``:     A list of ``vid : type`` mappings for certain
                         variables, where ``type`` is the name of a
                         ``numpy`` data type (e.g. ``float32``).

    :returns: A tuple containing:

               - a ``pandas.DataFrame`` to be used as the base for the
                 variable table
               - a ``pandas.DataFrame`` to be used as the base for the
                 datacoding table
    """

    # Here we load these files, both obtained
    # from the UK Biobank showcase website:
    #
    #  - field.txt    - describes all UK biobank variables
    #  - encoding.txt - describes all data codings
    #
    # And we also load type.txt, which contains
    # the internal type to use for some variables

    # This dict contains all possible combinations
    # of (value_type, base_type) from field.txt
    typecodes = {
        (11,  0)  : util.CTYPES.integer,
        (31,  0)  : util.CTYPES.continuous,
        (21,  11) : util.CTYPES.categorical_single,
        (21,  41) : util.CTYPES.categorical_single_non_numeric,
        (22,  11) : util.CTYPES.categorical_multiple,
        (22,  41) : util.CTYPES.categorical_multiple_non_numeric,
        (61,  0)  : util.CTYPES.time,
        (51,  0)  : util.CTYPES.date,
        (41,  0)  : util.CTYPES.text,
        (101, 0)  : util.CTYPES.compound,
    }

    # We need pandas >=0.24 to support enums here
    def settype(valtype, basetype):
        return typecodes[valtype, basetype]

    datadir   = op.join(op.dirname(__file__), 'schema')
    fields    = schema.loadFieldProperties()
    encodings = schema.loadEnodingDictionaries()
    types     = pd.read_csv(op.join(datadir, 'type.txt'),
                            delimiter='\t',
                            index_col=0,
                            converters={'Type' : convert_dtype})

    dcbase  = pd.DataFrame({'ID' : encodings['encoding_id']}).set_index('ID')
    varbase = pd.DataFrame({
        'ID'           : fields['field_id'].astype(np.uint64),
        'Type'         : fields['value_type'].combine(fields['base_type'],
                                                      settype),
        'Description'  : fields['title'],
        'DataCoding'   : fields['encoding_id'],
        'Instancing'   : fields['instance_id'],
    }).set_index('ID')

    types.rename({'Type' : 'InternalType'}, axis=1, inplace=True)
    varbase = pd.concat((varbase, types), axis=1, join='outer')

    return varbase, dcbase




[docs]
def mergeTableFiles(
        base       : pd.DataFrame,
        fnames     : List[str],
        what       : str,
        dtypes     : Dict[str, Type],
        converters : Dict[str, Callable],
        columns    : List[str]
) -> pd.DataFrame:
    """Load and merge one or more table files.

    This function is called by :func:`loadVariableTable` to load the variable,
    data coding, and type table files.

    The variable and datacoding tables can be loaded from multiple files, with
    each file containing part of the full table. All provided files are merged
    into one table. The final table for a given set of files is the outer join
    on the index column (assumed to be the first column in each file), where
    non-na values in overlapping columns from later files will overwrite the
    values in earlier files.

    :arg base:       Table containing base information - used for the variable
                     and datacoding tables (see :func:`_loadTableBases`).

    :arg fnames:     List of files to load. If ``None``, the ``base`` table
                     is returned (or an empty table if a ``base`` is not
                     given).

    :arg what:       Name of the table files being loaded - used solely for log
                     messages

    :arg dtypes:     Dict containing ``{column : datatype}`` mappings

    :arg converters: Dict containing ``{column : convertfunc}`` mappings

    :arg columns:    Expected column names
    """

    idcol   = columns[0]
    columns = columns[1:]

    if fnames is None:
        fnames = []

    fnames = [f for f in fnames if f is not None]

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=pd.errors.ParserWarning)

        for f in fnames:

            log.debug('Loading %s table from %s', what, f)

            table = pd.read_csv(f,
                                sep='\t',
                                index_col=0,
                                dtype=dtypes,
                                converters=converters)

            if base is None:
                base = table
                continue

            # Merge each file with an outer
            # join, so we retain all IDs and
            # columns defined across the
            # entire set of files.
            merged = base.merge(table,
                                how='outer',
                                on=idcol,
                                sort=True,
                                suffixes=('_x', '_y'))

            # Now we merge overlapping columns -
            # non-na values in later files take
            # precedence.
            for c in [c[:-2] for c in merged.columns if c.endswith('_x')]:
                bname                = c + '_x'
                tname                = c + '_y'
                merged[c]            = merged[bname]
                notna                = merged[tname].notna()
                merged.loc[notna, c] = merged.loc[notna, tname]
                merged               = merged.drop(columns=[bname, tname])
            base = merged

    # no base, and no files
    if base is None:
        base = pd.DataFrame()

    # error if we have any
    # unrecognised columns
    for col in base.columns:
        if col not in columns:
            raise ValueError('Unrecognised column in table file {} - '
                             'should be {}, but file contained {}.'.format(
                                 fnames, columns, base.columns))

    # in-fill any columns that
    # were not provided
    for col in columns:
        if col not in base.columns:
            base[col] = pd.Series(dtype=np.float64)

    return base




[docs]
def sanitiseVariableTable(
        vartable   : pd.DataFrame,
        cols       : Sequence[datatable.Column],
        dropAbsent : bool
) -> List[datatable.Column]:
    """Ensures that the variable table contains an entry for every
    variable in the input data.

    Called by :func:`loadVariableTable`.

    :arg vartable:   ``pandas.DataFrame`` containing the variable table.

    :arg cols:       Sequence of :class:`.Column` objects representing
                     the columns in the input data.

    :arg dropAbsent: If ``True``, entries in the table for variables which are
                     not in ``cols`` will be removed.

    :return:         A list of unknown :class:`.Column` objects, i.e.
                     representing variables which were not in the variable
                     table.
    """

    unknownVars = []

    # Make sure a placeholder entry is
    # present for any variables which are
    # not in the variable table, but which
    # are in the data file(s).
    for col in cols:

        vid  = col.vid
        name = col.name

        if vid in vartable.index:
            continue

        unknownVars.append(col)
        addNewVariable(vartable, vid, name)

    # And the inverse - we can drop any
    # variables from the variable table
    # that are not in the data.
    if dropAbsent:
        vids = [c.vid for c in cols]
        vartable.drop([v for v in vartable.index if v not in vids],
                      inplace=True)
    return unknownVars




[docs]
def mergeIntoVariableTable(
        vartable : pd.DataFrame,
        cols     : Sequence[str],
        mapping  : Union[str, Dict[int, Any]]):
    """Merge data from ``mapping`` into the variable table.

    Called by :func:`loadVariableTable`.

    :arg vartable: The variable table

    :arg cols:     Names of columns in the variable table

    :arg mapping:  Dict of ``{vid : values}`` mappings containing the
                   data to copy in.
    """

    onecol = isinstance(cols, str)
    if onecol:
        cols = [cols]

    # Ignore any variables that
    # are not in variable table
    vids   = list(mapping.keys())
    vin    = pd.Series(vids).isin(vartable.index)
    vids   = [v for i, v in enumerate(vids) if vin[i]]

    for vid in vids:
        vals = mapping[vid]

        if onecol:
            vals = [vals]

        for col, val in zip(cols, vals):
            vartable.at[vid, col] = val




[docs]
def mergeDataCodingTable(
        vartable : pd.DataFrame,
        dctable  : pd.DataFrame):
    """Merges information from the data coding table into the variable
    table.

    Called by :func:`loadVariableTable`.

    :arg vartable: The variable table.
    :arg dctable:  The data coding table.
    """

    with_datacoding = vartable['DataCoding'].notna()

    for field in ['NAValues', 'RawLevels', 'NewLevels']:
        mask    = vartable[field].isna() & with_datacoding
        newvals = vartable.loc[mask].merge(dctable,
                                           left_on='DataCoding',
                                           right_index=True,
                                           suffixes=('_v', '_dc'),
                                           copy=False)['{}_dc'.format(field)]
        vartable.loc[mask, field] = newvals




[docs]
def mergeCleanFunctions(
        vartable    : pd.DataFrame,
        tytable     : pd.DataFrame,
        clean       : Dict[int, str],
        typeClean   : Dict[str, str],
        globalClean : str):
    """Merges custom clean functions into the variable table.

    Called by :func:`loadVariableTable`.

    :arg vartable:    The variable table.

    :arg tytable:     The type table

    :arg clean:       Dictionary of ``{vid : expr}`` mappings containing
                      cleaning functions to apply - this will override
                      any cleaning specified in the variable file, and
                      any cleaning specified in ``typeClean``.

    :arg typeClean:   Dictionary of ``{type : expr}`` mappings containing
                      cleaning functions to apply to all variables of a
                      specific type - this will override any cleaning
                      specified in the type file.

    :arg globalClean: Expression containing cleaning functions to
                      apply to every variable - this will be performed after
                      variable-specific cleaning in the variable table,
                      or specified via ``clean`` or ``typeClean``.
    """

    # Merge type-specific Clean
    # from the type table into
    # the variable table.
    for vid in vartable.index:

        if vid == 0:
            continue

        vtype = vartable.loc[vid, 'Type']
        pp    = vartable.loc[vid, 'Clean']

        # Override with typeClean if necessary
        if typeClean is not None and vtype in typeClean:
            tpp = convert_Process('cleaner', typeClean[vtype])
        elif vtype in tytable.index:
            tpp = collections.OrderedDict((tytable.loc[vtype, 'Clean']))
        else:
            continue

        # type cleaning is applied after
        # variable-specific cleaning
        if pd.isnull(pp): vartable.loc[[vid], 'Clean'] = [tpp]
        else:             vartable.loc[ vid,  'Clean'].update(tpp)

    # Override cleaning with expressions
    # that have been passed on the command line
    if clean is not None:
        clean = {vid : convert_Process('cleaner', expr)
                 for vid, expr in clean.items()}
        mergeIntoVariableTable(vartable, 'Clean', clean)

    # Add global cleaning to all variables
    if globalClean is not None:

        for vid in vartable.index:

            if vid == 0:
                continue

            pp  = vartable.loc[vid, 'Clean']
            gpp = convert_Process('cleaner', globalClean)

            # global cleaning is applied
            # after all other cleaning
            if pd.isnull(pp): vartable.loc[[vid], 'Clean'] = [gpp]
            else:             vartable.loc[ vid,  'Clean'].update(gpp)




[docs]
def addNewVariable(
        vartable   : pd.DataFrame,
        vid        : int,
        name       : str,
        dtype      : np.dtype = None,
        instancing : int      = None):
    """Add a new row to the variable table.

    The ``instancing`` argument defines the meaning of the
    :attr:`.Column.visit` field for columns associated with this variable.
    The default value is ``2``, meaning that this variable may be associated
    with columns corresponding to measurements acquired at different UK
    BioBank assessments and imaging visits.  See
    https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi?id=9 and
    https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi?id=10 for more details.

    .. note:: If an entry for the specified ``vid`` already exists in
              ``vartable``, the ``name``, ``dtype`` and ``instancing``
              arguments are ignored and the existing information in
              ``vartable`` will take precedence.

    :arg vartable:   The variable table

    :arg vid:        Integer variable ID

    :arg name:       Variable name - used as the description

    :arg dtype:      ``numpy`` data type. If ``None``, the variable type
                     is set to :attr:`.util.CTYPES.unknown`.

    :arg instancing: Instancing code for the new variable - defaults to ``2``
    """

    # If an entry already exists
    # in the variable table, it
    # takes precedence
    if vid in vartable.index:
        dtype      = vartable.at[vid, 'Type']
        name       = vartable.at[vid, 'Description']
        instancing = vartable.at[vid, 'Instancing']

    else:
        # set dtype to something which
        # will cause the conditionals
        # to fall through
        if dtype is None: dtype = object
        else:             dtype = dtype.type

        # Assume that new variables
        # are associated with visits
        if instancing is None:
            instancing = 2

        if   issubclass(dtype, int):   dtype = util.CTYPES.integer
        elif issubclass(dtype, float): dtype = util.CTYPES.continuous
        else:                          dtype = util.CTYPES.unknown

    vartable.loc[vid, 'Description'] = name
    vartable.loc[vid, 'Type']        = dtype
    vartable.loc[vid, 'Instancing']  = instancing




[docs]
def loadProcessingTable(
        procfile       : str                             = None,
        skipProcessing : bool                            = False,
        prependProcess : Sequence[Tuple[List[int], str]] = None,
        appendProcess  : Sequence[Tuple[List[int], str]] = None,
        cattable       : pd.DataFrame                    = None,
        **kwargs  # pylint: disable=unused-argument
) -> pd.DataFrame:
    """Loads the processing table from the given file.

    :arg procfile:       Path to the processing table file.

    :arg skipProcessing: If ``True``, the processing table is not loaded from
                         ``procfile``. The ``prependProcess`` and
                         ``appendProcess`` arguments are still applied.

    :arg prependProcess: Sequence of ``(varids, procstr)`` mappings specifying
                         processes to prepend to the beginning of the
                         processing table.

    :arg appendProcess:  Sequence of ``(varids, procstr)`` mappings specifying
                         processes to append to the end of the processing
                         table.

    :arg cattable:       ``pandas.DataFrame`` containing variable categories.
                         If not provided, any processing rules which refer to
                         categories will not be able to be processed, and will
                         result in an error.

    All other keyword arguments are ignored.
    """

    if prependProcess is None: prependProcess = []
    if appendProcess  is None: appendProcess  = []

    converters = dict(PROCTABLE_CONVERTERS)
    converters['Variable'] = ft.partial(convert_Process_Variable,
                                        cattable=cattable)

    if (procfile is not None) and (not skipProcessing):
        log.debug('Loading processing table from %s', procfile)
        proctable = pd.read_csv(procfile,
                                sep='\t',
                                index_col=False,
                                skip_blank_lines=True,
                                comment='#',
                                converters=converters)
    else:
        proctable = pd.DataFrame(columns=PROCTABLE_COLUMNS)

    # prepend/append custom
    # processes to the table
    proctable.index += len(prependProcess)
    for i, (vids, procs) in enumerate(prependProcess):
        vids  = convert_Process_Variable(vids, cattable)
        procs = convert_Process('processor', procs)
        proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
        proctable.at[i, 'Process']  = procs

    for i, (vids, procs) in enumerate(appendProcess, len(proctable.index)):
        vids  = convert_Process_Variable(vids, cattable)
        procs = convert_Process('processor', procs)
        proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
        proctable.at[i, 'Process']  = procs

    proctable.sort_index(inplace=True)

    return proctable




[docs]
def loadCategoryTable(catfile : str = None) -> pd.DataFrame:
    """Loads the category table from the given file.

    :arg catfile: Path to the category file.
    """
    if catfile is not None:
        log.debug('Loading category table from %s', catfile)
        cattable = pd.read_csv(catfile,
                               sep='\t',
                               index_col=0,
                               dtype=CATTABLE_DTYPES,
                               converters=CATTABLE_CONVERTERS)
    else:
        cattable            = pd.DataFrame(columns=CATTABLE_COLUMNS[1:])
        cattable.index.name = CATTABLE_COLUMNS[0]


    return cattable




[docs]
def categoryVariables(
        cattable   : pd.DataFrame,
        categories : Sequence[Union[int, str]]
) -> List[int]:
    """Returns a list of variable IDs from ``cattable`` which correspond to
    the strings in ``categories``.

    :arg cattable:   The category table.
    :arg categories: Sequence of integer category IDs or label sub-strings
                     specifying the categories to return.
    :returns:        A list of variable IDs.
    """

    allvars = []

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        for cat in categories:

            catpat  = re.compile('({})'.format(cat), re.IGNORECASE)
            idmask  = cattable.index.isin([cat])
            lblmask = cattable['Category'].str.contains(catpat)
            catvars = cattable.loc[idmask | lblmask, 'Variables']

            if len(catvars) == 0:
                continue

            for c in catvars.iloc[0]:
                if c not in allvars:
                    allvars.append(c)

    return allvars




[docs]
def variableCategories(
        cattable : pd.DataFrame,
        vids     : Sequence[int]
) -> Dict[int, List[str]]:
    """Identify the categories for each variable in vids.

    :arg cattable: The category table
    :arg vids:     Sequence of variable/datafield IDs
    :returns:      A dict of ``{vid : [category]}`` mappings
    """

    categories = collections.defaultdict(list)

    for v in vids:
        for i in cattable.index:
            if v in cattable.loc[i, 'Variables']:
                categories[v].append(cattable.loc[i, 'Category'])

    return categories




[docs]
def addImplicitCategories(
        cattable : pd.DataFrame,
        unknown  : Sequence[datatable.Column],
        uncat    : Sequence[datatable.Column]):
    """Adds some implicit/automatic categories to the category table.

    The following implicit categories are added:

     - ``unknown``:       Variables which are not present in the variable
                          table - this comprises non-UKB variables, or new UKB
                          variables which are not described by the internal
                          FUNPACK variable information in ``funpack/schema/``.

     - ``uncategorised``: Variables which are not present in any other
                          category.

    :arg cattable: The category table.
    :arg unknown:  Sequence of :class:`.Column` objects representing
                   variables to add to an "unknown" category.
    :arg uncat:    Sequence of :class:`.Column` objects representing
                   variables to add to an "uncategorised" category.
    """

    for cols, label in zip((unknown, uncat), ('unknown', 'uncategorised')):

        if cols is None:
            continue

        vids = list(sorted({c.vid for c in cols}))

        # label already in table?
        umask = cattable['Category'] == label

        if np.any(umask):
            idx  = np.where(umask)[0][0]
            idx  = cattable.index[idx]
            vids = cattable.loc[idx, 'Variables'] + vids
        else:
            idx = IMPLICIT_CATEGORIES[label]

        cattable.loc[idx, 'Category']  = label
        cattable.loc[idx, 'Variables'] = list(vids)




[docs]
def columnTypes(
        vartable : pd.DataFrame,
        columns  : Sequence[datatable.Column]
) -> Tuple[List[util.CTYPES], Dict[str, np.dtype]]:
    """Retrieves the type of each column in ``cols`` as listed in ``vartable``.
    Also identifies a suitable internal data type to use for each column where
    possible.

    :arg vartable: The variable table.

    :arg columnss: List of :class:`.Column` objects.

    :returns:      A tuple containing:

                    - A list containing the type for each column in ``cols`` -
                      an identifier from the :attr:`.util.CTYPES` enum.
                      Columns corresponding to a variable which is not in
                      the variable table is given a type of ``None``.

                    - A dict of ``{ column_name : dtype }`` mappings containing
                      a suitable internal data type to use for some columns.
    """

    vttypes = []
    dtypes  = {}

    for col in columns:

        vid  = col.vid
        name = col.name

        if vid not in vartable.index:
            vttypes.append(None)
            continue

        vttype = vartable.loc[vid, 'Type']
        dtype  = vartable.loc[vid, 'InternalType']

        if pd.isna([dtype])[0]:
            dtype = util.DATA_TYPES.get(vttype, None)

        vttypes.append(vttype)
        if dtype is not None:
            dtypes[name] = dtype

    return vttypes, dtypes




[docs]
def identifyUncategorisedVariables(
        fileinfo : finfo.FileInfo,
        cattable : pd.DataFrame
) -> List[datatable.Column]:
    """Called by :func:`loadTables`. Identifies all variables which are in the
    data file(s), but which are uncategorised (not present in any categories
    in the category table).

    Such variables might have been overlooked, so the user may need to be
    warned about them.

    :arg fileinfo:  :class:`.FileInfo` object.
    :arg cattable:  Category table
    :returns:       A list of :class:`.Column` objects associated with
                    variables that are uncategorised.
    """

    def isCategorised(col):
        def inCategory(catvars):
            return col.vid in catvars
        return cattable['Variables'].apply(inCategory).any()

    uncategorised = []
    for datafile in fileinfo.datafiles:
        cols = fileinfo.columns(datafile)
        cols = [c for c in cols if (c.vid != 0) and (not isCategorised(c))]
        uncategorised.extend(cols)

    return uncategorised