#!/usr/bin/env python
#
# loadtables.py - Functions which load the variable, data coding, processing,
# and category tables used by funpack.
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module provides functions and logic to load the variable, data coding,
type, processing, and category tables used by funpack.
The variable table is a ``pandas.DataFrame`` which contains metadata about all
UK Biobank variables in the input data files, along with cleaning rules. The
data coding and type tables contain the same information about all UK Biobank
data codings and types - these are merged into the variable table after
being loaded. All of these tables are loaded by the :func:`loadVariableTable`
function.
The processing table contains an ordered list of processing functions to be
applied to the input data.
The category table contains collections of variable groupings; it is used to
allow the user to select these groups by name, rather than having to
use variable IDs.
.. autosummary::
:nosignatures:
loadTables
loadVariableTable
addNewVariable
loadProcessingTable
loadCategoryTable
categoryVariables
columnTypes
"""
import itertools as it
import functools as ft
import os.path as op
import re
import logging
import warnings
import collections
from typing import Tuple, Sequence, Union, Dict, List, Type, Any, Callable
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
import numpy as np
import pandas as pd
import funpack.util as util
import funpack.fileinfo as finfo
import funpack.datatable as datatable
import funpack.parsing as parsing
import funpack.schema as schema
log = logging.getLogger(__name__)
[docs]
def convert_type(val : str) -> util.CTYPES:
"""Convert a string containing a UK BioBank type into a numerical
identifier for that type - see :attr:`funpack.util.CTYPES`.
"""
valmap = {
'sequence' :
util.CTYPES.sequence,
'integer' :
util.CTYPES.integer,
'continuous' :
util.CTYPES.continuous,
'categorical (single)' :
util.CTYPES.categorical_single,
'categorical (single non-numeric)' :
util.CTYPES.categorical_single_non_numeric,
'categorical (multiple)' :
util.CTYPES.categorical_multiple,
'categorical (multiple non-numeric)' :
util.CTYPES.categorical_multiple_non_numeric,
'time' : util.CTYPES.time,
'date' :
util.CTYPES.date,
'text' :
util.CTYPES.text,
'compound' :
util.CTYPES.compound,
'unknown' :
util.CTYPES.unknown,
}
return valmap.get(val.lower(), util.CTYPES.unknown)
[docs]
def convert_dtype(val : str) -> Union[np.dtype, Literal[np.nan]]:
"""Convert a string containing a ``numpy.dtype`` (e.g. ``'float32'``)
into a ``dtype`` object.
"""
if val == '':
return np.nan
dtype = getattr(np, val, None)
if dtype not in np.ScalarType:
raise ValueError('Invalid numpy dtype: {}'.format(dtype))
return dtype
[docs]
def convert_comma_sep_text(val : str) -> Union[List[str], Literal[np.nan]]:
"""Convert a string containing comma-separated text into a list. """
if val.strip() == '':
return np.nan
words = val.split(',')
return [w.strip() for w in words]
[docs]
def convert_comma_sep_numbers(val : str) -> Union[np.ndarray, Literal[np.nan]]:
"""Convert a string containing comma-separated numbers into a ``numpy``
array.
"""
if val.strip() == '':
return np.nan
return np.fromstring(val, sep=',', dtype=np.float)
[docs]
def convert_ParentValues(
val : str
) -> Union[List[parsing.VariableExpression], Literal[np.nan]]:
"""Convert a string containing a sequence of comma-separated
``ParentValue`` expressions into a sequence of :class:`.VariableExpression`
objects.
"""
if val.strip() == '':
return np.nan
return [parsing.VariableExpression(e) for e in val.split(',')]
[docs]
def convert_Process_Variable(
val : str,
cattable : pd.DataFrame = None
) -> Tuple[str, List[int]]:
"""Convert a string containing a process variable specification.
A process variable specification comprises one or more comma-separated:
- integer variable IDs,
- MATLAB-style ``start:stop:step`` ranges denoting a range of
variable IDs
- Category IDs, denoted as ``'cat<ID>'``, e.g. ``'cat25'``.
A specification may be preceded by one of:
- ``'all'``, indicating that the process is to be applied to all
variables simultaneously (this is the default)
- ``'independent,'``, followed by one or more comma-separated variable
IDs, indicating that the process is to be applied to the specified
variables independently.
- ``'all_independent'``, indicating that the process is to be applied
to all variables independently
- ``'all_except,'``, followed by one or more comma-separated MATLAB-style
ranges, indicating that the process is to be applied to all variables
simultaneously, except for the specified variables.
- ``'all_independent_except,'``, followed by one or more comma-separated
MATLAB-style ranges, indicating that the process is to be applied to
all variables independently, except for the specified variables.
:returns: A tuple containing:
- The process variable type - one of ``'all'``,
``'all_independent'``, ``'all_except'``,
``'all_independent_except'``, ``'independent'``, or
``'vids'``
- A list of variable IDs (empty if the process variable type
is ``'all'`` or ``'all_independent'``).
"""
tokens = convert_comma_sep_text(val)
if len(tokens) == 1 and \
tokens[0] in ('all', 'all_independent',
'all_except', 'all_independent_except'):
return np.array((tokens[0], []), dtype=object)
if tokens[0] in ('independent', 'all_except', 'all_independent_except'):
ptype = tokens[0]
tokens = tokens[1:]
else:
ptype = 'vids'
vids = []
for token in tokens:
if token.startswith('cat'):
if cattable is None:
raise RuntimeError(
'Cannot parse process variable specification '
f'without category table: {val}')
cid = int(token[3:])
vids.extend(categoryVariables(cattable, [cid]))
else:
vids.extend(util.parseMatlabRange(token))
return np.array((ptype, vids), dtype=object)
[docs]
def convert_Process(
ptype : str,
val : str
) -> Dict[str, parsing.Process]:
"""Convert a string containing a sequence of comma-separated ``Process`` or
``Clean`` expressions into an ``OrderedDict`` of :class:`.Process`
objects (with the process names used as dictionary keys).
"""
if val.strip() == '':
return np.nan
procs = parsing.parseProcesses(val, ptype)
return collections.OrderedDict([(p.name, p) for p in procs])
[docs]
def convert_category_variables(val : str) -> List[int]:
"""Convert a string containing a sequence of comma-separated variable IDs
or ranges into a list of variable IDs. Variables may be specified as
integer IDs, or via a MATLAB-style ``start:step:stop`` range. See
:func:`.util.parseMatlabRange`.
"""
ranges = convert_comma_sep_text(val)
variables = list(it.chain(*[util.parseMatlabRange(r) for r in ranges]))
return variables
VARTABLE_COLUMNS = [
'ID',
'Type',
'InternalType',
'Description',
'DataCoding',
'Instancing',
'NAValues',
'RawLevels',
'NewLevels',
'ParentValues',
'ChildValues',
'Clean']
"""The columns that must be in a variable table file. """
DCTABLE_COLUMNS = [
'ID',
'NAValues',
'RawLevels',
'NewLevels']
"""The columns that must be in a datacoding table file. """
TYPETABLE_COLUMNS = [
'Type',
'Clean']
"""The columns that must be in a type table file. """
PROCTABLE_COLUMNS = [
'Variable',
'Process']
"""The columns that must be in a processing table file. """
CATTABLE_COLUMNS = [
'ID',
'Category',
'Variables']
"""The columns that must be in a category table file. """
VARTABLE_DTYPES = {
'ID' : np.uint32,
'Description' : object,
'Type' : object,
# We can't use an integer for the data
# coding, because not all variables
# have a data coding, and pandas uses
# np.nan to represent missing data.
'DataCoding' : np.float32,
'Instancing' : np.float32,
'NAValues' : object,
'RawLevels' : object,
'NewLevels' : object,
'ParentValues' : object,
'ChildValues' : object,
'Clean' : object,
}
"""Types to use for some columns in the variable table. """
VARTABLE_CONVERTERS = {
'Type' : convert_type,
'InternalType' : convert_dtype,
'ParentValues' : convert_ParentValues,
'Clean' : ft.partial(convert_Process, 'cleaner'),
}
"""Custom converter functinos to use for some columns in the variable
table.
"""
DCTABLE_DTYPES = {
'ID' : np.uint32,
'NAValues' : object,
'RawLevels' : object,
'NewLevels' : object,
}
"""Types to use for some columns in the data coding table. """
TYPETABLE_DTYPES = {
'Type' : object,
'Clean' : object,
}
"""Types to use for some columns in the types table. """
TYPETABLE_CONVERTERS = {
'Type' : convert_type,
'Clean' : ft.partial(convert_Process, 'cleaner'),
}
"""Custom converter functinos to use for some columns in the type trable. """
PROCTABLE_CONVERTERS = {
'Process' : ft.partial(convert_Process, 'processor'),
}
"""Custom converter functinos to use for some columns in the processing
table.
"""
CATTABLE_DTYPES = {
'ID' : np.int32,
}
"""Types to use for some columns in the category table. """
CATTABLE_CONVERTERS = {
'Variables' : convert_category_variables
}
"""Custom converter functinos to use for some columns in the category
table.
"""
IMPLICIT_CATEGORIES = {
'unknown' : -1,
'uncategorised' : -2,
}
"""This dict contains some categories which are automatically/implicitly
added to the category table by the :func:`loadTables` function (via a
call to :func:`addImplicitCategories`).
"""
[docs]
def loadTables(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
procfile : str = None,
catfile : str = None,
**kw
) -> Tuple[pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
List[datatable.Column],
List[datatable.Column]]:
"""Loads the data tables used to run ``funpack``.
:arg fileinfo: :class:`.FileInfo` object describing the input data files.
:arg varfiles: Path to one or more partial variable table files
:arg dcfiles: Path to one or more partial data coding table files
:arg typefile: Path to the type table file
:arg procfile: Path to the processing table file
:arg catfile: Path to the category table file
All other arguments are passed throughh to the :func:`loadVariableTable`
and :func:`loadProcessingTable` functions.
:returns: A tuple containing:
- The variable table
- The processing table
- The category table
- List of :class:`.Column` objects representing columns
which were in the data file(s), but not in the variable
table.
- List of :class:`.Column` objects representing columns
which are uncategorised.
"""
vartable, unk, _ = loadVariableTable(fileinfo,
varfiles,
dcfiles,
typefile,
**kw)
cattable = loadCategoryTable(catfile)
proctable = loadProcessingTable(procfile, cattable=cattable, **kw)
unc = identifyUncategorisedVariables(fileinfo, cattable)
# uncategorised should not contain unknown
unc = [c for c in unc if c not in unk]
# Add categorise for unknown/
# uncategorised variables
addImplicitCategories(cattable, unk, unc)
return vartable, proctable, cattable, unk, unc
[docs]
def loadVariableTable(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
noBuiltins : bool = False,
naValues : Dict[int, str] = None,
childValues : Dict[int, Tuple[str, str]] = None,
recoding : Dict[int, Tuple[str, str]] = None,
clean : Dict[int, str] = None,
typeClean : Dict[util.CTYPES, str] = None,
globalClean : str = None,
dropAbsent : bool = True,
**kwargs # pylint: disable=unused-argument
) -> Tuple[pd.DataFrame,
Sequence[datatable.Column],
Sequence[datatable.Column]]:
"""Given variable table and datacoding table file names, builds and returns
the variable table.
:arg fileinfo: :class:`.FileInfo` object describing the input data
files.
:arg varfiles: Path(s) to one or more variable files
:arg dcfiles: Path(s) to one or more data coding files
:arg typefile: Path to the type file
:arg noBuiltins: If provided, the built-in variable and datacoding base
tables are not loaded.
:arg naValues: Dictionary of ``{vid : values}`` mappings, specifying
values which should be replaced with NA. The values
are expected to be strings of comma-separated values.
:arg childValues: Dictionary of ``{vid : (exprs, values)}`` mappings,
specifying parent value expressions, and corresponding
child values. The expressions and values
are expected to be strings of comma-separated values
of the same length.
:arg recoding: Dictionary of ``{vid : (rawlevel, newlevel)}``
mappings. The raw and enw levels are expected to be
strings of comma-separated values of the same length.
:arg clean: Dictionary of ``{vid : expr}`` mappings containing
cleaning functions to apply - this will override
any cleaning specified in the variable file, and
any cleaning specified in ``typeClean``. The expressions
are expected to be strings.
:arg typeClean: Dictionary of ``{type : expr}`` mappings containing
cleaning functions to apply to all variables of a
specific type - this will override any cleaning
specified in the type file. The expressions
are expected to be strings.
:arg globalClean: Expression containing cleaning functions to
apply to every variable - this will be performed after
variable-specific cleaning in the variable table,
or specified via ``clean`` or ``typeClean``. The
expressions are expected to be strings.
:arg dropAbsent: If ``True`` (the default), remove all variables from the
variable table which are not present in the data
file(s).
All other keyword arguments are ignored.
:returns: A tuple containing:
- A ``pandas.DataFrame`` containing the variable table
- A sequence of :class:`.Column` objects representing variables
which were present in the data files, but not in the variable
table, but were added to the variable table.
- A sequence of :class:`.Column` objects representing variables
which were present in the data files and in the variable
table, but which did not have any cleaning rules specified.
"""
if noBuiltins: varbase, dcbase = None, None
else: varbase, dcbase = loadTableBases()
vartable = mergeTableFiles(varbase,
varfiles,
'variable',
VARTABLE_DTYPES,
VARTABLE_CONVERTERS,
VARTABLE_COLUMNS)
dctable = mergeTableFiles(dcbase,
dcfiles,
'data coding',
DCTABLE_DTYPES,
{},
DCTABLE_COLUMNS)
tytable = mergeTableFiles(None,
[typefile],
'type',
TYPETABLE_DTYPES,
TYPETABLE_CONVERTERS,
TYPETABLE_COLUMNS)
# Make sure data types are aligned,
# otherwise we may run into problems
# when merging them together.
vartable = vartable.astype(
{c : t for c, t in VARTABLE_DTYPES .items() if c != 'ID'})
dctable = dctable .astype(
{c : t for c, t in DCTABLE_DTYPES .items() if c != 'ID'})
tytable = tytable .astype(
{c : t for c, t in TYPETABLE_DTYPES.items() if c != 'Type'})
vartable.index = vartable.index.astype(VARTABLE_DTYPES[ 'ID'])
dctable .index = dctable .index.astype(DCTABLE_DTYPES[ 'ID'])
tytable .index = tytable .index.astype(TYPETABLE_DTYPES['Type'])
# Build a list of all columns in the input
# data files, with the index column(s)
# from each file dropped (index columns
# are assigned a VID of 0)
cols = []
for df in fileinfo.datafiles:
dfcols = fileinfo.columns(df)
cols.extend([c for c in dfcols if c.vid != 0])
# Make sure the variable table
# contains an entry for every
# variable in the input data.
unknownVars = sanitiseVariableTable(vartable, cols, dropAbsent)
# Merge data coding specific NAValues,
# RawLevels, and NewLevels from the data
# coding table into the variable table.
mergeDataCodingTable(vartable, dctable)
# Merge provided naValues, recodings,
# and childValues into the variable
# table (overriding whatever was specified
# in the datacoding/variable tables)
if naValues is not None:
mergeIntoVariableTable(
vartable,
'NAValues',
naValues)
if recoding is not None:
mergeIntoVariableTable(
vartable,
['RawLevels', 'NewLevels'],
recoding)
if childValues is not None:
childValues = {vid : (convert_ParentValues(expr), values)
for vid, (expr, values) in childValues.items()}
mergeIntoVariableTable(
vartable,
['ParentValues', 'ChildValues'],
childValues)
# navalues, raw/new levels and child values
# are all still comma-separated strings -
# convert them to types appropriate to the
# datafield/variable
def convert(rowvalues, column):
val = rowvalues[column]
if pd.isna(val):
return np.nan
return parsing.parseValueExpressions(val, rowvalues['Type'])
# Make sure the series type stays as "object" - if
# no rules are specified on any vid, the convert
# function will return all nans, and pandas will
# coerce the series type to float64, which may
# interfere with subsequent rule assignments (as
# these series may contain numpy arrays or
# list-likes)
if len(vartable) > 0:
for col in ('NAValues', 'RawLevels', 'NewLevels', 'ChildValues'):
vals = vartable.apply(convert, axis=1, args=(col,))
vartable[col] = vals.astype(object)
# Before merging the cleaning functions
# in, we generate a list of variables
# which are "uncleaned", i.e. have not
# had any cleaning specified, as this
# may indicate that a variable has been
# overlooked.
#
# If a variable has indirectly had NA
# value insertion or recoding applied
# via its data coding, it is not included
# in this list.
if clean is not None: ucmask = ~vartable.index.isin(clean.keys())
else: ucmask = vartable.index.notna()
ucmask = (ucmask &
vartable['NAValues'] .isna() &
vartable['RawLevels'] .isna() &
vartable['ParentValues'].isna() &
vartable['Clean'] .isna())
ucmask = ucmask[ucmask]
uncleanVars = [c for c in cols
if (c.vid in ucmask.index and
c not in unknownVars)]
# Merge clean options into variable table
mergeCleanFunctions(vartable, tytable, clean, typeClean, globalClean)
# Check where we can that the
# vartable contains valid rules
def checkLengths(col1, col2, row):
val1 = row[col1]
val2 = row[col2]
isna1 = pd.isna(val1)
isna2 = pd.isna(val2)
# ugh. if the value is a sequence, isna
# will return a sequence of bools
if not isinstance(isna1, bool): isna1 = False
if not isinstance(isna2, bool): isna2 = False
if isna1 and isna2:
return
if isna1 or isna2 or (len(val1) != len(val2)):
raise ValueError('Columns don\'t match [len({}) != '
'len({})]: {}'.format(val1, val2, row.name))
checkRecoding = ft.partial(checkLengths, 'RawLevels', 'NewLevels')
checkParentValues = ft.partial(checkLengths, 'ParentValues', 'ChildValues')
vartable.apply(checkRecoding, axis=1)
vartable.apply(checkParentValues, axis=1)
return vartable, unknownVars, uncleanVars
[docs]
def loadTableBases() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Loads the UK Biobank variable and data coding schema files.
This function is called by :func:`loadVariableTable`. It loads the UK
Biobank variable and data coding schema files (available from the UK
Biobank data showcase web site), and returns the information contained
within as two ``pandas.Dataframe`` objects. These dataframes are then
used as bases for the ``funpack`` variable table.
Information in the base tables is loaded from the following files:
- ``field.txt``: A list of all UKB variables
- ``encoding.txt``: A list of all UKB data codings
- ``type.txt``: A list of ``vid : type`` mappings for certain
variables, where ``type`` is the name of a
``numpy`` data type (e.g. ``float32``).
:returns: A tuple containing:
- a ``pandas.DataFrame`` to be used as the base for the
variable table
- a ``pandas.DataFrame`` to be used as the base for the
datacoding table
"""
# Here we load these files, both obtained
# from the UK Biobank showcase website:
#
# - field.txt - describes all UK biobank variables
# - encoding.txt - describes all data codings
#
# And we also load type.txt, which contains
# the internal type to use for some variables
# This dict contains all possible combinations
# of (value_type, base_type) from field.txt
typecodes = {
(11, 0) : util.CTYPES.integer,
(31, 0) : util.CTYPES.continuous,
(21, 11) : util.CTYPES.categorical_single,
(21, 41) : util.CTYPES.categorical_single_non_numeric,
(22, 11) : util.CTYPES.categorical_multiple,
(22, 41) : util.CTYPES.categorical_multiple_non_numeric,
(61, 0) : util.CTYPES.time,
(51, 0) : util.CTYPES.date,
(41, 0) : util.CTYPES.text,
(101, 0) : util.CTYPES.compound,
}
# We need pandas >=0.24 to support enums here
def settype(valtype, basetype):
return typecodes[valtype, basetype]
datadir = op.join(op.dirname(__file__), 'schema')
fields = schema.loadFieldProperties()
encodings = schema.loadEnodingDictionaries()
types = pd.read_csv(op.join(datadir, 'type.txt'),
delimiter='\t',
index_col=0,
converters={'Type' : convert_dtype})
dcbase = pd.DataFrame({'ID' : encodings['encoding_id']}).set_index('ID')
varbase = pd.DataFrame({
'ID' : fields['field_id'].astype(np.uint64),
'Type' : fields['value_type'].combine(fields['base_type'],
settype),
'Description' : fields['title'],
'DataCoding' : fields['encoding_id'],
'Instancing' : fields['instance_id'],
}).set_index('ID')
types.rename({'Type' : 'InternalType'}, axis=1, inplace=True)
varbase = pd.concat((varbase, types), axis=1, join='outer')
return varbase, dcbase
[docs]
def mergeTableFiles(
base : pd.DataFrame,
fnames : List[str],
what : str,
dtypes : Dict[str, Type],
converters : Dict[str, Callable],
columns : List[str]
) -> pd.DataFrame:
"""Load and merge one or more table files.
This function is called by :func:`loadVariableTable` to load the variable,
data coding, and type table files.
The variable and datacoding tables can be loaded from multiple files, with
each file containing part of the full table. All provided files are merged
into one table. The final table for a given set of files is the outer join
on the index column (assumed to be the first column in each file), where
non-na values in overlapping columns from later files will overwrite the
values in earlier files.
:arg base: Table containing base information - used for the variable
and datacoding tables (see :func:`_loadTableBases`).
:arg fnames: List of files to load. If ``None``, the ``base`` table
is returned (or an empty table if a ``base`` is not
given).
:arg what: Name of the table files being loaded - used solely for log
messages
:arg dtypes: Dict containing ``{column : datatype}`` mappings
:arg converters: Dict containing ``{column : convertfunc}`` mappings
:arg columns: Expected column names
"""
idcol = columns[0]
columns = columns[1:]
if fnames is None:
fnames = []
fnames = [f for f in fnames if f is not None]
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=pd.errors.ParserWarning)
for f in fnames:
log.debug('Loading %s table from %s', what, f)
table = pd.read_csv(f,
sep='\t',
index_col=0,
dtype=dtypes,
converters=converters)
if base is None:
base = table
continue
# Merge each file with an outer
# join, so we retain all IDs and
# columns defined across the
# entire set of files.
merged = base.merge(table,
how='outer',
on=idcol,
sort=True,
suffixes=('_x', '_y'))
# Now we merge overlapping columns -
# non-na values in later files take
# precedence.
for c in [c[:-2] for c in merged.columns if c.endswith('_x')]:
bname = c + '_x'
tname = c + '_y'
merged[c] = merged[bname]
notna = merged[tname].notna()
merged.loc[notna, c] = merged.loc[notna, tname]
merged = merged.drop(columns=[bname, tname])
base = merged
# no base, and no files
if base is None:
base = pd.DataFrame()
# error if we have any
# unrecognised columns
for col in base.columns:
if col not in columns:
raise ValueError('Unrecognised column in table file {} - '
'should be {}, but file contained {}.'.format(
fnames, columns, base.columns))
# in-fill any columns that
# were not provided
for col in columns:
if col not in base.columns:
base[col] = pd.Series(dtype=np.float64)
return base
[docs]
def sanitiseVariableTable(
vartable : pd.DataFrame,
cols : Sequence[datatable.Column],
dropAbsent : bool
) -> List[datatable.Column]:
"""Ensures that the variable table contains an entry for every
variable in the input data.
Called by :func:`loadVariableTable`.
:arg vartable: ``pandas.DataFrame`` containing the variable table.
:arg cols: Sequence of :class:`.Column` objects representing
the columns in the input data.
:arg dropAbsent: If ``True``, entries in the table for variables which are
not in ``cols`` will be removed.
:return: A list of unknown :class:`.Column` objects, i.e.
representing variables which were not in the variable
table.
"""
unknownVars = []
# Make sure a placeholder entry is
# present for any variables which are
# not in the variable table, but which
# are in the data file(s).
for col in cols:
vid = col.vid
name = col.name
if vid in vartable.index:
continue
unknownVars.append(col)
addNewVariable(vartable, vid, name)
# And the inverse - we can drop any
# variables from the variable table
# that are not in the data.
if dropAbsent:
vids = [c.vid for c in cols]
vartable.drop([v for v in vartable.index if v not in vids],
inplace=True)
return unknownVars
[docs]
def mergeIntoVariableTable(
vartable : pd.DataFrame,
cols : Sequence[str],
mapping : Union[str, Dict[int, Any]]):
"""Merge data from ``mapping`` into the variable table.
Called by :func:`loadVariableTable`.
:arg vartable: The variable table
:arg cols: Names of columns in the variable table
:arg mapping: Dict of ``{vid : values}`` mappings containing the
data to copy in.
"""
onecol = isinstance(cols, str)
if onecol:
cols = [cols]
# Ignore any variables that
# are not in variable table
vids = list(mapping.keys())
vin = pd.Series(vids).isin(vartable.index)
vids = [v for i, v in enumerate(vids) if vin[i]]
for vid in vids:
vals = mapping[vid]
if onecol:
vals = [vals]
for col, val in zip(cols, vals):
vartable.at[vid, col] = val
[docs]
def mergeDataCodingTable(
vartable : pd.DataFrame,
dctable : pd.DataFrame):
"""Merges information from the data coding table into the variable
table.
Called by :func:`loadVariableTable`.
:arg vartable: The variable table.
:arg dctable: The data coding table.
"""
with_datacoding = vartable['DataCoding'].notna()
for field in ['NAValues', 'RawLevels', 'NewLevels']:
mask = vartable[field].isna() & with_datacoding
newvals = vartable.loc[mask].merge(dctable,
left_on='DataCoding',
right_index=True,
suffixes=('_v', '_dc'),
copy=False)['{}_dc'.format(field)]
vartable.loc[mask, field] = newvals
[docs]
def mergeCleanFunctions(
vartable : pd.DataFrame,
tytable : pd.DataFrame,
clean : Dict[int, str],
typeClean : Dict[str, str],
globalClean : str):
"""Merges custom clean functions into the variable table.
Called by :func:`loadVariableTable`.
:arg vartable: The variable table.
:arg tytable: The type table
:arg clean: Dictionary of ``{vid : expr}`` mappings containing
cleaning functions to apply - this will override
any cleaning specified in the variable file, and
any cleaning specified in ``typeClean``.
:arg typeClean: Dictionary of ``{type : expr}`` mappings containing
cleaning functions to apply to all variables of a
specific type - this will override any cleaning
specified in the type file.
:arg globalClean: Expression containing cleaning functions to
apply to every variable - this will be performed after
variable-specific cleaning in the variable table,
or specified via ``clean`` or ``typeClean``.
"""
# Merge type-specific Clean
# from the type table into
# the variable table.
for vid in vartable.index:
if vid == 0:
continue
vtype = vartable.loc[vid, 'Type']
pp = vartable.loc[vid, 'Clean']
# Override with typeClean if necessary
if typeClean is not None and vtype in typeClean:
tpp = convert_Process('cleaner', typeClean[vtype])
elif vtype in tytable.index:
tpp = collections.OrderedDict((tytable.loc[vtype, 'Clean']))
else:
continue
# type cleaning is applied after
# variable-specific cleaning
if pd.isnull(pp): vartable.loc[[vid], 'Clean'] = [tpp]
else: vartable.loc[ vid, 'Clean'].update(tpp)
# Override cleaning with expressions
# that have been passed on the command line
if clean is not None:
clean = {vid : convert_Process('cleaner', expr)
for vid, expr in clean.items()}
mergeIntoVariableTable(vartable, 'Clean', clean)
# Add global cleaning to all variables
if globalClean is not None:
for vid in vartable.index:
if vid == 0:
continue
pp = vartable.loc[vid, 'Clean']
gpp = convert_Process('cleaner', globalClean)
# global cleaning is applied
# after all other cleaning
if pd.isnull(pp): vartable.loc[[vid], 'Clean'] = [gpp]
else: vartable.loc[ vid, 'Clean'].update(gpp)
[docs]
def addNewVariable(
vartable : pd.DataFrame,
vid : int,
name : str,
dtype : np.dtype = None,
instancing : int = None):
"""Add a new row to the variable table.
The ``instancing`` argument defines the meaning of the
:attr:`.Column.visit` field for columns associated with this variable.
The default value is ``2``, meaning that this variable may be associated
with columns corresponding to measurements acquired at different UK
BioBank assessments and imaging visits. See
https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi?id=9 and
https://biobank.ctsu.ox.ac.uk/crystal/schema.cgi?id=10 for more details.
.. note:: If an entry for the specified ``vid`` already exists in
``vartable``, the ``name``, ``dtype`` and ``instancing``
arguments are ignored and the existing information in
``vartable`` will take precedence.
:arg vartable: The variable table
:arg vid: Integer variable ID
:arg name: Variable name - used as the description
:arg dtype: ``numpy`` data type. If ``None``, the variable type
is set to :attr:`.util.CTYPES.unknown`.
:arg instancing: Instancing code for the new variable - defaults to ``2``
"""
# If an entry already exists
# in the variable table, it
# takes precedence
if vid in vartable.index:
dtype = vartable.at[vid, 'Type']
name = vartable.at[vid, 'Description']
instancing = vartable.at[vid, 'Instancing']
else:
# set dtype to something which
# will cause the conditionals
# to fall through
if dtype is None: dtype = object
else: dtype = dtype.type
# Assume that new variables
# are associated with visits
if instancing is None:
instancing = 2
if issubclass(dtype, int): dtype = util.CTYPES.integer
elif issubclass(dtype, float): dtype = util.CTYPES.continuous
else: dtype = util.CTYPES.unknown
vartable.loc[vid, 'Description'] = name
vartable.loc[vid, 'Type'] = dtype
vartable.loc[vid, 'Instancing'] = instancing
[docs]
def loadProcessingTable(
procfile : str = None,
skipProcessing : bool = False,
prependProcess : Sequence[Tuple[List[int], str]] = None,
appendProcess : Sequence[Tuple[List[int], str]] = None,
cattable : pd.DataFrame = None,
**kwargs # pylint: disable=unused-argument
) -> pd.DataFrame:
"""Loads the processing table from the given file.
:arg procfile: Path to the processing table file.
:arg skipProcessing: If ``True``, the processing table is not loaded from
``procfile``. The ``prependProcess`` and
``appendProcess`` arguments are still applied.
:arg prependProcess: Sequence of ``(varids, procstr)`` mappings specifying
processes to prepend to the beginning of the
processing table.
:arg appendProcess: Sequence of ``(varids, procstr)`` mappings specifying
processes to append to the end of the processing
table.
:arg cattable: ``pandas.DataFrame`` containing variable categories.
If not provided, any processing rules which refer to
categories will not be able to be processed, and will
result in an error.
All other keyword arguments are ignored.
"""
if prependProcess is None: prependProcess = []
if appendProcess is None: appendProcess = []
converters = dict(PROCTABLE_CONVERTERS)
converters['Variable'] = ft.partial(convert_Process_Variable,
cattable=cattable)
if (procfile is not None) and (not skipProcessing):
log.debug('Loading processing table from %s', procfile)
proctable = pd.read_csv(procfile,
sep='\t',
index_col=False,
skip_blank_lines=True,
comment='#',
converters=converters)
else:
proctable = pd.DataFrame(columns=PROCTABLE_COLUMNS)
# prepend/append custom
# processes to the table
proctable.index += len(prependProcess)
for i, (vids, procs) in enumerate(prependProcess):
vids = convert_Process_Variable(vids, cattable)
procs = convert_Process('processor', procs)
proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
proctable.at[i, 'Process'] = procs
for i, (vids, procs) in enumerate(appendProcess, len(proctable.index)):
vids = convert_Process_Variable(vids, cattable)
procs = convert_Process('processor', procs)
proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
proctable.at[i, 'Process'] = procs
proctable.sort_index(inplace=True)
return proctable
[docs]
def loadCategoryTable(catfile : str = None) -> pd.DataFrame:
"""Loads the category table from the given file.
:arg catfile: Path to the category file.
"""
if catfile is not None:
log.debug('Loading category table from %s', catfile)
cattable = pd.read_csv(catfile,
sep='\t',
index_col=0,
dtype=CATTABLE_DTYPES,
converters=CATTABLE_CONVERTERS)
else:
cattable = pd.DataFrame(columns=CATTABLE_COLUMNS[1:])
cattable.index.name = CATTABLE_COLUMNS[0]
return cattable
[docs]
def categoryVariables(
cattable : pd.DataFrame,
categories : Sequence[Union[int, str]]
) -> List[int]:
"""Returns a list of variable IDs from ``cattable`` which correspond to
the strings in ``categories``.
:arg cattable: The category table.
:arg categories: Sequence of integer category IDs or label sub-strings
specifying the categories to return.
:returns: A list of variable IDs.
"""
allvars = []
with warnings.catch_warnings():
warnings.simplefilter('ignore')
for cat in categories:
catpat = re.compile('({})'.format(cat), re.IGNORECASE)
idmask = cattable.index.isin([cat])
lblmask = cattable['Category'].str.contains(catpat)
catvars = cattable.loc[idmask | lblmask, 'Variables']
if len(catvars) == 0:
continue
for c in catvars.iloc[0]:
if c not in allvars:
allvars.append(c)
return allvars
[docs]
def variableCategories(
cattable : pd.DataFrame,
vids : Sequence[int]
) -> Dict[int, List[str]]:
"""Identify the categories for each variable in vids.
:arg cattable: The category table
:arg vids: Sequence of variable/datafield IDs
:returns: A dict of ``{vid : [category]}`` mappings
"""
categories = collections.defaultdict(list)
for v in vids:
for i in cattable.index:
if v in cattable.loc[i, 'Variables']:
categories[v].append(cattable.loc[i, 'Category'])
return categories
[docs]
def addImplicitCategories(
cattable : pd.DataFrame,
unknown : Sequence[datatable.Column],
uncat : Sequence[datatable.Column]):
"""Adds some implicit/automatic categories to the category table.
The following implicit categories are added:
- ``unknown``: Variables which are not present in the variable
table - this comprises non-UKB variables, or new UKB
variables which are not described by the internal
FUNPACK variable information in ``funpack/schema/``.
- ``uncategorised``: Variables which are not present in any other
category.
:arg cattable: The category table.
:arg unknown: Sequence of :class:`.Column` objects representing
variables to add to an "unknown" category.
:arg uncat: Sequence of :class:`.Column` objects representing
variables to add to an "uncategorised" category.
"""
for cols, label in zip((unknown, uncat), ('unknown', 'uncategorised')):
if cols is None:
continue
vids = list(sorted({c.vid for c in cols}))
# label already in table?
umask = cattable['Category'] == label
if np.any(umask):
idx = np.where(umask)[0][0]
idx = cattable.index[idx]
vids = cattable.loc[idx, 'Variables'] + vids
else:
idx = IMPLICIT_CATEGORIES[label]
cattable.loc[idx, 'Category'] = label
cattable.loc[idx, 'Variables'] = list(vids)
[docs]
def columnTypes(
vartable : pd.DataFrame,
columns : Sequence[datatable.Column]
) -> Tuple[List[util.CTYPES], Dict[str, np.dtype]]:
"""Retrieves the type of each column in ``cols`` as listed in ``vartable``.
Also identifies a suitable internal data type to use for each column where
possible.
:arg vartable: The variable table.
:arg columnss: List of :class:`.Column` objects.
:returns: A tuple containing:
- A list containing the type for each column in ``cols`` -
an identifier from the :attr:`.util.CTYPES` enum.
Columns corresponding to a variable which is not in
the variable table is given a type of ``None``.
- A dict of ``{ column_name : dtype }`` mappings containing
a suitable internal data type to use for some columns.
"""
vttypes = []
dtypes = {}
for col in columns:
vid = col.vid
name = col.name
if vid not in vartable.index:
vttypes.append(None)
continue
vttype = vartable.loc[vid, 'Type']
dtype = vartable.loc[vid, 'InternalType']
if pd.isna([dtype])[0]:
dtype = util.DATA_TYPES.get(vttype, None)
vttypes.append(vttype)
if dtype is not None:
dtypes[name] = dtype
return vttypes, dtypes
[docs]
def identifyUncategorisedVariables(
fileinfo : finfo.FileInfo,
cattable : pd.DataFrame
) -> List[datatable.Column]:
"""Called by :func:`loadTables`. Identifies all variables which are in the
data file(s), but which are uncategorised (not present in any categories
in the category table).
Such variables might have been overlooked, so the user may need to be
warned about them.
:arg fileinfo: :class:`.FileInfo` object.
:arg cattable: Category table
:returns: A list of :class:`.Column` objects associated with
variables that are uncategorised.
"""
def isCategorised(col):
def inCategory(catvars):
return col.vid in catvars
return cattable['Variables'].apply(inCategory).any()
uncategorised = []
for datafile in fileinfo.datafiles:
cols = fileinfo.columns(datafile)
cols = [c for c in cols if (c.vid != 0) and (not isCategorised(c))]
uncategorised.extend(cols)
return uncategorised