Source code for h5flow.data.lib

import h5py
import numpy as np
import numpy.ma as ma
import numpy.lib.recfunctions as rfn
import logging

ref_region_dtype = np.dtype([('start','i4'), ('stop','i4')])

[docs]def print_ref(grp):
    '''
        Print out all references in file (or group)
    '''
    l = list()
    grp.visititems(lambda n,d: l.append((n,d))\
        if n.endswith('/ref') and isinstance(d,h5py.Dataset)
        else None
        )
    if not len(l):
        return
    max_length = max([len(n) for n,d in l])
    for n,d in l:
        print(n+' '*(max_length-len(n))+' '+str(d))

[docs]def print_data(grp):
    '''
        Print out all datasets in file (or group)

    '''
    l = list()
    grp.visititems(lambda n,d: l.append((n,d))\
        if n.endswith('/data') and isinstance(d,h5py.Dataset)
        else None
        )
    if not len(l):
        return
    max_length = max([len(n) for n,d in l])
    for n,d in l:
        print(n+' '*(max_length-len(n))+' '+str(d))

[docs]def print_attr(grp):
    '''
        Print out all attributes in file (or group)

    '''
    l = list()
    grp.visititems(lambda n,d: l.append((n,d.attrs))\
        if len(d.attrs) and not (n.endswith('/ref') or n.endswith('/ref_region'))\
        else None
        )
    if not len(l):
        return
    max_length = max([len(k) for n,d in l for k in d])
    for n,d in l:
        print(n)
        for k,v in d.items():
            print('\t'+k+':'+' '*(max_length-len(k))+' '+str(v))

[docs]def dereference_chain(sel, refs, data=None, regions=None, mask=None, ref_directions=None, indices_only=False):
    '''
        Load a "chain" of references. Allows traversal of multiple layers of references,
        e.g. for three datasets ``A``, ``B``, and ``C`` linked ``A->B->C``. One
        can use a selection in ``A`` and load the ``C`` data associated with it.

        Example usage::

            sel = slice(0,100)
            refs = [f['A/ref/B/ref'], f['C/ref/B/ref']]
            ref_dirs = [(0,1), (1,0)]
            data = f['C/data']
            regions = [f['A/ref/B/ref_region'], f['B/ref/C/ref_region']]
            mask = np.r_[sel] > 50

            c_data = dereference_chain(sel, refs, data, regions=regions, mask=mask, ref_directions=ref_dirs)
            c_data.shape # (100, max_a2b_assoc, max_b2c_assoc)

        :param sel: iterable of indices, a slice, or an integer, see ``sel`` argument in ``dereference``

        :param refs: a list of reference datasets to load, in order, see ``ref`` argument in ``dereference``

        :param data: a dataset to load dereferenced data from, optional if ``indices_only=True``

        :param regions: lookup table into ``refs`` for each selection, see ``region`` argument in ``dereference``

        :param mask: a boolean mask into the first selection, true will not load the entry

        :param ref_directions: intepretation of reference datasets, see ``ref_direction`` argument in ``dereference``

        :param indices_only: flag to skip loading the data and instead just return indices into the final dataset

    '''
    sel = np.r_[sel]
    mask = np.zeros_like(sel, dtype=bool) | (mask if mask is not None else False)
    sel = ma.array(sel, mask=mask, shrink=False)
    shape = (len(sel),)
    dref = None

    nsteps = len(refs)
    for i in range(nsteps):
        dset = data if i == nsteps-1 else None
        ref = refs[i]
        ref_dir = ref_directions[i] if ref_directions else (0,1) # default to (0,1)
        reg = regions[i] if regions else None

        dref = dereference(sel.data.ravel(), ref,
            data=dset, region=reg,
            mask=mask.ravel(), ref_direction=ref_dir,
            indices_only=True if i != nsteps-1 else indices_only)
        shape += dref.shape[-1:]

        mask = np.expand_dims(mask, axis=-1) | \
            (rfn.structured_to_unstructured(dref.mask).any(axis=-1).reshape(shape) \
            if dref.mask.dtype.kind == 'V' else dref.mask.reshape(shape))
        dref = ma.array(dref.data.reshape(shape), mask=mask, shrink=False)

        if i != nsteps-1:
            sel = dref

    return dref


[docs]def dereference(sel, ref, data=None, region=None, mask=None, ref_direction=(0,1), indices_only=False, as_masked=True):
    '''
        Load ``data`` referred to by ``ref`` that corresponds to the desired
        positions specified in ``sel``.

        :param sel: iterable of indices, an index, or a ``slice`` to match against ``ref[:,ref_direction[0]]``. Return value will have same first dimension as ``sel``, e.g. ``dereference(slice(100), ref, data).shape[0] == 100``

        :param ref: a shape (N,2) ``h5py.Dataset`` or array of pairs of indices linking ``sel`` and ``data``

        :param data: a ``h5py.Dataset`` or array to load dereferenced data from, can be omitted if ``indices_only==True``

        :param region: a 1D ``h5py.Dataset`` or array with a structured array type of [('start','i8'), ('stop','i8')]; 'start' defines the earliest index within the ``ref`` dataset for each value in ``sel``, and 'stop' defines the last index + 1 within the ``ref`` dataset (optional). If a ``h5py.Dataset`` is used, the ``sel`` spec will be used to load data from the dataset (i.e. ``region[sel]``), otherwise ``len(sel) == len(region)`` and a 1:1 correspondence is assumed

        :param mask: mask off specific items in selection (boolean, True == don't dereference selection), len(mask) == len(np.r_[sel])

        :param ref_direction: defines how to interpret second dimension of ``ref``. ``ref[:,ref_direction[0]]`` are matched against items in ``sel``, and ``ref[:,ref_direction[1]]`` are indices into the ``data`` array (``default=(0,1)``). So for a simple example: ``dereference([0,1,2], [[1,0], [2,1]], ['A','B','C','D'], ref_direction=(0,1))`` returns an array equivalent to ``[[],['A'],['B']]`` and ``dereference([0,1,2], [[1,0], [2,1]], ['A','B','C','D'], ref_direction=(1,0))`` returns an array equivalent to ``[['B'],['C'],[]]``

        :param indices_only: if ``True``, only returns the indices into ``data``, does not fetch data from ``data``

        :returns: ``numpy`` masked array (or if ``as_masked=False`` a ``list``) of length equivalent to ``sel``
    '''

    # set up selection
    sel_mask = mask
    sel_idcs = np.r_[sel][~sel_mask] if sel_mask is not None else np.r_[sel]
    n_elem = len(sel_idcs) if sel_mask is None else len(sel_mask)

    return_dtype = data.dtype if not indices_only else ref.dtype

    if not len(sel_idcs) and n_elem:
        # special case for if there is nothing selected in the mask
        if as_masked:
            return ma.array(np.empty((n_elem,1), dtype=return_dtype), mask=True, shrink=False)
        else:
            return [np.empty(0, data.dtype) for _ in range(n_elem)]
    elif not len(sel_idcs):
        if as_masked:
            return ma.array(np.empty((0,1), dtype=return_dtype), mask=True, shrink=False)
        else:
            return []

    # load fast region lookup
    if region is not None:
        if isinstance(region, h5py.Dataset):
            if isinstance(sel, slice):
                region = region[sel] # load parent reference region information
            else:
                region_offset = np.min(sel_idcs)
                region_sel = slice(region_offset, int(np.max(sel_idcs)+1))
                region = region[region_sel][sel_idcs - region_offset]
        else:
            region = region[sel_idcs]

    # load relevant references
    region_valid = region['start'] != region['stop'] if region is not None else None
    if not region is None and np.count_nonzero(region_valid) == 0:
        # special case for if there are no valid references
        if as_masked:
            return ma.array(np.empty((n_elem,1), dtype=return_dtype), mask=True)
        else:
            return [np.empty(0, return_dtype) for _ in range(n_elem)]
    ref_offset = np.min(region[region_valid]['start']) if region is not None else 0
    ref_sel = slice(ref_offset, int(np.max(region[region_valid]['stop']))) if region is not None else slice(ref_offset,len(ref))
    ref = ref[ref_sel]

    # if no valid references, return
    if len(ref) == 0:
        if as_masked:
            return ma.array(np.empty((n_elem,1), dtype=return_dtype), mask=True)
        else:
            return [np.empty(0, return_dtype) for _ in range(n_elem)]

    # load relevant data
    dset_offset = np.min(ref[:,ref_direction[1]])
    dset_sel = slice(dset_offset, int(np.max(ref[:,ref_direction[1]])+1))
    dset = data[dset_sel] if not indices_only else None # load child dataset region

    # create a region array, if one was not given
    if region is None:
        region = np.zeros(len(sel_idcs), dtype=ref_region_dtype)
        region['start'] = ref_sel.start
        region['stop'] = ref_sel.stop

    if not as_masked:
        # dump into list using subregion masks
        if indices_only:
            indices = [
                    ref[st:sp,ref_direction[1]][ (ref[st:sp,ref_direction[0]] == i) ]
                    for i,st,sp in zip(sel_idcs, region['start']-ref_offset, region['stop']-ref_offset)
                ]
            return indices
        else:
            data = [
                    dset[ref[st:sp,ref_direction[1]][ (ref[st:sp,ref_direction[0]] == i) ] - dset_offset]
                    for i,st,sp in zip(sel_idcs, region['start']-ref_offset, region['stop']-ref_offset)
                ]
            return data


    # the rest of this is index manipulation to convert from sel -> ref -> data
    # first using only the unique references and then casting it back into the
    # original selection

    # first get mapping from unique selection back into the selection
    uniq_sel, uniq_inv = np.unique(sel_idcs, return_inverse=True)

    # only use references that are relevant to the selection
    ref_mask = np.isin(ref[:,ref_direction[0]], uniq_sel)
    if not np.any(ref_mask):
        # special case if no valid references for selection
        return ma.array(np.empty((n_elem,1), dtype=return_dtype), mask=True, shrink=False)

    # get the number of references per parent and rearrange so that references are in ordered by parent
    uniq, counts = np.unique(ref[ref_mask,ref_direction[0]], return_counts=True)
    reordering = np.argsort(uniq)
    uniq, counts = uniq[reordering], counts[reordering]
    max_counts = np.max(counts)

    # now, we'll fill a subarray consisting of unique elements that were requested (shape: (len(uniq_sel), max_counts) )
    # get a mapping from the unique selection into the unique reference parents
    _,uniq_sel_idcs,uniq2uniq_sel_idcs = np.intersect1d(uniq_sel, uniq, assume_unique=False, return_indices=True)

    # set up subarrays for unique selection
    shape = (len(uniq_sel), max_counts)
    condensed_data = np.zeros(shape, dtype=return_dtype)
    condensed_mask = np.zeros(shape, dtype=bool)

    # block off and prepare slots for unique selection
    condensed_mask[uniq_sel_idcs] = np.arange(condensed_data.shape[1]).reshape(1,-1) < counts[uniq2uniq_sel_idcs].reshape(-1,1)
    view_dtype = np.dtype([('ref0',ref.dtype),('ref1',ref.dtype)])
    sort_ref = np.argsort(ref[ref_mask].view(view_dtype), axis=0,
        order=[view_dtype.names[ref_direction[0]], view_dtype.names[ref_direction[1]]]
        ) # arrange by parent (then by child)
    # and fill slots
    if indices_only:
        np.place(condensed_data, mask=condensed_mask, vals=ref[ref_mask,ref_direction[1]][sort_ref])
    else:
        np.place(condensed_data, mask=condensed_mask, vals=dset[ref[ref_mask,ref_direction[1]][sort_ref] - dset_offset])

    # then cast unique selections into full set of elements that were requested (shape: (len(sel), max_counts) )
    mask = np.zeros((len(sel_mask), max_counts), dtype=bool) if sel_mask is not None \
        else condensed_mask[uniq_inv]
    data = np.zeros((len(sel_mask), max_counts), dtype=condensed_data.dtype) if sel_mask is not None \
        else condensed_data[uniq_inv]
    if sel_mask is not None:
        mask[~sel_mask] = condensed_mask[uniq_inv]
        data[~sel_mask] = condensed_data[uniq_inv]
    return ma.array(data, mask=~mask, shrink=False)