Source code for qnd.pdbf

"""Pure python QnD wrapper for PDB files.

Information on the PDB file format is somewhat hard to come by.  
Try the SILO github repo https://github.com/LLNL/Silo
esp. src/pdb/ and src/score/ dirs.
Also the end of the QND file pdbparse.py has a long comment with a detailed description.

Note that yorick-generated PDB files are version II, not version III.  Also,
yorick pointers are written (by default) in a yorick-specific format.  Since
yorick readability has held back many application codes (the LEOS library
and LLNL rad-hydro codes used for ICF design), most of the dwindling legacy
PDB files are version II.  Hence, this implementation focuses on the
version III format, and the version I format is supported only for reading.

Furthermore, this implementation only supports IEEE 754 4 and 8 byte
floating point formats, since those are the only unambiguous floating
point formats supported by numpy.  Fortunately, this covers all modern
PDB files likely to show up in practice, so we have no significant
incentive to do the work required to support exotic formats.

"""
from __future__ import absolute_import

import sys
import weakref
from numbers import Integral
from collections import OrderedDict
from warnings import warn

from numpy import (zeros, arange, fromfile, prod, array, ascontiguousarray,
                   dtype as npdtype)

from .frontend import QGroup, QnDList
from .generic import opener
from .pdbparse import parser, PDBChart
from .pdbdump import flusher_for, initializer_for
from .utils import leading_args

__all__ = ['openpdb']

PY2 = sys.version_info < (3,)
if PY2:
    range = xrange


[docs]def openpdb(filename, mode='r', auto=1, **kwargs):
    """Open PDB file or family, and wrap it in a QnD QGroup.

    Parameters
    ----------
    filename : str
       Name of file to open.  See notes below for file family.
    mode : str
       One of 'r' (default, read-only), 'r+' (read-write, must exist),
       'a' (read-write, create if does not exist), 'w' (create, clobber if
       exists), 'w-' (create, fail if exists).
    auto : int
       The intial state of auto-read mode.  If the QGroup handle returned
       by openh5 is `f`, then ``f.varname`` reads an array variable, but not
       a subgroup when auto=1, the default.  With auto=0, the variable
       reference reads neither (permitting later partial reads in the case
       of array variables).  With auto=2, a variable reference recursively
       reads subgroups, bringing a whole tree into memory.
    **kwargs
       Other keywords.  The `maxsize` keyword sets the size of files in a
       family generated in ``recording==1`` mode; a new file will begin when
       the first item in a new record would begin beyond `maxsize`.  The
       default maxsize is 128 MiB (134 MB).  The `order` keyword can be '>'
       or '<' to force the byte order in a new file; by default the byte
       order is the native order.  File families always have the same order
       for every file, so `order` is ignored if any files exist.

    Returns
    -------
    f : QGroup
       A file handle implementing the QnD interface.

    Notes
    -----
    The `filename` may be an iterable, one string per file in order.  The
    sequence may extend beyond the files which actually exist for 'r+', 'a',
    'w', or 'w-' modes.

    Alternatively `filename` specifies a family if it contains shell globbing
    wildcard characters.  Existing matching files are sorted first by length,
    then alphabetically (ensuring that 'file100' comes after 'file99', for
    example).  If there is only a single wildcard group, it also serves to
    define a sequence of future family names beyond those currently existing
    for 'r+', 'a', 'w', or 'w-' modes.  A '?' pattern is treated the same as
    a '[0-9]' pattern if all its matches are digits or if the pattern
    matches no existing files.  Similarly, a '*' acts like the minimum number
    of all-digit matches, or three digits if there are no matches.

    """
    maxsize = kwargs.pop('maxsize', 134217728)
    order = kwargs.pop('order', None)
    if order:
        if order not in '<>':
            raise ValueError("order must be either > or <")
        order = 1 if order == '>' else 2
    kwargs['nextaddr_mode'] = 1  # tell opener to initialize nextaddr to 0
    handle, n = opener(filename, mode, **kwargs)
    root = PDBGroup(handle, maxsize)
    for i in range(n):
        try:
            parser(handle, root, i)
        except IOError:
            # Something went terribly wrong.  If this is first file, we die.
            name = handle.filename(i)
            if not i:
                raise IOError("Fatal errors opening PDB file "
                              "".format(name))
            handle.open(i-1)
            warn("file family stopped by incompatible {}".format(name))
    if not n and order:
        root.chart.byteorder = order
    # If file was freshly created, setting initializer calls it.
    handle.callbacks(flusher_for(root), initializer_for(root))
    # If any files exist, parser has set nexaddr to the chart address
    # of the last existing file in the family.  If there are no record
    # variables, we let this stand.  However, if there are record variables,
    # and the family is writable, we set nextaddr to the zero address of
    # the next file beyond all existing files.  This causes any new records
    # to be placed in a new file, leaving all existing files in the
    # family undisturbed.
    mode = mode.lower()
    if ((mode.startswith('a') or mode.startswith('r+')) and
        handle.state[4] is not None and _has_records(root)):
            handle.declared(handle.zero_address(len(handle.state[2])), None, 0)
    return QGroup(root, auto=auto)


def _has_records(root):
    for name in root:
        item = root.items[name]
        if item.islist() == 1:
            return True
        # Recurse into groups, but not into lists or objects of any type.
        if item.isgroup() and '__class__' not in item and _has_records(item):
            return True
    return False


class PDBGroup(object):
    """A directory in a PDB file, or a whole file or file family.

    """
    def __init__(self, parent, maxsize=134217728):
        if not isinstance(parent, PDBGroup):  # this is root group
            self.root = weakref.ref(self)
            self.maxsize = maxsize
            self.maxblocks = 0
            self.handle = parent
            self.chart = PDBChart(self)
        else:
            self.root = parent.root
        self.items = OrderedDict()
        self.attrs = None

    @staticmethod
    def isgroup():
        return 1

    @staticmethod
    def islist():
        return 0

    isleaf = islist

    def close(self):
        self.root().handle.close()

    def flush(self):
        self.root().handle.flush()

    def __len__(self):
        return len(self.items)

    def __iter__(self):
        return iter(self.items)

    def lookup(self, name):
        item = self.items.get(name)
        if isinstance(item, PDBGroup):
            item = QnDList.fromgroup(item)
        return item

    def declare(self, name, dtype, shape, unlim=None, addr=-1):
        current = self.items.get(name)
        if dtype == dict:
            if current is not None:
                if current.isgroup():
                    return current
                raise KeyError("already a non-group item {}".format(name))
            item = PDBGroup(self)
        elif dtype == list:
            if current is not None:
                if current.islist() == 2:
                    return current
                raise KeyError("already a non-list item {}".format(name))
            item = QnDList(PDBGroup(self), 1)
        else:
            if current is not None:
                raise KeyError("attempt to redeclare {}".format(name))
            if dtype is None and name == '_':
                # Assume we are creating a QList.
                dtype = npdtype('u1')
            elif isinstance(dtype, npdtype) and dtype.kind == 'S':
                # frontend never passes 'U' dtype
                shape = shape + (dtype.itemsize,)
                dtype = npdtype('S1')
            item = PDBLeaf(self, addr, dtype, shape, unlim)
            if unlim:
                item = QnDList(item, None if hasattr(addr, '__iter__') or
                               addr != -1 else 1)
        self.items[name] = item
        return item

    # This is used in pdbparse._endparse to declare or check symbols
    # against declarations from previous files in a family.
    def _leaf_declare(self, name, dtype, shape, addr):
        item = self.items.get(name)
        unlim = isinstance(addr, list)
        if item is not None:
            tsa = None
            if not isinstance(item, PDBLeaf):
                item = None if isinstance(item, PDBGroup) else item.parent()
            if item is not None:
                tsa = item.tsa
                if (unlim != isinstance(tsa[2], list) or
                        tsa[:2] != (dtype, shape)):
                    item = None
                elif unlim:
                    tsa[2].extend(addr)
            if item is None:
                raise IOError("incompatible redeclaration of {}".format(name))
            return
        self.declare(name, dtype, shape, unlim, addr)

    def attget(self, vname):
        item = self.lookup(vname) if vname else self
        if isinstance(item, QnDList):
            item = item.parent()
        attrs = item.attrs
        if attrs is None:
            item.attrs = attrs = PDBAttrs()
        return attrs

    def attset(self, vname, aname, dtype, shape, value):
        item = self.lookup(vname) if vname else self
        if isinstance(item, QnDList):
            item = item.parent()
        attrs = item.attrs
        if attrs is None:
            item.attrs = attrs = PDBAttrs()
        if value.dtype != dtype or value.shape != shape:
            v = zeros(shape, dtype)
            v[()] = value
            value = v
        attrs[aname] = value


class PDBLeaf(object):
    """An ndarray in a PDB file.

    (Eventual stretch goal is to implement None and zero-length arrays.)

    """
    def __init__(self, parent, addr, dtype, shape, unlim):
        if dtype is None or (shape and not all(shape)):
            raise NotImplementedError("None or zero length array")
        root = parent.root()
        if not isinstance(dtype, tuple):
            # Construct full data type: (dtype, stype, align, typename)
            dtype = (dtype,) + root.chart.find_or_create(dtype)
        self.parent = weakref.ref(parent)
        self.attrs = None
        if hasattr(addr, '__iter__'):
            unlim = 1
            if not isinstance(addr, list):
                addr = list(addr)
        elif addr == -1:
            stype, align = dtype[1:3]
            handle = root.handle
            addr = _align(handle.next_address(), align)
            handle.declared(addr, stype, prod(shape) if shape else 1)
            if unlim:
                addr = [addr]
        elif unlim:
            addr = [int(addr)]
        else:
            addr = int(addr)
        self.tsa = dtype, shape, addr

    @staticmethod
    def isleaf():
        return 1

    @staticmethod
    def isgroup():
        return 0

    islist = isgroup

    def root(self):
        return self.parent().root()

    def query(self):
        # return dtype, shape, sshape
        dtype, shape, addr = self.tsa
        if isinstance(addr, list):
            # Do this for consistency with treatment of h5py chunked data.
            shape = (len(addr),) + shape
        return dtype[0], shape, shape

    def read(self, args=()):
        dtype, shape, addr = self.tsa
        dtype, stype, _, typename = dtype
        istext = typename == b'text'
        if isinstance(addr, list):
            arg0 = args[0] if args else slice(None)
            args = args[1:]
            if not isinstance(arg0, Integral):
                arg0 = arange(len(addr))[arg0]
                if arg0.ndim == 1:
                    return array([self.read((a,) + args) for a in arg0], dtype)
                elif arg0.ndim:
                    raise TypeError("block variable leading index too complex")
            addr = addr[arg0]
        root = self.root()
        chart = root.chart
        nopartial = chart.nopartial(typename)
        if nopartial is None:
            typename = None
        if typename and nopartial:
            offset = 0
        else:
            args, shape, offset = leading_args(args, shape)
        if offset:
            addr += dtype.itemsize * offset
        f = root.handle.seek(addr)
        value = fromfile(f, stype, prod(shape) if shape else 1)
        if not nopartial:
            value = value.reshape(shape)[args]
        if typename:
            value = chart.read_special(f, typename, value)
            stype = dtype = value.dtype
        if nopartial:
            value = value.reshape(shape)[args]
        if istext and value.shape:
            return value.view('S' + str(value.shape[-1]))[..., 0]
        return value if stype is dtype else value.astype(dtype)

    def write(self, value, args=()):
        dtype, shape, addr = self.tsa
        dtype, stype, align, typename = dtype[:4]
        arg0 = args[0] if args else slice(None)
        args = args[1:]
        root = self.root()
        handle = root.handle
        if root.chart.nopartial(typename) is not None:
            raise TypeError("write to pointer type {} unsupported"
                            "".format(typename.decode('latin1')))
        if isinstance(addr, list):
            # This variable has blocks.
            if not isinstance(arg0, Integral):
                arg0 = arange(len(addr))[arg0]
                if arg0.size > 1:
                    raise TypeError("can only write block variables one "
                                    "block at a time")
                arg0 = arg0.reshape(())
            newfile = arg0 == len(addr)
            if newfile:
                # This is a new block for this variable, but not first block.
                # TODO: Should prevent partial writes here?
                selfaddr = addr
                addr, faddr = handle.next_address(both=1)
                if addr is None:
                    pass  # TODO: issue warning here and below?
                if faddr >= root.maxsize and arg0 >= root.maxblocks:
                    a = handle.next_address(newfile=1)
                    if a is not None:
                        addr = a  # Next file in family has been created.
                    else:
                        # No next filename, and current file exceeds maxsize.
                        pass  # TODO: issue warning here and above?
                addr = _align(addr, align)
                selfaddr.append(addr)
                handle.declared(addr, stype, prod(shape) if shape else 1)
            else:
                addr = addr[arg0]
        else:
            newfile = False
        args, shape, offset = leading_args(args, shape)
        if offset:
            addr += dtype.itemsize * offset
        seeker = handle.seek
        f = seeker(addr)
        if args:
            # Must do read-modify-write for potentially non-contiguous write.
            v = fromfile(f, stype, prod(shape) if shape else 1).reshape(shape)
            v[args] = value
            value = v
            f = seeker(addr)
        else:
            if stype.kind == 'S' and shape:
                value = value.astype('S' + str(shape[-1]))
                value = value.reshape(value.shape + (1,)).view('S1')
            else:
                value = ascontiguousarray(value, stype)
            if value.shape != shape:
                # Avoid the recent (numpy 1.10) broadcast_to function.
                v = zeros(shape, stype)
                v[()] = value
                value = v
        value.tofile(f)

    def shifted_copy(self, delta):
        # Special helper for copying non-record variables for first file
        # to later files in a family.
        dtype, shape, addr = self.tsa
        if isinstance(addr, list):
            raise TypeError("cannot make shifted copy of record variable")
        parent = self.parent()
        if array(addr, 'u8') >> array(parent.root().handle.abits, 'u8'):
            raise TypeError("expecting non-record vars to be in first file")
        return PDBLeaf(self.parent(), addr+delta, dtype, shape, 0)


def _align(addr, align):
    if align > 1:
        rem = addr & (align - 1)
        if rem:
            addr += align - rem
    return addr


class PDBAttrs(dict):
    """Variable attributes are not a standard feature of PDB.

    We implement a poor man's version here as follows: Attributes are
    held in memory until the metadata is flushed, at which point they
    are written with name 'variable_path:attribute_name' immediately
    before the metadata.  If the file is extended, new data overwrites
    old attributes, which are rewritten just before the metadata once
    again.

    Hence, in memory, a dict suffices.

    """
    __slots__ = ()
    # qnd.QAttribute uses only __iter__, get, items, __len__, __contains__
    # PDBGroup uses __setitem__
    # Only thing that needs fixing is mapping items to iteritems for python2.
    if PY2:
        def items(self):
            return self.iteritems()
    else:
        pass
Source code for qnd.pdbf

qnd

Navigation

Related Topics