Source code for qnd.pdbf

"""Pure python QnD wrapper for PDB files.

Information on the PDB file format is somewhat hard to come by.  
Try the SILO github repo https://github.com/LLNL/Silo
esp. src/pdb/ and src/score/ dirs.
Also the end of the QND file pdbparse.py has a long comment with a detailed description.

Note that yorick-generated PDB files are version II, not version III.  Also,
yorick pointers are written (by default) in a yorick-specific format.  Since
yorick readability has held back many application codes (the LEOS library
and LLNL rad-hydro codes used for ICF design), most of the dwindling legacy
PDB files are version II.  Hence, this implementation focuses on the
version III format, and the version I format is supported only for reading.

Furthermore, this implementation only supports IEEE 754 4 and 8 byte
floating point formats, since those are the only unambiguous floating
point formats supported by numpy.  Fortunately, this covers all modern
PDB files likely to show up in practice, so we have no significant
incentive to do the work required to support exotic formats.

"""
from __future__ import absolute_import

import sys
import weakref
from numbers import Integral
from collections import OrderedDict
from warnings import warn

from numpy import (zeros, arange, fromfile, prod, array, ascontiguousarray,
                   dtype as npdtype)

from .frontend import QGroup, QnDList
from .generic import opener
from .pdbparse import parser, PDBChart
from .pdbdump import flusher_for, initializer_for
from .utils import leading_args

__all__ = ['openpdb']

PY2 = sys.version_info < (3,)
if PY2:
    range = xrange


[docs]def openpdb(filename, mode='r', auto=1, **kwargs): """Open PDB file or family, and wrap it in a QnD QGroup. Parameters ---------- filename : str Name of file to open. See notes below for file family. mode : str One of 'r' (default, read-only), 'r+' (read-write, must exist), 'a' (read-write, create if does not exist), 'w' (create, clobber if exists), 'w-' (create, fail if exists). auto : int The intial state of auto-read mode. If the QGroup handle returned by openh5 is `f`, then ``f.varname`` reads an array variable, but not a subgroup when auto=1, the default. With auto=0, the variable reference reads neither (permitting later partial reads in the case of array variables). With auto=2, a variable reference recursively reads subgroups, bringing a whole tree into memory. **kwargs Other keywords. The `maxsize` keyword sets the size of files in a family generated in ``recording==1`` mode; a new file will begin when the first item in a new record would begin beyond `maxsize`. The default maxsize is 128 MiB (134 MB). The `order` keyword can be '>' or '<' to force the byte order in a new file; by default the byte order is the native order. File families always have the same order for every file, so `order` is ignored if any files exist. Returns ------- f : QGroup A file handle implementing the QnD interface. Notes ----- The `filename` may be an iterable, one string per file in order. The sequence may extend beyond the files which actually exist for 'r+', 'a', 'w', or 'w-' modes. Alternatively `filename` specifies a family if it contains shell globbing wildcard characters. Existing matching files are sorted first by length, then alphabetically (ensuring that 'file100' comes after 'file99', for example). If there is only a single wildcard group, it also serves to define a sequence of future family names beyond those currently existing for 'r+', 'a', 'w', or 'w-' modes. A '?' pattern is treated the same as a '[0-9]' pattern if all its matches are digits or if the pattern matches no existing files. Similarly, a '*' acts like the minimum number of all-digit matches, or three digits if there are no matches. """ maxsize = kwargs.pop('maxsize', 134217728) order = kwargs.pop('order', None) if order: if order not in '<>': raise ValueError("order must be either > or <") order = 1 if order == '>' else 2 kwargs['nextaddr_mode'] = 1 # tell opener to initialize nextaddr to 0 handle, n = opener(filename, mode, **kwargs) root = PDBGroup(handle, maxsize) for i in range(n): try: parser(handle, root, i) except IOError: # Something went terribly wrong. If this is first file, we die. name = handle.filename(i) if not i: raise IOError("Fatal errors opening PDB file " "".format(name)) handle.open(i-1) warn("file family stopped by incompatible {}".format(name)) if not n and order: root.chart.byteorder = order # If file was freshly created, setting initializer calls it. handle.callbacks(flusher_for(root), initializer_for(root)) # If any files exist, parser has set nexaddr to the chart address # of the last existing file in the family. If there are no record # variables, we let this stand. However, if there are record variables, # and the family is writable, we set nextaddr to the zero address of # the next file beyond all existing files. This causes any new records # to be placed in a new file, leaving all existing files in the # family undisturbed. mode = mode.lower() if ((mode.startswith('a') or mode.startswith('r+')) and handle.state[4] is not None and _has_records(root)): handle.declared(handle.zero_address(len(handle.state[2])), None, 0) return QGroup(root, auto=auto)
def _has_records(root): for name in root: item = root.items[name] if item.islist() == 1: return True # Recurse into groups, but not into lists or objects of any type. if item.isgroup() and '__class__' not in item and _has_records(item): return True return False class PDBGroup(object): """A directory in a PDB file, or a whole file or file family. """ def __init__(self, parent, maxsize=134217728): if not isinstance(parent, PDBGroup): # this is root group self.root = weakref.ref(self) self.maxsize = maxsize self.maxblocks = 0 self.handle = parent self.chart = PDBChart(self) else: self.root = parent.root self.items = OrderedDict() self.attrs = None @staticmethod def isgroup(): return 1 @staticmethod def islist(): return 0 isleaf = islist def close(self): self.root().handle.close() def flush(self): self.root().handle.flush() def __len__(self): return len(self.items) def __iter__(self): return iter(self.items) def lookup(self, name): item = self.items.get(name) if isinstance(item, PDBGroup): item = QnDList.fromgroup(item) return item def declare(self, name, dtype, shape, unlim=None, addr=-1): current = self.items.get(name) if dtype == dict: if current is not None: if current.isgroup(): return current raise KeyError("already a non-group item {}".format(name)) item = PDBGroup(self) elif dtype == list: if current is not None: if current.islist() == 2: return current raise KeyError("already a non-list item {}".format(name)) item = QnDList(PDBGroup(self), 1) else: if current is not None: raise KeyError("attempt to redeclare {}".format(name)) if dtype is None and name == '_': # Assume we are creating a QList. dtype = npdtype('u1') elif isinstance(dtype, npdtype) and dtype.kind == 'S': # frontend never passes 'U' dtype shape = shape + (dtype.itemsize,) dtype = npdtype('S1') item = PDBLeaf(self, addr, dtype, shape, unlim) if unlim: item = QnDList(item, None if hasattr(addr, '__iter__') or addr != -1 else 1) self.items[name] = item return item # This is used in pdbparse._endparse to declare or check symbols # against declarations from previous files in a family. def _leaf_declare(self, name, dtype, shape, addr): item = self.items.get(name) unlim = isinstance(addr, list) if item is not None: tsa = None if not isinstance(item, PDBLeaf): item = None if isinstance(item, PDBGroup) else item.parent() if item is not None: tsa = item.tsa if (unlim != isinstance(tsa[2], list) or tsa[:2] != (dtype, shape)): item = None elif unlim: tsa[2].extend(addr) if item is None: raise IOError("incompatible redeclaration of {}".format(name)) return self.declare(name, dtype, shape, unlim, addr) def attget(self, vname): item = self.lookup(vname) if vname else self if isinstance(item, QnDList): item = item.parent() attrs = item.attrs if attrs is None: item.attrs = attrs = PDBAttrs() return attrs def attset(self, vname, aname, dtype, shape, value): item = self.lookup(vname) if vname else self if isinstance(item, QnDList): item = item.parent() attrs = item.attrs if attrs is None: item.attrs = attrs = PDBAttrs() if value.dtype != dtype or value.shape != shape: v = zeros(shape, dtype) v[()] = value value = v attrs[aname] = value class PDBLeaf(object): """An ndarray in a PDB file. (Eventual stretch goal is to implement None and zero-length arrays.) """ def __init__(self, parent, addr, dtype, shape, unlim): if dtype is None or (shape and not all(shape)): raise NotImplementedError("None or zero length array") root = parent.root() if not isinstance(dtype, tuple): # Construct full data type: (dtype, stype, align, typename) dtype = (dtype,) + root.chart.find_or_create(dtype) self.parent = weakref.ref(parent) self.attrs = None if hasattr(addr, '__iter__'): unlim = 1 if not isinstance(addr, list): addr = list(addr) elif addr == -1: stype, align = dtype[1:3] handle = root.handle addr = _align(handle.next_address(), align) handle.declared(addr, stype, prod(shape) if shape else 1) if unlim: addr = [addr] elif unlim: addr = [int(addr)] else: addr = int(addr) self.tsa = dtype, shape, addr @staticmethod def isleaf(): return 1 @staticmethod def isgroup(): return 0 islist = isgroup def root(self): return self.parent().root() def query(self): # return dtype, shape, sshape dtype, shape, addr = self.tsa if isinstance(addr, list): # Do this for consistency with treatment of h5py chunked data. shape = (len(addr),) + shape return dtype[0], shape, shape def read(self, args=()): dtype, shape, addr = self.tsa dtype, stype, _, typename = dtype istext = typename == b'text' if isinstance(addr, list): arg0 = args[0] if args else slice(None) args = args[1:] if not isinstance(arg0, Integral): arg0 = arange(len(addr))[arg0] if arg0.ndim == 1: return array([self.read((a,) + args) for a in arg0], dtype) elif arg0.ndim: raise TypeError("block variable leading index too complex") addr = addr[arg0] root = self.root() chart = root.chart nopartial = chart.nopartial(typename) if nopartial is None: typename = None if typename and nopartial: offset = 0 else: args, shape, offset = leading_args(args, shape) if offset: addr += dtype.itemsize * offset f = root.handle.seek(addr) value = fromfile(f, stype, prod(shape) if shape else 1) if not nopartial: value = value.reshape(shape)[args] if typename: value = chart.read_special(f, typename, value) stype = dtype = value.dtype if nopartial: value = value.reshape(shape)[args] if istext and value.shape: return value.view('S' + str(value.shape[-1]))[..., 0] return value if stype is dtype else value.astype(dtype) def write(self, value, args=()): dtype, shape, addr = self.tsa dtype, stype, align, typename = dtype[:4] arg0 = args[0] if args else slice(None) args = args[1:] root = self.root() handle = root.handle if root.chart.nopartial(typename) is not None: raise TypeError("write to pointer type {} unsupported" "".format(typename.decode('latin1'))) if isinstance(addr, list): # This variable has blocks. if not isinstance(arg0, Integral): arg0 = arange(len(addr))[arg0] if arg0.size > 1: raise TypeError("can only write block variables one " "block at a time") arg0 = arg0.reshape(()) newfile = arg0 == len(addr) if newfile: # This is a new block for this variable, but not first block. # TODO: Should prevent partial writes here? selfaddr = addr addr, faddr = handle.next_address(both=1) if addr is None: pass # TODO: issue warning here and below? if faddr >= root.maxsize and arg0 >= root.maxblocks: a = handle.next_address(newfile=1) if a is not None: addr = a # Next file in family has been created. else: # No next filename, and current file exceeds maxsize. pass # TODO: issue warning here and above? addr = _align(addr, align) selfaddr.append(addr) handle.declared(addr, stype, prod(shape) if shape else 1) else: addr = addr[arg0] else: newfile = False args, shape, offset = leading_args(args, shape) if offset: addr += dtype.itemsize * offset seeker = handle.seek f = seeker(addr) if args: # Must do read-modify-write for potentially non-contiguous write. v = fromfile(f, stype, prod(shape) if shape else 1).reshape(shape) v[args] = value value = v f = seeker(addr) else: if stype.kind == 'S' and shape: value = value.astype('S' + str(shape[-1])) value = value.reshape(value.shape + (1,)).view('S1') else: value = ascontiguousarray(value, stype) if value.shape != shape: # Avoid the recent (numpy 1.10) broadcast_to function. v = zeros(shape, stype) v[()] = value value = v value.tofile(f) def shifted_copy(self, delta): # Special helper for copying non-record variables for first file # to later files in a family. dtype, shape, addr = self.tsa if isinstance(addr, list): raise TypeError("cannot make shifted copy of record variable") parent = self.parent() if array(addr, 'u8') >> array(parent.root().handle.abits, 'u8'): raise TypeError("expecting non-record vars to be in first file") return PDBLeaf(self.parent(), addr+delta, dtype, shape, 0) def _align(addr, align): if align > 1: rem = addr & (align - 1) if rem: addr += align - rem return addr class PDBAttrs(dict): """Variable attributes are not a standard feature of PDB. We implement a poor man's version here as follows: Attributes are held in memory until the metadata is flushed, at which point they are written with name 'variable_path:attribute_name' immediately before the metadata. If the file is extended, new data overwrites old attributes, which are rewritten just before the metadata once again. Hence, in memory, a dict suffices. """ __slots__ = () # qnd.QAttribute uses only __iter__, get, items, __len__, __contains__ # PDBGroup uses __setitem__ # Only thing that needs fixing is mapping items to iteritems for python2. if PY2: def items(self): return self.iteritems() else: pass