Source code for qnd.pdbparse

"""Pure python PDB file format parsing.

Parse PDB metadata.  The PDB file format was designed by Stewart Brown
at Lawrence Livermore National Laboratory in the 1990s.

PDB is widely used at LLNL for restart and post-processing data
produced by large radiation-hydrodynamics simulation codes.  The PDB
format can describe and store named arrays of any data type
representable in the C programming language that is derived from one
of the primitive C data types char, short, int, long, float, or double
or pointer to a representable type.  The format was extended to handle
primitive integer or floating point numbers of any size, such as long
long or long double or 16 bit floating point data.  The format was
also extended to organize the named arrays into groups like HDF5.  The
metadata is text with a few embedded ASCII control characters, written
at the end of the file, and intended to be parsed and held in memory
when the file is opened.

"""
from __future__ import absolute_import

import re
import sys
from collections import OrderedDict
from warnings import warn
import weakref

from numpy import (prod, array, arange, concatenate, zeros, dtype as npdtype,
                   fromfile, int64)

PY2 = sys.version_info < (3,)
if PY2:
    from itertools import izip as zip
    range = xrange

    def itemsof(d): return d.iteritems()  # noqa
else:
    from numpy.core.defchararray import decode as npdecode  # noqa
    basestring = str

    def itemsof(d): return d.items()  # noqa


[docs]class PDBChart(object):
    """PDB structure chart contains all information about data types."""
    # stype --> dtype on disk
    # All typename and membername values are bytes, not unicode (str).
    # The assumed codec is latin1; legacy PDB files could support UTF-8,
    # but without any explicit mandate, latin1 is the safer option for
    # reading files of unknown origin, since unlike UTF-8 there are no
    # undefined bytes.
    #
    # hasint64   True if any int primitive is 8 bytes, else False.
    #            False means that i8 dtypes will be mapped to i4.
    # byteorder  Determines new stype corresponding to new primitive dtype.
    # structal   Initial struct alignment, before adding any members.
    # csaddr     Chart and symtab address in header.
    # haspointers  Pointer types used: 0 none, 1 PDB style, 2 yorick style
    # primitives [name] --> stype, dtype, align, fpbits or None
    #            fpbits present only if stype is Vn for non-IEEE float.
    # structs    [name] --> stype, dtype, align, members
    #            members is OrderedDict membername --> typename, shape
    # by_dtype   [dtype] --> typename, table (either primitives or structs)
    #            This maps dtype to a PDB typename for interpreting newly
    #            declared variables.  The names _anon_0, _anon_1, ... are
    #            created for dtypes which are previously undeclared.
    # nanons     Count of _anon_X typenames already used.
    # pointers   [typename] --> otype corresponding to stype, dtype
    # yortypes   list of primitives+structs typenames
    #
    # Strategy for converting pointers:
    # 1. Convert stype --> dtype as usual; yorick can do partial reads
    # 2. Create final otype = dtype('O') variant of dtype
    # 3. Read the stype array.
    # 4. Create empty otype array and assign stype array.  In the case of
    #    yorick pointers, numpy converts the pointer values to type 'O'.
    # 5. Iterate through the otype one array item, then one struct item
    #    at a time, converting the 'O' members from pointer to pointee.
    #    Recurse during this loop to handle any pointers encountered in
    #    the pointees.

    def __init__(self, root):
        self.root = weakref.ref(root)  # root() is top level PDBGroup
        self.hasint64 = False
        self.structal = self.byteorder = self.csaddr = None
        self.haspointers = 0
        self.primitives = OrderedDict()
        self.structs = OrderedDict()
        self.by_dtype = {}
        self.nanons = 0
        self.pointers = {}
        self.yortypes = []

    _orders = {1: '>', 2: '<'}
    # Create distinguishable pointer and string dtypes.
    _dtypeo = npdtype('O')
    _dtypes = npdtype('O', copy=1)

[docs]    def find_or_create(self, dtype):
        """Find or create stype --> (stype, align, typename)."""
        dtype = self._saveable(dtype)
        typename, table = self.by_dtype.get(dtype, (None, None))
        if typename is not None:
            stype, _, align, _ = table[typename]
        else:
            stype, align, typename = self._add_dtype(dtype)
        return stype, align, typename

    def nopartial(self, typename):
        if self.pointers.get(typename) is not None:
            return bool(self.haspointers == 1)
        return None

[docs]    def use(self, dtype):
        """Find or create stype --> (stype, align, typename, nopartial)."""
        stype, align, typename = self.find_or_create(dtype)
        # The native PDB pointer types require special treatment; the
        # entire variable and all of its indirect references must be
        # read at once (no partial reads).
        # The yorick string and pointer types require special post-processing
        # after a read in order to interpret the pointer values.
        # Any struct which has a member requiring special treatment itself
        # requires special treatment.
        return stype, align, typename, self.nopartial()

    def _saveable(self, dtype):
        if dtype.shape:
            raise TypeError("cannot store subarray dtype in PDB file")
        if not dtype.isnative:
            dtype = dtype.newbyteorder('=')
        if dtype.kind in 'ui' and dtype.itemsize > 4:
            # If file does not have int64 type, map int64 to int32.
            if not self.hasint64:
                dtype = npdtype(dtype.kind + '4')
        return dtype

    def _add_dtype(self, dtype):
        by_dtype, nonenone = self.by_dtype, (None, None)
        # Need to declare anonymous type now.
        size = dtype.itemsize
        if size == 1 and dtype.kind == 'S':
            # Declare QnD-specific text primitive.
            stype = self.add_primitive(b'text', (1, 0, 0))
            return stype, 0, b'text'
        if dtype.kind == 'c':
            fsize = size // 2
            ffmt = npdtype('f{}'.format(fsize))
            fields = dict(re=(ffmt, 0), im=(ffmt, fsize))
            names = ['re', 'im']
            if fsize == 8:
                name = b'complex'
            elif fsize == 4:
                name = b'fcomplex'
            else:
                name = 'complex{}'.format(size*8).encode('latin1')
        else:
            fields = dtype.fields
            names = dtype.names
            n = self.nanons
            self.nanons += 1
            name = '_anon_{}'.format(n).encode('latin1')
        if not names:
            order = self.byteorder
            if not order:
                order = dtype.str[0]
                order = 1 if order == '>' else (2 if order == '<' else 0)
            align = size  # WAG, could create align=1 record dtype to check
            desc = size, 0 if size == 1 else order, align
            adder = self.add_primitive
            if dtype.kind == 'f':
                if size == 4:
                    desc += (_binary32,)
                elif size == 8:
                    desc += (_binary64,)
                else:
                    raise NotImplementedError(
                        "only f4 and f8 floating point dtypes supported")
        else:
            align = self.structal
            saveable = self._saveable
            desc = OrderedDict()  # really members list
            _dtype_shape = self._dtype_shape
            for nm in names:
                dtyp, shape = _dtype_shape(fields[nm])
                dtyp = saveable(dtyp)
                typename, table = by_dtype.get(dtyp, nonenone)
                if typename is None:
                    # Even though we are just building the members list here,
                    # we need to recursively create any member stypes which
                    # have not yet been encountered.  Note that the recursion
                    # produces the anonymous types in order of dependency.
                    _, al, typename = self._add_dtype(dtyp)
                else:
                    al = table[typename][2]
                if al > align:
                    align = al
                desc[nm.encode('latin1')] = typename, shape
            desc = size, desc
            adder = self.add_struct
        stype = adder(name, desc)
        return stype, align, name

    @staticmethod
    def _dtype_shape(field):
        dtype = field[0]
        dtype, shape = dtype.base, dtype.shape
        s = dtype.shape
        while s:  # numpy does not automatically condense subarrays
            shape += s
            dtype, s = dtype.base, dtype.shape
        return dtype, shape

[docs]    def add_primitive(self, name, desc):
        """desc is (size, order, align) or (size, order, align, fpbits)"""
        ispdbptr = name == b'*'
        if ispdbptr:
            name = b'char *'
        size, order, align = desc[:3]
        if not order:
            order = 0
        elif not hasattr(order, '__getitem__'):
            order = int(order)
        else:
            perm = array(order)
            if perm[0] == 1:
                order = 1
            elif perm[-1] == 1:
                order = 2
                perm = perm[::-1]
            if perm.size != size or (arange(1, 1+size) != perm).any():
                order = 0
        fpbits = desc[3] if len(desc) > 3 else None
        if align & (align - 1):
            # Error if alignment is not a power of 2.
            raise IOError("alignment {} for primitive {} not power of 2"
                          "".format(align, name.decode('latin1')))
        order = self._orders.get(order, '')
        if not order and size > 1:
            stype = npdtype('V{}'.format(size))
        elif fpbits:
            if size == 4 and fpbits == _binary32:
                stype, fpbits = npdtype(order + 'f4'), None
            elif size == 8 and fpbits == _binary64:
                stype, fpbits = npdtype(order + 'f8'), None
            else:
                stype = npdtype('V{}'.format(size))
        else:
            lname = name.lower()
            if lname.startswith(b'u') or name == b'char':
                kind = order + 'u{}'
            elif size == 1:
                if lname in (b'bool', b'boolean'):
                    kind = order + 'b{}'
                elif name == b'text':
                    kind = 'S{}'
                else:
                    kind = 'i{}'
            else:
                kind = order + 'i{}'
            if size in (1, 2, 4, 8):
                stype = npdtype(kind.format(size))
                if size == 8:
                    self.hasint64 = True
            else:
                stype = npdtype('V{}'.format(size))
        if stype.isnative:
            dtype = stype
        else:
            dtype = stype.newbyteorder('=')
        primitives = self.primitives
        current = primitives.get(name)
        if current:
            if current == (stype, dtype, align, fpbits):
                return current[0]
            else:
                return None
        primitives[name] = stype, dtype, align, fpbits
        self.by_dtype.setdefault(dtype, (name, primitives))
        if ispdbptr:
            dtype = npdtype('V0')
            primitives[b'char*'] = dtype, dtype, 0, None
            pointers = self.pointers
            pointers[b'char *'] = pointers[b'char*'] = self._dtypeo
        elif self.haspointers == 2 and name == b'double':
            slong, dlong, along, _ = primitives[b'long']
            primitives[b'string'] = slong, dlong, along, None
            primitives[b'pointer'] = slong, dlong, along, None
            pointers = self.pointers
            pointers[b'string'] = self._dtypes
            pointers[b'pointer'] = self._dtypeo
        return stype

[docs]    def add_struct(self, name, members):
        """members is OrderedDict, name --> typename, shape"""
        if not isinstance(name, bytes):
            name = name.encode('latin1')
        primitives, structs = self.primitives, self.structs
        align = self.structal
        pointers = self.pointers
        size, members = members
        names, formats, oformats = [], [], []
        offsets, off, special = [], 0, False
        for mname, (tname, shape) in itemsof(members):
            # mname must be bytes, str_mname must be str
            # hash(dtype) fails if mname is unicode in PY2
            if isinstance(mname, bytes):
                str_mname = mname if PY2 else mname.decode('latin1')
            elif PY2:
                str_mname = mname = mname.encode('latin1')
            else:
                str_mname, mname = mname, mname.encode('latin1')
            if not isinstance(tname, bytes):
                tname = tname.encode('latin1')
            if tname.endswith(b'*'):
                tname = b'char *'
            stype = primitives.get(tname)
            isstruct = stype is None
            if isstruct:
                stype = structs.get(tname)
                if stype is None:
                    raise IOError("struct {} refers to undefined type {}"
                                  "".format(name.decode('latin1'),
                                            tname.decode('latin1')))
            stype, dtype, al, _ = stype
            if stype.shape:
                shape = stype.shape + shape
                stype = stype.base
            if al > 1:
                rem = off & (al - 1)
                if rem:
                    off += al - rem
            if al > align:
                align = al
            offsets.append(off)
            formats.append(npdtype((stype, shape)) if shape else stype)
            names.append(str_mname)
            off += stype.itemsize * (prod(shape) if shape else 1)
            ptrtype = pointers.get(tname)
            if ptrtype is not None:
                dtype, special = ptrtype, True
            oformats.append(npdtype((dtype, shape)) if shape else dtype)
        # if align > 1:
        #     rem = off & (align - 1)
        #     if rem:
        #         off += align - rem
        lname = name.lower()
        iscomplex = b'complex' in lname or b'plx' in lname and len(names) == 2
        if iscomplex and formats[0] == formats[1] and formats[0].kind == 'f':
            re, im = [n.lower().replace('part', '').replace('_', '')
                      for n in names]
            iscomplex = (re, im) in _complex_members
        if iscomplex:
            stype = npdtype(formats[0].byteorder + 'c{}'.format(size))
        else:
            stype = npdtype(dict(names=names, formats=formats,
                                 offsets=offsets, itemsize=size))
        dtype = stype if stype.isnative else stype.newbyteorder('=')
        current = structs.get(name)
        if current:
            if current == (stype, dtype, align, members):
                return current[0]
            else:
                return None  # Attempt to redefine struct type.
        if special:
            pointers[name] = npdtype(dict(names=names, formats=oformats))
        structs[name] = stype, dtype, align, members
        self.by_dtype.setdefault(dtype, (name, structs))
        return stype

[docs]    def read_special(self, f, typename, value):
        """Recursively read pointer values."""
        # Convert value from as-stored stype to final caller dtype.
        # This will fill all the dtype('O') fields corresponding to pointers
        # with whatever was stored in the corresponding slot in the stype.
        # (In the case of either yorick or PDB itag pointers, this is
        # a scalar int value.)
        # value can be non-ndarray in v[mname][0] call a few lines down:
        shape = value.shape if hasattr(value, 'shape') else ()
        v = zeros(shape, self.pointers[typename])
        v[()] = value  # works even for PDB-II char* 0-length void value
        value = v
        by_address = {}
        dtype = value.dtype
        names = dtype.names
        # Iterate through each individual pointer in the order which the
        # native PDB pointer mechanism would process them.  This isn't
        # terribly efficient, but reading the individual pointees is
        # likely to be even more expensive than all these nested lookups.
        if names:
            read_special = self.read_special
            pointers = self.pointers
            members = self.structs[typename][3]
            for v in value.reshape(value.size, 1):
                for mname, (tname, shape) in itemsof(members):
                    if tname in pointers:
                        mname = mname.decode('latin1')
                        v[mname][0] = read_special(f, tname, v[mname][0])
                        # If the member has record dtype, it will not become
                        # a recarray here.  This is consistent with the
                        # behavior of non-pointer members.
                        # An additional problem is that the numpy interface
                        # does not handle O scalars as you might wish - when
                        # you extract such a member it remains a scalar
                        # object array, rather than the object itself.
        else:
            haspointers = self.haspointers
            reader = (self.read_yorickptr if haspointers == 2 else
                      self.read_pdbptr)
            for v in value.reshape(value.size, 1):
                v[0] = reader(f, v, by_address)
            if dtype is self._dtypes:
                # Convert from dtype O to string or dtype S
                value = array(value.tolist())
                if not PY2:
                    try:
                        value = npdecode(value, 'utf8')
                    except UnicodeDecodeError:
                        value = npdecode(value, 'latin1')
            if not value.shape:
                value = value[()]
        return value

    def read_pdbptr(self, f, value, by_address):
        addr0 = f.tell()
        header = block = f.read(1024)
        while True:
            m = _line_re.search(block)
            if m:
                break
            header += block
            if len(block) < 1024:
                break
            f.read(1024)
        nhead = m.start if m else len(header) - 1
        addr1 = addr0 + nhead + 1
        f.seek(addr1)  # position file to byte after header
        m = _ptrheader.match(header)
        if m:
            nitems, typename, addr, here = m.group(1, 2, 3, 4)
            addr = int64(-1) if addr is None else int64(addr)
            if addr == -1:
                return None  # This is a NULL pointer.
            if here is not None and not int64(here):
                # We need to get this pointee from elsewhere.
                v = by_address.get(addr, Ellipsis)
                if v is not Ellipsis:
                    return v  # already read and cached this pointee
                # Read pointee from elsewhere, then snap back to here.
                f.seek(addr)
                v = self.read_pdbptr(f, value, by_address)
                f.seek(addr1)
                return v
            # Data is here at addr0, read it now.
            # Unknown what happens if addr != addr0 -- if actual header
            # address isn't used when data not here, this algorithm will
            # fail (e.g.- if addr for one not here points to another not
            # here in a chain).
            nitems = int64(nitems)
            typename = typename.strip()
            stype = self.primitives[typename]
            if stype is None:
                stype = self.structs[typename]
            if stype is not None:
                stype, dtype = stype[:2]
                v = fromfile(f, stype, nitems)
                if typename in self.pointers:
                    # Recurse because pointee contains pointers.
                    v = self.read_special(f, typename, v)
                elif stype is not dtype:
                    v = v.astype(dtype)
                if nitems == 1:
                    # Presume intent is a scalar, PDB offers no way to find
                    # the original shape of a pointee.
                    v = v.reshape(())
                by_address[addr0] = v
                # File is positioned immediately after block of pointees
                # which has just been read.
                return v
        raise IOError("Unable to track PDB pointees at {}".format(addr0))

    def read_yorickptr(self, f, value, by_address):
        addr = int64(value[0])
        v = by_address.get(addr, Ellipsis)
        if v is not Ellipsis:
            return v
        yortypes, primitives = self.yortypes, self.primitives
        if not yortypes:
            # Create yorick type list now.
            yortypes = list(primitives) + list(self.structs)
            self.yortypes = yortypes
        null = addr < 0
        if not null:
            f.seek(addr)
        longstype = primitives[b'long'][0]
        if value.dtype is self._dtypes:
            # This is a yorick string.
            n = 0 if null else fromfile(f, longstype, 1).astype(int64)[0]
            return fromfile(f, npdtype('S{}'.format(n)), 1)[0] if n else b''
        # This is a general yorick pointee.
        if null:
            return None
        ytype, ndim = fromfile(f, longstype, 2).astype(int64)
        if ytype < 0 or ytype >= len(yortypes) or ndim > 10:
            IOError("bad yorick pointee header at {}".format(addr))
        if ndim:
            shape = tuple(fromfile(f, longstype, ndim).astype(int64)[::-1])
            size = prod(shape)
        else:
            shape = ()
            size = 1
        typename = yortypes[ytype]
        if ytype < len(primitives):
            stype, dtype, align = primitives[typename][:3]
        else:
            stype, dtype, align = self.structs[typename][:3]
        addr += (2 + ndim) * longstype.itemsize
        if align > 1:
            rem = addr & (align - 1)
            if rem:
                addr += align - rem
                f.seek(addr)
        v = fromfile(f, stype, size).reshape(shape)
        if typename in self.pointers:
            # Recurse because pointee contains more pointers.
            v = self.read_special(f, typename, v)
        elif stype is not dtype:
            v = v.astype(dtype)
        return v


_complex_members = (('r', 'i'), ('re', 'im'), ('real', 'imag'),
                    ('real', 'imaginary'))


[docs]def parser(handle, root, index=0):
    """Parse PDB file with the given MultiFile handle and PDBGroup group."""
    f = handle.open(index)

    # Begin by reading header (or trailer for version 3).
    header = f.read(1024)
    m = _magic2.match(header)
    if m:
        # Most common version 2 PDB format.
        root.pdb_version = version = 2
        n = 13 + (ord(header[13]) if PY2 else header[13])
        priminfo = map(ord, header[14:n]) if PY2 else [h for h in header[14:n]]
        ma = _magic2a.match(header[n:])
        if ma:  # save address of chart and symtab addresses for pdbdump
            root.chart.csaddr = n + ma.end(2) + 2
        try:
            nums = list(map(int, ma.group(1, 2, 3, 4)))
        except (AttributeError, ValueError):
            nums = [-1]
        if any(n < 0 for n in nums):
            raise IOError("PDB version 2 header illegible")
        bias = nums[:2]  # float, double exponent bias
        chart, symtab = nums[2:]
        # Unpack tentative primitive information from header.
        psz, ssz, isz, lsz, fsz, dsz = priminfo[0:6]
        sord, iord, lord = priminfo[6:9]  # 1 for big, 2 for little endian
        i = 9 + fsz
        j = i + dsz
        ford = tuple(priminfo[9:i])  # 1...sz permutation, position as in data,
        dord = tuple(priminfo[i:j])  # ...value is position in big endian order
        fbits = priminfo[j:j+7]      # [ N  e#  s#  -&  e&  s&  1? ]
        dbits = priminfo[j+7:j+14]
        # (itemsize, order, align)
        # order == 1 for big-endian, 2 for little-endian, 0 unknown
        primtypes = ((b'char', (1, 0, 0)),
                     (b'short', (ssz, sord, 0)),
                     (b'integer', (isz, iord, 0)),
                     (b'long', (lsz, lord, 0)),
                     (b'float', (fsz, ford, 0, tuple(fbits+bias[0:1]))),
                     (b'double', (dsz, dord, 0, tuple(dbits+bias[1:2]))),
                     (b'*', (psz, lord, 0)))

    else:
        m = _magic1.match(header)
        if not m:
            try:
                f.seek(-4096, 2)
            except IOError:
                f.seek(0)  # assume file shorter than 4096 bytes
            trailer = f.read(4096)
            i = trailer.rfind(b'StructureChartAddress:')
            m = _magic3.match(trailer[i:])
            if not m:
                raise IOError("file is not any version PDB file")
            # Version III PDB format, not legible by yorick.
            root.pdb_version = version = 3
            try:
                chart, symtab = map(int64, m.group(1, 2))
            except ValueError:
                chart, symtab = -1, -1
            if chart <= 0 or symtab <= 0:
                raise ValueError("PDB version 3 trailer illegible")
        else:
            # Legacy version 1 PDB format, no support for writing.
            root.pdb_version = version = 1
            try:
                nums = map(int64, m.group(1, 2, 3))
            except (AttributeError, ValueError):
                nums = [-1]
            if any(n < 0 for n in nums):
                raise IOError("PDB version 1 header illegible")
            platform, chart, symtab = nums
            primtypes = _1layouts.get(platform)
            if primtypes is None:
                primtypes = _1layouts[5]

    # Next, read chart and symbol table from end of file.
    if symtab < chart:
        # PDBLib (any version) fails unless symbol table immediately after
        # structure chart -- effectively symtab just gives length of chart.
        raise IOError("PDB file chart must precede symtab")
    f.seek(chart)
    handle.declared(handle.zero_address() | int64(chart), None, 0)
    chart_contents = f.read(symtab - chart)  # "chart" is PDB type table
    symtab_contents = f.read()
    if version == 3:
        _parse3(f, root, chart_contents, symtab_contents)
        return
    errors = []

    # Parse chart_contents into an OrderedDict of compounds,
    # typename --> size, members
    #   members is an OrderedDict, membername --> typename, shape
    structs = OrderedDict()
    pointertypes = set()
    chartprims, dundertype = [], None
    for line in _line_re.finditer(chart_contents):
        line = line.group(1)
        if line == b'\002':
            break
        members = line.split(b'\001')[:-1]  # must end with \001
        try:
            name = members[0].strip()
            if (not name) or (name in structs):
                errors.append("empty or repeated type name: {}"
                              "".format(name.decode('latin1')))
                raise ValueError
            size = int64(members[1])
            members = members[2:]
            if members:
                mlist, members = members, OrderedDict()
                for mem in mlist:
                    m = _memberdef.match(mem)
                    typ, ind, nm, dims = m.group(1, 2, 3, 4)
                    if nm in members:
                        errors.append("repeated member name: {}"
                                      "".format(nm.decode('latin1')))
                        raise AttributeError
                    if ind:
                        typ += b'*' * ind.count(b'*')
                        pointertypes.add(name)
                    elif typ in pointertypes:
                        pointertypes.add(name)
                    shape = _parse_shape(dims)
                    members[nm] = typ, shape
                structs[name] = size, members
                if name == b'__':
                    dundertype = members
            else:
                # Ultimately we will ignore these, but go ahead and
                # collect them here for informational purposes.
                chartprims.append((name, (size, 0, 0)))
        except (AttributeError, ValueError, IndexError):
            continue

    # Parse the symtab_contents into an OrderedDict,
    # symbolname --> address, typename, shape
    symtab = []
    haspointers = maybe_pointers = 0
    has_dirs = False
    recordsym = None
    basisnames, basismap, basisrecs = set(), {}, set()
    symtab_contents = _line_re.finditer(symtab_contents)
    for line in symtab_contents:
        line = line.group(1)
        if not line:
            break
        sym = line.split(b'\001')
        name = sym[0].strip()
        try:
            lens = list(map(int64, sym[2:-1]))  # always ends with ''
            count, addr = lens[0:2]
        except (IndexError, ValueError):
            errors.append("{} has bad symtab entry"
                          "".format(name.decode('latin1')))
            continue
        shape = lens[3::2]  # ignore index origins
        if not shape and count > 1:
            shape = [count]
        if prod(shape) == count:
            typ = sym[1].strip()
            if typ.endswith(b'*'):
                # Remove optional whitespace between pointer * characters.
                typ = sym[1].replace(b' ', b'').replace(b'\t', b'')
                haspointers |= 1
            elif (dundertype and typ == b'__' and name.endswith(b'__@history')
                  and not shape):
                recordsym = name
            elif name.count(b'@') == 1:
                bname, pkg = name.split(b'@')
                if pkg in (b'macro', b'funct'):
                    continue  # skip basis macros or functions
                # Strip @pkg if no bname conflict.
                if bname not in basisnames:
                    basisnames.add(bname)
                    basismap[name] = bname
                    name = bname
                if pkg == b'history':
                    basisrecs.add(name)
            if not has_dirs:
                has_dirs |= typ == b'Directory' or name.startswith(b'/')
            # Begin workaround of yorick bug that sometimes forgot to write
            # string (and maybe pointer?) to Primitive-Types extra.
            if not haspointers:
                if typ == b'string':
                    maybe_pointers |= 1
                if typ == b'pointer':
                    maybe_pointers |= 2
            symtab.append((name, (addr, typ, tuple(shape))))
        else:
            errors.append("{} has count mismatch"
                          "".format(name.decode('latin1')))
    # Adjust symbol names to avoid conflicts between yorick record variables
    # and static variables.
    if recordsym:
        names = set(dundertype)
        names.update(s[0] for s in symtab)
        if len(names) < len(dundertype) + len(symtab):
            for i, (name, desc) in enumerate(symtab):
                if name not in dundertype:
                    continue
                # Append number to end of name, incrementing until unused.
                prefix, j = name, 0
                while True:
                    name = prefix + str(j).encode()
                    if name not in names:
                        break
                    j += 1
                symtab[i] = name, desc
                names.add(name)
    # PDBlib (but not yorick) outputs symtab in random hash order.  Sort it
    # into order of increasing address, which is probably declaration order.
    symtab.sort(key=lambda x: x[1][0])
    symtab = OrderedDict(symtab)

    # Continue parsing symtab_contents to extract extras metadata.
    eob = (b'', b'\002')
    extras = {}
    name = body = datebug = None
    nerrs = 0
    for line in symtab_contents:
        line = line.group(1)
        if (not line) and (not name):
            if datebug:
                datebug = False
                # Some files were written with a bug that included the
                # '\n' at the end of the date from the POSIX ctime call.
                # This causes a \n\n at the end of the Version: extra.
                continue
            break
        if line in eob:
            if name:
                extras[name.strip()] = body
            name = body = None
        elif not name:
            parts = line.split(b':', 1)
            if len(parts) > 1:
                name, body = parts
                if body:
                    name = name.strip()
                    extras[name] = [body]
                    datebug = name == 'Version'
                    name = body = None
                else:
                    body = []
            else:
                nerrs += 1
        else:
            body.append(line)
    if nerrs:
        errors.append("{} missing extra block name(s)".format(nerrs))

    # Handle Alignment extra.
    # The alignment extra may be overridden later by the Primitive-Types
    # extra, so interpret it first.
    palign = extras.pop(b'Alignment', [b''])[0]
    if len(palign) == 7:
        # These override values in types argument (from file header).
        palign = map(ord, palign) if PY2 else [p for p in palign]
        # reorder to match primitives in primtypes
        palign = palign[:1] + palign[2:] + palign[1:2]
        ptypes = []
        for (name, soad), a in zip(primtypes, palign):
            soad = list(soad)
            soad[2] = a
            ptypes.append((name, tuple(soad)))
        primtypes = tuple(ptypes)
    elif version == 2:
        errors.append("bad or missing Alignment extra")

    # Major-Order   (MajorOrder in PDB-3)
    # Struct-Alignment   (StructAlignment in PDB-3)
    # Offset   (DefaultIndexOffset in PDB-3)
    if extras.pop(b'Major-Order', [b'101'])[0].strip() in (b'102', b'column'):
        # Little endian shapes, reverse all shape lists.
        _flip_shapes(structs, symtab)
    extras.pop(b'Offset', None)  # ignore all minimum index values
    structal = extras.pop(b'Struct-Align', [b'0'])[0]
    try:
        structal = int(structal)
    except ValueError:
        structal = -1
    if structal < 0:
        errors.append("bad Struct-Alignment extra")
        structal = 0
    if not structal:
        structal = 1
    # Other potentially interesting extras:
    # Dynamic Spaces: <n>\n      (ia_<n> is next pointer?)
    # Use Itags: 1 or 0\n
    # Previous-File: <name>\n

    # Parse Blocks extra.
    garbled = False
    block_lines = iter(extras.pop(b'Blocks', ()))
    name = None
    # Some block addresses may not be computable until after all data type
    # item sizes are known.
    deferred = {}
    for block in block_lines:  # block_lines may increment in loop body
        # Blocks:\n
        # <name>\001<nblocks>\x<addr> <nitems>...\n    where \x = \n or space
        # \002\n
        #   -- nitems is multiple of declared dimensions except slowest
        name = block.split(b'\001', 1)
        if len(name) != 2:
            if not garbled:
                errors.append("garbled line(s) in Blocks extra")
                garbled = True
            continue
        name, n = name
        try:
            n = list(map(int64, n.split()))
        except ValueError:
            errors.append("bad block count for {}"
                          "".format(name.decode('latin1')))
            continue
        n, addcnt = n[0], n[1:]
        n += n
        if n > len(addcnt):
            for line in block_lines:  # increment block_lines iterator
                try:
                    addcnt += list(map(int64, line.split()))
                except ValueError:
                    errors.append("bad block address for {}"
                                  "".format(name.decode('latin1')))
                    break
                if len(addcnt) >= n:
                    break
            else:
                break
        if n != len(addcnt):
            errors.append("block count address mismatch for {}"
                          "".format(name.decode('latin1')))
            continue
        addcnt = array(addcnt)
        addr, count = addcnt[0::2], addcnt[1::2]
        name = basismap.get(name, name)
        basisrecs.discard(name)
        sym = symtab.get(name)
        if not sym:
            errors.append("block for non-existent variable {}"
                          "".format(name.decode('latin1')))
            continue
        shape = sym[2]
        if not shape:
            shape = (1,)
        # Without if len test, chunk and later count become floats.
        chunk = prod(shape[1:]) if len(shape) > 1 else 1
        if ((count % chunk).any() or (count[0] < shape[0]*chunk)
                or (addr[0] != sym[0])):
            errors.append("block shape disagreement for {}"
                          "".format(name.decode('latin1')))
            continue
        count //= chunk  # number of slowest index positions
        # QnD interface wants simple list of block addresses.
        if (count == 1).all():
            # Address of each block listed separately, so addr list
            # can be used as-is.
            # OrderedDict guarantees that replacing item value does not
            # change its position in the sequence.
            symtab[name] = addr.tolist(), sym[1], shape[1:]
        else:
            # Otherwise, we need to defer converting symtab[name] to
            # blocks until we know the number of bytes per chunk.
            deferred[name] = addr, chunk, count
        name = None
    if name is not None:
        errors.append("block count address mismatch for {}"
                      "".format(name.decode('latin1')))
    # Treat basis variables marked with @history as record variables
    # even if they have no blocks.
    for name in basisrecs:
        sym = symtab.get(name)
        if sym is None:
            continue  # Impossible to get here?
        addr, typ, shape = sym
        if not isinstance(addr, list):  # Impossible to fail this test?
            addr = [addr]
        shape = shape[1:] if shape and shape[0] == 1 else shape
        symtab[name] = addr, typ, shape

    # Parse Primitive-Types extra to an OrderedDict:
    # typename --> size, order, align
    #          or  size, order, align, fpformat
    ptypes = OrderedDict()
    garbled = False
    for line in extras.pop(b'Primitive-Types', []):
        line = line.split(b'\001')[:-1]   # must end with \001
        name = line[0].strip()
        try:
            if not name or len(line) < 5:
                if not garbled:
                    errors.append("garbled line(s) in Primitive-Types extra")
                garbled = True
                raise ValueError
            size, align, order = list(map(int, line[1:4]))
            flag = line[4]
            i = 5
            if flag == b'ORDER':
                i += size
                if len(line) < i:
                    errors.append("bad ORDER in primitive {}"
                                  "".format(name.decode('latin1')))
                    raise ValueError
                order = tuple(map(int, line[5:i]))
            elif flag == b'DEFORDER':
                order = tuple(range(1, size+1) if order == 1 else
                              range(size, 0, -1))
            else:
                errors.append("unknown ORDER in primitive {}"
                              "".format(name.decode('latin1')))
                raise ValueError
            flag, line = line[i], line[i+1:]
            if flag == b'FLOAT':
                if len(line) < 8:
                    errors.append("bad FLOAT in primitive {}"
                                  "".format(name.decode('latin1')))
                    raise ValueError
                fbits = tuple(map(int, line[:8]))
                ptypes[name] = size, order, align, fbits
                line = line[8:]
            elif (flag in (b'FIX', b'NO-CONV')):
                if flag == b'NO-CONV':
                    order = ()
                ptypes[name] = size, order, align
            else:
                errors.append("unrecognized primitive type {}"
                              "".format(name.decode('latin1')))
                raise ValueError
            # line may still contain UNSGNED an ONESCMP items
            # also things like (TUPLE, float_complex, 2, -1) ?!
        except (ValueError, IndexError):
            errors.append("garbled primitive type {}"
                          "".format(name.decode('latin1')))

    # Yorick writes idiosyncratic chart and PrimitiveTypes, which is
    # important for how it interprets its own brand of pointers.
    # Yorick writes first writes seven primitives in order:
    #   * short integer long float double char
    # Yorick writes no other primitives in the chart; rather they are
    # written to the Primitive-Types extra.
    # This differs from the order of the standard type indices it uses
    # when writing pointer values:
    #   char short int long float double string pointer
    # Two additional yorick internal virtual primitives come after these:
    #   char * (PDB member pointer) and  char* (PDB symbol pointer)
    # These 10 types have predefined type indices 0-9 in yorick pointer
    # data.  The remaining type indices come first from the Primitive-Types
    # extra, if and only if referenced (Directory first if used), then
    # from the order of non-primitive compound types in the chart.
    # (Note that yorick's string and pointer primitives appear only if
    # used in the file.  They are aliases for the long type primitive.)
    # Yorick does not write the standard primitives in the chart, PDBLib does.
    # Yorick writes complex data as struct complex {double re, im;}.

    # We have primtypes (char, short, int long, float, double, *),
    # chartprims, and ptypes (from Primitive-Types extra).  The chartprims
    # are nearly useless, since they have no order or alignment information.

    # Yorick assumes that all chartprims are mentioned either as one of
    # the standard primtypes or in the ptypes from the Primitive-Types extra,
    # so we simply ignore them here (in yorick they can only generate errors).
    # Convert primtypes to an OrderedDict and append ptypes to it, keeping
    # the original primtypes at the beginning, and adding any additional
    # primitives from the extras in order afterwards.  The Primitive-Types
    # extra supersedes the original primtypes.
    primtypes = OrderedDict(primtypes)
    primtypes.update(ptypes)
    if not haspointers:
        string, pointer = ptypes.get(b'string'), ptypes.get(b'pointer')
        if not string and maybe_pointers & 1 and b'string' not in structs:
            primtypes[b'string'] = string = primtypes[b'long']
        if not pointer and maybe_pointers & 2 and b'pointer' not in structs:
            primtypes[b'pointer'] = pointer = primtypes[b'long']
        desc = string or pointer
        if string and pointer and string != pointer:
            desc = None
        if desc:
            # At least one of string or pointer primitive present and
            # both the same if both present.  Check that this common type
            # is identical to long primitive.
            ldesc = primtypes[b'long']
            if ldesc[0::2] == desc[0::2]:
                # Orders hard to compare, may be either permuation or flag.
                lord, pord = ldesc[1], desc[1]
                if lord == pord:
                    haspointers = 2
                elif isinstance(lord, tuple) and not isinstance(pord, tuple):
                    sord = tuple(range(1, len(lord)+1))
                    if pord == 2:
                        sord = sord[::-1]
                    elif pord != 1:
                        sord = ()
                    if sord == lord:
                        haspointers = 2
                elif isinstance(pord, tuple) and not isinstance(lord, tuple):
                    sord = tuple(range(1, len(pord)+1))
                    if lord == 2:
                        sord = sord[::-1]
                    elif lord != 1:
                        sord = ()
                    if sord == pord:
                        haspointers = 2

    _endparse(root, structal, haspointers, primtypes, structs, symtab, errors,
              deferred, recordsym)


def _endparse(root, structal, haspointers, primtypes, structs, symtab, errors,
              deferred, recordsym=None):
    chart = root.chart
    chart.structal = structal
    chart.haspointers = haspointers
    # TODO: In PDB-3 it is possible that primtypes does not contain a
    # definition for int, or indeed for any primitive not used in symtab.
    chart.byteorder = primtypes[b'long'][1]  # of long

    # Make sure that Directory primitive goes first if present.
    # This ensures yorick pointer type index will correspond to
    # position in chart.primitives OrderedDict.
    dirdesc = primtypes.get(b'Directory')
    if dirdesc is not None:
        chart.add_primitive(b'Directory', dirdesc)
    type_mismatch = False
    for name, desc in itemsof(primtypes):
        if chart.add_primitive(name, desc) is None:
            type_mismatch = True
    if recordsym:
        dundertype = structs.pop(b'__')  # make sure this is last
    for name, desc in itemsof(structs):
        if chart.add_struct(name, desc) is None:
            type_mismatch = True
    if recordsym:
        if chart.add_struct(b'__', dundertype) is None:
            type_mismatch = True
        else:
            dundertype = chart.structs.pop(b'__')
    if type_mismatch:
        raise IOError("data type mismatch building PDB structure chart")

    if recordsym:
        # Convert yorick default __@history style records to less
        # efficient native PDB blocks style record variables.
        raddr = symtab.pop(recordsym)[0]
        stype, members = dundertype[0], dundertype[3]
        # Names in symtab were adjusted if necessary to not conflict with
        # field names in stype.
        fields = stype.fields
        offsets = [fields[nm][1] for nm in stype.names]
        raddr = array(raddr)
        for off, (sname, (tname, shape)) in zip(offsets, itemsof(members)):
            symtab[sname] = (raddr + off).tolist(), tname, shape

    primitives, structs = chart.primitives, chart.structs
    undefined = set()
    groups = {b'': root}
    addr0 = root.handle.zero_address()
    for name, (addr, tname, shape) in itemsof(symtab):
        defblock = deferred.get(name)
        if name.startswith(b'/'):
            dirname, name = name.rsplit(b'/', 1)
            grp = groups.get(dirname)
            if grp is None:
                grp = _declare_group(groups, dirname)
        else:
            grp = root
        if tname == b'Directory':
            continue
        if not name:
            errors.append("{}/ not type Directory"
                          "".format(dirname.decode('latin1')))
            name = b'?'  # soldier on with bogus name
        dtype = primitives.get(tname)
        if dtype is None:
            dtype = structs.get(tname)
            if dtype is None:
                if tname not in (b'string', b'pointer'):
                    undefined.add(tname)
                    continue
                # Work around yorick bug that sometimes forgets to write
                # Primitive-Types extras.
        if defblock:
            # Complicated Blocks: extra could not be processed until now
            # that we know the number of bytes per item.
            addr = _make_simple_list(dtype[0].itemsize, *defblock)
        unlim = hasattr(addr, '__iter__')  # match test in PDBLeaf.__init__
        if unlim:
            addr = (array(addr) + addr0).tolist()
        else:
            addr = addr + addr0
        if not PY2:
            name = name.decode('latin1')
        stype, dtype, align, _ = dtype  # from primitives[] or structs[]
        dtype = dtype, stype, align, tname
        grp._leaf_declare(name, dtype, shape, addr)
    if undefined:
        errors.append("undefined types: {}".format(undefined))

    if errors:
        # Can filter this by module name.
        warn("{} errors parsing PDB metadata".format(len(errors)))


def _declare_group(groups, dirname):
    dname, name = dirname.rsplit(b'/', 1)
    grp = groups.get(dname)
    if grp is None:  # recurse to define all undefined ancestors
        grp = _declare_group(groups, dname)
    if not PY2:
        name = name.decode('latin1')
    grp = grp.declare(name, dict, None)
    groups[dirname] = grp
    return grp


_magic1 = re.compile(br'!<><PDB><>!\s*([0-9]+)\s*[\r\n\037]'
                     br'\s*([0-9]+)\s*[\r\n\037]\s*([0-9]+)\s*[\r\n\037]')

_magic2 = re.compile(br'!<<PDB:II>>![\r\n\037](.)')
_magic2a = re.compile(br'\s*([0-9]+)\001\s*([0-9]+)\001[\r\n\037]'
                      br'\s*([0-9]+)\001\s*([0-9]+)\001[\r\n\037]')

_magic3 = re.compile(br'StructureChartAddress:\s*([0-9]+)\s*[\r\n\037]+'
                     br'SymbolTableAddress:\s*([0-9]+)\s*[\r\n\037]+'
                     br'!<<PDB:3>>![\r\n\037]+')

# IEEE 754-2008 float, double, and quad precision formats
_binary32 =  ( 32,  8,  23, 0, 1,  9, 0,   127)  # noqa IEEE 754 float
_binary64 =  ( 64, 11,  52, 0, 1, 12, 0,  1023)  # noqa IEEE 754 double
_binary128 = (128, 15, 112, 0, 1, 16, 0, 16383)  # noqa IEEE 754 quadruple
# Intel 80 bit format is an example of IEEE 754-2008 binary64ext
_intel80 =   ( 80, 15,  64, 0, 1, 16, 1, 16383)  # noqa Intel 80 bit registers
_oldmac96 =  ( 96, 15,  64, 0, 1, 32, 1, 16382)  # noqa ??
# Remaining formats have different rules for Inf, Nan, and denormal
_cray64 =    ( 64, 15,  48, 0, 1, 16, 1, 16384)  # noqa Cray 1, XMP, YMP
_vaxf32 =    ( 32,  8,  23, 0, 1,  9, 0,   129)  # noqa VAX F float
_vaxd64 =    ( 64,  8,  55, 0, 1,  9, 0,   129)  # noqa VAX D double
_vaxg64 =    ( 64, 11,  52, 0, 1, 12, 0,  1025)  # noqa VAX G double
_vaxh128 =   (128, 15, 112, 0, 1, 16, 0, 16385)  # noqa VAX H quad

# legacy primitive types for PDB version 1
_1layouts = {1: ((b'char',    (1, 0, 0)), (b'short', (2, 1, 2)),   # sun3
                 (b'integer', (4, 1, 2)), (b'long',  (4, 1, 2)),
                 (b'float',  (4, 1, 2, _binary32)),
                 (b'double', (8, 1, 2, _binary64)), (b'*', (4, 1, 4))),
             2: ((b'char',    (1, 0, 0)), (b'short', (2, 2, 2)),  # ibmpc
                 (b'integer', (2, 2, 2)), (b'long',  (4, 2, 2)),
                 (b'float',  (4, 2, 2, _binary32)),
                 (b'double', (8, 2, 2, _binary64)), (b'*', (4, 2, 4))),
             3: ((b'char',    (1, 0, 0)), (b'short', (8, 1, 2)),  # cray
                 (b'integer', (8, 1, 8)), (b'long',  (8, 1, 8)),  # hybrid cc
                 (b'float',  (8, 1, 8, _cray64)),
                 (b'double', (8, 1, 8, _cray64)), (b'*', (8, 1, 8))),
             4: ((b'char',    (1, 0, 0)), (b'short', (2, 2, 1)),  # vax
                 (b'integer', (4, 2, 1)), (b'long',  (4, 2, 1)),
                 (b'float',  (4, (2, 1, 4, 3), 1, _vaxf32)),
                 (b'double', (8, (2, 1, 4, 3, 6, 5, 8, 7), 1, _vaxg64)),
                 (b'*', (4, 2, 1))),
             5: ((b'char',    (1, 0, 0)), (b'short', (2, 1, 4)),  # PDB default
                 (b'integer', (4, 1, 4)), (b'long',  (4, 1, 4)),
                 (b'float',  (4, 1, 4, _binary32)),
                 (b'double', (8, 1, 4, _binary64)), (b'*', (4, 1, 4))),
             6: ((b'char',    (1, 0, 0)), (b'short', (2, 1, 2)),  # mac2
                 (b'integer', (2, 1, 2)), (b'long',  (4, 1, 2)),
                 (b'float',  (4, 1, 2, _binary32)),
                 (b'double', (12, 1, 2, _oldmac96)), (b'*', (4, 1, 2)))}

# Though probably never actually used, PDBlib source code will accept
# ASCII 037 = US = unit separator as a newline character.
# Curiously, PDB treats DOS \r\n as a blank line following the line,
# which would break the PDB metadata.
_line_re = re.compile(br'([^\r\n\037]*)[\r\n\037]')

dim = br'\s*(\d+(?::\d+)?(?:\s*,\s*\d+(?::\d+)?)*)\s*'  # n[:n][,n[:n]]*
dim = br'\s*(?:[\[(]' + dim + br'[])]\s*)?'
_memberdef = re.compile(br'\s*([^[(* \t]+)\s((?:\s*\*)*)'
                        br'([^[(* \t]+)' + dim + br'\s*$')
_ptrheader = re.compile(br'(\d+)\001([^\001]+)\001'
                        br'(?:([-+]?\d+)\001)?(?:(\d+)\001)?[\r\n\037]')

_pdb3_tagline = re.compile(br'^\s*(\S+)\s*:\s*(\S+)?\s*$')
_pdb3_prim = re.compile(br'^\s*(\S+)\s+(\d+)\s+(\d+)\s+([^;]+);\s*$')
_pdb3_pattr = re.compile(br'\s*(FIX|NO-CONV|UNSGNED|ONESCMP|'
                         br'TYPEDEF\s*\(\s*(\S+)\s*\)|'
                         br'ORDER\s*\(\s*(big|little|\d+(?:\s*,\s*\d+)*)\s*\)|'
                         br'FLOAT\s*\(\d+(?:\s*,\s*\d+){7}\s*\))')
_pdb3_struct = re.compile(br'^\s*(\S+)\s*\(\s*(\d+)\s*\)\s*$')
_pdb3_memb = re.compile(br'\s*(\{)?\s*([^ \t*]+)\s*(\*(?:\s*\*)*)?\s*'
                        br'([^[( \t<;]+)' + dim + br'(?:<-\s*\S+\s*)?'
                        br';\s*(\}\s*;\s*)?$')
_pdb3_symb = re.compile(br'\s*([^ \t*]+)\s*(\*(?:\s*\*)*)?\s*([^ \t<;]+)' +
                        dim + br'@\s*(\d+)\s*\(\s*(\d+)\s*\)\s*;\s*$')
del dim
_pdb3_block = re.compile(br'^\s*(\S+)\s*(\d+)\s*$')


def _parse_shape(shape):
    if not shape:
        return ()
    dims = []
    for d in (s.split(b':') for s in shape.split(b',')):
        v, d = int(d[0]), d[1:]
        dims.append(max(int(d[0]) - v + 1, 0) if d else v)
    return tuple(dims)


def _flip_shapes(chart, symtab):
    for name in chart:
        members = chart[name][1]
        if members:
            for mname in members:
                typ, shape = members[mname]
                members[mname] = typ, shape[::-1]
    for name in symtab:
        addr, typ, shape = symtab[name]
        symtab[name] = addr, typ, shape[::-1]


def _make_simple_list(itemsize, addrs, chunk, count):
    # Convert counted list of chunk addresses into a simple address list
    # with one chunk per element.
    # The crucial feature of this algorithm is that there is no explicit
    # interpreted loop; it is entirely numpy array operations, so the
    # number of chunks can be quite large without a significant performance
    # impact.  Equivalent to:
    #   output = []
    #   for a, c in zip(addrs, count):
    #       output.extend(a + arange(c)*chunk)
    chunk *= itemsize
    n, na = count.sum(), addrs.size
    if na < n:
        indx = concatenate(([0], count))[:-1].cumsum()
        ibcast = zeros(n, int64)
        ibcast[indx] = 1
        ibcast = ibcast.cumsum() - 1
        icount = arange(n) - indx[ibcast]
        addrs = addrs[ibcast] + chunk*icount
    return addrs.tolist()


def _parse3(f, root, chart_contents, symtab_contents):
    chart_contents = _line_re.finditer(chart_contents)
    symtab_contents = _line_re.finditer(symtab_contents)
    errors = []

    # Parse the version 3 structure chart.
    ptypes = OrderedDict()
    structs = OrderedDict()
    pointertypes = set()
    structal = skippedprim = skippedcomp = 0
    fortran = False
    openstruct = tag = members = None
    for line in chart_contents:
        match = _pdb3_tagline.match(line)
        if match:
            if openstruct:
                errors.append("struct {} definition incomplete"
                              "".format(openstruct.decode('latin1')))
                openstruct = None
            tag = match.group(1)
            # Each tag is a chart section.  They should appear in this order,
            # although we do not enforce the order here.
            if tag == b'PrimitiveTypes':
                if ptypes:
                    errors.append("multiple PrimitiveTypes sections in chart")
                tag = b':'
            elif tag == b'StructAlignment':
                structal = match.group(2)
                if structal.isdigit():
                    structal = int(structal)
                else:
                    errors.append("bad StructAlignment value")
                    structal = 0
                tag = None
            elif tag == b'DefaultIndexOffset':
                # Ignore all index origin suggestions.
                tag = None
            elif tag == b'MajorOrder':
                fortran = match.group(2) == b'column'
                tag = None
            elif tag == b'CompoundTypes':
                if structs:
                    errors.append("multiple CompoundTypes sections in chart")
                tag = b''
            else:
                tag = None
            continue
        if tag is None:
            continue

        if tag == b':':
            # Parse PrimitiveTypes section.
            # typename --> size, order, align
            #          or  size, order, align, fpformat
            match = _pdb3_prim.match(line)
            if not match:
                if line.strip():
                    skippedprim += 1
                continue
            name, size, align, attribs = match.group(1, 2, 3, 4)
            size, align = int(size), int(align)
            fpbits = None
            order = 0
            isfix, typedefs = False, []
            for attr in attribs.split(b'|'):
                match = _pdb3_pattr.match(attr)
                if not match:
                    continue  # just ignore malformed attributes
                a = match.group(1)
                if a.startswith(b'FL'):
                    fpbits = tuple(int(p) for p in match.group(4).split(b','))
                elif a.startswith(b'O'):
                    order = match.group(3)
                    if order.startswith(b'b'):
                        order = 2
                    elif order.startswith(b'l'):
                        order = 1
                    else:
                        order = tuple(int(p) for p in order.split(b','))
                        if 0 in order:  # PDB-II permutations are 1-origin
                            order = tuple(p+1 for p in order)
                elif a.startswith(b'T'):
                    typedefs.append(match.group(2))
                elif a.startswith(b'FI'):
                    isfix = True
            if isfix and fpbits:
                fpbits = None
                errors.append("{} declared both fix and float in chart"
                              "".format(name.decode('latin1')))
            if (isfix or fpbits) and size > 1 and not order:
                errors.append("{} does not specify byte order in chart"
                              "".format(name.decode('latin1')))
            if fpbits:
                ptypes[name] = size, order, align, fpbits
            else:
                ptypes[name] = size, order, align
            continue

        # Parse CompoundTypes section.
        # typename --> size, members
        #   members is an OrderedDict, membername --> typename, shape
        if not openstruct:
            match = _pdb3_struct.match(line)
            if not match:
                if line.strip():
                    skippedcomp += 1
                continue
            openstruct, size = match.group(1, 2)
            size = int64(size)
            members = OrderedDict()
            continue
        match = _pdb3_memb.match(line)
        if match.group(1):  # has opening brace
            if members:
                errors.append("{} has multiple opening braces"
                              "".format(openstruct.decode('latin1')))
        elif not members:
            errors.append("{} missing open brace"
                          "".format(openstruct.decode('latin1')))
        mtype, ind, mname, shape = match.groups(2, 3, 4, 5)
        if ind:
            mtype += b'*' * ind.count(b'*')
            pointertypes.add(openstruct)
        elif mtype in pointertypes:
            pointertypes.add(openstruct)
        shape = _parse_shape(shape)
        members[mname] = mtype, shape
        if match.groups(6):  # has closing brace
            structs[openstruct] = size, members
            openstruct = members = None
    if openstruct:
        errors.append("struct {} definition incomplete"
                      "".format(openstruct.decode('latin1')))
    if skippedprim:
        errors.append("skipped {} lines parsing PrimitiveTypes"
                      "".format(skippedprim))
    if skippedcomp:
        errors.append("skipped {} lines parsing CompoundTypes"
                      "".format(skippedcomp))

    # Parse the symtab_contents into an OrderedDict,
    # symbolname --> address, typename, shape
    symtab = OrderedDict()
    has_dirs = False
    haspointers = garbage = 0
    slowest = -1 if fortran else 0
    tag = None
    openblock = blocks = nbspecs = None
    deferred = {}
    for line in symtab_contents:
        line = line.group(1)
        match = _pdb3_tagline.match(line)
        if match:
            if openblock:
                errors.append("blocks for {} incomplete"
                              "".format(openblock.decode('latin1')))
                openblock = blocks = None
            tag = match.group(1)
            if tag == b'StructureChartAddress':
                break
            if tag == b'SymbolTable':
                if symtab:
                    errors.append("multiple SymbolTable sections in symtab")
                tag = b':'
            elif tag == b'Blocks':
                if symtab:
                    tag = b'b:'
                else:
                    errors.append("Blocks before SymbolTable in symtab")
                    tag = None
            elif tag == b'Checksums':
                tag = None  # Ignore checksums for now.
            # DynamicSpaces: <n>\n      (ia_<n> is next pointer?)
            # UseItags: 1 or 0\n
            # PreviousFile: <name>\n
            else:
                tag = None
            continue
        if tag is None:
            continue

        if tag == b':':
            # Parse SymbolTable entry
            match = _pdb3_symb.match(line)
            if not match:
                if line.strip():
                    garbage += 1
                continue
            typ, ind, sname, shape, addr, size = match.group(1, 2, 3, 4, 5, 6)
            addr, size = int64(addr), int64(size)
            if not shape and size > 1:
                shape = (size,)
            else:
                shape = _parse_shape(shape)
            if prod(shape) == size:
                if ind:
                    typ += b'*' * ind.count(b'*')
                    haspointers |= 1
                elif typ in pointertypes:
                    haspointers |= 1
                symtab[name] = addr, typ, shape
                if not has_dirs:
                    has_dirs |= typ == 'Directory' or name.startswith(b'/')
            else:
                errors.append("{} has count mismatch"
                              "".format(name.decode('latin1')))

        elif tag == b'b:':
            # Parse next line of blocks table.
            match = _pdb3_block.match(line)
            if not match:
                if line.strip():
                    garbage += 1
                continue
            addr, count = match.group(1, 2)
            count = int64(count)
            if not openblock:
                if count:
                    openblock, nbspecs = addr, count
                    blocks = []
                continue
            if not addr.isdigit():
                errors.append("garbled blocks for {}"
                              "".format(openblock.decode('latin1')))
                # Attempt to resynchronize, but probably hopelessly lost.
                openblock, nbspecs = addr, count
                continue
            addr = int64(addr)
            blocks.append((addr, count))
            nbspecs -= 1
            if not nbspecs:
                name, openblock = openblock, None
                addr, count = array(blocks).T
                sym = symtab.get(name)
                shape = sym[2] if sym else ()
                if not shape:
                    errors.append("block shape mismatch for {}"
                                  "".format(name.decode('latin1')))
                    continue
                # Without if len test, chunk and later count become floats.
                chunk = prod(shape[1:]) if len(shape) > 1 else 1
                if ((count % chunk).any() or (count[0] < shape[slowest]*chunk)
                        or (addr[0] != sym[0])):
                    errors.append("block shape disagreement for {}"
                                  "".format(name.decode('latin1')))
                    continue
                count //= chunk  # number of slowest index positions
                # QnD interface wants simple list of block addresses.
                if (count == 1).all():
                    # All chunk addresses given, convert symtab entry
                    addr, typ, shape = symtab[name]
                    # OrderedDict guarantees that replacing item value does
                    # not change its position in the sequence.
                    symtab[name] = addr.tolist(), typ, shape[1:]
                else:
                    deferred[name] = addr, chunk, count
    if openblock:
        errors.append("blocks for {} incomplete"
                      "".format(openblock.decode('latin1')))
    if garbage:
        errors.append("skipped {} lines parsing symtab".format(garbage))

    # PDBlib outputs symtab in random hash order.  Sort it into order of
    # increasing address, which is probably declaration order.
    symtab = list(itemsof(symtab))
    symtab.sort(key=_addrkey)
    symtab = OrderedDict(symtab)

    if fortran:
        # Shapes are in little-endian order, reverse them.
        _flip_shapes(structs, symtab)

    _endparse(root, structal, haspointers, ptypes, structs, symtab, errors,
              deferred)


def _addrkey(item):
    addr = item[1][0]
    return addr[0] if isinstance(addr, list) else addr


# ----------------------------------------------------------------------------
# standard numpy types, present on all modern machines:

#   Array protocol (array interface) type strings:
#   (> or < depending on platform, except |i1, |u1)
# i1   i2   i4   i8
# u1   u2   u4   u8
# f4   f8
# c8   c16
#   all i and u are two's complement
#     f4 = IEEE 754-2008 binary32 format
#     f8 = IEEE 754-2008 binary64 format

#   Additionally, an unspecified binary64ext extended precision format
#   will be present, either f12 or f16, usually the 80-bit intel format
#   with 2 or 6 unused bytes.  On some non-intel platforms, the f16
#   format will be the specific IEEE 754-2008 binary128 format.
#   The only portable way to discover this type is as
# numpy.longdouble
#   On 32-bit intel machines, this will be f12, on 64 bit machines f16.


# ------------------------------------------------------- PDB metadata summary
# In PDB metadata, in all versions, newline \n may be \n, \r, or \037 (ESC).

# PDB-I header:
# Begin with 11 bytes:
#   header[:11] = '!<><PDB><>!'
# Following this are three lines containing one decimal number each:
#   platform \n chart_address \n symtab_address \n
# The platform numbers are detailed in the _1layouts global variable above;
# all are obsolete machines unlikely to match any modern machines.

# PDB-II header:
# Header format, newline \n may be \n, \r, or \037 (ESC):
# Begin with 13 bytes
#   header[:13] = '!<<PDB:II>>!\n'
# Followed by single byte values as follows:
#   N = ord(header[13]) = count of single byte values
#   sP, sS, sI, sL, sF, sD = ord(header[14:20]) = type byte sizes
#   oS, oI, oL = ord(header[20:23]) = integer type byte orders
#   pF = ord(header[23:23+sF]) = float byte permutation
#   pD = ord(header[23+sF:23+sF+sD]) = double byte permutation
#   fF = ord(header[23+sF+sD:30+sF+sD])
#   fD = ord(header[30+sF+sD:37+sF+sD])
# Notes:
#   N = 24 + sF + sD
#   oS, oI, oL are 1 for big-endian, 2 for little-endian order
#   pF, pD are 1-origin big-endian byte positions
#     e.g.- pF = [4, 3, 2, 1] for little-endian, [1, 2, 3, 4] for big
#     The type alignments are not specified in this header;
#     the extras later override this header primitive specification.
#   fF, fD are floating point format [ N  e#  s#  -&  e&  s&  1? ]
# Bytes header[37+sF+sD:] are four ASCII decimal numbers organized
# in two lines (with same options for newline as header[12]):
#   biasF\001 biasD\001\n chart_address\001 symtab_address\001\n
# The biases complete the floating point formats.
# The addresses are byte addresses; the symtab always immediately
# follows the chart, and the symtab and extras table extend to the
# end of the file.
#
# PDB-II chart:
# <type>\001<nbytes>\001<type> <name> [<dimlist>]\001 ...\001\n
# ... note: primitives first, including *, with no members
# \002\n
#
# PDB-II symtab:
# <name>\001<type>\001<nitems>\001<address>\001[<origin>\001<length>\001]*\n
# \n    so \n\n marks end of symtab, beginning of extras
#
# PDB-II extras related to type interpretation
#   (primitives also listed at front of version II chart):
# Primitive-Types:\n
# <type>\001<nbytes>\001<alignment>\001<order>\001<moreorder>\001<more>\001\n
#   <order> = 1 MSB first, 2 LSB first, -1 NO-CONV or FLOAT
#   <moreorder> = DEFORDER  or  ORDER\0010\0011\0012\0013
#   <more> = NO-CONV or FIX or FLOAT\001nbits\001...bias
# \002\n
# Alignment:cpsilfd\n    alignments as 7 single bytes
# StructAlignment: <n>\n
# Offset:<n>\n    default index origin (ignore)
# MajorOrder: 101 or 102\n   (row or column = first slowest, first fastest)
# Longlong-Format-Alignment:soa\n    nbytes, order, alignment as bytes
#    order = 1 big-endian, 2 little-endian, 3 text(?), 4 external, 0 none
#  When directories initialized, creates / and /&ptrs/
#  and NO-CONV 1 byte 0 align no order Directory primitive.
#
# more PDB-II extras
# Has-Directories: 1 or 0 \n
# Version:<11 for yorick>|<date>\n     bug in some files- <date> contains \n
# Blocks:\n
# <name>\001<nblocks>\x<addr> <nitems>...\n    where \x either \n or space
# \002\n
#   -- note that nitems is multiple of declared dimensions except slowest
# Checksums:
#   ...
# Dynamic Spaces: <n>\n      count of itags (ia_<n> is next pointer?)
# Use Itags: 1 or 0\n
# Previous-File: <name>\n
#
# in PDB-II, \002 is optional at end of block
# extra-name: blah-blah \n
# extra-name:\n
# blah-blah\n
# \n
# extra-name:\n
# blah-blah\n
# \002 blah-blah \n
#
# PDB-II pointer data:
# Symbols which are pointers take no space; instead the pointee data begins
# at the symbol address.  For symbols which are arrays of structs containing
# pointers, the pointer data begins at the address immediately following the
# declared struct array, proceeding in depth-first order (that is, marching
# through the top level symbol, but interrupting with the pointees of any
# pointers encountered in the top-level pointees, and so on).  Each pointee
# is preceeded by a bytes header:
# "%d\001%s\001%d\001%d\001\n", nitems, full_type, address, dataHere
#     -- a NULL pointer is represented by nitems==0, address==-1, dataHere==0
#        -- if address is -1, nitems is ignored (NULL pointer)
#        -- address is the address of the pointee header with dataHere!=0
#           not the address of the data itself (this nitems and full_type
#           are ignored, strangely)
#     -- dataHere may be missing, treated as if dataHere==1
#     -- address and dataHere may be missing, which causes header to be
#        treated as if NULL, again ignoring nitems
# If dataHere non-zero, the full_type[nitems] data begins at the byte after \n
# followed by any pointee data it may refer to.
#
# PDB-II yorick pointers:
# Pointer itself (string or general pointer) is a long containing address
# of the header.  An address <0 (-1 preferred) is a NULL pointer.
# string: Header is long byte count n, followed immediately by n data bytes.
# pointer: Header is 2+ndims longs; first two are typenumber, ndims (<=10).
#          Data follows at next data-aligned address after header.
# Typenumber is tricky; always begins with standard types char (0), short, int
# long, float, double, followed by string, pointer, followed by char *, char*,
# followed by PrimitiveTypes actually used (beginning with Directory if used),
# followed by compound types in chart order.

# PDB-3 has no header, marked near EOF with
# StructureChartAddress: <address>\n
# SymbolTableAddress: <address>\n
# !<<PDB:3>>!\n
#
# PDB-3 chart
# PrimitiveTypes:                primitive types section first, not optional
# <type> <nbytes> <alignment> <attr1>[|<attr2>]*;\n
#   ORDER(big little or 0,1,2,3)  or  NO-CONV
#   FIX  or  FLOAT(nbits,nexp,nmant,osign,oexp,omant,leadbit,bias)
#   UNSGNED   ONESCMP  TYPEDEF(<type>)
#
# Directory 1 0 NO-CONV|FIX;        in particular
#
# StructAlignment: <n>\n
# DefaultIndexOffset: <n>\n
# MajorOrder: row or column\n   (C or Fortran, respectively)
#
# CompoundTypes:                 compound types section last, not optional
# <type> (<nbytes>)\n
# {<type> <name> [<dimlist>] [<- <cast>];\n
#  <type> <name> [<dimlist>] [<- <cast>];\n
#  <type> <name> [<dimlist>] [<- <cast>];};\n
#
# PDB-3 symtab
# SymbolTable:
# <type> <name>[<dimensions>] @ <address> (<nitems>);\n
# Directory / @ 0 (1);
# Directory /&ptrs/ @ 1 (1);
# double /&ptrs/ia_1 @ 10 (10);
#
# PDB-3 extras
# Version: <n> [(date)]\n       date is file creation date, not version date
# Version: 19 (Fri Apr 28 14:04:36 2006)
#    version 24 appeared dated 02/03/2010
# Blocks:
# <name> <nblocks>\n
#   <addr> <nitems>\n
#   ...
# Checksums:
#   ...
# DynamicSpaces: <n>\n      count of itags (ia_<n> is next pointer?)
# UseItags: 1 or 0\n
# PreviousFile: <name>\n
Source code for qnd.pdbparse

qnd

Navigation

Related Topics