Source code for cngi.conversion.convert_image

#  CASA Next Generation Infrastructure
#  Copyright (C) 2021 AUI, Inc. Washington DC, USA
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
this module will be included in the api
"""


##########################################
[docs]def convert_image(infile, outfile=None, artifacts=[], compressor=None, chunks=(-1, -1, 1, 1)):
    """
    Convert legacy CASA or FITS format Image to xarray Image Dataset and zarr storage format

    This function requires CASA6 casatools module.

    Parameters
    ----------
    infile : str
        Input image filename (.image or .fits format). If taylor terms are present, they should be in the form of filename.image.tt0 and
        this infile string should be filename.image
    outfile : str
        Output zarr filename. If None, will use infile name with .img.zarr extension
    artifacts : list of str
        List of other image artifacts to include if present with infile. Use None for just the specified infile.
        Default [] uses ``['mask','model','pb','psf','residual','sumwt','weight']``
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    chunks: 4-D tuple of ints
        Shape of desired chunking in the form of (l, m, channels, polarization), use -1 for entire axis in one chunk. Default is (-1, -1, 1, 1)
        Note: chunk size is the product of the four numbers (up to the actual size of the dimension)

    Returns
    -------
    xarray.core.dataset.Dataset
        new xarray Datasets of Image data contents
    """
    from casatools import image as ia
    from casatools import quanta as qa
    from cngi._utils._table_conversion import convert_simple_table, convert_time
    import numpy as np
    from itertools import cycle
    import importlib_metadata
    import xarray
    from numcodecs import Blosc
    import time, os, warnings, re
    warnings.simplefilter("ignore", category=FutureWarning)  # suppress noisy warnings about bool types

    # TODO - find and save projection type

    infile = os.path.expanduser(infile[:-1]) if infile.endswith('/') else os.path.expanduser(infile)
    infile = infile if infile.startswith('/') else './' + infile
    prefix = infile[:infile.rindex('.')]
    suffix = infile[infile.rindex('.') + 1:]
    srcdir = infile[:infile.rindex('/') + 1]
    if outfile == None: outfile = prefix + '.img.zarr'
    outfile = os.path.expanduser(outfile)

    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)

    tmp = os.system("rm -fr " + outfile)
    tmp = os.system("mkdir " + outfile)

    IA = ia()
    QA = qa()
    begin = time.time()
    
    # all image artifacts will go in same zarr file and share common dimensions if possible
    # check for meta data compatibility
    # store necessary coordinate conversion data
    if artifacts is None: artifacts = [suffix]
    elif len(artifacts) == 0: artifacts = ['image', 'pb', 'psf', 'residual', 'mask', 'model', 'sumwt', 'weight', 'image.pbcor']
    if suffix not in artifacts: artifacts = [suffix] + artifacts
    diftypes, mxds, artifact_dims, artifact_masks = [], xarray.Dataset(), {}, {}
    ttcount = 0

    # for each image artifact, determine what image files in the source directory are compatible with each other
    # extract the metadata from each
    # if taylor terms are present for the artifact, process metadata for first one only
    print("converting Image...")
    dirlist = sorted([srcdir+ff for ff in os.listdir(srcdir) if (srcdir+ff).startswith(prefix)])
    for imtype in artifacts:
        imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s$'%(prefix, imtype), ff))>0]
        if len(imagelist)==0: imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s\.tt\d+$'%(prefix, imtype), ff))>0]
        if (len(imagelist)==0) and (len(imtype.split('.'))>1):
            imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s\.tt\d\.%s$'%(prefix, imtype.split('.')[0], imtype.split('.')[1]), ff))>0]
        if len(imagelist) == 0: continue

        # find number of taylor terms for this artifact and update count for total set if necessary
        ttcount = len(imagelist) if ttcount == 0 else ttcount

        rc = IA.open(imagelist[0])
        csys = IA.coordsys()
        summary = IA.summary(list=False)  # imhead would be better but chokes on big images
        ims = IA.shape()  # image shape
        coord_names = [ss.replace(' ', '_').lower().replace('stokes', 'pol').replace('frequency', 'chan') for ss in summary['axisnames']]
        
        # compute world coordinates for spherical dimensions
        sphr_dims = [dd for dd in range(len(ims)) if QA.isangle(summary['axisunits'][dd])]
        coord_idxs = np.mgrid[[range(ims[dd]) if dd in sphr_dims else range(1) for dd in range(len(ims))]].reshape(len(ims), -1)
        coord_world = csys.toworldmany(coord_idxs.astype(float))['numeric'][sphr_dims].reshape((-1,)+tuple(ims[sphr_dims]))
        coords = dict([(coord_names[dd], (['l','m'], coord_world[di])) for di, dd in enumerate(sphr_dims)])
        if imtype == 'sumwt': coords = {}   # special case, force sumwt to only cartesian coords (chan, pol)
        
        # compute world coordinates for cartesian dimensions
        cart_dims = [dd for dd in range(len(ims)) if dd not in sphr_dims]
        coord_idxs = np.mgrid[[range(ims[dd]) if dd in cart_dims else range(1) for dd in range(len(ims))]].reshape(len(ims), -1)
        coord_world = csys.toworldmany(coord_idxs.astype(float))['numeric'][cart_dims].reshape((-1,)+tuple(ims[cart_dims]))
        for dd, cs in enumerate(list(coord_world)):
            spi = tuple([slice(None) if di == dd else slice(1) for di in range(cs.ndim)])
            coords.update(dict([(coord_names[cart_dims[dd]], cs[spi].reshape(-1))]))

        # compute the time coordinate
        dtime = csys.torecord()['obsdate']['m0']['value']
        if csys.torecord()['obsdate']['m0']['unit'] == 'd': dtime = dtime * 86400
        coords['time'] = convert_time([dtime])

        # assign values to l, m coords based on incr and refpix in metadata
        if ('incr' in summary) and ('refpix' in summary) and ('shape' in summary) and (imtype != 'sumwt'):
            coords['l'] = np.arange(-summary['refpix'][0], summary['shape'][0]-summary['refpix'][0]) * summary['incr'][0]
            coords['m'] = np.arange(-summary['refpix'][1], summary['shape'][1]-summary['refpix'][1]) * summary['incr'][1]

        # check to see if this image artifact is of a compatible shape to be part of the image artifact dataset
        try:  # easiest to try to merge and let xarray figure it out
            mxds = mxds.merge(xarray.Dataset(coords=coords), compat='equals')
        except Exception:
            diftypes += [imtype]
            continue

        # store rest of image metadata as attributes (if not already in the xds
        omits = list(mxds.attrs.keys())
        omits += ['hasmask', 'masks', 'defaultmask', 'ndim', 'refpix', 'refval', 'shape', 'tileshape', 'messages']
        nested = [kk for kk in summary.keys() if isinstance(summary[kk], dict)]
        mxds = mxds.assign_attrs(dict([(kk.lower(), summary[kk]) for kk in summary.keys() if kk not in omits + nested]))
        artifact_dims[imtype] = [ss.replace('right_ascension', 'l').replace('declination', 'm') for ss in coord_names]
        artifact_masks[imtype] = summary['masks']
        
        # manually swap known meta attrs that deal with dim/coord units
        if ('axisnames' in summary) and ('axisnames' not in omits):
            mxds = mxds.assign_attrs({'axisnames':list(mxds.axisnames[:2]) + ['Time'] + list(mxds.axisnames[2:][::-1])})
        if ('axisunits' in summary) and ('axisunits' not in omits):
            mxds = mxds.assign_attrs({'axisunits':list(mxds.axisunits[:2]) + ['datetime64[ns]'] + list(mxds.axisunits[2:][::-1])})

        # check for common and restoring beams
        rb = IA.restoringbeam()
        if (len(rb) > 0) and ('restoringbeam' not in mxds.attrs):
            # if there is a restoring beam, this should work
            cb = IA.commonbeam()
            mxds = mxds.assign_attrs({'commonbeam': [cb['major']['value'], cb['minor']['value'], cb['pa']['value']]})
            mxds = mxds.assign_attrs({'commonbeam_units': [cb['major']['unit'], cb['minor']['unit'], cb['pa']['unit']]})
            mxds = mxds.assign_attrs({'restoringbeam': [cb['major']['value'], cb['minor']['value'], cb['pa']['value']]})
            if 'beams' in rb:
                beams = np.array([[rbs['major']['value'], rbs['minor']['value'], rbs['positionangle']['value']]
                                                          for rbc in rb['beams'].values() for rbs in rbc.values()])
                mxds = mxds.assign_attrs({'perplanebeams':beams.reshape(len(rb['beams']),-1,3)})
                
        # parse messages for additional keys, drop duplicate info
        omits = list(mxds.attrs.keys()) + ['image_name', 'image_type', 'image_quantity', 'pixel_mask(s)', 'region(s)', 'image_units']
        for msg in summary['messages']:
            line = [tuple(kk.split(':')) for kk in msg.lower().split('\n') if ': ' in kk]
            line = [(kk[0].strip().replace(' ', '_'), kk[1].strip()) for kk in line]
            line = [ll for ll in line if ll[0] not in omits]
            mxds = mxds.assign_attrs(dict(line))
                
        rc = csys.done()
        rc = IA.close()

    if len(diftypes) > 0: print('incompatible components: ', diftypes)

    # if taylor terms are present, the chan axis must be expanded to the length of the terms
    if ttcount > len(mxds.chan): mxds = mxds.pad({'chan': (0, ttcount-len(mxds.chan))}, mode='edge')
   
    
    chunk_dict = dict(zip(['l','m','time','chan','pol'], chunks[:2]+(1,)+chunks[2:]))
    mxds = mxds.chunk(chunk_dict)

    # for each artifact, convert the legacy format and add to the new image set
    # masks may be stored within each image, so they will need to be handled like subtables
    for ac, imtype in enumerate(list(artifact_dims.keys())):
        for ec, ext in enumerate([''] + ['/'+ff for ff in list(artifact_masks[imtype])]):
            imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s$' % (prefix, imtype), ff)) > 0]
            if len(imagelist) == 0: imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s\.tt\d+$' % (prefix, imtype), ff)) > 0]
            if (len(imagelist) == 0) and (len(imtype.split('.')) > 1):
                imagelist = [ff for ff in dirlist if len(re.findall('%s\.%s\.tt\d\.%s$' % (prefix, imtype.split('.')[0], imtype.split('.')[1]), ff)) > 0]
            if len(imagelist) == 0: continue

            dimorder = ['time'] + list(reversed(artifact_dims[imtype]))
            chunkorder = [chunk_dict[vv] for vv in dimorder]
            ixds = convert_simple_table(imagelist[0]+ext, outfile+'.temp', dimnames=dimorder, compressor=compressor, chunks=chunkorder)

            # if the image set has taylor terms, loop through any for this artifact and concat together
            # pad the chan dim as necessary to fill remaining elements if not enough taylor terms in this artifact
            for ii in range(1, ttcount):
                if (ii < len(imagelist)) and os.path.exists(imagelist[ii] + ext):
                    txds = convert_simple_table(imagelist[ii]+ext, outfile + '.temp', dimnames=dimorder, chunks=chunkorder, nofile=True)
                    ixds = xarray.concat([ixds, txds], dim='chan')
                else:
                    ixds = ixds.pad({'chan': (0, 1)}, constant_values=np.nan)

            ixds = ixds.rename({list(ixds.data_vars)[0]:(imtype.replace('.','_')+ext.replace('/','_')).upper()}).transpose('l','m','time','chan','pol')
            if imtype == 'sumwt': ixds = ixds.squeeze(['l','m'], drop=True)
            if imtype == 'mask': ixds = ixds.rename({'MASK':'AUTOMASK'})  # rename mask

            encoding = dict(zip(list(ixds.data_vars), cycle([{'compressor': compressor}])))
            ixds.to_zarr(outfile, mode='w' if (ac==0) and (ec==0) else 'a', encoding=encoding, compute=True, consolidated=True)

    tmp = os.system("rm -fr " + outfile+'.temp')
    print('processed image in %s seconds' % str(np.float32(time.time() - begin)))

    # add attributes from metadata and version tag file
    mxds.to_zarr(outfile, mode='a', compute=True, consolidated=True)
    try:
        version = importlib_metadata.version('cngi-prototype')
    except:
        version = '0.0.0'

    with open(outfile + '/.version', 'w') as fid:   # write sw version that did this conversion to zarr directory
        fid.write('cngi-protoype ' + version + '\n')

    return xarray.open_zarr(outfile)