Source code for cngi.conversion.convert_table

#  CASA Next Generation Infrastructure
#  Copyright (C) 2021 AUI, Inc. Washington DC, USA
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
this module will be included in the api
"""



[docs]def convert_table(infile, outfile=None, subtable=None, keys=None, timecols=None, ignorecols=None, compressor=None, chunks=(10000, -1), append=False, nofile=False):
    """
    Convert casacore table format to xarray Dataset and zarr storage format.

    This function requires CASA6 casatools module. Table rows may be renamed or expanded to n-dim arrays based on column values specified in keys.

    Parameters
    ----------
    infile : str
        Input table filename
    outfile : str
        Output zarr filename. If None, will use infile name with .tbl.zarr extension
    subtable : str
        Name of the subtable to process. If None, main table will be used
    keys : dict or str
        Source column mappings to dimensions. Can be a dict mapping source columns to target dims, use a tuple when combining cols
        (ie {('ANTENNA1','ANTENNA2'):'baseline'} or a string to rename the row axis dimension to the specified value.  Default of None
    timecols : list
        list of strings specifying column names to convert to datetime format from casacore time.  Default is None
    ignorecols : list
        list of column names to ignore. This is useful if a particular column is causing errors.  Default is None
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    chunks : int
        Shape of desired chunking in the form of (dim0, dim1, ..., dimN), use -1 for entire axis in one chunk. Default is (80000, 10).
        Chunking is applied per column / data variable.  If too few dimensions are specified, last chunk size is reused as necessary.
        Note: chunk size is the product of the four numbers, and data is batch processed by the first axis, so that will drive memory needed for conversion.
    append : bool
        Append an xarray dataset as a new partition to an existing zarr directory.  False will overwrite zarr directory with a single new partition
    nofile : bool
        Allows legacy table to be directly read without file conversion. If set to true, no output file will be written and entire table will be held in memory.
        Requires ~4x the memory of the table size.  Default is False
    Returns
    -------
    New xarray.core.dataset.Dataset
      New xarray Dataset of table data contents. One element in list per DDI plus the metadata global.
    """
    import os
    from numcodecs import Blosc
    import importlib_metadata
    import cngi._utils._table_conversion as tblconv

    # parse filename to use
    infile = os.path.expanduser(infile)
    prefix = infile[:infile.rindex('.')]
    if outfile is None: outfile = prefix + '.tbl.zarr'
    outfile = os.path.expanduser(outfile)
    if not infile.endswith('/'): infile = infile + '/'
    if not outfile.endswith('/'): outfile = outfile + '/'
    if subtable is None: subtable = ''
    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)
        
    print('processing %s to %s' % (infile+subtable, outfile+subtable))

    # need to manually remove existing zarr file (if any)
    if (not nofile) and (not append):
        os.system("rm -fr " + outfile)
        os.system("mkdir " + outfile)
    
    if (keys is None) or (type(keys) is str):
        xds = tblconv.convert_simple_table(infile, outfile,
                                           subtable=subtable,
                                           #rowdim='d0' if keys is None else keys,
                                           timecols=[] if timecols is None else timecols,
                                           ignore= [] if ignorecols is None else ignorecols,
                                           compressor=compressor,
                                           chunks=chunks,
                                           nofile=nofile)
    else:
        xds = tblconv.convert_expanded_table(infile, outfile,
                                             keys=keys,
                                             subtable=subtable,
                                             subsel=None,
                                             timecols=[] if timecols is None else timecols,
                                             dimnames={},
                                             ignore=[] if ignorecols is None else ignorecols,
                                             compressor=compressor,
                                             chunks=chunks,
                                             nofile=nofile)

    # write sw version that did this conversion to zarr directory
    with open(outfile + '/.version', 'w') as fid:
        fid.write('cngi-protoype ' + importlib_metadata.version('cngi-prototype') + '\n')
    
    return xds