Source code for cngi._utils._mxds_ops

#  CASA Next Generation Infrastructure
#  Copyright (C) 2021 AUI, Inc. Washington DC, USA
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.

#################################
# Helper File
#
# Not exposed in API
#
#################################

import xarray as xr
import typing

[docs]def get_subtable_primary_key_names(xds: xr.Dataset, subtable_name : str) -> typing.List[str]:
    """Finds the coordinates that are probably primary keys in the given subtable"""
    ret = [] # type: typing.List[str]
    for coord_name in list(xds.coords):
        if subtable_name.lower() in coord_name.lower():
            if "_id" in coord_name.lower():
                ret.append(coord_name)
    return ret

[docs]def get_subtables(mxds : xr.Dataset) -> typing.List[str]:
    """Finds the attributes that are probably subtables in the given mxds."""
    subtable_names = []
    for subtable_name in list(mxds.attrs):
        # don't include visibility tables
        if "xds" in subtable_name:
            continue
        # don't include anything that isn't a subtable
        if not isinstance(mxds.attrs[subtable_name], xr.Dataset):
            continue
        # include this attribute as a subtable
        subtable_names.append(subtable_name)

    return subtable_names

[docs]def assign_dimensions_for_primary_coordinates(sub0 : xr.Dataset, sub1 : xr.Dataset, subtable_name : str) -> typing.Tuple[xr.Dataset, xr.Dataset]:
    """Assigns the primary key coordinates as dimensions for those that don't have dimension coordinates."""
    import re
    import numpy as np

    # get the shared primary key coordinates that aren't dimension coordinates
    prime0 = get_subtable_primary_key_names(sub0, subtable_name)
    prime1 = get_subtable_primary_key_names(sub1, subtable_name)
    shared_primes = [x for x in prime0 if x in prime1] # intersection of prime0 and prime1
    shared_primes = list(filter(lambda x: x not in sub0.dims and x not in sub1.dims, shared_primes)) # exclude current dimension coordinates

    # verify that the primary key coordinates have a single dimension
    good_shared_primes = []
    anon = re.compile("d[0-9]")
    for coord in shared_primes:
        if (len(sub0[coord].dims) != 1) or (len(sub1[coord].dims) != 1):
            print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: too many dimensions.")
            continue
        first_dim = sub0[coord].dims[0]
        if not anon.match(first_dim):
            print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: first dimension {first_dim} is not anonymous.")
            continue
        if len(np.unique(sub0[coord].values)) != len(sub0[coord].values):
            if (subtable_name != "FEED") or (coord != "feed_id"): # feed_id in most cases should NOT be unique
                print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: coordinate does not have all-unique values.")
            continue
        good_shared_primes.append(coord)
    shared_primes = good_shared_primes

    # assign dimension coordinates
    ret = [sub0, sub1]
    for subi in range(2):
        sub = sub0 if subi == 0 else sub1
        for coord in shared_primes:
            first_dim = sub[coord].dims[0]
            if (coord not in sub.dims) and (first_dim not in sub.coords):
                sub = sub.rename({first_dim: coord}) # replace the non-dimensional coordinate with the primary key name
                # Renaming a dimension to the same name as a coordinate does not do whatever happens internally to mark that coordinate as a dimensional coordinate.
                # You wouldn't know this based off of print(sub), because the print statement adds a "*" next to the coordinate name.
                # Symptom: sub0.broadcast_like(sub1) with different values in their "coord" coordinate will not broadcast.
                # Example: change the values of "source_id" in the "SOURCE" subtable after renaming, here, then try to broadcast against each other.
                # Workaround: create a new dataset with all the same values. This corrects whatever issues there are internally.
                # TODO find encapsulated use case and submit bug report to xarray
                sub = xr.Dataset(coords=sub.coords, data_vars=sub.data_vars, attrs=sub.attrs)
        ret[subi] = sub

    return (ret[0], ret[1])

[docs]def check_mxds_subtable_ref_ids(mxds : xr.Dataset, onerror="print_warning"):
    """!!!COMPUTES!!! verifies that the references to primary keys between subtables exist.

    Parameters
    ----------
    mxds: xarray.Dataset
        The multi-visibility dataset containing subtables to check.
    onerror: str
        What to do when bad values are found. Options are "print_warning", "raise", or
        any other value to do nothing but return the subtable in the returned list.

    Returns
    -------
    list[str]
        The list of subtables with bad reference values. Empty if no bad values are found.
    """
    import numpy as np
    ret = []

    subtable_names = get_subtables(mxds)
    for subtable_name0 in subtable_names:
        sub_pkname0 = subtable_name0.lower().replace("spectral_window", "spw")
        sub0 = mxds.attrs[subtable_name0]

        # get the subtable primary "key" id(s)
        primary_key_names = get_subtable_primary_key_names(sub0, subtable_name0)

        # check against references in other subtables
        for subtable_name1 in subtable_names:
            # don't compare to itself
            if subtable_name0 == subtable_name1:
                continue
            sub1 = mxds.attrs[subtable_name1]

            # check coordinates and data_vars for references
            cdnames = list(sub1.coords) + list(sub1.data_vars) # type: typing.List[str]
            for cdname in cdnames:
                # only match against primary "key" references
                if sub_pkname0 not in cdname.lower():
                    continue
                if "_id" not in cdname.lower():
                    continue

                # compare the values in sub1[cdname] to the similar primary key from sub0
                for pkname in primary_key_names:
                    if (pkname.lower() not in cdname.lower()) and \
                       (cdname.lower() not in pkname.lower()):
                        continue

                    # check all values for existance
                    if cdname in sub1.coords:
                        vals = sub1.coords[cdname].values.flatten()
                    else:
                        vals = sub1.data_vars[cdname].values.flatten()
                    vals = np.unique(vals)
                    for v in vals:
                        if v in sub0.coords[pkname]:
                            # value is in the primary key: ok
                            continue
                        # value is not in the primary key: not ok
                        ret.append(subtable_name1)
                        msg = f"Warning: reference value {v} in subtable {subtable_name1} does not exist in {subtable_name0}.{pkname}!"
                        if onerror == "print_warning":
                            print(msg)
                        elif onerror == "raise":
                            raise ValueError(msg)

    # done checking reference values, return the results
    ret = list(np.unique(ret))
    return ret