Source code for cngi._utils._mxds_ops

#  CASA Next Generation Infrastructure
#  Copyright (C) 2021 AUI, Inc. Washington DC, USA
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.

#################################
# Helper File
#
# Not exposed in API
#
#################################

import xarray as xr
import typing

[docs]def get_subtable_primary_key_names(xds: xr.Dataset, subtable_name : str) -> typing.List[str]: """Finds the coordinates that are probably primary keys in the given subtable""" ret = [] # type: typing.List[str] for coord_name in list(xds.coords): if subtable_name.lower() in coord_name.lower(): if "_id" in coord_name.lower(): ret.append(coord_name) return ret
[docs]def get_subtables(mxds : xr.Dataset) -> typing.List[str]: """Finds the attributes that are probably subtables in the given mxds.""" subtable_names = [] for subtable_name in list(mxds.attrs): # don't include visibility tables if "xds" in subtable_name: continue # don't include anything that isn't a subtable if not isinstance(mxds.attrs[subtable_name], xr.Dataset): continue # include this attribute as a subtable subtable_names.append(subtable_name) return subtable_names
[docs]def assign_dimensions_for_primary_coordinates(sub0 : xr.Dataset, sub1 : xr.Dataset, subtable_name : str) -> typing.Tuple[xr.Dataset, xr.Dataset]: """Assigns the primary key coordinates as dimensions for those that don't have dimension coordinates.""" import re import numpy as np # get the shared primary key coordinates that aren't dimension coordinates prime0 = get_subtable_primary_key_names(sub0, subtable_name) prime1 = get_subtable_primary_key_names(sub1, subtable_name) shared_primes = [x for x in prime0 if x in prime1] # intersection of prime0 and prime1 shared_primes = list(filter(lambda x: x not in sub0.dims and x not in sub1.dims, shared_primes)) # exclude current dimension coordinates # verify that the primary key coordinates have a single dimension good_shared_primes = [] anon = re.compile("d[0-9]") for coord in shared_primes: if (len(sub0[coord].dims) != 1) or (len(sub1[coord].dims) != 1): print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: too many dimensions.") continue first_dim = sub0[coord].dims[0] if not anon.match(first_dim): print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: first dimension {first_dim} is not anonymous.") continue if len(np.unique(sub0[coord].values)) != len(sub0[coord].values): if (subtable_name != "FEED") or (coord != "feed_id"): # feed_id in most cases should NOT be unique print(f"Warning: can't assign primary key coordinate {coord} as dimension coordinate in {assign_dimensions_for_primary_coordinates.__name__}: coordinate does not have all-unique values.") continue good_shared_primes.append(coord) shared_primes = good_shared_primes # assign dimension coordinates ret = [sub0, sub1] for subi in range(2): sub = sub0 if subi == 0 else sub1 for coord in shared_primes: first_dim = sub[coord].dims[0] if (coord not in sub.dims) and (first_dim not in sub.coords): sub = sub.rename({first_dim: coord}) # replace the non-dimensional coordinate with the primary key name # Renaming a dimension to the same name as a coordinate does not do whatever happens internally to mark that coordinate as a dimensional coordinate. # You wouldn't know this based off of print(sub), because the print statement adds a "*" next to the coordinate name. # Symptom: sub0.broadcast_like(sub1) with different values in their "coord" coordinate will not broadcast. # Example: change the values of "source_id" in the "SOURCE" subtable after renaming, here, then try to broadcast against each other. # Workaround: create a new dataset with all the same values. This corrects whatever issues there are internally. # TODO find encapsulated use case and submit bug report to xarray sub = xr.Dataset(coords=sub.coords, data_vars=sub.data_vars, attrs=sub.attrs) ret[subi] = sub return (ret[0], ret[1])
[docs]def check_mxds_subtable_ref_ids(mxds : xr.Dataset, onerror="print_warning"): """!!!COMPUTES!!! verifies that the references to primary keys between subtables exist. Parameters ---------- mxds: xarray.Dataset The multi-visibility dataset containing subtables to check. onerror: str What to do when bad values are found. Options are "print_warning", "raise", or any other value to do nothing but return the subtable in the returned list. Returns ------- list[str] The list of subtables with bad reference values. Empty if no bad values are found. """ import numpy as np ret = [] subtable_names = get_subtables(mxds) for subtable_name0 in subtable_names: sub_pkname0 = subtable_name0.lower().replace("spectral_window", "spw") sub0 = mxds.attrs[subtable_name0] # get the subtable primary "key" id(s) primary_key_names = get_subtable_primary_key_names(sub0, subtable_name0) # check against references in other subtables for subtable_name1 in subtable_names: # don't compare to itself if subtable_name0 == subtable_name1: continue sub1 = mxds.attrs[subtable_name1] # check coordinates and data_vars for references cdnames = list(sub1.coords) + list(sub1.data_vars) # type: typing.List[str] for cdname in cdnames: # only match against primary "key" references if sub_pkname0 not in cdname.lower(): continue if "_id" not in cdname.lower(): continue # compare the values in sub1[cdname] to the similar primary key from sub0 for pkname in primary_key_names: if (pkname.lower() not in cdname.lower()) and \ (cdname.lower() not in pkname.lower()): continue # check all values for existance if cdname in sub1.coords: vals = sub1.coords[cdname].values.flatten() else: vals = sub1.data_vars[cdname].values.flatten() vals = np.unique(vals) for v in vals: if v in sub0.coords[pkname]: # value is in the primary key: ok continue # value is not in the primary key: not ok ret.append(subtable_name1) msg = f"Warning: reference value {v} in subtable {subtable_name1} does not exist in {subtable_name0}.{pkname}!" if onerror == "print_warning": print(msg) elif onerror == "raise": raise ValueError(msg) # done checking reference values, return the results ret = list(np.unique(ret)) return ret