Source code for pysatModels.utils.match

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2022, pysat development team
# Full license can be found in License.md
# -----------------------------------------------------------------------------
"""Routines to match modelled and observational data."""

import datetime as dt
import numpy as np
import pandas as pds

import pysat

import pysatModels
from pysatModels.utils.convert import load_model_xarray
from pysatModels.utils import extract


[docs]def collect_inst_model_pairs(start, stop, tinc, inst, inst_download_kwargs=None, model_load_rout=load_model_xarray, model_load_kwargs=None, inst_clean_rout=None, inst_lon_name=None, mod_lon_name=None, lon_pos='end', inst_name=None, mod_name=None, mod_datetime_name=None, mod_time_name=None, mod_units=None, sel_name=None, time_method='min', pair_method='closest', method='linear', model_label='model', comp_clean='clean'): """Pair instrument and model data. Parameters ---------- start : dt.datetime Starting datetime stop : dt.datetime Ending datetime tinc : dt.timedelta Time incriment for model files inst : pysat.Instrument Instrument object for which modelled data will be extracted inst_download_kwargs : dict or NoneType Optional keyword arguments for downloading instrument data (default=None) model_load_rout : func Routine to load model data into an xarray using datetime as argument input input and other necessary data as keyword arguments. If the routine requires a time-dependent filename, ensure that the load routine uses the datetime input to construct the correct filename, as done in load_model_xarray. (default=load_model_xarray) model_load_kwargs : dict or NoneType Keyword arguments for the model loading routine. (default=None) inst_clean_rout : func Routine to clean the instrument data. (default=None) inst_lon_name : str variable name for instrument longitude mod_lon_name : str variable name for model longitude lon_pos : str or int Accepts zero-offset integer for list order or 'end' (default='end') inst_name : list or NoneType List of names of the data series to use for determing instrument location. (default=None) mod_name : list or NoneType List of names of the data series to use for determing model locations in the same order as inst_name. These must make up a regular grid. (default=None) mod_datetime_name : str Name of the data series in the model Dataset containing datetime info mod_time_name : str Name of the time coordinate in the model Dataset mod_units : list or NoneType Units for each of the mod_name location attributes. Currently supports: rad/radian(s), deg/degree(s), h/hr(s)/hour(s), m, km, and cm. (default=None) sel_name : list or NoneType list of names of modelled data indices to append to instrument object, or None to append all modelled data (default=None) time_method : str Pair data using larger (max) or smaller (min) of the smallest instrument/model time increments (default='min') pair_method : str Find all relevent pairs ('all') or just the closest pairs ('closest'). (default='closest') method : str Interpolation method. Supported are 'linear', 'nearest', and 'splinef2d'. The last is only supported for 2D data and is not recommended here. (default='linear') model_label : str name of model, used to identify interpolated data values in instrument (default="model") comp_clean : str Clean level for the comparison data ('clean', 'dusty', 'dirty', 'none') (default='clean') Returns ------- matched_inst : pysat.Instrument Instrument object with observational data from `inst` and paired modelled data. Raises ------ ValueError If input is incorrect Note ---- Perform the data cleaning after finding the times and locations where the observations and model align. """ # Initialize the output matched_inst = None # Test the input if inst_lon_name is None: raise ValueError('Need longitude name for instrument data') if mod_lon_name is None: raise ValueError('Need longitude name for model data') if mod_datetime_name is None: raise ValueError('Need datetime coordinate name for model data') if mod_time_name is None: raise ValueError('Need time coordinate name for model data') if inst_name is None or len(inst_name) == 0: estr = 'Must provide instrument location attribute names as a list' raise ValueError(estr) if mod_name is None: estr = 'Must provide model location attribute names as a list' raise ValueError(estr) if mod_units is None: raise ValueError('Must provide model units as a list') if len(inst_name) != len(mod_name): estr = ''.join(['Must provide the same number of instrument and ', 'model location attribute names as a list']) raise ValueError(estr) if len(mod_name) != len(mod_units): raise ValueError(''.join(['Must provide units for each model location', ' attribute'])) if inst_clean_rout is None: raise ValueError('Need routine to clean the instrument data') if inst_download_kwargs is None: inst_download_kwargs = {} if model_load_kwargs is None: model_load_kwargs = {} skip_download = False if "skip_download" in inst_download_kwargs.keys(): skip_download = inst_download_kwargs['skip_download'] del inst_download_kwargs['skip_download'] # Download the instrument data, if needed and wanted if not skip_download and (stop - start).days != len(inst.files[start:stop]): missing_times = [tt for tt in pds.date_range(start, stop, freq='1D', closed='left') if tt not in inst.files[start:stop].index] for tt in missing_times: inst.download(start=tt, stop=tt + pds.DateOffset(days=1), **inst_download_kwargs) # Cycle through the times, loading the model and instrument data as needed istart = start inst_lon_adjust = True inst_dims = [] while start < stop: # Load the model data for each time try: mdata = model_load_rout(start, **model_load_kwargs) except (IOError, ValueError) as err: pysatModels.logger.info( 'unable to load model data at {:}\n{:}'.format(start, err)) mdata = None if mdata is not None: # Get the range for model longitude, if it has not already been set if inst_lon_adjust: if mod_lon_name in mdata.coords: lon_high = float(mdata.coords[mod_lon_name].max()) lon_low = float(mdata.coords[mod_lon_name].min()) elif mod_lon_name in mdata.data_vars: lon_high = float(np.nanmax(mdata.data_vars[mod_lon_name])) lon_low = float(np.nanmin(mdata.data_vars[mod_lon_name])) else: raise ValueError("".join(["unknown name for model ", "longitude: ", mod_lon_name])) if lon_high > 180.0 and lon_low < 0.0: raise ValueError("unexpected longitude range") elif lon_high > 180.0 or lon_low >= 0.0: lon_low = 0.0 lon_high = 360.0 else: lon_low = -180.0 lon_high = 180.0 # Set the range of the instrument longitude inst.custom_attach(pysat.utils.coords.update_longitude, kwargs={'low': lon_low, 'lon_name': inst_lon_name, 'high': lon_high}) inst.load(date=istart) # Set flag to false now that the range has been set inst_lon_adjust = False # Load the instrument data, if needed if inst.empty or inst.index[-1] < istart: inst.load(date=istart) if not inst.empty and np.any(inst.index >= istart): added_names = extract.extract_modelled_observations( inst=inst, model=mdata, inst_name=inst_name, mod_name=mod_name, mod_datetime_name=mod_datetime_name, mod_time_name=mod_time_name, mod_units=mod_units, sel_name=sel_name, time_method=time_method, method=method, pair_method=pair_method, model_label=model_label) if len(added_names) > 0: # Clean the instrument data inst.clean_level = comp_clean inst_clean_rout(inst) check_name = "_".join([model_label, mod_datetime_name]) im = list() imbase = None for aname in added_names: if aname == check_name: # There is a baseline for the names imbase = np.where( np.isfinite(inst[check_name].values)) # Determine the number of good points for this data imnew = np.where(np.isfinite(inst[aname].values)) # Some data types are higher dimensions than others, # make sure we end up choosing a high dimension one # so that we don't accidently throw away paired data if len(im) == 0 or len(im[0]) < len(imnew[0]): im = imnew # Check the data against the baseline if imbase is not None: if len(im[0]) > len(imbase[0]): ikeep = [i for i, ind in enumerate(im[0]) if ind in imbase[0]] im = [imnew[ikeep] for imnew in list(im)] # If the data is 1D, save it as a list instead of a tuple if len(im) == 1: im = im[0] else: # If the dimension data hasn't been set yet, do it here if len(inst_dims) == 0: inst_dims = [inst.index.name] inst_dims.extend([dd for dd in inst.data.dims.keys() if dd != inst.index.name]) im = {kk: np.unique(im[i]) for i, kk in enumerate(inst_dims)} # Save the clean, matched data if matched_inst is None: matched_inst = inst.copy() matched_inst.data = inst[im] else: matched_inst.concat_data(inst[im]) # Reset the clean flag inst.clean_level = 'none' # Cycle the times if tinc.total_seconds() <= 86400.0: start += tinc if start + tinc > istart + dt.timedelta(days=1): istart += dt.timedelta(days=1) else: if start + tinc >= istart + dt.timedelta(days=1): istart += dt.timedelta(days=1) if istart >= start + tinc: start += tinc return matched_inst