Source code for pyjams.ncio.netcdfio

#!/usr/bin/env python
"""
netCDF4 functions to a copy netcdf file while doing some transformations on
variables and dimensions.

This module was written by Matthias Cuntz while at Institut National de
Recherche pour l'Agriculture, l'Alimentation et l'Environnement (INRAE), Nancy,
France.

It borrows the idea of get_variable_definition from the netcdf4 thin layer of
David Schaefer.

:copyright: Copyright 2020-2022 Matthias Cuntz, see AUTHORS.rst for details.
:license: MIT License, see LICENSE for details.

.. moduleauthor:: Matthias Cuntz

The following functions are provided

.. autosummary::
   copy_dimensions
   copy_file
   copy_global_attributes
   copy_variables
   create_new_variable
   create_variables
   get_fill_value_for_dtype
   get_variable_definition
   set_output_filename

History
    * Written Apr 2020 by Matthias Cuntz (mc (at) macu (dot) de)
    * Added get_fill_value_for_dtype, Mar 2021, Matthias Cuntz
    * Added create_new_variable, Mar 2021, Matthias Cuntz
    * flake8 compatible, Mar 2021, Matthias Cuntz
    * Remove keyword in copy_global_attributes, Mar 2021, Matthias Cuntz
    * Add timedim in create_variables, Dec 2021, Matthias Cuntz
    * Rename functions, e.g. create_dimensions -> copy_dimensions,
      May 2022, Matthias Cuntz
    * Add copy_variables, May 2022, Matthias Cuntz
    * Delete unnecessary HDF5 filters in variable definition for compatibility
      with netcdf4 > 1.6.0, Jun 2022, Matthias Cuntz
    * Make get_variable_definition public, Apr 2023, Matthias Cuntz
    * get_variable_definition returns _FillValue and only then missing_value,
      Jul 2023, Matthias Cuntz
    * create_variables sets missing_value attribute if present even if used for
      _FillValue, Jul 2023, Matthias Cuntz
    * addglobalatt in copy_file, Dec 2023, Matthias Cuntz

"""
import numpy as np
import netCDF4 as nc
from warnings import warn, filterwarnings
filterwarnings("default", category=DeprecationWarning)


__all__ = ['copy_dimensions', 'copy_file', 'copy_global_attributes',
           'copy_variables', 'create_new_variable', 'create_variables',
           'get_fill_value_for_dtype', 'get_variable_definition',
           'set_output_filename']


def _tolist(arg):
    """
    Assure that *arg* is a list, e.g. if string or None are given.

    Parameters
    ----------
    arg :
        Argument to make list

    Returns
    -------
    list
        list(arg)

    Examples
    --------
    >>> _tolist('string')
    ['string']
    >>> _tolist([1,2,3])
    [1, 2, 3]
    >>> _tolist(None)
    [None]

    """
    if isinstance(arg, str):
        return [arg]
    try:
        return list(arg)
    except TypeError:
        return [arg]


[docs] def get_variable_definition(ncvar): """ Collect information on input variable. Parameters ---------- ncvar : netcdf4 variable Variable of input file Returns ------- dict Containing information on input variable withkey/value pairs. The following keys are returned: 'name', 'dtype', 'dimensions', 'fill_vallue', 'chunksizes' Examples -------- .. code-block:: python get_variable_definition(fi.variables['GPP']) """ out = ncvar.filters() if ncvar.filters() else {} # Delete HDF5 filters that are False, e.g. because the plugin is not # available. zlib is always available. # Necessary for netcdf4 > 1.6.0 because it gives all possible filters, # installed or not, i.e. 'zlib', 'szip', 'zstd', 'bzip2', 'blosc', # 'shuffle', 'complevel', 'fletcher32' todel = [ oo for oo in out if (oo != 'zlib') and (not out[oo]) ] for oo in todel: del out[oo] # chunksizes chunks = None if "chunking" in dir(ncvar): if ncvar.chunking() is not None: if not isinstance(ncvar.chunking(), str): chunks = list(ncvar.chunking()) # missing value if "_FillValue" in dir(ncvar): ifill = ncvar._FillValue elif "missing_value" in dir(ncvar): ifill = ncvar.missing_value else: ifill = None # output variable out.update({ "name": ncvar.name, "dtype": ncvar.dtype, "dimensions": list(ncvar.dimensions), "fill_value": ifill, "chunksizes": chunks, }) return out
[docs] def copy_file(ifile, ofile, timedim='time', removevar=[], renamevar={}, replacevar={}, replaceatt={}, addglobalatt={}, noclose=False): """ Copy variables from input file into output file. Parameters ---------- ifile : str File name of netcdf input file ofile : str File name of netcdf output file timedim : str, optional Name of time dimension in input file (default: 'time'). removevar : list of str, optional Do not copy variables given in *removevar* to output file. renamevar : dict, optional Copy variables from input file with different name in output file. Variable names in input file are given as dictionary keys, corresponding variable names of output file are give as dictionary values. replacevar : dict, optional Replace existing variables with variables in dictionary. Variable names in input file are given as dictionary keys, dictionary values are also dictionaries where keys are the output variable name and values are the output variable values. replaceatt : dict, optional Replace or set attributes of variables in dictionary keys (case sensitive). Dictionary values are also dictionaries with {'attribute_name': attribute_value}. Dictionary keys are the names of the output variables after renaming and replacing. addglobalatt : dict, optional Create or add to global file attributes. dict values will be given to attributes given in dict keys. Attributes will be created if they do not exist yet. noclose : bool, optional Return file handle of opened output file for further manipulation if True (default: False) Returns ------- nothing or file_handle The output file will have the altered or unaltered variables copied from the input file. Examples -------- .. code-block:: python ovar = np.arange(100) copy_variable('in.nc', 'out.nc', renamevar={'lon': 'longitude'}, replacevar={'var1': {'arange': ovar}}, replaceatt={'arange': {'long_name': 'A range', 'unit': '-'}}) """ import time as ptime warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) fi = nc.Dataset(ifile, 'r') if 'file_format' in dir(fi): fo = nc.Dataset(ofile, 'w', format=fi.file_format) else: # pragma: no cover fo = nc.Dataset(ofile, 'w', format='NETCDF4') # meta data if 'history' not in addglobalatt: addglobalatt.update({'history': ptime.asctime() + ': ' + 'copy_variables(' + ifile + ', ' + ofile + ')'}) copy_global_attributes(fi, fo, add=addglobalatt) # copy dimensions copy_dimensions(fi, fo) # create variables # rename replace variables as well xreplacevar = {} for rr in replacevar: nn = replacevar[rr].copy() kk = nn.popitem()[0] xreplacevar.update({rr: kk}) arenamevar = renamevar.copy() arenamevar.update(xreplacevar) # create static variables (independent of time) create_variables(fi, fo, time=False, timedim=timedim, fill=True, removevar=removevar, renamevar=arenamevar) # create dynamic variables (time dependent) create_variables(fi, fo, time=True, timedim=timedim, fill=True, removevar=removevar, renamevar=arenamevar) # set extra attributes for rr in replaceatt: ovar = fo.variables[rr] att = replaceatt[rr] for aa in att: ovar.setncattr(aa, att[aa]) # copy variables # do not copy replace variables aremovevar = _tolist(removevar) aremovevar.extend(xreplacevar.keys()) copy_variables(fi, fo, time=False, timedim=timedim, removevar=aremovevar, renamevar=renamevar) copy_variables(fi, fo, time=True, timedim=timedim, removevar=aremovevar, renamevar=renamevar) # set replace variables for rr in replacevar: odict = replacevar[rr].copy() oname, oval = odict.popitem() ovar = fo.variables[oname] ovar[:] = oval fi.close() if noclose: return fo else: fo.close() return
[docs] def copy_dimensions(fi, fo, removedim=[], renamedim={}, changedim={}, adddim={}): """ Create dimensions in output file from dimensions in input file. Parameters ---------- fi : file_handle File handle of opened netcdf input file fo : file_handle File handle of opened netcdf output file removedim : list of str, optional Do not create dimensions given in *removedim* in output file. renamedim : dict, optional Rename dimensions in output file compared to input file. Dimension names in input file are given as dictionary keys, corresponding dimension names of output file are give as dictionary values. changedim : dict, optional Change the size of the output dimension compared to the input file. Dimension names are given as dictionary keys, corresponding dimension sizes are given as dictionary values. adddim : dict, optional Add dimension to output file. New dimension names are given as dictionary keys and new dimension sizes are given as dictionary values. Returns ------- nothing The output file will have the altered and unaltered dimensions of the input file. Examples -------- .. code-block:: python copy_dimensions(fi, fo, removedim=['patch'], renamedim={'x': 'lon', 'y': 'lat'}, changedim={'mland': 1}) """ warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) removedim = _tolist(removedim) for d in fi.dimensions.values(): # remove dimension if in removedim if d.name not in removedim: # change dimension size if in changedim if d.name in changedim.keys(): nd = changedim[d.name] elif d.isunlimited(): nd = None else: nd = len(d) # rename dimension if in renamedim if d.name in renamedim.keys(): oname = renamedim[d.name] else: oname = d.name # create dimension fo.createDimension(oname, nd) # add new dimensions for d in adddim.keys(): if d not in fo.dimensions: fo.createDimension(d, adddim[d]) return
[docs] def copy_global_attributes(fi, fo, add={}, remove=[]): """ Create global output file attributes from input global file attributes. Parameters ---------- fi : file_handle File handle of opened netcdf input file fo : file_handle File handle of opened netcdf output file add : dict, optional dict values will be given to attributes given in dict keys. Attributes will be created if they do not exist yet. remove : list, optional Do not create global attributes given in *remove* in the output file. Returns ------- nothing Output will have global file attributes Examples -------- .. code-block:: python copy_global_attributes( fi, fo, add={'history': time.asctime()+': '+' '.join(sys.argv)}) """ warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) for k in fi.ncattrs(): if k not in remove: iattr = fi.getncattr(k) # add to existing global attribute if k in add.keys(): iattr += '\n' + add[k] fo.setncattr(k, iattr) # add if global attribute does not exist yet for k in add.keys(): if k not in fi.ncattrs(): fo.setncattr(k, add[k]) return
[docs] def copy_variables(fi, fo, time=None, timedim='time', removevar=[], renamevar={}): """ Copy variables from input file into output file. Parameters ---------- fi : file_handle File handle of opened netcdf input file fo : file_handle File handle of opened netcdf output file time : None or bool, optional None: copy all variables (default). True: copy only variables having dimension *timedim*. False: copy only variables that do not have dimension *timedim*. timedim : str, optional Name of time dimension (default: 'time'). removevar : list of str, optional Do not copy variables given in *removevar* to output file. renamevar : dict, optional Copy variables from input file with different name in output file. Variable names in input file are given as dictionary keys, corresponding variable names of output file are give as dictionary values. Returns ------- nothing The output file will have the altered or unaltered variables copied from the input file. Examples -------- .. code-block:: python copy_variable(fi, fo, fill=True, renamevar={'lon': 'longitude'}) """ warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) removevar = _tolist(removevar) # just copy all variables if no time dimension or # time dimension not known # Could be improved by looking for the unlimited dimension if timedim not in fi.dimensions: time = False ntime = 0 else: ntime = fi.dimensions[timedim].size # collect variables with and without time dimension itvar = [] # w time dimension inottvar = [] # w/o time dimension if time or (time is None): for ivar in fi.variables.values(): # remove variable if in removevar if ivar.name not in removevar: if timedim in ivar.dimensions: itvar.append(ivar) if (not time) or (time is None): for ivar in fi.variables.values(): # remove variable if in removevar if ivar.name not in removevar: if timedim not in ivar.dimensions: inottvar.append(ivar) # copy static variables (not time-dependent) for ivar in inottvar: oname = ivar.name if ivar.name in renamevar.keys(): oname = renamevar[ivar.name] ovar = fo.variables[oname] ovar[:] = ivar[:] # copy dynamic variables (time-dependent) for tt in range(ntime): for ivar in itvar: oname = ivar.name if ivar.name in renamevar.keys(): oname = renamevar[ivar.name] ovar = fo.variables[oname] if ivar.ndim == 1: ovar[tt] = ivar[tt] else: ovar[tt, ...] = ivar[tt, ...] return
[docs] def create_new_variable(invardef, fo, izip=False, fill=None, chunksizes=True): """ Create variable in output file from dictionary with variable attributes. Parameters ---------- invardef : dict Dictionary with name and dtype plus further attributes used in netCDF4.Dataset.createVariable; all other entries are set as variable attributes: 'dimensions', 'zlib', 'complevel', 'shuffle', 'fletcher32', 'contiguous', 'chunksizes', 'endian', 'least_significant_digit', 'fill_value', 'chunk_cache' fo : file_handle File handle of opened netcdf output file izip : bool, optional True: the data will be compressed in the netCDF file using gzip compression independent of 'zlib' entry in input dictionary *invardef* (default: False). fill : float, bool or None, optional Determine the behaviour if variable has no _FillValue or missing_value. If None or False: no _FillValue will be set. If True: _FillValue will be set to default value of the Python netCDF4 package for this type. If number: _FillValue will be set to number. chunksizes : bool, optional True: include possible chunksizes in output file (default). False: do not include chunksize information from input file in output file, even if given in input dictionary *invardef*. Returns ------- variable handle Handle to newly created variable in output file. Examples -------- .. code-block:: python nvar = {'name': 'new_field', 'dtype': np.dtype(np.float), 'dimensions': ('time', 'y', 'x'), 'units': 'kg/m2/s', } ovar = create_new_variable(nvar, fo, fill=True, izip=True) """ warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) assert 'name' in invardef, 'name not in input dictionary' assert 'dtype' in invardef, 'dtype not in input dictionary' varname = invardef.pop("name") datatype = invardef.pop("dtype") nckwargs = ['dimensions', 'zlib', 'complevel', 'shuffle', 'fletcher32', 'contiguous', 'chunksizes', 'endian', 'least_significant_digit', 'fill_value', 'chunk_cache'] ncdict = dict() for nn in nckwargs: if nn in invardef: ncdict.update({nn: invardef.pop(nn)}) if izip: ncdict.update({'zlib': True}) if not chunksizes: if 'chunksizes' in ncdict: _ = ncdict.pop('chunksizes') if 'fill_value' not in ncdict: ncdict.update({'fill_value': None}) # set _FillValue if None if ncdict['fill_value'] is None: if fill: if isinstance(fill, bool): ncdict['fill_value'] = get_fill_value_for_dtype(datatype) else: ncdict['fill_value'] = fill # create variable ovar = fo.createVariable(varname, datatype, **ncdict) # all other dict entries are attributes for k in invardef: ovar.setncattr(k, invardef[k]) return ovar
[docs] def create_variables(fi, fo, time=None, timedim='time', izip=False, fill=None, chunksizes=True, removevar=[], renamevar={}, removedim=[], renamedim={}, replacedim={}): """ Create variables in output from variables in input file. Parameters ---------- fi : file_handle File handle of opened netcdf input file fo : file_handle File handle of opened netcdf output file time : None or bool, optional None: create all variables (default). True: create only variables having dimension *timedim*. False: create only variables that do not have dimension *timedim*. timedim : str, optional Name of time dimension (default: 'time'). izip : bool, optional True: the data will be compressed in the netCDF file using gzip compression (default: False). fill : float, bool or None, optional Determine the behaviour if variable have no _FillValue or missing_value. If None or False: no _FillValue will be set. If True: _FillValue will be set to default value of the Python package netCDF4 for this type. If number: _FillValue will be set to number. chunksizes : bool, optional True: include possible chunksizes in output file (default). False: do not include chunksize information from input file in output file. Set to False, for example, if dimension size gets changed because the chunksize on a dimension can not be greater than the dimension size. removevar : list of str, optional Do not create variables given in *removevar* in output file. renamevar : dict, optional Rename variables in output file compared to input file. Variable names in input file are given as dictionary keys, corresponding variable names of output file are give as dictionary values. removedim : list of str, optional Remove dimensions from variable definitions in output file. renamedim : dict, optional Rename dimensions for variables in output file. Dimension names in input file are given as dictionary keys, corresponding dimension names of output file are give as dictionary values. replacedim : dict, optional Replace dimensions for variables in output file. Dimension names in input file are given as dictionary keys, corresponding dimension names of output file are given as dictionary values. The output names can be tuples or lists to extend dimensions of a variable. Returns ------- nothing The output file will have the altered or unaltered variables of the input file defined. Examples -------- .. code-block:: python create_variable(fi, fo, fill=True, izip=True, removedim=['patch'], renamevar={'lon': 'longitude'}, replacedim={'land': ('y', 'x')}) """ warn('The netcdf functions of pyjams were transferred to a standalone' ' repository "ncio". ncio is hence deprecated in pyjams.', category=DeprecationWarning) removevar = _tolist(removevar) removedim = _tolist(removedim) for ivar in fi.variables.values(): # remove variable if in removevar if ivar.name not in removevar: if time is None: itime = True else: if time: itime = timedim in ivar.dimensions else: itime = timedim not in ivar.dimensions if itime: invardef = get_variable_definition(ivar) if izip: invardef.update({'zlib': True}) # rename variable if in renamevar if ivar.name in renamevar.keys(): invardef['name'] = renamevar[ivar.name] # remove dimension if in removedim dims = invardef['dimensions'] chunks = invardef['chunksizes'] for dd in removedim: if dd in dims: ip = dims.index(dd) dims.remove(dd) if chunks: _ = chunks.pop(ip) # rename dimension if in renamedim for dd in renamedim.keys(): if dd in dims: dims[dims.index(dd)] = renamedim[dd] # replace dimensions if in replacedim for dd in replacedim.keys(): if dd in dims: rdim = _tolist(replacedim[dd]) ip = dims.index(dd) dims = dims[:ip] + rdim + dims[ip + 1:] if chunks: rchunk = [] for cc in rdim: # use fo not fi because new dims perhaps # not yet in fi rchunk.append(len(fo.dimensions[cc])) chunks = chunks[:ip] + rchunk + chunks[ip + 1:] invardef['dimensions'] = dims invardef['chunksizes'] = chunks # set _FillValue if None if invardef['fill_value'] is None: if fill: if isinstance(fill, bool): invardef['fill_value'] = get_fill_value_for_dtype( invardef['dtype']) else: invardef['fill_value'] = fill # exclude chunksizes if not chunksizes: _ = invardef.pop('chunksizes') oname = invardef.pop("name") otype = invardef.pop("dtype") ovar = fo.createVariable(oname, otype, **invardef) for k in ivar.ncattrs(): iattr = ivar.getncattr(k) if k != '_FillValue': ovar.setncattr(k, iattr) return
[docs] def get_fill_value_for_dtype(dtype): """ Get default _FillValue of netCDF4 for the given data type. Parameters ---------- dtype : np.dtype numpy data type Returns ------- default _FillValue of given numpy data type Examples -------- .. code-block:: python fill_value = get_fill_value_for_dtype(var.dtype) """ if dtype == np.dtype(np.int8): return nc.default_fillvals['i1'] elif dtype == np.dtype(np.int16): return nc.default_fillvals['i2'] elif dtype == np.dtype(np.int32): return nc.default_fillvals['i4'] elif dtype == np.dtype(np.int64): return nc.default_fillvals['i8'] elif dtype == np.dtype(np.uint8): return nc.default_fillvals['u1'] elif dtype == np.dtype(np.uint16): return nc.default_fillvals['u2'] elif dtype == np.dtype(np.uint32): return nc.default_fillvals['u4'] elif dtype == np.dtype(np.uint64): return nc.default_fillvals['u8'] elif dtype == np.dtype(np.float32): return nc.default_fillvals['f4'] elif dtype == np.dtype(np.float64): return nc.default_fillvals['f8'] else: import warnings warnings.warn("data type unknown: " + str(dtype)) return None
[docs] def set_output_filename(ifile, ext): """ Create output file name from input file name by adding *ext* before the file suffix. Parameters ---------- ifile : str input file name ext : str string to add before file suffix Returns ------- str output filename with ext before file suffix Examples -------- >>> set_output_filename('in.nc', '-no_patch') in-no_patch.nc >>> set_output_filename('in.nc', '.nop') in.nop.nc """ sifile = ifile.split('.') sifile[-2] = sifile[-2] + ext ofile = '.'.join(sifile) return ofile
if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)