Source code for py21cmfast.cache_tools

"""A set of tools for reading/writing/querying the in-built cache."""
import glob
import h5py
import logging
import os
import re
from os import path

from . import outputs, wrapper
from ._cfg import config
from .wrapper import global_params

logger = logging.getLogger("21cmFAST")


[docs]def readbox( *, direc=None, fname=None, hsh=None, kind=None, seed=None, redshift=None, load_data=True, ): """ Read in a data set and return an appropriate object for it. Parameters ---------- direc : str, optional The directory in which to search for the boxes. By default, this is the centrally-managed directory, given by the ``config.yml`` in ``~/.21cmfast/``. fname: str, optional The filename (without directory) of the data set. If given, this will be preferentially used, and must exist. hsh: str, optional The md5 hsh of the object desired to be read. Required if `fname` not given. kind: str, optional The kind of dataset, eg. "InitialConditions". Will be the name of a class defined in :mod:`~wrapper`. Required if `fname` not given. seed: str or int, optional The random seed of the data set to be read. If not given, and filename not given, then a box will be read if it matches the kind and hsh, with an arbitrary seed. load_data: bool, optional Whether to read in the data in the data set. Otherwise, only its defining parameters are read. Returns ------- dataset : An output object, whose type depends on the kind of data set being read. Raises ------ IOError : If no files exist of the given kind and hsh. ValueError : If either ``fname`` is not supplied, or both ``kind`` and ``hsh`` are not supplied. """ direc = path.expanduser(direc or config["direc"]) if not (fname or (hsh and kind)): raise ValueError("Either fname must be supplied, or kind and hsh") zstr = f"z{redshift:.4f}_" if redshift is not None else "" if not fname: if not seed: fname = kind + "_" + zstr + hsh + "_r*.h5" files = glob.glob(path.join(direc, fname)) if files: fname = files[0] else: raise OSError("No files exist with that kind and hsh.") else: fname = kind + "_" + zstr + hsh + "_r" + str(seed) + ".h5" kind = _parse_fname(fname)["kind"] cls = getattr(outputs, kind) if hasattr(cls, "from_file"): inst = cls.from_file(fname, direc=direc, load_data=load_data) else: inst = cls.read(fname, direc=direc) return inst
def _parse_fname(fname): patterns = ( r"(?P<kind>\w+)_(?P<hash>\w{32})_r(?P<seed>\d+).h5$", r"(?P<kind>\w+)_z(?P<redshift>\d+.\d{1,4})_(?P<hash>\w{32})_r(?P<seed>\d+).h5$", ) for pattern in patterns: match = re.match(pattern, os.path.basename(fname)) if match: break if not match: raise ValueError( "filename {} does not have correct format for a cached output.".format( fname ) ) return match.groupdict()
[docs]def list_datasets(*, direc=None, kind=None, hsh=None, seed=None, redshift=None): """Yield all datasets which match a given set of filters. Can be used to determine parameters of all cached datasets, in conjunction with :func:`readbox`. Parameters ---------- direc : str, optional The directory in which to search for the boxes. By default, this is the centrally-managed directory, given by the ``config.yml`` in ``.21cmfast``. kind: str, optional Filter by this kind (one of {"InitialConditions", "PerturbedField", "IonizedBox", "TsBox", "BrightnessTemp"} hsh: str, optional Filter by this hsh. seed: str, optional Filter by this seed. Yields ------ fname: str The filename of the dataset (without directory). parts: tuple of strings The (kind, hsh, seed) of the data set. """ direc = path.expanduser(direc or config["direc"]) fname = "{}{}_{}_r{}.h5".format( kind or r"(?P<kind>[a-zA-Z]+)", f"_z{redshift:.4f}" if redshift is not None else "(.*)", hsh or r"(?P<hash>\w{32})", seed or r"(?P<seed>\d+)", ) for fl in os.listdir(direc): if re.match(fname, fl): yield fl
[docs]def query_cache( *, direc=None, kind=None, hsh=None, seed=None, redshift=None, show=True ): """Get or print datasets in the cache. Walks through the cache, with given filters, and return all un-initialised dataset objects, optionally printing their representation to screen. Useful for querying which kinds of datasets are available within the cache, and choosing one to read and use. Parameters ---------- direc : str, optional The directory in which to search for the boxes. By default, this is the centrally-managed directory, given by the ``config.yml`` in ``~/.21cmfast``. kind: str, optional Filter by this kind. Must be one of "InitialConditions", "PerturbedField", "IonizedBox", "TsBox" or "BrightnessTemp". hsh: str, optional Filter by this hsh. seed: str, optional Filter by this seed. show: bool, optional Whether to print out a repr of each object that exists. Yields ------ obj: Output objects, un-initialized. """ for file in list_datasets( direc=direc, kind=kind, hsh=hsh, seed=seed, redshift=redshift ): cls = readbox(direc=direc, fname=file, load_data=False) if show: print(file + ": " + str(cls)) # noqa: T yield file, cls
[docs]def clear_cache(**kwargs): """Delete datasets in the cache. Walks through the cache, with given filters, and deletes all un-initialised dataset objects, optionally printing their representation to screen. Parameters ---------- kwargs : All options passed through to :func:`query_cache`. """ if "show" not in kwargs: kwargs["show"] = False direc = kwargs.get("direc", path.expanduser(config["direc"])) number = 0 for fname, _ in query_cache(**kwargs): if kwargs.get("show", True): logger.info(f"Removing {fname}") os.remove(path.join(direc, fname)) number += 1 logger.info(f"Removed {number} files from cache.")