Source code for pyiron_base.maintenance.generic

import importlib
import os
import pkgutil
import sys
import warnings

import pandas

import pyiron_base.storage.hdfio
from pyiron_base.maintenance.databaseperformance import get_database_statistics
from pyiron_base.maintenance.update.pyiron_base_03x_to_04x import (
    pyiron_base_03x_to_04x,
)
from pyiron_base.state import state

# we sometimes move classes between modules; this would break HDF storage,
# since objects save there the module path from which their classes can be
# imported.  We can work around this by defining here an explicit map that
# _to_object can use to find the new modules and update the HDF5 files
_MODULE_CONVERSION_DICT = {
    "pyiron_base.generic.datacontainer": "pyiron_base.storage.datacontainer",
    "pyiron_base.generic.inputlist": "pyiron_base.storage.inputlist",
    "pyiron_base.generic.flattenedstorage": "pyiron_base.storage.flattenedstorage",
    "pyiron_base.table.datamining": "pyiron_base.jobs.datamining",
}


[docs] def add_module_conversion(old: str, new: str): """ Add a new module conversion. After setting up a conversion, call :meth:`.Project.maintenance.local.update_hdf_types` to rewrite the HDF5 files to make this change and allow loading of previously saved objects. Args: old (str): path to module that previously defined objects in storage new (str): path to module that should be imported instead Raises: ValueError: if an entry for `old` already exists and does not point to `new`. """ if old not in _MODULE_CONVERSION_DICT: _MODULE_CONVERSION_DICT[old] = new elif _MODULE_CONVERSION_DICT[old] != new: raise ValueError( f"Module path '{old}' already found in conversion dict, pointing to '{_MODULE_CONVERSION_DICT[old]}'!" )
[docs] class Maintenance: """ The purpose of maintenance class is to provide some measures of perfomance for pyiron, whether local to the project or global (describing the status of pyiron on the running machine) """
[docs] def __init__(self, project): """ Args: (project): pyiron project to do maintenance on """ self._project = project self._global = GlobalMaintenance() self._update = UpdateMaintenance(self._project) self._local = LocalMaintenance(self._project)
@property def global_status(self): return self._global @property def update(self): return self._update @property def local(self): return self._local
[docs] @staticmethod def get_repository_status() -> pandas.DataFrame: """ Finds the hashes and versions for every `pyiron` module available. Returns: pandas.DataFrame: The name of each module and the hash and version for its current git head. """ from git import InvalidGitRepositoryError, Repo module_names = [ name for _, name, _ in pkgutil.iter_modules() if name.startswith("pyiron") ] report = pandas.DataFrame( columns=["Module", "Git head", "Version"], index=range(len(module_names)) ) for i, name in enumerate(module_names): module = importlib.import_module(name) try: repo = Repo(os.path.dirname(os.path.dirname(module.__file__))) except InvalidGitRepositoryError: hash_ = "Not a repo" else: try: hash_ = repo.head.reference.commit.hexsha except (ValueError, TypeError): hash_ = "Error while resolving sha" if hasattr(module, "__version__"): version = module.__version__ else: version = "not defined" report.loc[i] = [name, hash_, version] return report
[docs] class LocalMaintenance:
[docs] def __init__(self, project): self._project = project
[docs] def defragment_storage( self, recursive: bool = True, progress: bool = True, **kwargs: dict, ): """ Rewrite the hdf5 files of jobs. This can free up unused space. By default iterate recursively over the jobs within the current project. This can be controlled with `recursive` and `kwargs`. Args: recursive (bool): search subprojects [True/False] - True by default progress (bool): if True (default), add an interactive progress bar to the iteration **kwargs (dict): Optional arguments for filtering with keys matching the project database column name (eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more instances of any character """ for job in self._project.iter_jobs( recursive=recursive, progress=progress, convert_to_object=False, **kwargs ): hdf = job.project_hdf5 hdf.rewrite_hdf5(job.name)
[docs] def update_hdf_types( self, recursive: bool = True, progress: bool = True, **kwargs: dict, ): """ Rewrite TYPE fields in hdf5 files for renamed modules. New module conversions can be added with :func:`.add_module_conversion(old, new)`. This method will then consider all objects previously imported from `old` to be imported from `new`. Args: recursive (bool): search subprojects [True/False] - True by default progress (bool): if True (default), add an interactive progress bar to the iteration **kwargs (dict): Optional arguments for filtering with keys matching the project database column name (eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more instances of any character """ def recurse(hdf): contents = hdf.list_all() for group in contents["groups"]: recurse(hdf[group]) if "TYPE" in contents["nodes"]: ( module_path, class_name, ) = pyiron_base.storage.hdfio._extract_module_class_name(hdf["TYPE"]) if module_path in _MODULE_CONVERSION_DICT: new_module_path = _MODULE_CONVERSION_DICT[module_path] hdf["TYPE"] = f"<class '{new_module_path}.{class_name}'>" for job in self._project.iter_jobs( recursive=recursive, progress=progress, convert_to_object=False, **kwargs ): hdf = job.project_hdf5 recurse(hdf) def fix_project_data(pr): try: hdf = pr.create_hdf(pr.path, "project_data")["../data"] recurse(hdf) except ValueError: # in case project data does not exist yet pass fix_project_data(self._project) for sub in self._project.iter_groups(): fix_project_data(sub) self.update_pyiron_tables(recursive=recursive, progress=progress, **kwargs)
def update_pyiron_tables( self, recursive: bool = True, progress: bool = True, **kwargs: dict, ): kwargs["hamilton"] = "PyironTable" for old, new in _MODULE_CONVERSION_DICT.items(): sys.modules[old] = importlib.import_module(new) for job in self._project.iter_jobs( recursive=recursive, progress=progress, **kwargs ): job.to_hdf()
[docs] class UpdateMaintenance:
[docs] def __init__(self, project): self._project = project
[docs] def base_to_current(self, start_version: str, project=None): """Runs all updates for pyiron_base to reach the current version. Args: start_version(str): Version of pyiron_base in the mayor.minor[.patch] format from which to start applying the updates. project(None/project/list/str): The project(s) to be converted from 0.3 to 0.4 ; default: current project One may provide a pyiron Project, a list of pyiron Projects, or a string containing "all" or a valid path. If "all" is provided, pyiron tries to find all projects using the PROJECT_PATHS defined in the configuration. """ mayor, minor = start_version.split(".")[0:2] if int(mayor) != 0: raise ValueError("Updates to version >0.x.y is not possible.") if int(minor) < 4: self.base_v0_3_to_v0_4(project)
[docs] def base_v0_3_to_v0_4(self, project=None): """Update hdf files written with pyiron_base-0.3.x to pyiron_base-0.4.x pyiron_base<=0.3.9 has a bug that writes all arrays with dtype=object even numeric ones. As a fix pyiron_base=0.4.0 introduces a conversion when reading such arrays, but does not automatically save them. This conversion script simply goes over all jobs and rewrites their HDF5 files, since it's read with the correct dtype, this then writes this correct dtype. Args: project(None/project/list/str): The project(s) to be converted from 0.3 to 0.4 ; default: current project One may provide a pyiron Project, a list of pyiron Projects, or a string containing "all" or a valid path. If "all" is provided, pyiron tries to find all projects using the PROJECT_PATHS defined in the configuration. """ if project is None: projects = [self._project] elif isinstance(project, list): projects = project elif project == "all": projects = [ self._project.__class__(path) for path in state.settings.configuration["project_paths"] ] elif isinstance(project, str): if os.path.isdir(project): projects = [self._project.__class__(project)] else: raise ValueError( f"{project} is a str but neither 'all' nor a directory." ) else: projects = [project] if len(projects) == 0: warnings.warn( f"Provided project {project} lead to 0 projects to be converted." ) for pr in projects: try: pyiron_base_03x_to_04x(pr) except ValueError as e: print(f"WARNING: Updating project {project} failed with {e}!")
[docs] class GlobalMaintenance:
[docs] def __init__(self): """ initialize the flag self._check_postgres, to control whether pyiron is set to communicate with a postgres database. """ connection_string = state.database.sql_connection_string if "postgresql" not in connection_string: warnings.warn( "The database statistics is only available for a Postgresql database" ) self._check_postgres = False else: self._check_postgres = True
def get_database_statistics(self): if self._check_postgres: return get_database_statistics() else: raise RuntimeError( "The detabase statistics is only available for a Postgresql database" )