Source code for pyiron_base.maintenance.generic

import importlib
import os
import pkgutil
import sys
import warnings

import pandas

import pyiron_base.storage.hdfio
from pyiron_base.maintenance.databaseperformance import get_database_statistics
from pyiron_base.maintenance.update.pyiron_base_03x_to_04x import (
    pyiron_base_03x_to_04x,
)
from pyiron_base.state import state

# we sometimes move classes between modules; this would break HDF storage,
# since objects save there the module path from which their classes can be
# imported.  We can work around this by defining here an explicit map that
# _to_object can use to find the new modules and update the HDF5 files
_MODULE_CONVERSION_DICT = {
    "pyiron_base.generic.datacontainer": "pyiron_base.storage.datacontainer",
    "pyiron_base.generic.inputlist": "pyiron_base.storage.inputlist",
    "pyiron_base.generic.flattenedstorage": "pyiron_base.storage.flattenedstorage",
    "pyiron_base.table.datamining": "pyiron_base.jobs.datamining",
}



[docs]
def add_module_conversion(old: str, new: str):
    """
    Add a new module conversion.

    After setting up a conversion, call :meth:`.Project.maintenance.local.update_hdf_types` to rewrite the HDF5 files to
    make this change and allow loading of previously saved objects.

    Args:
        old (str): path to module that previously defined objects in storage
        new (str): path to module that should be imported instead
    Raises:
        ValueError: if an entry for `old` already exists and does not point to `new`.
    """
    if old not in _MODULE_CONVERSION_DICT:
        _MODULE_CONVERSION_DICT[old] = new
    elif _MODULE_CONVERSION_DICT[old] != new:
        raise ValueError(
            f"Module path '{old}' already found in conversion dict, pointing to '{_MODULE_CONVERSION_DICT[old]}'!"
        )




[docs]
class Maintenance:
    """
    The purpose of maintenance class is to provide
    some measures of perfomance for pyiron, whether local to the project
    or global (describing the status of pyiron on the running machine)
    """


[docs]
    def __init__(self, project):
        """
        Args:
            (project): pyiron project to do maintenance on
        """
        self._project = project
        self._global = GlobalMaintenance()
        self._update = UpdateMaintenance(self._project)
        self._local = LocalMaintenance(self._project)


    @property
    def global_status(self):
        return self._global

    @property
    def update(self):
        return self._update

    @property
    def local(self):
        return self._local


[docs]
    @staticmethod
    def get_repository_status() -> pandas.DataFrame:
        """
        Finds the hashes and versions for every `pyiron` module available.

        Returns:
            pandas.DataFrame: The name of each module and the hash and version for its current git head.
        """
        from git import InvalidGitRepositoryError, Repo

        module_names = [
            name for _, name, _ in pkgutil.iter_modules() if name.startswith("pyiron")
        ]

        report = pandas.DataFrame(
            columns=["Module", "Git head", "Version"], index=range(len(module_names))
        )
        for i, name in enumerate(module_names):
            module = importlib.import_module(name)
            try:
                repo = Repo(os.path.dirname(os.path.dirname(module.__file__)))
            except InvalidGitRepositoryError:
                hash_ = "Not a repo"
            else:
                try:
                    hash_ = repo.head.reference.commit.hexsha
                except (ValueError, TypeError):
                    hash_ = "Error while resolving sha"
            if hasattr(module, "__version__"):
                version = module.__version__
            else:
                version = "not defined"
            report.loc[i] = [name, hash_, version]

        return report





[docs]
class LocalMaintenance:

[docs]
    def __init__(self, project):
        self._project = project



[docs]
    def defragment_storage(
        self,
        recursive: bool = True,
        progress: bool = True,
        **kwargs: dict,
    ):
        """
        Rewrite the hdf5 files of jobs.  This can free up unused space.

        By default iterate recursively over the jobs within the current
        project.  This can be controlled with `recursive` and `kwargs`.

        Args:
            recursive (bool): search subprojects [True/False] - True by default
            progress (bool): if True (default), add an interactive progress bar to the iteration
            **kwargs (dict): Optional arguments for filtering with keys matching the project database column name
                            (eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more
                            instances of any character
        """
        for job in self._project.iter_jobs(
            recursive=recursive, progress=progress, convert_to_object=False, **kwargs
        ):
            hdf = job.project_hdf5
            hdf.rewrite_hdf5(job.name)



[docs]
    def update_hdf_types(
        self,
        recursive: bool = True,
        progress: bool = True,
        **kwargs: dict,
    ):
        """
        Rewrite TYPE fields in hdf5 files for renamed modules.

        New module conversions can be added with
        :func:`.add_module_conversion(old, new)`.  This method will then
        consider all objects previously imported from `old` to be imported from
        `new`.

        Args:
            recursive (bool): search subprojects [True/False] - True by default
            progress (bool): if True (default), add an interactive progress bar to the iteration
            **kwargs (dict): Optional arguments for filtering with keys matching the project database column name
                            (eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more
                            instances of any character
        """

        def recurse(hdf):
            contents = hdf.list_all()
            for group in contents["groups"]:
                recurse(hdf[group])
            if "TYPE" in contents["nodes"]:
                (
                    module_path,
                    class_name,
                ) = pyiron_base.storage.hdfio._extract_module_class_name(hdf["TYPE"])
                if module_path in _MODULE_CONVERSION_DICT:
                    new_module_path = _MODULE_CONVERSION_DICT[module_path]
                    hdf["TYPE"] = f"<class '{new_module_path}.{class_name}'>"

        for job in self._project.iter_jobs(
            recursive=recursive, progress=progress, convert_to_object=False, **kwargs
        ):
            hdf = job.project_hdf5
            recurse(hdf)

        def fix_project_data(pr):
            try:
                hdf = pr.create_hdf(pr.path, "project_data")["../data"]
                recurse(hdf)
            except ValueError:
                # in case project data does not exist yet
                pass

        fix_project_data(self._project)
        for sub in self._project.iter_groups():
            fix_project_data(sub)

        self.update_pyiron_tables(recursive=recursive, progress=progress, **kwargs)


    def update_pyiron_tables(
        self,
        recursive: bool = True,
        progress: bool = True,
        **kwargs: dict,
    ):
        kwargs["hamilton"] = "PyironTable"
        for old, new in _MODULE_CONVERSION_DICT.items():
            sys.modules[old] = importlib.import_module(new)

        for job in self._project.iter_jobs(
            recursive=recursive, progress=progress, **kwargs
        ):
            job.to_hdf()




[docs]
class UpdateMaintenance:

[docs]
    def __init__(self, project):
        self._project = project



[docs]
    def base_to_current(self, start_version: str, project=None):
        """Runs all updates for pyiron_base to reach the current version.

        Args:
            start_version(str): Version of pyiron_base in the mayor.minor[.patch] format from which to start applying
                the updates.
            project(None/project/list/str): The project(s) to be converted from 0.3 to 0.4 ; default: current project
                One may provide a pyiron Project, a list of pyiron Projects, or a string containing "all" or a valid
                path.
                If "all" is provided, pyiron tries to find all projects using the PROJECT_PATHS defined in the
                configuration.
        """
        mayor, minor = start_version.split(".")[0:2]
        if int(mayor) != 0:
            raise ValueError("Updates to version >0.x.y is not possible.")
        if int(minor) < 4:
            self.base_v0_3_to_v0_4(project)



[docs]
    def base_v0_3_to_v0_4(self, project=None):
        """Update hdf files written with pyiron_base-0.3.x to pyiron_base-0.4.x

        pyiron_base<=0.3.9 has a bug that writes all arrays with dtype=object even
        numeric ones.  As a fix pyiron_base=0.4.0 introduces a conversion when reading
        such arrays, but does not automatically save them.  This conversion script
        simply goes over all jobs and rewrites their HDF5 files, since it's read with
        the correct dtype, this then writes this correct dtype.

        Args:
            project(None/project/list/str): The project(s) to be converted from 0.3 to 0.4 ; default: current project
                One may provide a pyiron Project, a list of pyiron Projects, or a string containing "all" or a valid
                path.
                If "all" is provided, pyiron tries to find all projects using the PROJECT_PATHS defined in the
                configuration.
        """
        if project is None:
            projects = [self._project]
        elif isinstance(project, list):
            projects = project
        elif project == "all":
            projects = [
                self._project.__class__(path)
                for path in state.settings.configuration["project_paths"]
            ]
        elif isinstance(project, str):
            if os.path.isdir(project):
                projects = [self._project.__class__(project)]
            else:
                raise ValueError(
                    f"{project} is a str but neither 'all' nor a directory."
                )
        else:
            projects = [project]

        if len(projects) == 0:
            warnings.warn(
                f"Provided project {project} lead to 0 projects to be converted."
            )

        for pr in projects:
            try:
                pyiron_base_03x_to_04x(pr)
            except ValueError as e:
                print(f"WARNING: Updating project {project} failed with {e}!")





[docs]
class GlobalMaintenance:

[docs]
    def __init__(self):
        """
        initialize the flag self._check_postgres, to control whether pyiron is
        set to communicate with a postgres database.
        """
        connection_string = state.database.sql_connection_string
        if "postgresql" not in connection_string:
            warnings.warn(
                "The database statistics is only available for a Postgresql database"
            )
            self._check_postgres = False
        else:
            self._check_postgres = True


    def get_database_statistics(self):
        if self._check_postgres:
            return get_database_statistics()
        else:
            raise RuntimeError(
                "The detabase statistics is only available for a Postgresql database"
            )