Source code for pyiron_base.storage.filedata

"""Generic File Object."""

# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

import json
import os
from abc import ABC, abstractmethod
from functools import lru_cache
from typing import IO, Any, Callable, List, Union

import pandas
from pyiron_snippets.import_alarm import ImportAlarm

from pyiron_base.storage.hdfio import FileHDFio, ProjectHDFio

__author__ = "Niklas Siemer"
__copyright__ = (
    "Copyright 2020, Max-Planck-Institut für Eisenforschung GmbH - "
    "Computational Materials Design (CM) Department"
)
__version__ = "0.1"
__maintainer__ = "Niklas Siemer"
__email__ = "siemer@mpie.de"
__status__ = "development"
__date__ = "Feb 02, 2021"


_has_imported = {}
try:
    from PIL import Image

    _has_imported["PIL"] = True
    # For some reason I do not know this forces PIL to always be aware of all possible Image extensions.
    Image.registered_extensions()
except ImportError:
    _has_imported["PIL"] = False
try:
    import nbconvert
    import nbformat

    _has_imported["nbformat"] = True
except ImportError:
    _has_imported["nbformat"] = False

if all(_has_imported.values()):
    import_alarm = ImportAlarm()
else:
    import_alarm = ImportAlarm(
        "Reduced functionality, since "
        + str(
            [package for package in _has_imported.keys() if not _has_imported[package]]
        )
        + " could not be imported."
    )


def _load_txt(file: Union[str, IO]) -> List[str]:
    """
    Load a text file and return a list of lines.

    Args:
        file (str or file-like object): Path to the text file or file object.

    Returns:
        list: List of lines from the text file.
    """
    if isinstance(file, str):
        with open(file, encoding="utf8") as f:
            return f.readlines()
    else:
        return file.readlines()


def _load_json(file: Union[str, IO]) -> Any:
    """
    Load a JSON file and return the parsed data.

    Args:
        file (str or file-like object): Path to the JSON file or file object.

    Returns:
        Any: Parsed data from the JSON file.
    """
    if isinstance(file, str):
        with open(file) as f:
            return json.load(f)
    else:
        return json.load(file)



[docs]
class FileLoader:
    """Class for loading different file types."""

    _file_types = {
        ".json": _load_json,
        ".txt": _load_txt,
        ".csv": pandas.read_csv,
    }
    default_assumed_file_type = ".txt"


[docs]
    @classmethod
    def register(cls, file_type: str, load_callable: Callable) -> None:
        """Register a load function for a specific file type.

        Args:
            file_type (str): File extension to be registered, e.g. '.txt', '.csv'
            load_callable (callable): Function accepting a file or file-handle, returning an appropriate object for
                this file type.
        """
        cls._file_types[file_type] = load_callable



[docs]
    def load(self, file_type: str, file: Union[str, IO], *args, **kwargs) -> Any:
        """Load a file of a specific type.

        Args:
            file_type (str): File extension indicating the type of the file.
            file (str or file-like object): Path to the file or file object.

        Returns:
            Any: Object containing the loaded data.

        Raises:
            IOError: If the file could not be loaded.
        """
        if file_type in self._file_types:
            return self._file_types[file_type](file, *args, **kwargs)
        else:
            return self._load_default(file, *args, **kwargs)


    def _load_default(self, file: Union[str, IO], *args, **kwargs) -> Any:
        """Load a file using the default assumed file type.

        Args:
            file (str or file-like object): Path to the file or file object.

        Returns:
            Any: Object containing the loaded data.

        Raises:
            IOError: If the file could not be loaded.
        """
        try:
            return self._file_types[self.default_assumed_file_type](
                file, *args, **kwargs
            )
        except Exception as e:
            raise IOError("File could not be loaded.") from e



if _has_imported["PIL"]:
    for ext in Image.registered_extensions():
        FileLoader.register(ext, Image.open)


if _has_imported["nbformat"]:


[docs]
    class OwnNotebookNode(nbformat.NotebookNode):
        """Wrapper for nbformat.NotebookNode with some additional representation based on nbconvert."""

        def _repr_html_(self):
            """
            Generate HTML representation of the object.

            Returns:
                str: HTML representation of the object.
            """
            html_exporter = nbconvert.HTMLExporter()
            html_exporter.template_name = "classic"
            return html_exporter.from_notebook_node(self)[0]


    def _load_ipynb(file):
        return OwnNotebookNode(nbformat.read(file, as_version=4))

    FileLoader.register(".ipynb", _load_ipynb)

_file_loader = FileLoader()



[docs]
@import_alarm
def load_file(fp, filetype=None, project=None):
    """
    Load the file and return an appropriate object containing the data.

    Args:
        fp (str / file): path to the file or file object to be displayed.
        filetype (str/None): File extension, if given this overwrites the assumption based on the filename.
        project (pyiron-Project/None): Project calling this function, provided to all objects referring to such.

        Supported file types are:
        '.h5', '.hdf'
        '.json'
        '.txt'
        '.csv'
        '.ipynb'
        Image extensions supported by PIL

    Returns:
        :class:`FileHDFio`/:class:`ProjectHDFio`: pointing to the file of filetype = '.h5'
        dict/list: containing data from file of filetype = '.json'
        list: of all lines from file for filetype = '.txt'
        :class:`pandas.DataFrame`: containing data from file of filetype = '.csv'

    """

    def _resolve_filetype(file, _filetype):
        if _filetype is None and isinstance(file, str):
            _, _filetype = os.path.splitext(file)
        elif _filetype is None and hasattr(file, "name"):
            _, _filetype = os.path.splitext(file.name)
        elif _filetype is None:
            return None
        elif _filetype[0] != ".":
            _filetype = "." + _filetype
        return _filetype.lower()

    filetype = _resolve_filetype(fp, filetype)

    if filetype in [".h5", ".hdf"] and isinstance(fp, str):
        if project is None:
            return FileHDFio(file_name=fp)
        else:
            return ProjectHDFio(file_name=fp, project=project)
    else:
        return _file_loader.load(filetype, fp)




[docs]
class FileDataTemplate(ABC):
    @property
    @abstractmethod
    def data(self):
        """Return the associated data."""
        pass




[docs]
class FileData(FileDataTemplate):
    """FileData stores an instance of a data file, e.g. a single Image from a measurement."""


[docs]
    def __init__(
        self, file, data=None, metadata=None, filetype=None, pyiron_project=None
    ):
        """FileData class to store data and associated metadata.

        Args:
            file (str): path to the data file (if data is None) or filename associated with the data.
            data (object/None): object containing data
            metadata (dict/DataContainer): Dictionary of metadata associated with the data
            filetype (str): File extension associated with the type data,
                            If provided this overwrites the assumption based on the extension of the filename.
            pyiron_project(Project): Project this file belongs to, if any, used to load files with project awareness.
        """
        self._project = pyiron_project
        if data is None:
            self.filename = os.path.split(file)[1]
            self.source = file
            self._data = None
        else:
            self.filename = file
            self.source = None
            self._data = data
        if filetype is None:
            filetype = os.path.splitext(self.filename)[1]
            if filetype == "" or filetype == ".":
                self.filetype = None
            else:
                self.filetype = filetype[1:]
        else:
            self.filetype = filetype
        if metadata is None:
            self.metadata = {}
        else:
            self.metadata = metadata
        self._hasdata = True if self._data is not None else False


    @property
    @lru_cache()
    def data(self):
        """Return the associated data."""
        if self._hasdata:
            return self._data
        else:
            return load_file(self.source, filetype=self.filetype, project=self._project)