Source code for pyiron_base.storage.filedata

"""Generic File Object."""

# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

import json
import os
from abc import ABC, abstractmethod
from functools import lru_cache
from typing import IO, Any, Callable, List, Union

import pandas
from pyiron_snippets.import_alarm import ImportAlarm

from pyiron_base.storage.hdfio import FileHDFio, ProjectHDFio

__author__ = "Niklas Siemer"
__copyright__ = (
    "Copyright 2020, Max-Planck-Institut für Eisenforschung GmbH - "
    "Computational Materials Design (CM) Department"
)
__version__ = "0.1"
__maintainer__ = "Niklas Siemer"
__email__ = "siemer@mpie.de"
__status__ = "development"
__date__ = "Feb 02, 2021"


_has_imported = {}
try:
    from PIL import Image

    _has_imported["PIL"] = True
    # For some reason I do not know this forces PIL to always be aware of all possible Image extensions.
    Image.registered_extensions()
except ImportError:
    _has_imported["PIL"] = False
try:
    import nbconvert
    import nbformat

    _has_imported["nbformat"] = True
except ImportError:
    _has_imported["nbformat"] = False

if all(_has_imported.values()):
    import_alarm = ImportAlarm()
else:
    import_alarm = ImportAlarm(
        "Reduced functionality, since "
        + str(
            [package for package in _has_imported.keys() if not _has_imported[package]]
        )
        + " could not be imported."
    )


def _load_txt(file: Union[str, IO]) -> List[str]:
    """
    Load a text file and return a list of lines.

    Args:
        file (str or file-like object): Path to the text file or file object.

    Returns:
        list: List of lines from the text file.
    """
    if isinstance(file, str):
        with open(file, encoding="utf8") as f:
            return f.readlines()
    else:
        return file.readlines()


def _load_json(file: Union[str, IO]) -> Any:
    """
    Load a JSON file and return the parsed data.

    Args:
        file (str or file-like object): Path to the JSON file or file object.

    Returns:
        Any: Parsed data from the JSON file.
    """
    if isinstance(file, str):
        with open(file) as f:
            return json.load(f)
    else:
        return json.load(file)


[docs] class FileLoader: """Class for loading different file types.""" _file_types = { ".json": _load_json, ".txt": _load_txt, ".csv": pandas.read_csv, } default_assumed_file_type = ".txt"
[docs] @classmethod def register(cls, file_type: str, load_callable: Callable) -> None: """Register a load function for a specific file type. Args: file_type (str): File extension to be registered, e.g. '.txt', '.csv' load_callable (callable): Function accepting a file or file-handle, returning an appropriate object for this file type. """ cls._file_types[file_type] = load_callable
[docs] def load(self, file_type: str, file: Union[str, IO], *args, **kwargs) -> Any: """Load a file of a specific type. Args: file_type (str): File extension indicating the type of the file. file (str or file-like object): Path to the file or file object. Returns: Any: Object containing the loaded data. Raises: IOError: If the file could not be loaded. """ if file_type in self._file_types: return self._file_types[file_type](file, *args, **kwargs) else: return self._load_default(file, *args, **kwargs)
def _load_default(self, file: Union[str, IO], *args, **kwargs) -> Any: """Load a file using the default assumed file type. Args: file (str or file-like object): Path to the file or file object. Returns: Any: Object containing the loaded data. Raises: IOError: If the file could not be loaded. """ try: return self._file_types[self.default_assumed_file_type]( file, *args, **kwargs ) except Exception as e: raise IOError("File could not be loaded.") from e
if _has_imported["PIL"]: for ext in Image.registered_extensions(): FileLoader.register(ext, Image.open) if _has_imported["nbformat"]:
[docs] class OwnNotebookNode(nbformat.NotebookNode): """Wrapper for nbformat.NotebookNode with some additional representation based on nbconvert.""" def _repr_html_(self): """ Generate HTML representation of the object. Returns: str: HTML representation of the object. """ html_exporter = nbconvert.HTMLExporter() html_exporter.template_name = "classic" (html_output, _) = html_exporter.from_notebook_node(self) return html_output
def _load_ipynb(file): return OwnNotebookNode(nbformat.read(file, as_version=4)) FileLoader.register(".ipynb", _load_ipynb) _file_loader = FileLoader()
[docs] @import_alarm def load_file(fp, filetype=None, project=None): """ Load the file and return an appropriate object containing the data. Args: fp (str / file): path to the file or file object to be displayed. filetype (str/None): File extension, if given this overwrites the assumption based on the filename. project (pyiron-Project/None): Project calling this function, provided to all objects referring to such. Supported file types are: '.h5', '.hdf' '.json' '.txt' '.csv' '.ipynb' Image extensions supported by PIL Returns: :class:`FileHDFio`/:class:`ProjectHDFio`: pointing to the file of filetype = '.h5' dict/list: containing data from file of filetype = '.json' list: of all lines from file for filetype = '.txt' :class:`pandas.DataFrame`: containing data from file of filetype = '.csv' """ def _resolve_filetype(file, _filetype): if _filetype is None and isinstance(file, str): _, _filetype = os.path.splitext(file) elif _filetype is None and hasattr(file, "name"): _, _filetype = os.path.splitext(file.name) elif _filetype is None: return None elif _filetype[0] != ".": _filetype = "." + _filetype return _filetype.lower() filetype = _resolve_filetype(fp, filetype) if filetype in [".h5", ".hdf"] and isinstance(fp, str): if project is None: return FileHDFio(file_name=fp) else: return ProjectHDFio(file_name=fp, project=project) else: return _file_loader.load(filetype, fp)
[docs] class FileDataTemplate(ABC): @property @abstractmethod def data(self): """Return the associated data.""" pass
[docs] class FileData(FileDataTemplate): """FileData stores an instance of a data file, e.g. a single Image from a measurement."""
[docs] def __init__( self, file, data=None, metadata=None, filetype=None, pyiron_project=None ): """FileData class to store data and associated metadata. Args: file (str): path to the data file (if data is None) or filename associated with the data. data (object/None): object containing data metadata (dict/DataContainer): Dictionary of metadata associated with the data filetype (str): File extension associated with the type data, If provided this overwrites the assumption based on the extension of the filename. pyiron_project(Project): Project this file belongs to, if any, used to load files with project awareness. """ self._project = pyiron_project if data is None: self.filename = os.path.split(file)[1] self.source = file self._data = None else: self.filename = file self.source = None self._data = data if filetype is None: filetype = os.path.splitext(self.filename)[1] if filetype == "" or filetype == ".": self.filetype = None else: self.filetype = filetype[1:] else: self.filetype = filetype if metadata is None: self.metadata = {} else: self.metadata = metadata self._hasdata = True if self._data is not None else False
@property @lru_cache() def data(self): """Return the associated data.""" if self._hasdata: return self._data else: return load_file(self.source, filetype=self.filetype, project=self._project)