Source code for pantea.datasets.runner

from __future__ import annotations

from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterator, List, Optional, TextIO

from pantea.atoms.structure import Structure
from pantea.types import Dtype, default_dtype
from pantea.utils.tokenize import tokenize


[docs]class RunnerDataSource:
    """

    The class is intended for the input data format of `RuNNer`_ consists of atomic attributes
    and simulation box information. Within each snapshot, there are two types of
    properties: `per-atom` properties and `collective` properties.

    The per-atom properties encompass various attributes like the element name,
    positions, energy, charge, force components, and more.

    On the other hand, the collective properties include attributes
    such as lattice parameters, total energy, and total charge.

    .. _RuNNer: https://www.uni-goettingen.de/de/560580.html
    """

    def __init__(
        self,
        filename: Path,
        dtype: Optional[Dtype] = None,
    ) -> None:
        """
        Create a `RuNNer`_ structure data by initializing it from an input file.

        :param filename: input file name
        :type filename: Path
        :param dtype: precision for the structure data, defaults to None
        :type dtype: Optional[Dtype], optional

        .. _RuNNer: https://www.uni-goettingen.de/de/560580.html
        """
        self.filename = Path(filename)
        self.dtype = dtype if dtype is not None else default_dtype.FLOATX

    def __len__(self) -> int:
        """Return number of available structures."""
        num_structures: int = 0
        with open(str(self.filename), "r") as file:
            while self._ignore_next_structure(file):
                num_structures += 1
        return num_structures

    def __getitem__(self, index: int) -> Structure:
        """
        Return i-th structure.

        This is a lazy call which means that only required section
        of data is loaded into the memory.
        """
        with open(str(self.filename), "r") as file:
            for _ in range(index):
                self._ignore_next_structure(file)
            data = self._read_next_structure(file)
            if not data:
                raise IndexError(
                    f"The given index {index} is out of bound (len={len(self)})"
                )
        return self._to_structure(data)

[docs]    def read_structures(self) -> Iterator[Structure]:
        """
        Read structures consecutively.

        It must be noted that reading data in a consecutive manner from file is
        faster compared to indexing read. This can be used for performant preloading
        of structures into the memory, if needed.

        :return: Structure
        :rtype: Iterator[Structure]
        """
        with open(str(self.filename), "r") as file:
            while True:
                data = self._read_next_structure(file)
                if not data:
                    break
                yield self._to_structure(data)

    @classmethod
    def _read_next_structure(cls, file: TextIO) -> Dict[str, List]:
        """Read next structure."""
        data = defaultdict(list)
        read_block: bool = False
        while True:
            line = file.readline()
            if not line:
                break
            keyword, _ = tokenize(line)
            if keyword == "begin":
                read_block = True
                break
        while read_block:
            line = file.readline()
            if not line:
                break
            keyword, tokens = tokenize(line)
            if keyword == "atom":
                data["positions"].append([float(t) for t in tokens[:3]])
                data["elements"].append(tokens[3])
                data["charges"].append(float(tokens[4]))
                data["energies"].append(float(tokens[5]))
                data["forces"].append([float(t) for t in tokens[6:9]])
            elif keyword == "lattice":
                data["lattice"].append([float(t) for t in tokens[:3]])
            elif keyword == "energy":
                data["total_energy"].append(float(tokens[0]))
            elif keyword == "charge":
                data["total_charge"].append(float(tokens[0]))
            elif keyword == "comment":
                data["comment"].append(" ".join(line.split()[1:]))
            elif keyword == "end":
                read_block = False
        return data

    @classmethod
    def _ignore_next_structure(cls, file: TextIO) -> bool:
        while True:
            line = file.readline()
            if not line:
                return False
            keyword, _ = tokenize(line)
            if keyword == "end":
                break
        return True

    def _to_structure(self, data: Dict[str, List]) -> Structure:
        return Structure.from_dict(data, dtype=self.dtype)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(filename='{str(self.filename)}', dtype={self.dtype.dtype})"