Source code for pantea.datasets.runner

from __future__ import annotations

from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterator, List, Optional, TextIO

from pantea.atoms.structure import Structure
from pantea.types import Dtype, default_dtype
from pantea.utils.tokenize import tokenize


[docs]class RunnerDataSource: """ The class is intended for the input data format of `RuNNer`_ consists of atomic attributes and simulation box information. Within each snapshot, there are two types of properties: `per-atom` properties and `collective` properties. The per-atom properties encompass various attributes like the element name, positions, energy, charge, force components, and more. On the other hand, the collective properties include attributes such as lattice parameters, total energy, and total charge. .. _RuNNer: https://www.uni-goettingen.de/de/560580.html """ def __init__( self, filename: Path, dtype: Optional[Dtype] = None, ) -> None: """ Create a `RuNNer`_ structure data by initializing it from an input file. :param filename: input file name :type filename: Path :param dtype: precision for the structure data, defaults to None :type dtype: Optional[Dtype], optional .. _RuNNer: https://www.uni-goettingen.de/de/560580.html """ self.filename = Path(filename) self.dtype = dtype if dtype is not None else default_dtype.FLOATX def __len__(self) -> int: """Return number of available structures.""" num_structures: int = 0 with open(str(self.filename), "r") as file: while self._ignore_next_structure(file): num_structures += 1 return num_structures def __getitem__(self, index: int) -> Structure: """ Return i-th structure. This is a lazy call which means that only required section of data is loaded into the memory. """ with open(str(self.filename), "r") as file: for _ in range(index): self._ignore_next_structure(file) data = self._read_next_structure(file) if not data: raise IndexError( f"The given index {index} is out of bound (len={len(self)})" ) return self._to_structure(data)
[docs] def read_structures(self) -> Iterator[Structure]: """ Read structures consecutively. It must be noted that reading data in a consecutive manner from file is faster compared to indexing read. This can be used for performant preloading of structures into the memory, if needed. :return: Structure :rtype: Iterator[Structure] """ with open(str(self.filename), "r") as file: while True: data = self._read_next_structure(file) if not data: break yield self._to_structure(data)
@classmethod def _read_next_structure(cls, file: TextIO) -> Dict[str, List]: """Read next structure.""" data = defaultdict(list) read_block: bool = False while True: line = file.readline() if not line: break keyword, _ = tokenize(line) if keyword == "begin": read_block = True break while read_block: line = file.readline() if not line: break keyword, tokens = tokenize(line) if keyword == "atom": data["positions"].append([float(t) for t in tokens[:3]]) data["elements"].append(tokens[3]) data["charges"].append(float(tokens[4])) data["energies"].append(float(tokens[5])) data["forces"].append([float(t) for t in tokens[6:9]]) elif keyword == "lattice": data["lattice"].append([float(t) for t in tokens[:3]]) elif keyword == "energy": data["total_energy"].append(float(tokens[0])) elif keyword == "charge": data["total_charge"].append(float(tokens[0])) elif keyword == "comment": data["comment"].append(" ".join(line.split()[1:])) elif keyword == "end": read_block = False return data @classmethod def _ignore_next_structure(cls, file: TextIO) -> bool: while True: line = file.readline() if not line: return False keyword, _ = tokenize(line) if keyword == "end": break return True def _to_structure(self, data: Dict[str, List]) -> Structure: return Structure.from_dict(data, dtype=self.dtype) def __repr__(self) -> str: return f"{self.__class__.__name__}(filename='{str(self.filename)}', dtype={self.dtype.dtype})"