"""
Module for extracting information from ORCA output files. Can be used to read arbitrary
information from the output files, as well as gradient and hessian files generated by
the ORCA code package. Containes several predefined parsers.
"""
from typing import Optional, List, Dict, Union
import logging
import os
import numpy as np
from ase import Atoms, units
from tqdm import tqdm
import schnetpack as spk
from schnetpack import properties
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
__all__ = [
"ppm2au",
"OrcaParserException",
"OrcaParser",
"format_dipole_derivatives",
"format_polarizability_derivatives",
"OrcaOutputParser",
"OrcaFormatter",
"OrcaPropertyParser",
"OrcaMainFileParser",
"OrcaHessianFileParser",
]
# Conversion from ppm to atomic units. Alpha is the fine structure constant and 1e6 are
# the ppm
ppm2au = 2.0 / (units.alpha**2 * 1e6)
class OrcaParserException(Exception):
"""
Exception for OrcaParser class.
"""
pass
[docs]class OrcaParser:
"""
Main parsers utility for ORCA output files. Runs over a list of output files, extracts the data
and stores it into a formatted ASE database in SchNetPack format (:obj:`schnetpack.data.AtomsData`).
This class makes use of the :obj:`OrcaMainFileParser` and :obj:`OrcaHessianFileParser` defined below.
Args:
dbpath (str): Path to the target database.
target_properties (list): List of properties to extract from ORCA files.
filter (dict, optional): Dictionary giving the name of a property and a threshold value. Entries in
in the output files with values exceeding the threshold in magnitude are
discarded. This can be used to e.g. screen for numerical noise in implicit
solvent computations, etc.
mask_charges (bool, optional): If the ORCA calculation used external charges, these are removed from the
positions and atom types read by the parser.
"""
main_properties = [
properties.energy,
properties.forces,
properties.dipole_moment,
properties.polarizability,
properties.shielding,
]
hessian_properties = [
properties.hessian,
properties.dipole_derivatives,
properties.polarizability_derivatives,
]
molecular_properties = [properties.dipole_moment, properties.polarizability]
file_extensions = {properties.forces: ".engrad", properties.hessian: ".oinp.hess"}
atomistic = ["atoms", properties.forces, properties.shielding]
def __init__(
self,
dbpath: str,
target_properties: List,
filter: Optional[Dict[str, float]] = None,
mask_charges: bool = False,
property_units: Dict[str, Union[str, float]] = {},
distance_unit: Union[str, float] = 1.0,
):
self.dbpath = dbpath
main_properties = []
hessian_properties = []
dummy_properties = []
# Initialize property unit dict
self.property_units = property_units
for p in target_properties:
if p not in self.property_units:
self.property_units[p] = 1.0
for p in target_properties:
if p in self.main_properties:
main_properties.append(p)
elif p in self.hessian_properties:
hessian_properties.append(p)
else:
print("Unrecognized property {:s}".format(p))
if properties.electric_field in target_properties:
dummy_properties.append(properties.electric_field)
if properties.magnetic_field in target_properties:
dummy_properties.append(properties.magnetic_field)
all_properties = main_properties + hessian_properties + dummy_properties
self.all_properties = all_properties
self.atomsdata = spk.data.ASEAtomsData.create(
dbpath, distance_unit=distance_unit, property_unit_dict=self.property_units
)
# The main file parser is always needed
self.main_parser = OrcaMainFileParser(
target_properties=main_properties + ["atoms"]
)
if len(hessian_properties) > 0:
self.hessian_parser = OrcaHessianFileParser(
target_properties=hessian_properties
)
else:
self.hessian_parser = None
# Set up filter dictionary to e.g. remove numerically unstable solvent computations
self.filter = filter
# If requested, mask Q charges introduced by Orca
self.mask_charges = mask_charges
def parse_data(self, data_files: List[str], buffer_size: int = 10):
"""
Reads in a list of ORCA output files, extracts the data, performs reformatting and
then stores structures and properties to an ASE database.
Args:
data_files (list): List of the paths to the ORCA output files.
buffer_size (int, optional): Collects a certain number of molecules before writing them to
the database.
"""
atom_buffer = []
property_buffer = []
for file in tqdm(sorted(data_files), ncols=100):
if os.path.exists(file):
atoms, properties = self._parse_molecule(file)
if properties is not None:
# Filter properties for problematic values
if self.filter is not None:
filtered = False
for p in self.filter:
if np.linalg.norm(properties[p]) > self.filter[p]:
filtered = True
logging.info(f"Filtered output {file} due to {p}")
if not filtered:
atom_buffer.append(atoms)
property_buffer.append(properties)
else:
atom_buffer.append(atoms)
property_buffer.append(properties)
if len(atom_buffer) >= buffer_size:
self.atomsdata.add_systems(property_buffer, atom_buffer)
atom_buffer = []
property_buffer = []
# Collect leftovers
if len(atom_buffer) > 0:
self.atomsdata.add_systems(property_buffer, atom_buffer)
def _parse_molecule(self, datafile: str):
"""
Parser for a single molecule. This routine first checks for convergence
of the calculation and then extracts all properties as instructed in the
individual parsers.
Args:
datafile (str): Path to the datafile.
Returns:
(ase.Atoms, dict):
atoms:
ase.Atoms object holding atom types and positions.
properties:
Dictionary containing all extracted properties. Naming conventions are the
same as used in :obj:`schnetpack.Properties`.
"""
# check if computation converged
if not self._check_convergence(datafile):
return None, None
# Get main properties
self.main_parser.parse_file(datafile)
main_properties = self.main_parser.get_parsed()
if self.mask_charges:
main_properties = self._mask_charges(main_properties)
atoms = None
target_properties = {}
for p in main_properties:
if main_properties[p] is None:
print("Error parsers {:s}".format(p))
return None, None
elif p == "atoms":
atypes, coords = main_properties[p]
atoms = Atoms(atypes, coords)
else:
target_properties[p] = main_properties[p].astype(np.float64)
if self.hessian_parser is not None:
hessian_file = (
os.path.splitext(datafile)[0] + self.file_extensions["hessian"]
)
if not os.path.exists(hessian_file):
print("Could not open Hessian file {:s}".format(hessian_file))
return atoms, None
else:
self.hessian_parser.parse_file(hessian_file)
hessian_properties = self.hessian_parser.get_parsed()
for p in hessian_properties:
if p is None:
return atoms, None
elif p == properties.dipole_derivatives:
target_properties[p] = format_dipole_derivatives(
hessian_properties[p]
)
elif p == properties.polarizability_derivatives:
target_properties[p] = format_polarizability_derivatives(
hessian_properties[p]
)
else:
target_properties[p] = hessian_properties[p]
# Unsqueeze first dimension for new batch format
for p in target_properties:
if p in self.molecular_properties:
target_properties[p] = target_properties[p][None, ...]
# Add dummy fields if requested
if properties.electric_field in self.all_properties:
target_properties[properties.electric_field] = np.zeros((1, 3))
if properties.magnetic_field in self.all_properties:
target_properties[properties.magnetic_field] = np.zeros((1, 3))
return atoms, target_properties
@staticmethod
def _check_convergence(datafile: str):
"""
Check whether calculation has converged by searching for the
typical ORCA closing statement.
Args:
datafile (str): Path to the output file.
Returns:
bool: Indicator if computation finished successfully.
"""
flag = open(datafile).readlines()[-2].strip()
if not flag == "****ORCA TERMINATED NORMALLY****":
return False
else:
return True
def _mask_charges(self, main_properties: Dict[str, np.array]):
"""
Remove the external charges Q introduced in orca input file. This is
only necessary, if the charges are given in the input file. This in
turn is necessary to get the right shielding tensors, as Orca is buggy
in this case. All other properties need to be taken from a computation
with external charges given in a file, since external charges in the
input file are added to the potential energy and the dipole moment...
Args:
main_properties (dict): main property dictionary generated by the
:obj:`OrcaParser`.
Returns:
dict: Updated property dictionary with the structures, atom types
and atomistic properties masked.
"""
n_atoms = np.sum(main_properties["atoms"][0] != "Q")
for p in main_properties:
if p in self.atomistic:
if p == "atoms":
main_properties[p][0] = main_properties[p][0][:n_atoms]
main_properties[p][1] = main_properties[p][1][:n_atoms]
else:
main_properties[p] = main_properties[p][:n_atoms]
return main_properties
def format_dipole_derivatives(target_property: np.array):
"""
Reshape the extracted dipole derivatives to the correct
format. Format is Natoms x (dx dy dz) x (property x y z)
Args:
property (numpy.array):
Returns:
numpy.array: Reshaped array
"""
N, _ = target_property.shape
N = N // 3
target_property = target_property.reshape(N, 3, 3)
return target_property
def format_polarizability_derivatives(target_property: np.array):
"""
Reshape the extracted polarizability derivatives to the correct
format. Format is Natoms x (dx dy dz) x (property Tensor)
Args:
target_property (numpy.array):
Returns:
numpy.array: Reshaped array
"""
N, _ = target_property.shape
N = N // 3
target_property = target_property.reshape(N, 3, 6)
triu_idx = np.triu_indices(3)
reshaped = np.zeros((N, 3, 3, 3))
reshaped[:, :, triu_idx[0], triu_idx[1]] = target_property
reshaped[:, :, triu_idx[1], triu_idx[0]] = target_property
return reshaped
[docs]class OrcaPropertyParser:
"""
Basic property parser for ORCA output files. Takes a start flag and a stop flag/list of stop flags and collects
the data entries in between. If a :obj:`OrcaFormatter` is provided, the data is formatted accordingly upon
retrieval. Operates in a line-wise fashion.
Args:
start (str): begins to collect data starting from this string
stop (str/list(str)): stops data collection if any of these strings is encounteres
formatters (OrcaFormatter): OrcaFormatter to convert collected data
"""
def __init__(
self,
start: str,
stop: Union[str, List[str]],
formatters: Optional[Union[OrcaFormatter, List[OrcaFormatter]]] = None,
):
self.start = start
self.stop = stop
self.formatters = formatters
self.read = False
self.parsed = None
def parse_line(self, line: str):
"""
Parses a line in the output file and updates the main container.
Args:
line (str): line of Orca output file
"""
line = line.strip()
if line.startswith("---------") or len(line) == 0:
pass
# if line.startswith("*********") or len(line) == 0:
# pass
elif line.startswith(self.start):
# Avoid double reading and restart for multiple files and repeated instances of data.
self.parsed = []
self.read = True
# For single line output
if self.stop is None:
self.parsed.append(line)
self.read = False
elif self.read:
# Check for stops
if isinstance(self.stop, list):
for stop in self.stop:
if self.read and line.startswith(stop):
self.read = False
if self.read:
self.parsed.append(line)
else:
if line.startswith(self.stop):
self.read = False
else:
self.parsed.append(line)
def get_parsed(self):
"""
Returns data, if formatters are specified in the corresponding format.
Returns:
numpy.array: Formatted data.
"""
if self.formatters is None:
return self.parsed
elif hasattr(self.formatters, "__iter__"):
return [formatter.format(self.parsed) for formatter in self.formatters]
else:
return self.formatters.format(self.parsed)
def reset(self):
"""
Reset state of the parser.
"""
self.read = False
self.parsed = None
[docs]class OrcaOutputParser:
"""
Basic ORCA output parser class. Parses an Orca output file according to the parsers specified in the 'parsers'
dictionary. Parsed data is stored in an dictionary, using the same keys as the parsers. If a list of formatters is
provided to a parser, a list of the parsed entries is stored in the output dictionary.
Args:
parsers (dict[str->callable]): dictionary of :obj:`OrcaPropertyParser`,
each with their own :obj:`OrcaFormatter`.
"""
def __init__(self, parsers: Dict[str, OrcaPropertyParser]):
self.parsers = parsers
self.parsed = None
def parse_file(self, path: str):
"""
Open the file and iterate over its lines, applying all parsers. In the end, all data is collected in a
dictionary.
Args:
path (str): path to Orca output file.
"""
# Reset for new file
for parser in self.parsers:
self.parsers[parser].reset()
with open(path, "r") as f:
for line in f:
for parser in self.parsers:
self.parsers[parser].parse_line(line)
self.parsed = {}
for parser in self.parsers:
self.parsed[parser] = self.parsers[parser].get_parsed()
def get_parsed(self):
"""
Auxiliary routine to collect the data from the parser.
Returns:
dict[str->list]: Dictionary of data entries according to parser keys.
"""
return self.parsed
[docs]class OrcaMainFileParser(OrcaOutputParser):
"""
Predefined :obj:`OrcaOutputParser` for extracting data from the main ORCA output file.
Can read and format structure and atom types, as well as the energies, forces, dipole moments,
polarizabilities and nuclear shielding tensors.
"""
target_properties = [
"atoms",
properties.forces,
properties.energy,
properties.dipole_moment,
properties.polarizability,
properties.shielding,
]
starts = {
"atoms": "CARTESIAN COORDINATES (ANGSTROEM)",
properties.forces: "CARTESIAN GRADIENT",
properties.energy: "FINAL SINGLE POINT ENERGY",
properties.dipole_moment: "Total Dipole Moment",
properties.polarizability: "The raw cartesian tensor (atomic units):",
properties.shielding: "CHEMICAL SHIFTS",
}
stops = {
"atoms": "CARTESIAN COORDINATES (A.U.)",
properties.forces: "Difference to translation invariance",
properties.energy: None,
properties.dipole_moment: None,
properties.polarizability: "diagonalized tensor:",
properties.shielding: "CHEMICAL SHIELDING SUMMARY",
}
formatters = {
"atoms": (
OrcaFormatter(0, converter=str),
OrcaFormatter(1, stop=4, unit=1.0 / units.Bohr),
),
properties.energy: OrcaFormatter(4),
properties.forces: OrcaFormatter(3, stop=6, unit=-1.0),
properties.dipole_moment: OrcaFormatter(4, stop=7),
properties.polarizability: OrcaFormatter(0, stop=4),
properties.shielding: OrcaFormatter(0, datatype="shielding", unit=ppm2au),
}
def __init__(self, target_properties: Optional[List[str]] = None):
if target_properties is None:
to_parse = self.target_properties
else:
to_parse = []
for p in target_properties:
if p not in self.target_properties:
print("Cannot parse property {:s}".format(p))
else:
to_parse.append(p)
parsers = {
p: OrcaPropertyParser(
self.starts[p], self.stops[p], formatters=self.formatters[p]
)
for p in to_parse
}
super(OrcaMainFileParser, self).__init__(parsers)
[docs]class OrcaHessianFileParser(OrcaMainFileParser):
"""
Predefined :obj:`OrcaOutputParser` for extracting data from the hessian output file written
by ORCA if some higher order derivatives are requested. Can read and format Hessians, as well as
Cartesian dipole moment and polarizability derivatives.
"""
target_properties = [
properties.hessian,
properties.dipole_derivatives,
properties.polarizability_derivatives,
]
starts = {
properties.hessian: "$hessian",
properties.dipole_derivatives: "$dipole_derivatives",
properties.polarizability_derivatives: "$polarizability_derivatives",
}
stops = {
properties.hessian: "$vibrational_frequencies",
properties.dipole_derivatives: "#",
properties.polarizability_derivatives: "#",
}
formatters = {
properties.hessian: OrcaFormatter(0, datatype="matrix", skip_first=1),
properties.dipole_derivatives: OrcaFormatter(0, stop=4, skip_first=1),
properties.polarizability_derivatives: OrcaFormatter(0, stop=6, skip_first=1),
}
def __init__(self, target_properties: Optional[List[str]] = None):
super(OrcaHessianFileParser, self).__init__(target_properties)