Source code for stko._internal.optimizers.utilities

import gzip
import re
import shutil
from collections import abc, deque
from itertools import chain
from pathlib import Path

import rdkit.Chem.AllChem as rdkit  # noqa: N813
import stk
from rdkit.Geometry import Point3D

# This dictionary gives easy access to the rdkit bond types.
bond_dict = {
    "1": rdkit.rdchem.BondType.SINGLE,
    "am": rdkit.rdchem.BondType.SINGLE,
    "2": rdkit.rdchem.BondType.DOUBLE,
    "3": rdkit.rdchem.BondType.TRIPLE,
    "ar": rdkit.rdchem.BondType.AROMATIC,
}


[docs] class MAEExtractor: """Extracts the lowest energy conformer from a .maegz file. Macromodel conformer searches produce -out.maegz files containing all of the conformers found during the search and their energies and other data. Initializing this class with a :class:`.ConstructedMolecule` finds the ``-out.maegz`` file of that :class:`.ConstructedMolecule` and converts it to a ``.mae`` file. It then creates and additional ``.mae`` file holding only the lowest energy conformer found. """ maegz_path: Path """``-out.maegz`` file generated by the conformer search.""" mae_path: Path """``.mae`` file holding the conformers from the conformer search.""" content: str """The content of the ``.mae`` file holding all the conformers. This holds other data such as their energies too. """ energies: list[tuple[float | None, int]] """Energies of the lowest energy confromers and their id.""" min_energy: float | None """The minimum energy found in the ``.mae`` file.""" path: Path """``.mae`` file holding the extracted lowest energy conformer.""" def __init__(self, run_name: str, n: int = 1) -> None: self.maegz_path = Path(f"{run_name}-out.maegz") self.maegz_to_mae() self.extract_conformers(n)
[docs] def extract_conformers(self, n: int) -> None: """Creates ``.mae`` files holding the lowest energy conformers. Parameters: n: The number of conformers to extract. """ for i in range(n): # Get the id of the lowest energy conformer. num = self.lowest_energy_conformers(n)[i][1] # Get the structure block corresponding to the lowest # energy conformer. content = self.content.split("f_m_ct") new_mae = "f_m_ct".join([content[0], content[num]]) # Write the structure block in its own .mae file, named # after conformer extracted. if n == 1: # Write the structure block in its own .mae file, named # after conformer extracted. new_name = self.mae_path.with_name( f"{self.mae_path.stem}EXTRACTED_{num}.mae" ) else: new_name = self.mae_path.with_name( f"{self.mae_path.stem}EXTRACTED_{num}_conf_{i}.mae" ) with new_name.open("w") as mae_file: mae_file.write(new_mae) if i == 0: # Save the path of the newly created file. self.path = new_name
[docs] def extract_energy(self, block: str) -> float | None: """Extracts the energy value from a ``.mae`` energy data block. Parameters: block: An ``.mae`` energy data block. Returns: The energy value extracted from `block` or ``None`` if one is not found. """ block_list = block.split(":::") for name, value in zip( block_list[0].split("\n"), block_list[1].split("\n"), strict=False ): if "r_mmod_Potential_Energy" in name: return float(value) return None
[docs] def lowest_energy_conformers( self, n: int, ) -> list[tuple[float | None, int]]: """Returns the id and energy of the lowest energy conformers. Parameters: n: The number of lowest energy conformers to return. Returns: A :class:`list` of the form .. code-block:: returned = [(123.3, 23), (143.89, 1), (150.6, 12), ...] Where each :class:`tuple` holds the energy of the `n` lowest energy conformers and the id, respectively. """ # Open the .mae file holding all the conformers and load its # content. self.content = self.mae_path.read_text() # Split the content across curly braces. This divides the # various sections of the .mae file. content_split = re.split(r"[{}]", self.content) # Go through all the datablocks in the the .mae file. For each # energy block extract the energy and store it in the # `energies` list. Store the `index` (conformer id) along with # each extracted energy. self.energies = [] prev_block = deque([""], maxlen=1) index = 1 for block in content_split: if ( "f_m_ct" in prev_block[0] and "r_mmod_Potential_Energy" in block ): energy = self.extract_energy(block) self.energies.append((energy, index)) index += 1 prev_block.append(block) # Selecting the lowest energy n conformers confs = sorted(self.energies)[:n] # Define the energy of the lowest energy conformer self.min_energy = confs[0][0] # Return a list with id and energy of the lowest energy # conformers. return confs
[docs] def maegz_to_mae(self) -> None: """Converts the .maegz file to a .mae file.""" self.mae_path = self.maegz_path.with_suffix(".mae") with ( gzip.open(self.maegz_path, "r") as maegz_file, self.mae_path.open("wb") as mae_file, ): mae_file.write(maegz_file.read())
[docs] def mol_from_mae_file(mae_path: Path | str) -> rdkit.Mol: # noqa: PLR0912, C901 """Creates a ``rdkit`` molecule from a ``.mae`` file. Parameters: mol2_file: The full path of the ``.mae`` file from which an rdkit molecule should be instantiated. Returns: An ``rdkit`` instance of the molecule held in `mae_file`. """ mae_path = Path(mae_path) mol = rdkit.EditableMol(rdkit.Mol()) conf = rdkit.Conformer() content = re.split(r"[{}]", mae_path.read_text()) prev_block = deque([""], maxlen=1) for block in content: if "m_atom[" in prev_block[0]: atom_block = block if "m_bond[" in prev_block[0]: bond_block = block prev_block.append(block) labels, data_block, *_ = atom_block.split(":::") labels = [ label for label in labels.split("\n") if not label.isspace() and label != "" ] data_block = [ a.split() for a in data_block.split("\n") if not a.isspace() and a != "" ] for line in data_block: words = [word for word in line if word != '"'] if len(labels) != len(words): msg = ( "Number of labels does" " not match number of columns" " in .mae file." ) raise RuntimeError(msg) for label, data in zip(labels, words, strict=False): if "x_coord" in label: x = float(data) if "y_coord" in label: y = float(data) if "z_coord" in label: z = float(data) if "atomic_number" in label: atom_num = int(data) atom_sym = rdkit.GetPeriodicTable().GetElementSymbol(atom_num) atom_coord = Point3D(x, y, z) atom_id = mol.AddAtom(rdkit.Atom(atom_sym)) conf.SetAtomPosition(atom_id, atom_coord) labels, data_block, *_ = bond_block.split(":::") labels = [ label for label in labels.split("\n") if not label.isspace() and label != "" ] data_block = [ a.split() for a in data_block.split("\n") if not a.isspace() and a != "" ] for line in data_block: if len(labels) != len(line): msg = ( "Number of labels does" " not match number of " "columns in .mae file." ) raise RuntimeError(msg) for label, data in zip(labels, line, strict=False): if "from" in label: atom1 = int(data) - 1 if "to" in label: atom2 = int(data) - 1 if "order" in label: bond_order = str(int(data)) mol.AddBond(atom1, atom2, bond_dict[bond_order]) mol = mol.GetMol() mol.AddConformer(conf) return mol
[docs] def move_generated_macromodel_files( basename: str, output_dir: Path | str ) -> None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for filename in Path().glob(f"{basename}*"): # Do not move the output_dir. if filename == output_dir: continue shutil.move(filename, output_dir / filename)
def has_h_atom(bond: stk.Bond) -> bool: """Check if a bond has a H atom. Parameters: bond: Bond to test if it has a H atom. Returns: Returns `True` if bond has H atom. """ if bond.get_atom1().get_atomic_number() == 1: return True return bond.get_atom2().get_atomic_number() == 1 def has_metal_atom(bond: stk.Bond, metal_atoms: list[stk.Atom]) -> bool: """Check if a bond has a metal atom. Parameters: bond: Bond to test if it has a metal atom. metal_atoms: List of metal atoms. Returns: Returns `True` if bond has metal atom. """ if bond.get_atom1() in metal_atoms: return True return bond.get_atom2() in metal_atoms def metal_atomic_numbers() -> abc.Iterable: return chain(range(21, 31), range(39, 49), range(72, 81))
[docs] def get_metal_atoms(mol: stk.Molecule) -> list[stk.Atom]: """Return a list of metal atoms in molecule.""" metals = set(metal_atomic_numbers()) return [ atom for atom in mol.get_atoms() if atom.get_atomic_number() in metals ]
def get_metal_bonds( mol: stk.Molecule, metal_atoms: list[stk.Atom], ) -> tuple[list, list]: """Return a list of bonds in molecule that contain metal atoms.""" metal_bonds = [] ids_to_metals = [] for bond in mol.get_bonds(): if bond.get_atom1() in metal_atoms: metal_bonds.append(bond) ids_to_metals.append(bond.get_atom2().get_id()) elif bond.get_atom2() in metal_atoms: metal_bonds.append(bond) ids_to_metals.append(bond.get_atom1().get_id()) return metal_bonds, ids_to_metals def to_rdkit_mol_without_metals( mol: stk.Molecule, metal_atoms: list[stk.Atom], metal_bonds: list[stk.Bond], ) -> rdkit.Mol: """Create :class:`rdkit.Mol` with metals replaced by H atoms. Parameters: mol: The molecule to be optimized. metal_atoms: List of metal atoms. metal_bonds: List of bonds including metal atoms. Returns: RDKit molecule with metal atoms replaced with H atoms. """ edit_mol = rdkit.EditableMol(rdkit.Mol()) for atom in mol.get_atoms(): if atom in metal_atoms: # In place of metals, add H's that will be constrained. # This allows the atom ids to not be changed. rdkit_atom = rdkit.Atom(1) rdkit_atom.SetFormalCharge(0) else: rdkit_atom = rdkit.Atom(atom.get_atomic_number()) rdkit_atom.SetFormalCharge(atom.get_charge()) edit_mol.AddAtom(rdkit_atom) for bond in mol.get_bonds(): if bond in metal_bonds: # Do not add bonds to metal atoms (replaced with H's). continue edit_mol.AddBond( beginAtomIdx=bond.get_atom1().get_id(), endAtomIdx=bond.get_atom2().get_id(), order=rdkit.BondType(bond.get_order()), ) edit_mol = edit_mol.GetMol() rdkit_conf = rdkit.Conformer(mol.get_num_atoms()) for atom_id, atom_coord in enumerate(mol.get_position_matrix()): rdkit_conf.SetAtomPosition(atom_id, atom_coord) edit_mol.GetAtomWithIdx(atom_id).SetNoImplicit(True) # noqa:FBT003 edit_mol.AddConformer(rdkit_conf) return edit_mol def get_long_bond_ids( mol: stk.ConstructedMolecule, reorder: bool = False, ) -> tuple[tuple[int, int], ...]: """Return tuple of long bond ids in a ConstructedMolecule.""" long_bond_ids = [] for bond_infos in mol.get_bond_infos(): if bond_infos.get_building_block() is None: if reorder: ba1 = bond_infos.get_bond().get_atom1().get_id() ba2 = bond_infos.get_bond().get_atom2().get_id() if ba1 < ba2: ids = ( bond_infos.get_bond().get_atom1().get_id(), bond_infos.get_bond().get_atom2().get_id(), ) else: ids = ( bond_infos.get_bond().get_atom2().get_id(), bond_infos.get_bond().get_atom1().get_id(), ) else: ids = ( bond_infos.get_bond().get_atom1().get_id(), bond_infos.get_bond().get_atom2().get_id(), ) long_bond_ids.append(ids) return tuple(long_bond_ids)