Source code for stko._internal.optimizers.utilities

import gzip
import re
import shutil
from collections import abc, deque
from itertools import chain
from pathlib import Path

import rdkit.Chem.AllChem as rdkit  # noqa: N813
import stk
from rdkit.Geometry import Point3D

# This dictionary gives easy access to the rdkit bond types.
bond_dict = {
    "1": rdkit.rdchem.BondType.SINGLE,
    "am": rdkit.rdchem.BondType.SINGLE,
    "2": rdkit.rdchem.BondType.DOUBLE,
    "3": rdkit.rdchem.BondType.TRIPLE,
    "ar": rdkit.rdchem.BondType.AROMATIC,
}



[docs]
class MAEExtractor:
    """Extracts the lowest energy conformer from a .maegz file.

    Macromodel conformer searches produce -out.maegz files containing
    all of the conformers found during the search and their energies
    and other data.

    Initializing this class with a :class:`.ConstructedMolecule` finds
    the ``-out.maegz`` file of that :class:`.ConstructedMolecule` and
    converts it to a ``.mae`` file. It then creates and additional
    ``.mae`` file holding only the lowest energy conformer found.

    """

    maegz_path: Path
    """``-out.maegz`` file generated by the conformer search."""

    mae_path: Path
    """``.mae`` file holding the conformers from the conformer search."""

    content: str
    """The content of the ``.mae`` file holding all the conformers.

    This holds other data such as their energies too.

    """

    energies: list[tuple[float | None, int]]
    """Energies of the lowest energy confromers and their id."""

    min_energy: float | None
    """The minimum energy found in the ``.mae`` file."""

    path: Path
    """``.mae`` file holding the extracted lowest energy conformer."""

    def __init__(self, run_name: str, n: int = 1) -> None:
        self.maegz_path = Path(f"{run_name}-out.maegz")
        self.maegz_to_mae()
        self.extract_conformers(n)


[docs]
    def extract_conformers(self, n: int) -> None:
        """Creates ``.mae`` files holding the lowest energy conformers.

        Parameters:
            n:
                The number of conformers to extract.

        """
        for i in range(n):
            # Get the id of the lowest energy conformer.
            num = self.lowest_energy_conformers(n)[i][1]
            # Get the structure block corresponding to the lowest
            # energy conformer.
            content = self.content.split("f_m_ct")
            new_mae = "f_m_ct".join([content[0], content[num]])

            # Write the structure block in its own .mae file, named
            # after conformer extracted.
            if n == 1:
                # Write the structure block in its own .mae file, named
                # after conformer extracted.
                new_name = self.mae_path.with_name(
                    f"{self.mae_path.stem}EXTRACTED_{num}.mae"
                )
            else:
                new_name = self.mae_path.with_name(
                    f"{self.mae_path.stem}EXTRACTED_{num}_conf_{i}.mae"
                )

            with new_name.open("w") as mae_file:
                mae_file.write(new_mae)

            if i == 0:
                # Save the path of the newly created file.
                self.path = new_name



[docs]
    def extract_energy(self, block: str) -> float | None:
        """Extracts the energy value from a ``.mae`` energy data block.

        Parameters:
            block:
                An ``.mae`` energy data block.

        Returns:
            The energy value extracted from `block` or ``None`` if
            one is not found.

        """
        block_list = block.split(":::")
        for name, value in zip(
            block_list[0].split("\n"), block_list[1].split("\n"), strict=False
        ):
            if "r_mmod_Potential_Energy" in name:
                return float(value)
        return None



[docs]
    def lowest_energy_conformers(
        self,
        n: int,
    ) -> list[tuple[float | None, int]]:
        """Returns the id and energy of the lowest energy conformers.

        Parameters:
            n:
                The number of lowest energy conformers to return.

        Returns:
            A :class:`list` of the form

            .. code-block::

                returned = [(123.3, 23), (143.89, 1), (150.6, 12), ...]

            Where each :class:`tuple` holds the energy of the
            `n` lowest energy conformers and the id, respectively.

        """
        # Open the .mae file holding all the conformers and load its
        # content.
        self.content = self.mae_path.read_text()
        # Split the content across curly braces. This divides the
        # various sections of the .mae file.
        content_split = re.split(r"[{}]", self.content)

        # Go through all the datablocks in the the .mae file. For each
        # energy block extract the energy and store it in the
        # `energies` list. Store the `index`  (conformer id) along with
        # each extracted energy.
        self.energies = []
        prev_block = deque([""], maxlen=1)
        index = 1
        for block in content_split:
            if (
                "f_m_ct" in prev_block[0]
                and "r_mmod_Potential_Energy" in block
            ):
                energy = self.extract_energy(block)
                self.energies.append((energy, index))
                index += 1

            prev_block.append(block)

        # Selecting the lowest energy n conformers
        confs = sorted(self.energies)[:n]
        # Define the energy of the lowest energy conformer
        self.min_energy = confs[0][0]
        # Return a list with id and energy of the lowest energy
        # conformers.
        return confs



[docs]
    def maegz_to_mae(self) -> None:
        """Converts the .maegz file to a .mae file."""
        self.mae_path = self.maegz_path.with_suffix(".mae")
        with (
            gzip.open(self.maegz_path, "r") as maegz_file,
            self.mae_path.open("wb") as mae_file,
        ):
            mae_file.write(maegz_file.read())





[docs]
def mol_from_mae_file(mae_path: Path | str) -> rdkit.Mol:  # noqa: PLR0912, C901
    """Creates a ``rdkit`` molecule from a ``.mae`` file.

    Parameters:
        mol2_file:
            The full path of the ``.mae`` file from which an rdkit molecule
            should be instantiated.

    Returns:
        An ``rdkit`` instance of the molecule held in `mae_file`.

    """
    mae_path = Path(mae_path)

    mol = rdkit.EditableMol(rdkit.Mol())
    conf = rdkit.Conformer()

    content = re.split(r"[{}]", mae_path.read_text())

    prev_block = deque([""], maxlen=1)
    for block in content:
        if "m_atom[" in prev_block[0]:
            atom_block = block
        if "m_bond[" in prev_block[0]:
            bond_block = block
        prev_block.append(block)

    labels, data_block, *_ = atom_block.split(":::")
    labels = [
        label
        for label in labels.split("\n")
        if not label.isspace() and label != ""
    ]

    data_block = [
        a.split()
        for a in data_block.split("\n")
        if not a.isspace() and a != ""
    ]

    for line in data_block:
        words = [word for word in line if word != '"']
        if len(labels) != len(words):
            msg = (
                "Number of labels does"
                " not match number of columns"
                " in .mae file."
            )
            raise RuntimeError(msg)

        for label, data in zip(labels, words, strict=False):
            if "x_coord" in label:
                x = float(data)
            if "y_coord" in label:
                y = float(data)
            if "z_coord" in label:
                z = float(data)
            if "atomic_number" in label:
                atom_num = int(data)

        atom_sym = rdkit.GetPeriodicTable().GetElementSymbol(atom_num)
        atom_coord = Point3D(x, y, z)
        atom_id = mol.AddAtom(rdkit.Atom(atom_sym))
        conf.SetAtomPosition(atom_id, atom_coord)

    labels, data_block, *_ = bond_block.split(":::")
    labels = [
        label
        for label in labels.split("\n")
        if not label.isspace() and label != ""
    ]
    data_block = [
        a.split()
        for a in data_block.split("\n")
        if not a.isspace() and a != ""
    ]

    for line in data_block:
        if len(labels) != len(line):
            msg = (
                "Number of labels does"
                " not match number of "
                "columns in .mae file."
            )
            raise RuntimeError(msg)

        for label, data in zip(labels, line, strict=False):
            if "from" in label:
                atom1 = int(data) - 1
            if "to" in label:
                atom2 = int(data) - 1
            if "order" in label:
                bond_order = str(int(data))
        mol.AddBond(atom1, atom2, bond_dict[bond_order])

    mol = mol.GetMol()
    mol.AddConformer(conf)
    return mol




[docs]
def move_generated_macromodel_files(
    basename: str, output_dir: Path | str
) -> None:
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for filename in Path().glob(f"{basename}*"):
        # Do not move the output_dir.
        if filename == output_dir:
            continue
        shutil.move(filename, output_dir / filename)



def has_h_atom(bond: stk.Bond) -> bool:
    """Check if a bond has a H atom.

    Parameters:
        bond:
            Bond to test if it has a H atom.

    Returns:
        Returns `True` if bond has H atom.

    """
    if bond.get_atom1().get_atomic_number() == 1:
        return True
    return bond.get_atom2().get_atomic_number() == 1


def has_metal_atom(bond: stk.Bond, metal_atoms: list[stk.Atom]) -> bool:
    """Check if a bond has a metal atom.

    Parameters:
        bond:
            Bond to test if it has a metal atom.

        metal_atoms:
            List of metal atoms.

    Returns:
        Returns `True` if bond has metal atom.

    """
    if bond.get_atom1() in metal_atoms:
        return True
    return bond.get_atom2() in metal_atoms


def metal_atomic_numbers() -> abc.Iterable:
    return chain(range(21, 31), range(39, 49), range(72, 81))



[docs]
def get_metal_atoms(mol: stk.Molecule) -> list[stk.Atom]:
    """Return a list of metal atoms in molecule."""
    metals = set(metal_atomic_numbers())
    return [
        atom for atom in mol.get_atoms() if atom.get_atomic_number() in metals
    ]



def get_metal_bonds(
    mol: stk.Molecule,
    metal_atoms: list[stk.Atom],
) -> tuple[list, list]:
    """Return a list of bonds in molecule that contain metal atoms."""
    metal_bonds = []
    ids_to_metals = []
    for bond in mol.get_bonds():
        if bond.get_atom1() in metal_atoms:
            metal_bonds.append(bond)
            ids_to_metals.append(bond.get_atom2().get_id())
        elif bond.get_atom2() in metal_atoms:
            metal_bonds.append(bond)
            ids_to_metals.append(bond.get_atom1().get_id())

    return metal_bonds, ids_to_metals


def to_rdkit_mol_without_metals(
    mol: stk.Molecule,
    metal_atoms: list[stk.Atom],
    metal_bonds: list[stk.Bond],
) -> rdkit.Mol:
    """Create :class:`rdkit.Mol` with metals replaced by H atoms.

    Parameters:
        mol:
            The molecule to be optimized.

        metal_atoms:
            List of metal atoms.

        metal_bonds:
            List of bonds including metal atoms.

    Returns:
        RDKit molecule with metal atoms replaced with H atoms.

    """
    edit_mol = rdkit.EditableMol(rdkit.Mol())
    for atom in mol.get_atoms():
        if atom in metal_atoms:
            # In place of metals, add H's that will be constrained.
            # This allows the atom ids to not be changed.
            rdkit_atom = rdkit.Atom(1)
            rdkit_atom.SetFormalCharge(0)
        else:
            rdkit_atom = rdkit.Atom(atom.get_atomic_number())
            rdkit_atom.SetFormalCharge(atom.get_charge())
        edit_mol.AddAtom(rdkit_atom)

    for bond in mol.get_bonds():
        if bond in metal_bonds:
            # Do not add bonds to metal atoms (replaced with H's).
            continue
        edit_mol.AddBond(
            beginAtomIdx=bond.get_atom1().get_id(),
            endAtomIdx=bond.get_atom2().get_id(),
            order=rdkit.BondType(bond.get_order()),
        )

    edit_mol = edit_mol.GetMol()
    rdkit_conf = rdkit.Conformer(mol.get_num_atoms())
    for atom_id, atom_coord in enumerate(mol.get_position_matrix()):
        rdkit_conf.SetAtomPosition(atom_id, atom_coord)
        edit_mol.GetAtomWithIdx(atom_id).SetNoImplicit(True)  # noqa:FBT003
    edit_mol.AddConformer(rdkit_conf)

    return edit_mol


def get_long_bond_ids(
    mol: stk.ConstructedMolecule,
    reorder: bool = False,
) -> tuple[tuple[int, int], ...]:
    """Return tuple of long bond ids in a ConstructedMolecule."""
    long_bond_ids = []
    for bond_infos in mol.get_bond_infos():
        if bond_infos.get_building_block() is None:
            if reorder:
                ba1 = bond_infos.get_bond().get_atom1().get_id()
                ba2 = bond_infos.get_bond().get_atom2().get_id()
                if ba1 < ba2:
                    ids = (
                        bond_infos.get_bond().get_atom1().get_id(),
                        bond_infos.get_bond().get_atom2().get_id(),
                    )
                else:
                    ids = (
                        bond_infos.get_bond().get_atom2().get_id(),
                        bond_infos.get_bond().get_atom1().get_id(),
                    )
            else:
                ids = (
                    bond_infos.get_bond().get_atom1().get_id(),
                    bond_infos.get_bond().get_atom2().get_id(),
                )
            long_bond_ids.append(ids)

    return tuple(long_bond_ids)