Source code for stko._internal.optimizers.collapser

import logging
import random
import shutil
from collections import abc, defaultdict
from itertools import combinations
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import stk
from scipy.spatial.distance import pdist
from stk import PdbWriter

from stko._internal.internal_types import ConstructedMoleculeT
from stko._internal.molecular.periodic.unitcell import UnitCell
from stko._internal.utilities.exceptions import InputError
from stko._internal.utilities.utilities import get_atom_distance

logger = logging.getLogger(__name__)


[docs] class Collapser: """Collapse stk.ConstructedMolecule to decrease enlarged bonds. It is recommended to use the MCHammer version of this code with :mod:`MCHammer` [1]_, where a much cleaner version is written. The utilities `get_long_bond_ids` will help generate sub units. This optimizer aims to bring extended bonds closer together for further optimisation. .. testcode:: collapser import stk import stko bb1 = stk.BuildingBlock('NCCN', [stk.PrimaryAminoFactory()]) bb2 = stk.BuildingBlock( smiles='O=CC(C=O)C=O', functional_groups=[stk.AldehydeFactory()], ) cage1 = stk.ConstructedMolecule( topology_graph=stk.cage.FourPlusSix((bb1, bb2)), ) # Perform collapser optimisation. optimizer = stko.Collapser( output_dir='test_coll', step_size=0.05, distance_cut=2.0, scale_steps=True, ) cage1 = optimizer.optimize(mol=cage1) .. testcleanup:: collapser import shutil shutil.rmtree('test_coll') Parameters: output_dir: The name of the directory into which files generated during the calculation are written. step_size: The relative size of the step to take during collapse. distance_cut: Distance between distinct building blocks to use as threshold for halting collapse in Angstrom. scale_steps: Whether to scale the step of each distict building block by their relative distance from the molecules centroid. Defaults to ``True`` References: .. [1] https://github.com/andrewtarzia/MCHammer """ def __init__( self, output_dir: Path | str, step_size: float, distance_cut: float, scale_steps: bool = True, ) -> None: self._output_dir = Path(output_dir) self._step_size = step_size self._distance_cut = distance_cut self._scale_steps = scale_steps def _get_inter_bb_distance( self, mol: stk.Molecule, ) -> abc.Iterable[float]: """Yield The distances between building blocks in mol. Ignores H atoms. """ position_matrix = mol.get_position_matrix() for atom1, atom2 in combinations( mol.get_atom_infos(), # type:ignore[attr-defined] 2, ): chk1 = atom1.get_atom().get_id() != atom2.get_atom().get_id() chk2 = ( atom1.get_atom().get_atomic_number() != 1 and atom2.get_atom().get_atomic_number() != 1 ) chk3 = ( atom1.get_building_block_id() != atom2.get_building_block_id() ) if chk1 and chk2 and chk3: dist = get_atom_distance( position_matrix=position_matrix, atom1_id=atom1.get_atom().get_id(), atom2_id=atom2.get_atom().get_id(), ) yield dist def _has_short_contacts(self, mol: stk.Molecule) -> bool: """Calculate if there are short contants in mol.""" return any( dist < self._distance_cut for dist in self._get_inter_bb_distance(mol) ) def _get_new_position_matrix( self, mol: stk.Molecule, step: float, vectors: dict[int, np.ndarray], scales: dict, ) -> np.ndarray: """Get the position matrix of the mol after translation.""" new_position_matrix = mol.get_position_matrix() for atom in mol.get_atom_infos(): # type:ignore[attr-defined] bb_id = atom.get_building_block_id() _id = atom.get_atom().get_id() pos = mol.get_position_matrix()[_id] new_position_matrix[_id] = ( pos - step * vectors[bb_id] * scales[bb_id] ) return new_position_matrix def _get_new_cell_vectors( self, unit_cell: UnitCell, step: float, scales: dict, ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Get the unit cell vectors after collapse step.""" vector_1 = unit_cell.get_vector_1() vector_2 = unit_cell.get_vector_2() vector_3 = unit_cell.get_vector_3() max_scale = max(list(scales.values())) vector_1 = vector_1 - vector_1 * step * max_scale vector_2 = vector_2 - vector_2 * step * max_scale vector_3 = vector_3 - vector_3 * step * max_scale return vector_1, vector_2, vector_3 def _get_bb_vectors( self, mol: stk.Molecule, bb_atom_ids: dict[int, list[int]], ) -> tuple[dict[int, np.ndarray], dict[int, np.floating]]: """Get the building block to COM vectors. Parameters: mol: The molecule to be optimized. bb_atom_ids: Dictionary mapping building block ids (keys) to a list of atom ids (values) in each distinct building block in the molecule. Returns: A tuple of: * Dictionary mapping building block ids (keys) to centroid vectors (values) of each distinct building block in the molecule. * Dictionary mapping building block ids (keys) to relative magnitude of centroid vectors (values) of each distinct building block in the molecule. """ cent = mol.get_centroid() # Get bb COM vector to molecule COM. bb_cent_vectors = { i: mol.get_centroid(atom_ids=bb_atom_ids[i]) - cent for i in bb_atom_ids } # Scale the step size based on the different distances of # bbs from the COM. Impacts anisotropic topologies. if self._scale_steps: norms = { i: np.linalg.norm(bb_cent_vectors[i]) for i in bb_cent_vectors } max_distance = max(list(norms.values())) # type: ignore[type-var] bb_cent_scales = {i: norms[i] / max_distance for i in norms} else: bb_cent_scales = {i: np.floating(1) for i in bb_cent_vectors} return bb_cent_vectors, bb_cent_scales
[docs] def optimize(self, mol: ConstructedMoleculeT) -> ConstructedMoleculeT: output_dir = self._output_dir.resolve() if output_dir.exists(): shutil.rmtree(output_dir) output_dir.mkdir(parents=True) bb_atom_ids = defaultdict(list) for i in mol.get_atom_infos(): bb_atom_ids[i.get_building_block_id()].append( i.get_atom().get_id() ) # Translate each building block along bb_COM_vectors by a # distance `step`. I.e. `step` is the proportion of the # bb_COM_vectors that the building block is moved. step_no = 0 step = self._step_size while not self._has_short_contacts(mol): # Update each step the building block vectors and distance. bb_cent_vectors, bb_cent_scales = self._get_bb_vectors( mol=mol, bb_atom_ids=bb_atom_ids, # type: ignore[arg-type] ) new_pos = self._get_new_position_matrix( mol=mol, step=step, vectors=bb_cent_vectors, scales=bb_cent_scales, ) step_no += 1 mol = mol.with_position_matrix(new_pos) mol.write(output_dir / f"collapsed_{step_no}.mol") bb_cent_vectors, bb_cent_scales = self._get_bb_vectors( mol=mol, bb_atom_ids=bb_atom_ids, # type: ignore[arg-type] ) # Check that we have not gone too far. min_dist = min(dist for dist in self._get_inter_bb_distance(mol)) if min_dist < self._distance_cut / 2: # Revert to half the previous step if we have. step = -(self._step_size / 2) new_pos = self._get_new_position_matrix( mol=mol, step=step, vectors=bb_cent_vectors, scales=bb_cent_scales, ) step_no += 1 mol = mol.with_position_matrix(new_pos) mol.write(output_dir / "collapsed_rev.mol") out_file = output_dir / "collapser.out" with out_file.open("w") as f: f.write( "Collapser algorithm.\n" "====================\n" f"Step size: {self._step_size}\n" f"Scale steps?: {self._scale_steps}\n" f"Distance cut: {self._distance_cut}\n" f"====================\n" f"Steps run: {step_no}\n" f"Minimum inter-bb distance: {min_dist}\n" f"====================\n" ) return mol
[docs] def p_optimize( self, mol: stk.Molecule, unit_cell: UnitCell, ) -> tuple[stk.Molecule, UnitCell]: """Optimize `mol`. Parameters: mol: The molecule to be optimized. unit_cell: The cell to be optimized. Returns: The optimized molecule and the optimized cell. """ if not isinstance(mol, stk.ConstructedMolecule): msg = f"{mol} needs to be a ConstructedMolecule for this optimizer" raise InputError(msg) output_dir = self._output_dir.resolve() if output_dir.exists(): shutil.rmtree(output_dir) output_dir.mkdir(parents=True) bb_atom_ids = defaultdict(list) for i in mol.get_atom_infos(): bb_atom_ids[i.get_building_block_id()].append( i.get_atom().get_id() ) # Translate each building block along bb_COM_vectors by a # distance `step`. I.e. `step` is the proportion of the # bb_COM_vectors that the building block is moved. step_no = 0 step = self._step_size while not self._has_short_contacts(mol): # Update each step the building block vectors and distance. ( bb_cent_vectors, bb_cent_scales, ) = self._get_bb_vectors( mol=mol, bb_atom_ids=bb_atom_ids, # type:ignore[arg-type] ) new_pos = self._get_new_position_matrix( mol=mol, step=step, vectors=bb_cent_vectors, scales=bb_cent_scales, ) step_no += 1 mol = mol.with_position_matrix(new_pos) ( new_vector_1, new_vector_2, new_vector_3, ) = self._get_new_cell_vectors( unit_cell=unit_cell, step=step, scales=bb_cent_scales, ) unit_cell = unit_cell.with_cell_from_vectors( vector_1=new_vector_1, vector_2=new_vector_2, vector_3=new_vector_3, ) PdbWriter().write( molecule=mol, path=output_dir / f"collapsed_{step_no}.pdb", periodic_info=unit_cell, ) ( bb_cent_vectors, bb_cent_scales, ) = self._get_bb_vectors( mol=mol, bb_atom_ids=bb_atom_ids, # type:ignore[arg-type] ) # Check that we have not gone too far. min_dist = min(dist for dist in self._get_inter_bb_distance(mol)) if min_dist < self._distance_cut / 2: # Revert to half the previous step if we have. step = -(self._step_size / 2) new_pos = self._get_new_position_matrix( mol=mol, step=step, vectors=bb_cent_vectors, scales=bb_cent_scales, ) step_no += 1 mol = mol.with_position_matrix(new_pos) ( new_vector_1, new_vector_2, new_vector_3, ) = self._get_new_cell_vectors( unit_cell=unit_cell, step=step, scales=bb_cent_scales, ) unit_cell = unit_cell.with_cell_from_vectors( vector_1=new_vector_1, vector_2=new_vector_2, vector_3=new_vector_3, ) PdbWriter().write( molecule=mol, path=output_dir / "collapsed_rev.pdb", periodic_info=unit_cell, ) out_file = output_dir / "collapser.out" with out_file.open("w") as f: f.write( "Collapser algorithm.\n" "====================\n" f"Step size: {self._step_size}\n" f"Scale steps?: {self._scale_steps}\n" f"Distance cut: {self._distance_cut}\n" f"====================\n" f"Steps run: {step_no}\n" f"Minimum inter-bb distance: {min_dist}\n" f"====================\n" ) return mol, unit_cell
[docs] class CollapserMC(Collapser): """Collapse molecule to decrease enlarged bonds using MC algorithm. It is recommended to use the MCHammer version of this code with :mod:`MCHammer` [#]_, where a much cleaner version is written. The utilities `get_long_bond_ids` will help generate sub units. Smarter optimisation than Collapser using simple Monte Carlo algorithm to perform rigid translations of building blocks. Parameters: output_dir: The name of the directory into which files generated during the calculation are written. step_size: The relative size of the step to take during step. target_bond_length: Target equilibrium bond length for long bonds to minimize to. num_steps: Number of MC moves to perform. bond_epsilon: Value of epsilon used in the bond potential in MC moves. Determines strength of the bond potential. Defaults to 50. nonbond_epsilon: Value of epsilon used in the nonbond potential in MC moves. Determines strength of the nonbond potential. Defaults to 20. nonbond_sigma: Value of sigma used in the nonbond potential in MC moves. Defaults to 1.2. nonbond_mu: Value of mu used in the nonbond potential in MC moves. Determines the steepness of the nonbond potential. Defaults to 3. beta: Value of beta used in the in MC moves. Beta takes the place of the inverse boltzmann temperature. Defaults to 2. random_seed: Random seed to use for MC algorithm. Should only be set if exactly reproducible results are required, otherwise a system-based random seed should be used for proper sampling. References: .. [#] https://github.com/andrewtarzia/MCHammer """ def __init__( # noqa: PLR0913 self, output_dir: Path | str, step_size: float, target_bond_length: float, num_steps: int, bond_epsilon: float = 50, nonbond_epsilon: float = 20, nonbond_sigma: float = 1.2, nonbond_mu: float = 3, beta: float = 2, random_seed: int | None = None, ) -> None: self._output_dir = Path(output_dir) self._step_size = step_size self._target_bond_length = target_bond_length self._num_steps = num_steps self._bond_epsilon = bond_epsilon self._nonbond_epsilon = nonbond_epsilon self._nonbond_sigma = nonbond_sigma self._nonbond_mu = nonbond_mu self._beta = beta if random_seed is None: random.seed() else: random.seed(random_seed) def _get_bb_atom_ids(self, mol: stk.Molecule) -> dict[int, list[int]]: bb_atom_ids = defaultdict(list) for i in mol.get_atom_infos(): # type: ignore[attr-defined] bb_atom_ids[i.get_building_block_id()].append( i.get_atom().get_id() ) return bb_atom_ids def _get_bond_length(self, mol: stk.Molecule, bond: stk.Bond) -> float: position_matrix = mol.get_position_matrix() return get_atom_distance( position_matrix=position_matrix, atom1_id=bond.get_atom1().get_id(), atom2_id=bond.get_atom2().get_id(), ) def _get_bond_vector( self, mol: stk.Molecule, bond: stk.Bond ) -> np.ndarray: position_matrix = mol.get_position_matrix() atom1_pos = position_matrix[bond.get_atom1().get_id()] atom2_pos = position_matrix[bond.get_atom2().get_id()] return atom2_pos - atom1_pos def _get_long_bond_infos( self, mol: stk.Molecule, ) -> dict[tuple[int, int], stk.BondInfo]: """Returns dict of long bond infos.""" long_bond_infos: dict[tuple[int, int], stk.BondInfo] = {} for bond_infos in mol.get_bond_infos(): # type: ignore[attr-defined] if bond_infos.get_building_block() is None: ids = ( bond_infos.get_bond().get_atom1().get_id(), bond_infos.get_bond().get_atom2().get_id(), ) long_bond_infos[ids] = bond_infos return long_bond_infos def _get_bb_centroids( self, mol: stk.Molecule, bb_atom_ids: dict[int, tuple[int, ...]], ) -> dict[int, np.ndarray]: """Returns dict of building block centroids.""" return { i: mol.get_centroid(atom_ids=bb_atom_ids[i]) for i in bb_atom_ids } def _get_cent_to_lb_vector( self, mol: stk.Molecule, bb_centroids: dict[int, np.ndarray], long_bond_infos: dict[tuple, stk.BondInfo], ) -> dict[tuple[int, int], tuple[float]]: """Returns dict of long bond atom to bb centroid vectors.""" position_matrix = mol.get_position_matrix() centroid_to_lb_vectors: dict[tuple[int, int], tuple[float]] = {} for bb, cent in bb_centroids.items(): for b_atom_ids in long_bond_infos: for atom_id in b_atom_ids: (atom_info,) = mol.get_atom_infos( # type: ignore[attr-defined] atom_id ) atom_pos = position_matrix[atom_id] if atom_info.get_building_block_id() == bb: centroid_to_lb_vectors[(bb, atom_id)] = ( atom_pos - cent, ) break return centroid_to_lb_vectors def _bond_potential(self, distance: float) -> float: """Define an arbitrary parabolic bond potential. This potential has no relation to an empircal forcefield. """ potential = (distance - self._target_bond_length) ** 2 return self._bond_epsilon * potential def _non_bond_potential(self, distance: float) -> float: """Define an arbitrary repulsive nonbonded potential. This potential has no relation to an empircal forcefield. """ return self._nonbond_epsilon * ( (self._nonbond_sigma / distance) ** self._nonbond_mu ) def _compute_non_bonded_potential(self, mol: stk.Molecule) -> float: # Get all pairwise distances. pair_dists = pdist(mol.get_position_matrix()) return np.sum(self._non_bond_potential(pair_dists)) def _compute_potential( self, mol: stk.Molecule, long_bond_infos: dict[tuple[int, int], stk.BondInfo], ) -> float: system_potential = self._compute_non_bonded_potential(mol) for long_bond in long_bond_infos.values(): system_potential += self._bond_potential( distance=self._get_bond_length( mol=mol, bond=long_bond.get_bond(), ) ) return system_potential def _translate_atoms_along_vector( self, mol: ConstructedMoleculeT, atom_ids: tuple[int, ...], vector: np.ndarray, ) -> ConstructedMoleculeT: new_position_matrix = mol.get_position_matrix() for atom in mol.get_atom_infos(atom_ids): # type: ignore[attr-defined] _id = atom.get_atom().get_id() pos = mol.get_position_matrix()[_id] new_position_matrix[_id] = pos - vector return mol.with_position_matrix(new_position_matrix) def _test_move(self, curr_pot: float, new_pot: float) -> bool: if new_pot < curr_pot: return True exp_term = np.exp(-self._beta * (new_pot - curr_pot)) rand_number = random.random() # noqa: S311 return exp_term > rand_number def _output_top_lines(self) -> str: return ( "====================================================\n" " Collapser optimisation \n" " ---------------------- \n" " \n" f" step size = {self._step_size} \n" f" target bond length = {self._target_bond_length} \n" f" num. steps = {self._num_steps} \n" f" bond epsilon = {self._bond_epsilon} \n" f" nonbond epsilon = {self._nonbond_epsilon} \n" f" nonbond sigma = {self._nonbond_sigma} \n" f" nonbond mu = {self._nonbond_mu} \n" f" beta = {self._beta} \n" "====================================================\n\n" ) def _plot_progess( self, steps: list, maxds: list, spots: list, npots: list, output_dir: Path, ) -> None: fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(steps, maxds, c="k", lw=2) # Set number of ticks for x-axis ax.tick_params(axis="both", which="major", labelsize=16) ax.set_xlabel("step", fontsize=16) ax.set_ylabel("max long bond length [angstrom]", fontsize=16) ax.axhline(y=self._target_bond_length, c="r") fig.tight_layout() fig.savefig( output_dir / "maxd_vs_step.pdf", dpi=360, bbox_inches="tight", ) plt.close() # Plot energy vs timestep. fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(steps, spots, c="k", lw=2, label="total") ax.plot(steps, npots, c="r", lw=2, label="non bonded") # Set number of ticks for x-axis ax.tick_params(axis="both", which="major", labelsize=16) ax.set_xlabel("step", fontsize=16) ax.set_ylabel("potential", fontsize=16) ax.legend(fontsize=16) fig.tight_layout() fig.savefig( output_dir / "pot_vs_step.pdf", dpi=360, bbox_inches="tight", ) plt.close()
[docs] def optimize(self, mol: ConstructedMoleculeT) -> ConstructedMoleculeT: # Handle output dir. output_dir = self._output_dir.resolve() if output_dir.exists(): shutil.rmtree(output_dir) output_dir.mkdir(parents=True) # Define long bonds to optimise. long_bond_infos = self._get_long_bond_infos(mol) # If no long bonds, then optimisation is done. if len(long_bond_infos) == 0: return mol system_potential = self._compute_potential( mol=mol, long_bond_infos=long_bond_infos, ) with output_dir.joinpath("coll.out").open("w") as f: f.write(self._output_top_lines()) mol.write(output_dir / "coll_0.mol") steps = [0] passed = [] spots = [system_potential] npots = [self._compute_non_bonded_potential(mol=mol)] maxds = [ max( [ self._get_bond_length( mol, long_bond_infos[i].get_bond() ) for i in long_bond_infos ] ) ] f.write( "Step system_potential nonbond_potential max_dist " "opt_bbs updated?\n" ) f.write(f"{steps[-1]} {spots[-1]} {npots[-1]} {maxds[-1]} -- --\n") for step in range(1, self._num_steps): # Randomly select a long bond. lb_ids = random.choice(list(long_bond_infos.keys())) # noqa: S311 lb_info = long_bond_infos[lb_ids] lb_vector = self._get_bond_vector( mol=mol, bond=lb_info.get_bond(), ) bb_id_1, bb_id_2 = ( i.get_building_block_id() for i in mol.get_atom_infos( # type: ignore[attr-defined] atom_ids=lb_ids ) ) # Choose bb to move out of the two randomly. moving_bb = random.choice([bb_id_1, bb_id_2]) # noqa: S311 moving_bb_atom_ids = tuple( i.get_atom().get_id() for i in mol.get_atom_infos() # type: ignore[attr-defined] if i.get_building_block_id() == moving_bb ) # Randomly choose between translation along long bond # vector or along BB-COM vector. # Random number from -1 to 1 rand = (random.random() - 0.5) * 2 # noqa: S311 # Define translation along long bond vector where # direction is from force, magnitude is randomly # scaled. long_bond_translation = -lb_vector * self._step_size * rand # Get bb COM vector to molecule COM. cent = mol.get_centroid() bb_cent_vector = ( mol.get_centroid(atom_ids=moving_bb_atom_ids) - cent ) com_translation = bb_cent_vector * self._step_size * rand translation_vector = random.choice( # noqa: S311 [ long_bond_translation, com_translation, ] ) # Translate building block. # Update atom position of building block. mol = self._translate_atoms_along_vector( mol=mol, atom_ids=moving_bb_atom_ids, vector=translation_vector, ) new_system_potential = self._compute_potential( mol=mol, long_bond_infos=long_bond_infos, ) if self._test_move(system_potential, new_system_potential): updated = "T" system_potential = new_system_potential passed.append(step) else: updated = "F" # Reverse move. mol = self._translate_atoms_along_vector( mol=mol, atom_ids=moving_bb_atom_ids, vector=-translation_vector, ) mol.write(output_dir / f"coll_{step}.xyz") steps.append(step) spots.append(system_potential) npots.append(self._compute_non_bonded_potential(mol=mol)) maxds.append( max( [ self._get_bond_length( mol, long_bond_infos[i].get_bond() ) for i in long_bond_infos ] ) ) f.write( f"{steps[-1]} {spots[-1]} " f"{npots[-1]} {maxds[-1]} {lb_ids} {updated}\n" ) f.write("\n============================================\n") f.write( "Optimisation done:\n" f"{len(passed)} steps passed: " f"{len(passed) / self._num_steps}" ) self._plot_progess(steps, maxds, spots, npots, output_dir) return mol