Source code for deepretro.algorithms.stability_checker

"""Heuristic stability checker for molecules in retrosynthetic pathways.

When an LLM proposes reactant molecules, some of them may be chemically
unstable — strained small rings, anti-aromatic systems, reactive
intermediates like carbocations or carbenes, etc.  This module catches
those problems by inspecting the molecular graph with RDKit descriptors
and SMARTS pattern matching.  No trained model is required.

Checks performed
----------------

1. **Strained small rings** — 3- or 4-membered rings that contain a
   heteroatom (N, O, S …) are flagged.  Aziridines and azetidines are
   significantly more strained than plain cyclopropane / cyclobutane.

2. **Anti-aromatic motifs** — rings whose π-electron count is a multiple
   of 4 (Hückel 4n rule) are thermodynamically destabilised.  Known
   patterns (cyclobutadiene, cyclooctatetraene, pentalene) are matched
   via SMARTS, and π electrons are also counted directly for rings of
   size 4, 8, 12, or 16.

3. **Fused small rings** — two rings of ≤ 4 atoms sharing atoms create
   extreme angle strain (e.g. bicyclo[1.1.0]butane).  These systems can
   be explosive.

4. **Large heterocycles** — rings with ≥ 7 atoms containing heteroatoms
   tend to be conformationally floppy and often unstable.  Very large
   heterocycles (> 10 atoms, ≥ 3 heteroatoms) get an extra penalty.

5. **Carbocations** — positively charged carbon centres are reactive
   intermediates, not isolable species.  The checker distinguishes:

   * *sp2* (``[C+;X3]``) — penalised unless stabilised by an adjacent
     aromatic ring, allylic double bond, or benzylic position.
   * *sp* (``[C+;X2]``) — always penalised heavily.
   * *Primary vs secondary* — primary is worse because fewer alkyl
     groups donate electron density.
   * *Adjacent to EWG* — a carbocation next to F, Cl, Br, I or charged
     N/S/O is the worst case.

6. **Carbenes** — a neutral carbon with two bonds and no hydrogens
   (``[C;X2;H0;+0]``) is extremely reactive.  Extra penalties if the
   carbene sits inside a 3- or 4-membered ring or is adjacent to an
   electron-withdrawing group.

7. **Fused cyclopentane + small hetero ring** — a 5-membered all-carbon
   ring sharing atoms with a 3- or 4-membered hetero ring creates
   significant ring strain.

8. **Physicochemical outliers** — extreme ``logP`` values with
   ``abs(logP) > 10`` or too many rotatable bonds (``> 15``) each
   incur a small penalty.

9. **Aromatic bonus** — aromatic rings stabilise a molecule, so each
   one adds a small bonus (capped at +15 total).

Scoring
-------
After all checks, the score is clamped to 0–100:

* **≥ 80** → ``"Likely stable"``
* **50–79** → ``"Moderately stable"``
* **< 50** → ``"Potentially unstable"``

Entry points
------------

* `check_molecule_stability` — analyse a single SMILES and return a
  0–100 stability score with an issue list.
* `is_valid_smiles` — quick check that a SMILES string parses.
"""

from __future__ import annotations

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import (
    CalcNumAliphaticCarbocycles,
    CalcNumAliphaticHeterocycles,
    CalcNumAliphaticRings,
    CalcNumAromaticCarbocycles,
    CalcNumAromaticHeterocycles,
    CalcNumAromaticRings,
    CalcNumBridgeheadAtoms,
)

from typing import Any

# Helpers


[docs] def is_valid_smiles(smiles: str) -> bool: """Check whether a SMILES string can be parsed by RDKit. Parameters ---------- smiles : str SMILES string to validate. Returns ------- bool ``True`` if RDKit can build a molecule from *smiles*. Examples -------- >>> is_valid_smiles("CCO") True >>> is_valid_smiles("not_a_molecule") False """ try: mol = Chem.MolFromSmiles(smiles) except Exception: return False return mol is not None
# Anti-aromatic / reactive-intermediate SMARTS (compiled once) _ANTI_AROMATIC_PATTERNS: list[tuple[Chem.Mol | None, str]] = [ (Chem.MolFromSmarts("c1ccc1"), "cyclobutadiene-like (anti-aromatic)"), ( Chem.MolFromSmarts("C1=CC=CC=CC=C1"), "cyclooctatetraene-like (potential anti-aromatic)", ), (Chem.MolFromSmarts("c1cc2cccc2c1"), "pentalene-like (potential anti-aromatic)"), ] _SP2_CARBOCATION = Chem.MolFromSmarts("[C+;X3]") _SP_CARBOCATION = Chem.MolFromSmarts("[C+;X2]") _UNSTABILISED_CARBOCATION = Chem.MolFromSmarts("[C+][F,Cl,Br,I,N+,S+,O+]") _PRIMARY_CARBOCATION = Chem.MolFromSmarts("[CH2+][#6]") _SECONDARY_CARBOCATION = Chem.MolFromSmarts("[CH+]([#6])[#6]") _ALLYLIC_CARBOCATION = Chem.MolFromSmarts("[C+]-[C]=[C]") _BENZYLIC_CARBOCATION = Chem.MolFromSmarts("[C+]-c") _SINGLET_CARBENE = Chem.MolFromSmarts("[C;X2;H0;+0]") _UNSTABLE_CARBENE = Chem.MolFromSmarts("[C;X2;H0;+0][F,Cl,Br,I,N+,S+,O+]") _RING3_CARBENE = Chem.MolFromSmarts("[C;X2;H0;+0]1[C,N,O][C,N,O]1") _RING4_CARBENE = Chem.MolFromSmarts("[C;X2;H0;+0]1[C,N,O][C,N,O][C,N,O]1") _CYCLOPENTANE = Chem.MolFromSmarts("C1CCCC1") # Core
[docs] def check_molecule_stability(smiles: str) -> dict[str, Any]: """Assess the stability of a molecule from its SMILES string. Parses the molecule with RDKit and runs nine heuristic checks (see the module docstring for a full description of each one): strained small rings, anti-aromatic motifs, fused small rings, large heterocycles, carbocations, carbenes, fused cyclopentane systems, physicochemical outliers, and an aromatic-ring bonus. Each problem subtracts from a base score of 100; the final score is clamped to 0–100. Parameters ---------- smiles : str SMILES string of the molecule to assess. Returns ------- dict[str, Any] Keys returned: * ``valid_structure`` (bool) — whether RDKit could parse the SMILES at all. * ``stability_score`` (int, 0–100) — overall stability rating. * ``issues`` (list[str]) — plain-English descriptions of every problem found (e.g. ``"Three-membered heterocycle (potentially unstable)"``). * ``metrics`` (dict) — molecular weight, logP, H-bond donors / acceptors, rotatable bonds. * ``ring_data`` (dict) — ring counts broken down by type (aliphatic / aromatic, carbocycle / heterocycle, bridgehead atoms, etc.). * ``atom_data`` (dict) — total atoms, bonds, heavy atoms, aromatic vs aliphatic counts. * ``assessment`` (str) — one of ``"Likely stable"``, ``"Moderately stable"``, or ``"Potentially unstable"``. Examples -------- >>> res = check_molecule_stability("c1ccccc1") # benzene >>> res["assessment"] 'Likely stable' >>> res = check_molecule_stability("[CH2+]C") # ethyl cation >>> res["assessment"] 'Potentially unstable' """ results: dict[str, Any] = { "valid_structure": False, "stability_score": 0, "issues": [], "metrics": {}, "ring_data": {}, "atom_data": {}, "assessment": "", } mol = Chem.MolFromSmiles(smiles) if mol is None: results["issues"].append("Invalid SMILES string or cannot be parsed") return results results["valid_structure"] = True # basic physicochemical properties mw = Descriptors.MolWt(mol) logp = Descriptors.MolLogP(mol) hbd = Descriptors.NumHDonors(mol) hba = Descriptors.NumHAcceptors(mol) rotatable_bonds = Descriptors.NumRotatableBonds(mol) results["metrics"] = { "molecular_weight": mw, "logP": logp, "h_bond_donors": hbd, "h_bond_acceptors": hba, "rotatable_bonds": rotatable_bonds, } # ring information ring_info = mol.GetRingInfo() atom_rings = ring_info.AtomRings() bond_rings = ring_info.BondRings() num_aromatic_atoms = len(mol.GetAromaticAtoms()) num_heavy_atoms = mol.GetNumHeavyAtoms() num_aliphatic_carbocycles = CalcNumAliphaticCarbocycles(mol) num_aliphatic_heterocycles = CalcNumAliphaticHeterocycles(mol) num_aliphatic_rings = CalcNumAliphaticRings(mol) num_aromatic_carbocycles = CalcNumAromaticCarbocycles(mol) num_aromatic_heterocycles = CalcNumAromaticHeterocycles(mol) num_aromatic_rings = CalcNumAromaticRings(mol) num_bridgehead_atoms = CalcNumBridgeheadAtoms(mol) results["ring_data"] = { "num_rings": len(atom_rings), "atom_rings": [list(r) for r in atom_rings], "bond_rings": [list(r) for r in bond_rings], "num_aliphatic_carbocycles": num_aliphatic_carbocycles, "num_aliphatic_heterocycles": num_aliphatic_heterocycles, "num_aliphatic_rings": num_aliphatic_rings, "num_aromatic_carbocycles": num_aromatic_carbocycles, "num_aromatic_heterocycles": num_aromatic_heterocycles, "num_aromatic_rings": num_aromatic_rings, "num_bridgehead_atoms": num_bridgehead_atoms, } results["atom_data"] = { "num_atoms": mol.GetNumAtoms(), "num_bonds": mol.GetNumBonds(), "num_heavy_atoms": num_heavy_atoms, "num_heavy_bonds": mol.GetNumBonds(), "num_aromatic_atoms": num_aromatic_atoms, "num_aliphatic_atoms": num_heavy_atoms - num_aromatic_atoms, } # strained small rings for ring in atom_rings: if len(ring) < 3: results["issues"].append(f"Highly strained ring of size {len(ring)}") elif len(ring) == 3 and any( mol.GetAtomWithIdx(i).GetSymbol() != "C" for i in ring ): results["issues"].append( "Three-membered heterocycle (potentially unstable)" ) elif len(ring) == 4 and any( mol.GetAtomWithIdx(i).GetSymbol() != "C" for i in ring ): results["issues"].append("Four-membered heterocycle (potentially unstable)") # anti-aromatic motifs for patt, name in _ANTI_AROMATIC_PATTERNS: if patt and mol.HasSubstructMatch(patt): results["issues"].append(f"Contains {name} motif") for ring in atom_rings: if len(ring) in (4, 8, 12, 16): double_bond_count = 0.0 ring_atoms_obj = [mol.GetAtomWithIdx(i) for i in ring] for atom in ring_atoms_obj: if atom.GetIsAromatic(): double_bond_count += 0.5 for bond in atom.GetBonds(): other = bond.GetOtherAtomIdx(atom.GetIdx()) if other in ring and bond.GetBondType() == Chem.BondType.DOUBLE: double_bond_count += 0.5 pi_electrons = int(double_bond_count * 2) if pi_electrons > 0 and pi_electrons % 4 == 0: results["issues"].append( f"{len(ring)}-membered ring with {pi_electrons} " f"\u03c0 electrons (potential anti-aromatic)" ) # fused small rings small_rings = [set(r) for r in atom_rings if len(r) <= 4] fused_small_rings_detected = False for i in range(len(small_rings)): for j in range(i + 1, len(small_rings)): if small_rings[i] & small_rings[j]: fused_small_rings_detected = True results["issues"].append( f"Fused {len(small_rings[i])} and " f"{len(small_rings[j])}-membered rings " f"(highly strained system)" ) if fused_small_rings_detected: results["issues"].append( "WARNING: Fused small rings create highly strained " "and potentially explosive compounds" ) # large heterocycles for ring in atom_rings: if len(ring) >= 7: ring_atoms_obj = [mol.GetAtomWithIdx(i) for i in ring] heteroatoms = [a for a in ring_atoms_obj if a.GetSymbol() not in ("C", "H")] if heteroatoms: symbols = {a.GetSymbol() for a in heteroatoms} results["issues"].append( f"Large ({len(ring)}-membered) heterocycle with " f"{', '.join(symbols)} (potentially unstable)" ) if len(ring) > 10 and len(heteroatoms) >= 3: results["issues"].append( "Very large heterocycle with multiple heteroatoms " "(significant stability concern)" ) # bridgehead strain if num_bridgehead_atoms > 0: if any(len(r) <= 4 for r in atom_rings) and num_bridgehead_atoms >= 2: results["issues"].append("Strained polycyclic system with bridgehead atoms") # scoring (base 100, subtract penalties) score = 100 score -= len(results["issues"]) * 15 # carbocations score = check_carbocations(mol, results, score) # carbenes score = check_carbenes(mol, results, score) # fused cyclopentane + small hetero ring score = check_fused_cyclopentane(mol, atom_rings, results, score) # physicochemical penalties if abs(logp) > 10: score -= 10 if rotatable_bonds > 15: score -= 10 # aromatic bonus if num_aromatic_rings > 0: score += min(num_aromatic_rings * 5, 15) # aliphatic heterocycle penalty if num_aliphatic_heterocycles > 0 and num_aliphatic_rings <= 3: score -= 5 * num_aliphatic_heterocycles # bridgehead penalty if num_bridgehead_atoms > 0 and any(len(r) <= 5 for r in atom_rings): score -= num_bridgehead_atoms * 5 if fused_small_rings_detected: score -= 30 # anti-aromatic penalty for issue in results["issues"]: if "anti-aromatic" in issue: score -= 25 elif "\u03c0 electrons" in issue: score -= 20 # large heterocycle penalty large_hetero = sum( 1 for iss in results["issues"] if "heterocycle" in iss and "large" in iss.lower() ) if large_hetero > 0: score -= large_hetero * 10 # clamp and label score = max(0, min(100, score)) results["stability_score"] = score if score >= 80: results["assessment"] = "Likely stable" elif score >= 50: results["assessment"] = "Moderately stable" else: results["assessment"] = "Potentially unstable" return results
[docs] def check_carbocations( mol: Chem.Mol, results: dict[str, Any], score: int, ) -> int: """Detect carbocation intermediates and apply score penalties. Carbocations are positively charged carbon centres i.e. reactive intermediates that cannot be bottled. The function uses SMARTS pattern matching to find them and then checks whether the charge is stabilised by resonance (allylic or benzylic position) or by neighbouring aromatic atoms. Unstabilised and primary carbocations get the heaviest penalties; stabilised ones get a small bonus. Parameters ---------- mol : Chem.Mol RDKit molecule object already parsed from SMILES. results : dict[str, Any] Accumulator — detected issues are appended to ``results["issues"]``. score : int Running stability score to adjust. Returns ------- int Updated stability score after carbocation penalties / bonuses. Examples -------- >>> from rdkit import Chem >>> from deepretro.algorithms import check_carbocations >>> mol = Chem.MolFromSmiles("[CH2+]C") # ethyl cation >>> results = {"issues": []} >>> new_score = check_carbocations(mol, results, 100) >>> "Contains primary carbocation (highly unstable)" in results["issues"] True >>> new_score < 100 True """ if _SP2_CARBOCATION and mol.HasSubstructMatch(_SP2_CARBOCATION): for match in mol.GetSubstructMatches(_SP2_CARBOCATION): carbon = mol.GetAtomWithIdx(match[0]) neighbours = [mol.GetAtomWithIdx(n.GetIdx()) for n in carbon.GetNeighbors()] is_allylic = _ALLYLIC_CARBOCATION and mol.HasSubstructMatch( _ALLYLIC_CARBOCATION ) is_benzylic = _BENZYLIC_CARBOCATION and mol.HasSubstructMatch( _BENZYLIC_CARBOCATION ) if any(a.GetIsAromatic() for a in neighbours) or is_allylic or is_benzylic: score += 10 else: results["issues"].append( "Contains non-stabilized sp2 carbocation " "(highly unstable intermediate)" ) score -= 25 if _SP_CARBOCATION and mol.HasSubstructMatch(_SP_CARBOCATION): results["issues"].append( "Contains sp carbocation (highly unstable intermediate)" ) score -= 30 if _UNSTABILISED_CARBOCATION and mol.HasSubstructMatch(_UNSTABILISED_CARBOCATION): results["issues"].append( "Contains carbocation adjacent to electron-withdrawing " "group (extremely unstable)" ) score -= 35 if _PRIMARY_CARBOCATION and mol.HasSubstructMatch(_PRIMARY_CARBOCATION): results["issues"].append("Contains primary carbocation (highly unstable)") score -= 30 elif _SECONDARY_CARBOCATION and mol.HasSubstructMatch(_SECONDARY_CARBOCATION): results["issues"].append( "Contains secondary carbocation (unstable intermediate)" ) score -= 25 return score
[docs] def check_carbenes( mol: Chem.Mol, results: dict[str, Any], score: int, ) -> int: """Detect carbene intermediates and apply score penalties. A carbene is a neutral carbon with only two bonds and no hydrogens i.e. an extremely reactive species that usually exists only as a fleeting intermediate. Additional penalties stack if the carbene is inside a strained 3- or 4-membered ring, or sits next to an electron-withdrawing group (halogens, charged heteroatoms). Parameters ---------- mol : Chem.Mol RDKit molecule object already parsed from SMILES. results : dict[str, Any] Accumulator — detected issues are appended to ``results["issues"]``. score : int Running stability score to adjust. Returns ------- int Updated stability score after carbene penalties. Examples -------- >>> from rdkit import Chem >>> from deepretro.algorithms import check_carbenes >>> mol = Chem.MolFromSmiles("[C]1CC1") # carbene in 3-membered ring >>> results = {"issues": []} >>> new_score = check_carbenes(mol, results, 100) >>> any("carbene" in i for i in results["issues"]) True >>> new_score < 100 True """ if _SINGLET_CARBENE and mol.HasSubstructMatch(_SINGLET_CARBENE): results["issues"].append("Contains carbene (highly reactive intermediate)") score -= 35 if _UNSTABLE_CARBENE and mol.HasSubstructMatch(_UNSTABLE_CARBENE): results["issues"].append( "Contains carbene adjacent to electron-withdrawing " "group (extremely unstable)" ) score -= 10 if _RING3_CARBENE and mol.HasSubstructMatch(_RING3_CARBENE): results["issues"].append( "Contains carbene in 3-membered ring (extremely unstable)" ) score -= 15 if _RING4_CARBENE and mol.HasSubstructMatch(_RING4_CARBENE): results["issues"].append( "Contains carbene in 4-membered ring (highly unstable)" ) score -= 10 return score
[docs] def check_fused_cyclopentane( mol: Chem.Mol, atom_rings: tuple, results: dict[str, Any], score: int, ) -> int: """Detect 5-membered carbon rings fused with small hetero rings. A cyclopentane ring sharing atoms with a 3- or 4-membered ring that contains a heteroatom (N, O, S …) creates significant angle strain, for example 1,2-epoxycyclopentane. Each such fusion incurs a heavy penalty (-40). Parameters ---------- mol : Chem.Mol RDKit molecule object already parsed from SMILES. atom_rings : tuple Ring atom-index tuples from ``mol.GetRingInfo().AtomRings()``. results : dict[str, Any] Accumulator — detected issues are appended to ``results["issues"]``. score : int Running stability score to adjust. Returns ------- int Updated stability score after fused-ring penalties. Examples -------- >>> from rdkit import Chem >>> from deepretro.algorithms import check_fused_cyclopentane >>> mol = Chem.MolFromSmiles("C1CC2OCC12") # epoxycyclopentane >>> rings = mol.GetRingInfo().AtomRings() >>> results = {"issues": []} >>> new_score = check_fused_cyclopentane(mol, rings, results, 100) >>> any("strained system" in i for i in results["issues"]) True >>> new_score < 100 True """ if not (_CYCLOPENTANE and mol.HasSubstructMatch(_CYCLOPENTANE)): return score five_mem = [ r for r in atom_rings if len(r) == 5 and all(mol.GetAtomWithIdx(i).GetSymbol() == "C" for i in r) ] small_hetero = [ r for r in atom_rings if len(r) in (3, 4) and any(mol.GetAtomWithIdx(i).GetSymbol() != "C" for i in r) ] for fr in five_mem: fr_set = set(fr) for sr in small_hetero: if fr_set & set(sr): hetero = { mol.GetAtomWithIdx(i).GetSymbol() for i in sr if mol.GetAtomWithIdx(i).GetSymbol() != "C" } results["issues"].append( f"5-membered carbon ring fused with " f"{len(sr)}-membered ring containing " f"{', '.join(hetero)} (strained system)" ) score -= 40 return score