Source code for deepretro.utils.utils_molecule

"""Molecule helpers for validation, filtering, and simple descriptors."""

from __future__ import annotations

from collections.abc import Sequence

import structlog
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

logger = structlog.get_logger()

_INVALID_SMILES_ERROR = "Invalid SMILES string provided."


def _parse_molecule(smiles: str, *, message: str | None = None):
    """Parse a SMILES string into an RDKit molecule."""

    try:
        molecule = Chem.MolFromSmiles(smiles)
    except Exception:
        molecule = None

    if molecule is None and message is not None:
        logger.warning(message, smiles=smiles)

    return molecule


[docs] def is_valid_smiles(smiles: str) -> bool: """Check whether a SMILES string can be parsed successfully. Parameters ---------- smiles : str SMILES string to validate. Returns ------- bool ``True`` when the SMILES string parses to an RDKit molecule, otherwise ``False``. Examples -------- >>> is_valid_smiles("CCO") True >>> is_valid_smiles("not_a_smiles") False """ return _parse_molecule(smiles) is not None
[docs] def substructure_matching(target_smiles: str, query_smiles: str) -> int: """Check whether a query molecule is a substructure of a target molecule. Parameters ---------- target_smiles : str SMILES string of the target molecule. query_smiles : str SMILES string of the query molecule. Returns ------- int ``1`` if the query is a substructure of the target, otherwise ``0``. Examples -------- >>> substructure_matching("CCc1ccccc1", "c1ccccc1") 1 >>> substructure_matching("CCO", "c1ccccc1") 0 """ target_molecule = _parse_molecule( target_smiles, message="Unable to parse target molecule for substructure matching", ) query_molecule = _parse_molecule( query_smiles, message="Unable to parse query molecule for substructure matching", ) if target_molecule is None or query_molecule is None: return 0 return int(target_molecule.HasSubstructMatch(query_molecule))
[docs] def validity_check( molecule: str, res_molecules: Sequence[Sequence[str] | str], res_explanations: Sequence[str], res_confidence: Sequence[float], ) -> tuple[list[list[str]], list[str], list[float]]: """Filter proposed retrosynthesis pathways down to valid precursor sets. Parameters ---------- molecule : str Target molecule for retrosynthesis. res_molecules : Sequence[Sequence[str] | str] Candidate precursor pathways returned by the model. res_explanations : Sequence[str] Explanation for each candidate pathway. res_confidence : Sequence[float] Confidence score for each candidate pathway. Returns ------- tuple[list[list[str]], list[str], list[float]] Valid precursor pathways, explanations, and confidence scores. A pathway is kept only when every precursor is valid, is not identical to the target molecule, and is not a substructure of the target. Examples -------- >>> original_logger = validity_check.__globals__["logger"] >>> class _SilentLogger: ... def info(self, *args, **kwargs): ... pass ... ... def warning(self, *args, **kwargs): ... pass >>> validity_check.__globals__["logger"] = _SilentLogger() >>> validity_check( ... molecule="c1ccccc1", ... res_molecules=[["CCO", "CCCl"]], ... res_explanations=["valid pathway"], ... res_confidence=[0.9], ... ) ([['CCO', 'CCCl']], ['valid pathway'], [0.9]) >>> validity_check.__globals__["logger"] = original_logger """ valid_pathways: list[list[str]] = [] valid_explanations: list[str] = [] valid_confidence: list[float] = [] for smiles_group, explanation, confidence in zip( res_molecules, res_explanations, res_confidence, ): candidate_pathway = ( [smiles_group] if isinstance(smiles_group, str) else list(smiles_group) ) valid_candidates: list[str] = [] for smiles in candidate_pathway: if not is_valid_smiles(smiles): logger.warning( "Invalid SMILES in pathway", molecule=molecule, smiles=smiles ) continue if are_molecules_same(molecule, smiles): logger.warning( "Molecule is same as target", molecule=molecule, smiles=smiles ) continue if substructure_matching(molecule, smiles): logger.warning( "Molecule is substructure of target", molecule=molecule, smiles=smiles, ) continue valid_candidates.append(smiles) if len(valid_candidates) == len(candidate_pathway): valid_pathways.append(valid_candidates) valid_explanations.append(explanation) valid_confidence.append(confidence) logger.info("Validity check complete", valid_pathways=len(valid_pathways)) return valid_pathways, valid_explanations, valid_confidence
[docs] def calc_mol_wt(mol: str) -> float: """Calculate the exact molecular weight for a SMILES string. Parameters ---------- mol : str SMILES string of the molecule. Returns ------- float Exact molecular weight. Returns ``0.0`` for invalid SMILES. Examples -------- >>> round(calc_mol_wt("CCO"), 3) 46.042 >>> round(calc_mol_wt("C"), 3) 16.031 """ molecule = _parse_molecule(mol) if molecule is None: logger.error("Error calculating molecular weight", smiles=mol) return 0.0 return ExactMolWt(molecule)
[docs] def calc_chemical_formula(mol: str) -> str: """Calculate the molecular formula for a SMILES string. Parameters ---------- mol : str SMILES string of the molecule. Returns ------- str Molecular formula. Returns ``"N/A"`` for invalid SMILES. Examples -------- >>> calc_chemical_formula("C") 'CH4' >>> calc_chemical_formula("CCO") 'C2H6O' """ molecule = _parse_molecule(mol) if molecule is None: logger.error("Error calculating chemical formula", smiles=mol) return "N/A" return CalcMolFormula(molecule)
[docs] def are_molecules_same(smiles1: str, smiles2: str) -> bool: """Check whether two SMILES strings describe the same molecule. Parameters ---------- smiles1 : str SMILES string of the first molecule. smiles2 : str SMILES string of the second molecule. Returns ------- bool ``True`` when the molecules are equivalent, otherwise ``False``. Raises ------ ValueError If either SMILES string is invalid. Examples -------- >>> are_molecules_same("CCO", "OCC") True >>> are_molecules_same("CCO", "c1ccccc1") False """ mol1 = _parse_molecule(smiles1) mol2 = _parse_molecule(smiles2) if mol1 is None or mol2 is None: raise ValueError(_INVALID_SMILES_ERROR) if Chem.MolToSmiles(mol1, canonical=True) == Chem.MolToSmiles( mol2, canonical=True, ): return True fingerprint1 = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol1, radius=2, nBits=1024, ) fingerprint2 = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol2, radius=2, nBits=1024, ) return fingerprint1 == fingerprint2
[docs] def compute_fingerprint( smiles: str, radius: int = 2, nBits: int = 2048, ) -> list[int] | None: """Compute a Morgan fingerprint for a molecule. Parameters ---------- smiles : str SMILES string of the molecule. radius : int, optional Fingerprint radius, by default ``2``. nBits : int, optional Number of bits in the fingerprint, by default ``2048``. Returns ------- list[int] | None Fingerprint bit vector as integers, or ``None`` when the SMILES is invalid. Examples -------- >>> fingerprint = compute_fingerprint("CCO", radius=2, nBits=16) >>> len(fingerprint) 16 >>> compute_fingerprint("not_a_smiles") is None True """ molecule = _parse_molecule(smiles) if molecule is None: return None fingerprint = AllChem.GetMorganFingerprintAsBitVect( molecule, radius, nBits=nBits, ) return list(fingerprint)
def _has_ring_of_size(smiles: str, ring_size: int) -> bool: """Check whether a molecule contains a ring of the requested size.""" molecule = _parse_molecule(smiles) if molecule is None: raise ValueError(_INVALID_SMILES_ERROR) return any(len(ring) == ring_size for ring in molecule.GetRingInfo().AtomRings())
[docs] def detect_seven_member_rings(smiles: str) -> bool: """Detect whether a molecule contains a seven-membered ring. Parameters ---------- smiles : str SMILES string of the molecule. Returns ------- bool ``True`` when at least one seven-membered ring is present. Raises ------ ValueError If the SMILES string is invalid. Examples -------- >>> detect_seven_member_rings("C1CCCCCC1") True >>> detect_seven_member_rings("C1CCCCC1") False """ return _has_ring_of_size(smiles, ring_size=7)
[docs] def detect_eight_member_rings(smiles: str) -> bool: """Detect whether a molecule contains an eight-membered ring. Parameters ---------- smiles : str SMILES string of the molecule. Returns ------- bool ``True`` when at least one eight-membered ring is present. Raises ------ ValueError If the SMILES string is invalid. Examples -------- >>> detect_eight_member_rings("C1CCCCCCC1") True >>> detect_eight_member_rings("C1CCCCCC1") False """ return _has_ring_of_size(smiles, ring_size=8)