thermo #

Train against thermodynamic properties.

Classes:

DataEntry –

Represents a single experimental data point.
SimulationKey –

A key used to identify a simulation.
SimulationConfig –

Configuration for a simulation to run.

Functions:

create_dataset –

Create a dataset from a list of existing data points.
create_from_evaluator –

Create a dataset from an evaluator PhysicalPropertyDataSet
extract_smiles –

Return a list of unique SMILES strings in the dataset.
default_config –

Return a default simulation configuration for the specified phase.
select_config –

A helper method to choose the simulation config based on the phase
predict –

Predict the properties in a dataset using molecular simulation, or by reweighting
default_closure –

Return a default closure function for training against thermodynamic

DataEntry #

Bases: TypedDict

Represents a single experimental data point.

Attributes:

type (DataType) –

The type of data point.
smiles_a (str) –

The SMILES definition of the first component.
x_a (float | None) –

The mole fraction of the first component. This must be set to 1.0 if the data
smiles_b (str | None) –

The SMILES definition of the second component if present.
x_b (float | None) –

The mole fraction of the second component if present.
temperature (float) –

The temperature at which the data point was measured.
pressure (float) –

The pressure at which the data point was measured.
value (float) –

The value of the data point.
std (float | None) –

The standard deviation of the data point if available.
units (str) –

The units of the data point.
source (str) –

The source of the data point.

type `instance-attribute` #

type: DataType

The type of data point.

smiles_a `instance-attribute` #

smiles_a: str

The SMILES definition of the first component.

x_a `instance-attribute` #

x_a: float | None

The mole fraction of the first component. This must be set to 1.0 if the data

smiles_b `instance-attribute` #

smiles_b: str | None

The SMILES definition of the second component if present.

x_b `instance-attribute` #

x_b: float | None

The mole fraction of the second component if present.

temperature `instance-attribute` #

temperature: float

The temperature at which the data point was measured.

pressure `instance-attribute` #

pressure: float

The pressure at which the data point was measured.

value `instance-attribute` #

value: float

The value of the data point.

std `instance-attribute` #

std: float | None

The standard deviation of the data point if available.

units `instance-attribute` #

units: str

The units of the data point.

source `instance-attribute` #

source: str

The source of the data point.

SimulationKey #

Bases: NamedTuple

A key used to identify a simulation.

Attributes:

smiles (tuple[str, ...]) –

The SMILES definitions of the components present in the system.
counts (tuple[int, ...]) –

The number of copies of each component present in the system.
temperature (float) –

The temperature [K] at which the simulation was run.
pressure (float | None) –

The pressure [atm] at which the simulation was run.

smiles `instance-attribute` #

smiles: tuple[str, ...]

The SMILES definitions of the components present in the system.

counts `instance-attribute` #

counts: tuple[int, ...]

The number of copies of each component present in the system.

temperature `instance-attribute` #

temperature: float

The temperature [K] at which the simulation was run.

pressure `instance-attribute` #

pressure: float | None

The pressure [atm] at which the simulation was run.

SimulationConfig `pydantic-model` #

Bases: BaseModel

Configuration for a simulation to run.

Fields:

max_mols (int)
gen_coords (GenerateCoordsConfig)
apply_hmr (bool)
equilibrate (list[MinimizationConfig | SimulationConfig])
production (SimulationConfig)
production_frequency (int)

max_mols `pydantic-field` #

max_mols: int

The maximum number of molecules to simulate.

gen_coords `pydantic-field` #

gen_coords: GenerateCoordsConfig

Configuration for generating initial coordinates.

apply_hmr `pydantic-field` #

apply_hmr: bool = False

Whether to apply hydrogen mass repartitioning.

equilibrate `pydantic-field` #

equilibrate: list[MinimizationConfig | SimulationConfig]

Configuration for equilibration simulations.

production `pydantic-field` #

production: SimulationConfig

Configuration for the production simulation.

production_frequency `pydantic-field` #

production_frequency: int

The frequency at which to write frames during production.

create_dataset #

create_dataset(*rows: DataEntry) -> Dataset

Create a dataset from a list of existing data points.

Parameters:

rows (DataEntry, default: () ) –

The data points to create the dataset from.

Returns:

Dataset –

The created dataset.

Source code in descent/targets/thermo.py

def create_dataset(*rows: DataEntry) -> datasets.Dataset:
    """Create a dataset from a list of existing data points.

    Args:
        rows: The data points to create the dataset from.

    Returns:
        The created dataset.
    """

    for row in rows:
        row["smiles_a"] = descent.utils.molecule.map_smiles(row["smiles_a"])

        if row["smiles_b"] is None:
            continue

        row["smiles_b"] = descent.utils.molecule.map_smiles(row["smiles_b"])

    # TODO: validate rows
    table = pyarrow.Table.from_pylist([*rows], schema=DATA_SCHEMA)

    dataset = datasets.Dataset(datasets.table.InMemoryTable(table))
    return dataset

create_from_evaluator #

create_from_evaluator(dataset_file: Path) -> Dataset

Create a dataset from an evaluator PhysicalPropertyDataSet

Parameters:

dataset_file (Path) –

The path to the evaluator dataset

Returns:

Dataset –

The created dataset

Source code in descent/targets/thermo.py

def create_from_evaluator(dataset_file: pathlib.Path) -> datasets.Dataset:
    """
    Create a dataset from an evaluator PhysicalPropertyDataSet

    Args:
        dataset_file: The path to the evaluator dataset

    Returns:
        The created dataset
    """
    import json

    from openff.units import unit

    _evaluator_to_prop = {
        "openff.evaluator.properties.density.Density": "density",
        "openff.evaluator.properties.enthalpy.EnthalpyOfMixing": "hmix",
        "openff.evaluator.properties.enthalpy.EnthalpyOfVaporization": "hvap",
    }
    _prop_units = {"density": "g/mL", "hmix": "kcal/mol", "hvap": "kcal/mol"}

    properties: list[DataEntry] = []
    property_data = json.load(dataset_file.open())

    for phys_prop in property_data["properties"]:
        try:
            prop_type = _evaluator_to_prop[phys_prop["@type"]]
        except KeyError:
            raise KeyError(f"{phys_prop['@type']} not currently supported.") from None

        smiles_and_role = [
            (comp["smiles"], comp["smiles"] + "{" + comp["role"]["value"] + "}")
            for comp in phys_prop["substance"]["components"]
        ]
        smiles_a, role_a = smiles_and_role[0]
        x_a = phys_prop["substance"]["amounts"][role_a][0]["value"]
        if len(smiles_and_role) == 1:
            smiles_b, x_b = None, None
        elif len(smiles_and_role) == 2:
            smiles_b, role_b = smiles_and_role[1]
            x_b = phys_prop["substance"]["amounts"][role_b][0]["value"]
        else:
            raise NotImplementedError("up to binary mixtures are currently supported")

        temp_unit = getattr(
            unit, phys_prop["thermodynamic_state"]["temperature"]["unit"]
        )
        temp = phys_prop["thermodynamic_state"]["temperature"]["value"] * temp_unit
        pressure_unit = getattr(
            unit, phys_prop["thermodynamic_state"]["pressure"]["unit"]
        )
        pressure = phys_prop["thermodynamic_state"]["pressure"]["value"] * pressure_unit
        value = phys_prop["value"]["value"] * getattr(unit, phys_prop["value"]["unit"])
        std = phys_prop["uncertainty"]["value"] * getattr(
            unit, phys_prop["uncertainty"]["unit"]
        )
        default_units = getattr(unit, _prop_units[prop_type])
        prop = {
            "type": prop_type,
            "smiles_a": smiles_a,
            "x_a": x_a,
            "smiles_b": smiles_b,
            "x_b": x_b,
            "temperature": temp.to(unit.kelvin).m,
            "pressure": pressure.to(unit.atm).m,
            "value": value.to(default_units).m,
            "units": _prop_units[prop_type],
            "std": std.to(default_units).m,
            "source": phys_prop["source"]["doi"],
        }
        properties.append(prop)

    return create_dataset(*properties)

extract_smiles #

extract_smiles(dataset: Dataset) -> list[str]

Return a list of unique SMILES strings in the dataset.

Parameters:

dataset (Dataset) –

The dataset to extract the SMILES strings from.

Returns:

list[str] –

The unique SMILES strings with full atom mapping.

Source code in descent/targets/thermo.py

def extract_smiles(dataset: datasets.Dataset) -> list[str]:
    """Return a list of unique SMILES strings in the dataset.

    Args:
        dataset: The dataset to extract the SMILES strings from.

    Returns:
        The unique SMILES strings with full atom mapping.
    """
    smiles_a = {smiles for smiles in dataset.unique("smiles_a") if smiles is not None}
    smiles_b = {smiles for smiles in dataset.unique("smiles_b") if smiles is not None}

    smiles_unique = sorted({*smiles_a, *smiles_b})
    return smiles_unique

default_config #

default_config(
    phase: Phase, temperature: float, pressure: float | None
) -> SimulationConfig

Return a default simulation configuration for the specified phase.

Parameters:

phase (Phase) –

The phase to return the default configuration for.
temperature (float) –

The temperature [K] at which to run the simulation.
pressure (float | None) –

The pressure [atm] at which to run the simulation.

Returns:

SimulationConfig –

The default simulation configuration.

Source code in descent/targets/thermo.py

def default_config(
    phase: Phase, temperature: float, pressure: float | None
) -> SimulationConfig:
    """Return a default simulation configuration for the specified phase.

    Args:
        phase: The phase to return the default configuration for.
        temperature: The temperature [K] at which to run the simulation.
        pressure: The pressure [atm] at which to run the simulation.

    Returns:
        The default simulation configuration.
    """

    if phase.lower() == "bulk":
        return _bulk_config(temperature, pressure)
    elif phase.lower() == "vacuum":
        return _vacuum_config(temperature, pressure)
    else:
        raise NotImplementedError(phase)

select_config #

select_config(
    phase: Phase,
    temperature: float,
    pressure: float | None,
    custom_config: (
        dict[str, SimulationConfig] | None
    ) = None,
) -> SimulationConfig

A helper method to choose the simulation config based on the phase with the desired temperature and pressure. If a custom configuration is not available the default will be used.

Parameters:

phase (Phase) –

The phase of the simulation.
temperature (float) –

The temperature [K] at which to run the simulation.
pressure (float | None) –

The pressure [atm] at which to run the simulation
custom_config (dict[str, SimulationConfig] | None, default: None ) –

The custom simulation configuration for each phase.

Returns:

SimulationConfig –

The simulation configuration for the given phase.

Source code in descent/targets/thermo.py

def select_config(
    phase: Phase,
    temperature: float,
    pressure: float | None,
    custom_config: dict[str, SimulationConfig] | None = None,
) -> SimulationConfig:
    """
    A helper method to choose the simulation config based on the phase
        with the desired temperature and pressure.
    If a custom configuration is not available the default will be used.

    Args:
        phase: The phase of the simulation.
        temperature: The temperature [K] at which to run the simulation.
        pressure: The pressure [atm] at which to run the simulation
        custom_config: The custom simulation configuration for each phase.

    Returns:
        The simulation configuration for the given phase.
    """
    if custom_config is None:
        custom_config = {}

    try:
        config = custom_config[phase]
        # edit the config with the desired temperature and pressure
        temperature = temperature * openmm.unit.kelvin
        pressure = pressure * openmm.unit.atmosphere
        for stage in config.equilibrate:
            if isinstance(stage, smee.mm.SimulationConfig):
                stage.temperature = temperature
                stage.pressure = pressure

        config.production.temperature = temperature
        config.production.pressure = pressure

    except KeyError:
        config = default_config(phase=phase, temperature=temperature, pressure=pressure)

    return config

predict #

predict(
    dataset: Dataset,
    force_field: TensorForceField,
    topologies: dict[str, TensorTopology],
    output_dir: Path,
    cached_dir: Path | None = None,
    per_type_scales: dict[DataType, float] | None = None,
    verbose: bool = False,
    simulation_config: (
        dict[str, SimulationConfig] | None
    ) = None,
) -> tuple[Tensor, Tensor, Tensor, Tensor]

Predict the properties in a dataset using molecular simulation, or by reweighting previous simulation data.

Parameters:

dataset (Dataset) –

The dataset to predict the properties of.
force_field (TensorForceField) –

The force field to use.
topologies (dict[str, TensorTopology]) –

The topologies of the molecules present in the dataset, with keys of mapped SMILES patterns.
output_dir (Path) –

The directory to write the simulation trajectories to.
cached_dir (Path | None, default: None ) –

The (optional) directory to read cached simulation trajectories from.
per_type_scales (dict[DataType, float] | None, default: None ) –

The scale factor to apply to each data type. A default of 1.0 will be used for any data type not specified.
verbose (bool, default: False ) –

Whether to log additional information.
simulation_config (dict[str, SimulationConfig] | None, default: None ) –

The (optional) simulation configuration, should contain a config for each phase if not provided the default will be used.

Source code in descent/targets/thermo.py

def predict(
    dataset: datasets.Dataset,
    force_field: smee.TensorForceField,
    topologies: dict[str, smee.TensorTopology],
    output_dir: pathlib.Path,
    cached_dir: pathlib.Path | None = None,
    per_type_scales: dict[DataType, float] | None = None,
    verbose: bool = False,
    simulation_config: dict[str, SimulationConfig] | None = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Predict the properties in a dataset using molecular simulation, or by reweighting
    previous simulation data.

    Args:
        dataset: The dataset to predict the properties of.
        force_field: The force field to use.
        topologies: The topologies of the molecules present in the dataset, with keys
            of mapped SMILES patterns.
        output_dir: The directory to write the simulation trajectories to.
        cached_dir: The (optional) directory to read cached simulation trajectories
            from.
        per_type_scales: The scale factor to apply to each data type. A default of 1.0
            will be used for any data type not specified.
        verbose: Whether to log additional information.
        simulation_config: The (optional) simulation configuration, should contain
            a config for each phase if not provided the default will be used.
    """

    entries: list[DataEntry] = [*descent.utils.dataset.iter_dataset(dataset)]

    required_simulations, entry_to_simulation = _plan_simulations(
        entries, topologies, simulation_config
    )
    observables = {
        phase: {
            key: _compute_observables(
                phase,
                key,
                system,
                force_field,
                output_dir,
                cached_dir,
                simulation_config,
            )
            for key, system in systems.items()
        }
        for phase, systems in required_simulations.items()
    }

    predicted = []
    predicted_std = []
    reference = []
    reference_std = []

    verbose_rows = []

    per_type_scales = per_type_scales if per_type_scales is not None else {}

    for entry, keys in zip(entries, entry_to_simulation, strict=True):
        value, std = _predict(entry, keys, observables, required_simulations)

        type_scale = per_type_scales.get(entry["type"], 1.0)

        predicted.append(value * type_scale)
        predicted_std.append(torch.nan if std is None else std * abs(type_scale))

        reference.append(entry["value"] * type_scale)
        reference_std.append(
            torch.nan if entry["std"] is None else entry["std"] * abs(type_scale)
        )

        if verbose:
            std_ref = "" if entry["std"] is None else f" ± {float(entry['std']):.3f}"

            verbose_rows.append(
                {
                    "type": f"{entry['type']} [{entry['units']}]",
                    "smiles_a": descent.utils.molecule.unmap_smiles(entry["smiles_a"]),
                    "smiles_b": (
                        ""
                        if entry["smiles_b"] is None
                        else descent.utils.molecule.unmap_smiles(entry["smiles_b"])
                    ),
                    "pred": f"{float(value):.3f} ± {float(std):.3f}",
                    "ref": f"{float(entry['value']):.3f}{std_ref}",
                }
            )

    if verbose:
        import pandas

        _LOGGER.info(f"predicted {len(entries)} properties")
        _LOGGER.info("\n" + pandas.DataFrame(verbose_rows).to_string(index=False))

    predicted = torch.stack(predicted)
    predicted_std = torch.stack(predicted_std)

    reference = smee.utils.tensor_like(reference, predicted)
    reference_std = smee.utils.tensor_like(reference_std, predicted_std)

    return reference, reference_std, predicted, predicted_std

default_closure #

default_closure(
    trainable: Trainable,
    topologies: dict[str, TensorTopology],
    dataset: Dataset,
    per_type_scales: dict[DataType, float] | None = None,
    verbose: bool = False,
    simulation_config: (
        dict[str, SimulationConfig] | None
    ) = None,
) -> ClosureFn

Return a default closure function for training against thermodynamic properties.

Parameters:

trainable (Trainable) –

The wrapper around trainable parameters.
topologies (dict[str, TensorTopology]) –

The topologies of the molecules present in the dataset, with keys of mapped SMILES patterns.
dataset (Dataset) –

The dataset to train against.
per_type_scales (dict[DataType, float] | None, default: None ) –

The scale factor to apply to each data type.
verbose (bool, default: False ) –

Whether to log additional information about predictions.
simulation_config (dict[str, SimulationConfig] | None, default: None ) –

The (optional) simulation configuration, should contain a config for each phase if not provided the default will be used.

Returns:

ClosureFn –

The default closure function.

Source code in descent/targets/thermo.py

def default_closure(
    trainable: "descent.train.Trainable",
    topologies: dict[str, smee.TensorTopology],
    dataset: datasets.Dataset,
    per_type_scales: dict[DataType, float] | None = None,
    verbose: bool = False,
    simulation_config: dict[str, SimulationConfig] | None = None,
) -> descent.optim.ClosureFn:
    """Return a default closure function for training against thermodynamic
    properties.

    Args:
        trainable: The wrapper around trainable parameters.
        topologies: The topologies of the molecules present in the dataset, with keys
            of mapped SMILES patterns.
        dataset: The dataset to train against.
        per_type_scales: The scale factor to apply to each data type.
        verbose: Whether to log additional information about predictions.
        simulation_config: The (optional) simulation configuration, should contain
            a config for each phase if not provided the default will be used.

    Returns:
        The default closure function.
    """

    def closure_fn(
        x: torch.Tensor,
        compute_gradient: bool,
        compute_hessian: bool,
    ):
        force_field = trainable.to_force_field(x)

        y_ref, _, y_pred, _ = descent.targets.thermo.predict(
            dataset,
            force_field,
            topologies,
            pathlib.Path.cwd(),
            None,
            per_type_scales,
            verbose,
            simulation_config,
        )
        loss, gradient, hessian = ((y_pred - y_ref) ** 2).sum(), None, None

        if compute_hessian:
            hessian = descent.utils.loss.approximate_hessian(x, y_pred)
        if compute_gradient:
            gradient = torch.autograd.grad(loss, x, retain_graph=True)[0].detach()

        return loss.detach(), gradient, hessian

    return closure_fn

thermo #

DataEntry #

type instance-attribute #

smiles_a instance-attribute #

x_a instance-attribute #

smiles_b instance-attribute #

x_b instance-attribute #

temperature instance-attribute #

pressure instance-attribute #

value instance-attribute #

std instance-attribute #

units instance-attribute #

source instance-attribute #

SimulationKey #

smiles instance-attribute #

counts instance-attribute #

temperature instance-attribute #

pressure instance-attribute #

SimulationConfig pydantic-model #

max_mols pydantic-field #

gen_coords pydantic-field #

apply_hmr pydantic-field #

equilibrate pydantic-field #

production pydantic-field #

production_frequency pydantic-field #

create_dataset #

create_from_evaluator #

extract_smiles #

default_config #

select_config #

predict #

default_closure #

type `instance-attribute` #

smiles_a `instance-attribute` #

x_a `instance-attribute` #

smiles_b `instance-attribute` #

x_b `instance-attribute` #

temperature `instance-attribute` #

pressure `instance-attribute` #

value `instance-attribute` #

std `instance-attribute` #

units `instance-attribute` #

source `instance-attribute` #

smiles `instance-attribute` #

counts `instance-attribute` #

temperature `instance-attribute` #

pressure `instance-attribute` #

SimulationConfig `pydantic-model` #

max_mols `pydantic-field` #

gen_coords `pydantic-field` #

apply_hmr `pydantic-field` #

equilibrate `pydantic-field` #

production `pydantic-field` #

production_frequency `pydantic-field` #