QM9 Regression#

In this tutorial we provide a 3D example using the qm9-dataset. We use the 3D molecules in the dataset for featurization and train a random forest regressor to predict one of the target properties (here: cv, heat capacity) in the dataset.

To follow along, make sure to install additional dependencies: molflux[rdkit].

Loading the qm9 dataset#

Since the original dataset is quite big, for demonstration reasons, we load a truncated version of the dataset stored on disk

from molflux.datasets import load_dataset_from_store

dataset = load_dataset_from_store("gdb9_trunc.parquet")

print(dataset)
Dataset({
    features: ['mol_bytes', 'mol_id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom'],
    num_rows: 1000
})

Registering a custom featurization method#

Here we demonstrate on how to add your own representation temporarily. This is a simple wrapper for one of the many 3d-descriptors offered by rdkit descriptors.

from typing import Any, Dict


from molflux.features.catalogue import register_representation
from molflux.features.bases import RepresentationBase
from molflux.features.info import RepresentationInfo
from rdkit import Chem
from rdkit.Chem import Descriptors3D

_DESCRIPTION = """
Rdkit 3D descriptors.
"""


@register_representation(kind="custom", name="rdkit_descriptors_3d")
class RdkitDescriptors3D(RepresentationBase):
    def _info(self) -> RepresentationInfo:
        return RepresentationInfo(
            description=_DESCRIPTION,
        )

    def _featurise(
        self,
        samples: Any,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """compute your features!
        Input: a matrix of an array-like representation x N (number of Datapoints in dataset)
        """
        samples = [Chem.Mol(mol) for mol in samples]
        descriptors = []
        for mol in samples:
            descriptors.append(getattr(Descriptors3D.rdMolDescriptors, "CalcAUTOCORR3D")(mol))
        return {self.tag: descriptors}

Featurising#

Now that we registered the representation, we can load it as if this were part of the catalogue.

from molflux.datasets import featurise_dataset
from molflux.features import load_from_dicts as load_representations_from_dicts

featuriser = load_representations_from_dicts(
    [
        {"name": "rdkit_descriptors_3d"},
    ]
)

featurised_dataset = featurise_dataset(
    dataset, column="mol_bytes", representations=featuriser
)

In order to compare it to classic molecular descriptors, we create a new column smiles using huggingface datasets mapping functionality.

# Now we can compare to 2D-feature, we need to first generate smiles with a small helper function


def _from_molbytes_to_smiles(example):
    mol = Chem.Mol(example["mol_bytes"])
    smiles = Chem.MolToSmiles(mol)
    # Catch molecules with which rdkit struggles.
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        example["smiles"] = None
        return example
    else:
        smiles = Chem.MolToSmiles(mol)
        example["smiles"] = smiles
        return example


featurised_dataset = featurised_dataset.map(_from_molbytes_to_smiles)
featurised_dataset = featurised_dataset.filter(
    lambda example: example["smiles"] is not None
)

featuriser = load_representations_from_dicts(
    [
        {"name": "morgan"},
        {"name": "maccs_rdkit"},
    ]
)

featurised_dataset = featurise_dataset(
    featurised_dataset, column="smiles", representations=featuriser
)
featurised_dataset
Dataset({
    features: ['mol_bytes', 'mol_id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom', 'mol_bytes::rdkit_descriptors3_d', 'smiles', 'smiles::morgan', 'smiles::maccs_rdkit'],
    num_rows: 991
})

Splitting#

from molflux.datasets import split_dataset
from molflux.splits import load_from_dict as load_split_from_dict

shuffle_strategy = load_split_from_dict(
    {
        "name": "shuffle_split",
        "presets": {
            "train_fraction": 0.8,
            "validation_fraction": 0.0,
            "test_fraction": 0.2,
        },
    }
)

split_featurised_dataset = next(split_dataset(featurised_dataset, shuffle_strategy))

split_featurised_dataset
DatasetDict({
    train: Dataset({
        features: ['mol_bytes', 'mol_id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom', 'mol_bytes::rdkit_descriptors3_d', 'smiles', 'smiles::morgan', 'smiles::maccs_rdkit'],
        num_rows: 792
    })
    validation: Dataset({
        features: ['mol_bytes', 'mol_id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom', 'mol_bytes::rdkit_descriptors3_d', 'smiles', 'smiles::morgan', 'smiles::maccs_rdkit'],
        num_rows: 0
    })
    test: Dataset({
        features: ['mol_bytes', 'mol_id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom', 'mol_bytes::rdkit_descriptors3_d', 'smiles', 'smiles::morgan', 'smiles::maccs_rdkit'],
        num_rows: 199
    })
})

Training using 3D-descriptors#

import json

from molflux.modelzoo import load_from_dict as load_model_from_dict
from molflux.metrics import load_suite

import matplotlib.pyplot as plt

y_feature = "cv"

model = load_model_from_dict(
    {
        "name": "random_forest_regressor",
        "config": {
            "x_features": ["mol_bytes::rdkit_descriptors3_d"],
            "y_features": [y_feature],
        },
    }
)

model.train(split_featurised_dataset["train"])

preds = model.predict(split_featurised_dataset["test"])

regression_suite = load_suite("regression")

scores = regression_suite.compute(
    references=split_featurised_dataset["test"][y_feature],
    predictions=preds[f"random_forest_regressor::{y_feature}"],
)

print(json.dumps({k: round(v, 2) for k, v in scores.items()}, indent=4))

plt.scatter(
    split_featurised_dataset["test"][y_feature],
    preds[f"random_forest_regressor::{y_feature}"],
)

plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.title(f"Performance on predicting {y_feature} property")
plt.show()
{
    "explained_variance": 0.91,
    "max_error": 4.21,
    "mean_absolute_error": 0.98,
    "mean_squared_error": 1.55,
    "root_mean_squared_error": 1.24,
    "median_absolute_error": 0.81,
    "r2": 0.91,
    "spearman::correlation": 0.94,
    "spearman::p_value": 0.0,
    "pearson::correlation": 0.95,
    "pearson::p_value": 0.0
}
../../_images/fe68446079b8df5d402dc20f56be7a168975c299d8fbcf07e076a09a343a89ac.png

Training using 2D-descriptors#

model = load_model_from_dict(
    {
        "name": "random_forest_regressor",
        "config": {
            "x_features": ["smiles::morgan", "smiles::maccs_rdkit"],
            "y_features": [y_feature],
        },
    }
)

model.train(split_featurised_dataset["train"])

preds = model.predict(split_featurised_dataset["test"])

regression_suite = load_suite("regression")

scores = regression_suite.compute(
    references=split_featurised_dataset["test"][y_feature],
    predictions=preds[f"random_forest_regressor::{y_feature}"],
)

print(json.dumps({k: round(v, 2) for k, v in scores.items()}, indent=4))

plt.scatter(
    split_featurised_dataset["test"][y_feature],
    preds[f"random_forest_regressor::{y_feature}"],
)

plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.title(f"Performance on predicting {y_feature} property")
plt.show()
{
    "explained_variance": 0.68,
    "max_error": 11.26,
    "mean_absolute_error": 1.74,
    "mean_squared_error": 5.41,
    "root_mean_squared_error": 2.33,
    "median_absolute_error": 1.45,
    "r2": 0.68,
    "spearman::correlation": 0.78,
    "spearman::p_value": 0.0,
    "pearson::correlation": 0.82,
    "pearson::p_value": 0.0
}
../../_images/aedeb2673616a89506dba2af38c9ef5a216c250e5b262e57463eeae5e7e696e4.png