Example configs¶

Realistic config¶

A realistic config for training a MACE model on the C-GAP-20U dataset:

CUTOFF: 3.7

general:
    seed: 42
    run_id: mace-c-gap-20u

model:
    offset: +LearnableOffset()
    core-repulsion:
        +ZBLCoreRepulsion:
            trainable: true
            cutoff: =/CUTOFF
    many-body:
        +MACE:
            elements: [C]
            cutoff: =/CUTOFF
            channels: 128
            hidden_irreps: 0e + 1o
            self_connection: true

data:
    +load_atoms_dataset:
        id: C-GAP-20U
        cutoff: =/CUTOFF
        n_train: 5000
        n_valid: 100
        n_test: 500
        split: random

loss:
    energy:
        +PerAtomEnergyLoss:
            # weight to balance energy and force loss
            weight: 10.0
            metric:
            # huber loss for smoothness
                +Huber: { delta: 0.01 }
    forces: +ForceRMSE()

fitting:
    trainer_kwargs:
        max_epochs: 1000
        accelerator: gpu

    callbacks:
        - +graph_pes.training.callbacks.DumpModel:
              every_n_val_checks: 10

    early_stopping:
        patience: 50

    optimizer:
        name: AdamW
        lr: 0.003

    scheduler:
        name: ReduceLROnPlateau
        patience: 10
        factor: 0.8

    loader_kwargs:
        batch_size: 32
        num_workers: 4

Kitchen sink config¶

A “kitchen sink” config that attempts to specify every possible option:

CUTOFF: 3.7

model:
    offset:
        +FixedOffset: { H: -123.4, C: -456.7 }
    core-repulsion:
        +ZBLCoreRepulsion:
            trainable: true
            cutoff: =/CUTOFF
    many-body:
        +NequIP:
            elements: [C, H, O, N]
            cutoff: =/CUTOFF
            channels: 128
            hidden_irreps: 0e + 1o
            self_connection: true

data:
    train:
        path: training_data.xyz
        n: 1000
        shuffle: true
        seed: 42
    valid: validation_data.xyz
    test:
        bulk:
            +my_module.bulk_test_set:
                cutoff: =/CUTOFF
        slab:
            +my_module.slab_test_set:
                n: 100
                cutoff: =/CUTOFF

loss:
    energy: +PerAtomEnergyLoss()
    forces:
        +ForceRMSE:
            weight: 3.0
    stress:
        +PropertyLoss:
            property: stress
            metric: RMSE
            weight: 10.0

fitting:
    pre_fit_model: true
    max_n_pre_fit: 1000
    early_stopping:
        monitor: valid/metrics/forces_rmse
        patience: 50
        min_delta: 1e-3

    trainer_kwargs:
        max_epochs: 1000
        accelerator: gpu
        accumulate_grad_batches: 4
        val_check_interval: 0.25

    optimizer:
        name: AdamW
        lr: 0.003
        weight_decay: 0.01
        amsgrad: true

    scheduler:
        name: ReduceLROnPlateau
        patience: 10
        factor: 0.8

    swa:
        lr: 0.001
        start: 0.8
        anneal_epochs: 10
        strategy: linear

    loader_kwargs:
        batch_size: 32
        num_workers: 4
        shuffle: true
        persistent_workers: true

general:
    seed: 42
    root_dir: /path/to/root
    run_id: kitchen-sink-run
    log_level: INFO
    progress: rich

wandb:
    project: my_project
    entity: my_entity
    tags: [kitchen-sink, test]
    name: kitchen-sink-run

Default config¶

For reference, here are the default config options used in graph-pes-train:

general:
    seed: 42
    root_dir: "graph-pes-results"
    run_id: null
    log_level: "INFO"
    progress: rich
    torch:
        dtype: float32
        float32_matmul_precision: high

fitting:
    pre_fit_model: true
    max_n_pre_fit: 5_000

    # train for 100 epochs on the best device available
    trainer_kwargs:
        max_epochs: 100
        accelerator: auto
        enable_model_summary: false

    loader_kwargs:
        num_workers: 1
        persistent_workers: true
        batch_size: 4
        pin_memory: false

    # "fancy"/optional training options disabled
    callbacks: []
    scheduler: null
    swa: null
    early_stopping: null
    auto_fit_reference_energies: false
    lr_warmup_steps: null

    # this is deprecated, use `early_stopping` instead
    early_stopping_patience: null

wandb: {}