Source code for load_atoms.database.database_entry

# explicitly not using future annotations since this is not supported
# by pydantic for python versions we want to target

from pathlib import Path
from typing import Dict, Literal, Optional, Union

import yaml
from pydantic import BaseModel, field_validator

from load_atoms.utils import BASE_REMOTE_URL

LICENSE_URLS = {
    "CC BY-NC-SA 4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en",
    "CC BY-NC 4.0": "https://creativecommons.org/licenses/by-nc/4.0/deed.en",
    "CC BY 4.0": "https://creativecommons.org/licenses/by/4.0/deed.en",
    "CC0": "https://creativecommons.org/publicdomain/zero/1.0/",
    "MIT": "https://opensource.org/licenses/MIT",
    "GPLv3": "https://www.gnu.org/licenses/gpl-3.0.html",
}
VALID_LICENSES = list(LICENSE_URLS.keys())

VALID_CATEGORIES = ["Benchmarks", "Potential Fitting", "Synthetic Data"]


class PropertyDescription(BaseModel):
    """
    Holds a description of a property, such that it can be automatically
    validated upon creation.
    """

    desc: str
    """A description of the property"""

    units: Optional[str] = None
    """The units of the property"""


[docs]class DatabaseEntry(BaseModel): """ Holds all the required metadata for a named dataset, such that it can be automatically downloaded using :func:`~load_atoms.load_dataset`, and so that documentation can be automatically generated. """ name: str """The name of the dataset""" year: int """The year the dataset was created""" description: str """A description of the dataset (in ``.rst`` format)""" category: str """ The category of the dataset (e.g. ``"Potential Fitting"``, ``"Benchmarks"``) """ format: Literal["lmdb", "memory"] = "memory" """The format of the dataset""" minimum_load_atoms_version: Union[str, None] = None """ The minimum version of load-atoms that is required to load the dataset. """ citation: Optional[str] = None """A citation for the dataset (in BibTeX format)""" license: Optional[str] = None """The license identifier of the dataset (e.g. ``"CC BY-NC-SA 4.0"``)""" representative_structure: Optional[int] = None """The index of a representative structure (for visualisation purposes)""" per_atom_properties: Optional[Dict[str, PropertyDescription]] = None """A mapping from per-atom properties to their descriptions""" per_structure_properties: Optional[Dict[str, PropertyDescription]] = None """A mapping from per-structure properties to their descriptions""" @field_validator("category") def validate_category(cls, v): if v not in VALID_CATEGORIES: raise ValueError( f"Invalid category: {v}. Must be one of {VALID_CATEGORIES}" ) return v @field_validator("license") def validate_license(cls, v): if v not in VALID_LICENSES: raise ValueError( f"Invalid license: {v}. Must be one of {VALID_LICENSES}" ) return v @field_validator("citation") def validate_citation(cls, v): v = v.strip() if v.startswith("@") and v.endswith("}"): return v raise ValueError(f"Invalid BibTeX: {v}") @field_validator("minimum_load_atoms_version", mode="before") def convert_minimum_version_to_str(cls, v): if v is None: return None return str(v) @classmethod def from_yaml_file(cls, path: Union[Path, str]) -> "DatabaseEntry": path = Path(path).resolve() with open(path) as f: data = yaml.safe_load(f) try: return cls(**data) except Exception as e: raise ValueError( f"Error loading dataset description from {path}. It may be " "that you have a stale version of this dataset's yaml file on " "disk. Please delete the file and try again:\n" f' $ rm "{path}"' ) from e @classmethod def remote_url_for_yaml(cls, dataset_id: str) -> str: return BASE_REMOTE_URL + f"{dataset_id}/{dataset_id}.yaml" @classmethod def importer_file_stem(cls, dataset_id: str) -> str: return dataset_id.lower().replace("-", "_") @classmethod def remote_url_for_importer(cls, dataset_id: str) -> str: fname = DatabaseEntry.importer_file_stem(dataset_id) return BASE_REMOTE_URL + f"src/load_atoms/database/importers/{fname}.py"