from __future__ import annotations
from pathlib import Path
import ase
from ase.io import read
from load_atoms.utils import remove_calculator
from .atoms_dataset import AtomsDataset, InMemoryAtomsDataset
from .database import backend
from .visualisation import view
__version__ = "0.3.9"
__all__ = ["load_dataset", "view"]
[docs]def load_dataset(
thing: str | list[ase.Atoms] | Path,
root: str | Path | None = None,
) -> AtomsDataset:
"""
Load a dataset by name or from a list of structures.
Parameters
----------
thing
A dataset id, a list of structures, or a path to a file.
root
The root directory to use when loading a dataset by id. If not
provided, the default root directory (:code:`~/.load-atoms`)
will be used.
Examples
--------
Load a dataset by id:
>>> from load_atoms import load_dataset
>>> dataset = load_dataset("QM9")
╭───────────────────────────────── QM9 ─────────────────────────────────╮
│ │
│ Downloading dsgdb9nsd.xyz.tar.bz2 ━━━━━━━━━━━━━━━━━━━━ 100% 00:09 │
│ Extracting dsgdb9nsd.xyz.tar.bz2 ━━━━━━━━━━━━━━━━━━━━ 100% 00:18 │
│ Processing files ━━━━━━━━━━━━━━━━━━━━ 100% 00:19 │
│ Caching to disk ━━━━━━━━━━━━━━━━━━━━ 100% 00:02 │
│ │
│ The QM9 dataset is covered by the CC0 license. │
│ Please cite the QM9 dataset if you use it in your work. │
│ For more information about the QM9 dataset, visit: │
│ load-atoms/QM9 │
╰───────────────────────────────────────────────────────────────────────╯
>>> dataset
QM9:
structures: 133,885
atoms: 2,407,753
species:
H: 51.09%
C: 35.16%
O: 7.81%
N: 5.80%
F: 0.14%
properties:
per atom: (partial_charges)
per structure: (
A, B, C, Cv, G, H, U, U0, alpha,
frequencies, gap, geometry, homo, inchi, index,
lumo, mu, r2, smiles, zpve
)
Optionally save a dataset to an explicit root directory:
>>> load_dataset("QM9", root="./my-datasets")
Wrap a list of structures in a dataset:
>>> load_dataset([Atoms("H2O"), Atoms("H2O2")])
Load a dataset from a file:
>>> load_dataset("path/to/file.xyz")
.. note::
As of ``ase==0.3.9``, the ``"energy"``, ``"forces"``, and ``"stress"``
special keys are loaded into a
:class:`~ase.calculators.singlepoint.SinglePointCalculator` object,
and removed from the ``.info`` and ``.arrays`` dictionaries on the
atoms object. We reverse this process when loading a dataset from file.
"""
if isinstance(thing, list) and all(isinstance(s, ase.Atoms) for s in thing):
# thing is a list of structures
return InMemoryAtomsDataset(thing)
if not isinstance(thing, (Path, str)):
raise TypeError(
f"Could not load dataset from {thing}. "
"Please provide a string, a list of structures, "
"or a path to a file."
)
if Path(thing).exists() and Path(thing).is_file():
# thing is a string/path to a file that exists
# assume it is a file containing structures and load them
structures = read(Path(thing), index=":")
if isinstance(structures, ase.Atoms):
structures = [structures]
for s in structures:
remove_calculator(s)
return InMemoryAtomsDataset(structures)
if isinstance(thing, Path):
# thing is a path to a file that does not exist
raise ValueError(f"The provided path does not exist. ({thing})")
# assume thing is a dataset ID, and try to load it
if root is None:
root = Path.home() / ".load-atoms"
return backend.load_dataset_by_id(thing, Path(root))