"""
The :code:`backend` module is responsible for down/loading datasets by name,
storing them locally, and serving them to :code:`load-atoms` via the
:func:`~load_atoms.load_dataset` function.
"""
from __future__ import annotations
import shutil
import tempfile
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Iterator
import ase.io
from ase import Atoms
from typing_extensions import override
from load_atoms.atoms_dataset import (
AtomsDataset,
get_file_extension_and_dataset_class,
)
from load_atoms.database.database_entry import (
LICENSE_URLS,
DatabaseEntry,
)
from load_atoms.database.internet import FileDownload, download, download_all
from load_atoms.progress import Progress, get_progress_for_dataset
from load_atoms.utils import (
UnknownDatasetException,
debug_mode,
frontend_url,
remove_calculator,
testing,
)
BASE_GITHUB_URL = "https://github.com/jla-gardner/load-atoms/raw/main/database"
[docs]def load_dataset_by_id(dataset_id: str, root: Path) -> AtomsDataset:
"""
Load the :class:`AtomsDataset` and corresponding :class:`DatabaseEntry` for
the given dataset id, saving the dataset to the given ``root`` directory.
Parameters
----------
name
The id of the dataset to load.
root
The root folder to save the structures to.
"""
# prepare local paths
yaml_file_path = root / "database-entries" / f"{dataset_id}.yaml"
yaml_file_path.parent.mkdir(parents=True, exist_ok=True)
with get_progress_for_dataset(dataset_id) as progress:
# down/load the dabase entry for the dataset
database_entry = get_database_entry(
dataset_id, yaml_file_path, progress
)
# get the file extension and dataset class for the dataset
extension, dataset_class = get_file_extension_and_dataset_class(
database_entry.format
)
data_file_path = root / f"{dataset_id}.{extension}"
# if the dataset already exists, load it from disk
if data_file_path.exists():
with progress.new_task("Reading from disk"):
dataset = dataset_class.load(data_file_path)
# otherwise, use the importer to get the structures
else:
# 1. get the Importer class from a suitably down/loaded file
importer_type: type[BaseImporter] = get_importer_type(
dataset_id, progress
)
# 2. download the files to an appropriate directory
download_dir_name = importer_type.permanent_download_dirname()
use_tmp_dir = (
download_dir_name is None and not debug_mode() and not testing()
)
if use_tmp_dir:
download_dir = Path(tempfile.mkdtemp())
else:
download_dir = (
root / "raw-downloads" / (download_dir_name or dataset_id)
)
download_all(
importer_type.files_to_download(), download_dir, progress
)
# 3. use the importer to get the structures (removing annoying calc)
def iterator():
for structure in importer_type.get_structures(
download_dir, progress
):
remove_calculator(structure)
yield structure
try:
dataset_class.save(data_file_path, iterator(), database_entry)
except Exception as e:
# remove the partially created dataset
if data_file_path.exists():
if data_file_path.is_dir():
shutil.rmtree(data_file_path)
else:
data_file_path.unlink()
raise ValueError(
"Failed to import dataset: please report an issue at "
"https://github.com/jla-gardner/load-atoms/issues if you "
"think this is a bug."
) from e
dataset = dataset_class.load(data_file_path)
# 4. clean up the temporary directory if necessary
if use_tmp_dir:
shutil.rmtree(download_dir)
# add the usage information to the progress bar
log_usage_information(database_entry, progress)
progress.refresh()
return dataset
class BaseImporter(ABC):
"""
Base class to inherit from to create new, dataset-specific importers.
Parameters
----------
files_to_download
A list of :class:`FileDownload` s
tmp_dirname
The name of the temporary directory to download the files to.
cleanup
Whether to clean up the temporary directory after processing.
"""
@classmethod
def files_to_download(cls) -> list[FileDownload]:
return []
@classmethod
@abstractmethod
def get_structures(
cls,
tmp_dir: Path,
progress: Progress,
) -> Iterator[Atoms]:
"""
Iterate over :class:`ase.Atoms` objects. All files passed
to the base class will have already been downloaded
and verified when this is called.
Parameters
----------
tmp_dir
The temporary directory where downloaded files are stored.
Yields
------
Atoms
An iterator of ASE Atoms objects processed from the downloaded files
"""
@classmethod
def permanent_download_dirname(cls) -> str | None:
"""
Get a path to the directory where the files should be saved.
If ``None`` (the default), is returned, the files will be downloaded
to a temporary directory, and removed after the dataset is imported.
"""
return None
class SingleFileImporter(BaseImporter):
@classmethod
@abstractmethod
def file_to_download(cls) -> FileDownload:
...
@classmethod
def files_to_download(cls) -> list[FileDownload]:
return [cls.file_to_download()]
@override
@classmethod
def get_structures(
cls, tmp_dir: Path, progress: Progress
) -> Iterator[Atoms]:
file_path = tmp_dir / Path(cls.files_to_download()[0].local_name)
with progress.new_task(f"Reading {file_path.resolve()}"):
for atoms in cls._read_file(file_path):
yield cls.process_atoms(atoms)
@classmethod
def process_atoms(cls, atoms: Atoms) -> Atoms:
return atoms
@classmethod
def _read_file(cls, file_path: Path) -> Iterator[Atoms]:
yield from ase.io.iread(file_path, index=":")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
def get_database_entry(
dataset_id: str,
yaml_file_path: Path,
progress: Progress,
) -> DatabaseEntry:
from load_atoms import __version__ as load_atoms_version
if not yaml_file_path.exists():
try:
download(
DatabaseEntry.remote_url_for_yaml(dataset_id),
yaml_file_path,
progress,
)
except Exception as e:
raise UnknownDatasetException(dataset_id) from e
db_entry = DatabaseEntry.from_yaml_file(yaml_file_path)
if (
db_entry.minimum_load_atoms_version is not None
and db_entry.minimum_load_atoms_version > load_atoms_version
):
raise Exception(
f"Dataset {dataset_id} requires load-atoms version "
f">={db_entry.minimum_load_atoms_version} "
f"(current version: {load_atoms_version}). "
"Please upgrade load-atoms to load this dataset "
"(e.g. `pip install --upgrade load-atoms`)."
)
return db_entry
def get_importer_type(
dataset_id: str,
progress: Progress,
) -> type[BaseImporter]:
importer_name = DatabaseEntry.importer_file_stem(dataset_id)
expected_importer_path = (
Path(__file__).parent / "importers" / f"{importer_name}.py"
)
if not expected_importer_path.exists():
try:
download(
DatabaseEntry.remote_url_for_importer(dataset_id),
expected_importer_path,
progress,
)
except Exception as e:
# couldn't download the importer:
raise UnknownDatasetException(dataset_id) from e
try:
return __import__(
f"load_atoms.database.importers.{importer_name}",
fromlist=["Importer"],
).Importer
except Exception as e:
raise Exception(
f"Unable to load dataset {dataset_id} due to a problem loading "
"the dataset's importer file. Please try updating load-atoms:\n"
" pip install --upgrade load-atoms\n"
"If the problem persists, please report an issue at:\n"
" https://github.com/jla-gardner/load-atoms/issues"
) from e
def log_usage_information(info: DatabaseEntry, progress: Progress):
progress.add_text("\n")
name = progress.bold(info.name)
if info.license is not None:
license = progress.link(info.license, LICENSE_URLS[info.license])
progress.add_text(
f"The {name} dataset is covered by the {license} license."
)
if info.citation is not None:
progress.add_text(
f"Please cite the {name} dataset if you use it in your work."
)
progress.add_text(f"For more information about the {name} dataset, visit:")
url = frontend_url(info)
progress.add_text(progress.link(f"load-atoms/{info.name}", url))
def unzip_file(file_path: Path, progress: Progress) -> Path:
"""Unzip a file and return the path to the extracted directory.
Parameters
----------
file_path
The path to the file to unzip.
progress
A :class:`Progress` object to track the unzip progress.
"""
extract_to = file_path.parent / f"{file_path.name}-extracted"
if not extract_to.exists():
with progress.new_task(
f"Unzipping {file_path.resolve()}",
):
shutil.unpack_archive(file_path, extract_dir=extract_to)
return extract_to
def rename(atoms: Atoms, mapping: dict[str, str]) -> Atoms:
"""Rename the properties of an Atoms object."""
for old_name, new_name in mapping.items():
if old_name in atoms.arrays:
atoms.arrays[new_name] = atoms.arrays.pop(old_name)
elif old_name in atoms.info:
atoms.info[new_name] = atoms.info.pop(old_name)
return atoms