From db88ecac3acbde8837ee2ba98aea2113c32630dd Mon Sep 17 00:00:00 2001 From: "Kevin J. Sung" Date: Thu, 27 Jun 2024 18:14:50 -0400 Subject: [PATCH] Support JSON serialization for MolecularData (#258) * add JSON serialization for MolecularData * add orjson as dependency * fix frozen bug in mp2 * support compression * fix frozen bug in ccsd * fix open shell for mp2 and ccsd * fix open-shell fci * use data.get * support open shell * fix test assert function * mypy * reorder functions * mypy * doc --- pyproject.toml | 1 + python/ffsim/molecular_data.py | 99 +++++++++++++++++++++++++++++ tests/python/molecular_data_test.py | 77 ++++++++++++++++++++++ 3 files changed, 177 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e70fa244c..4b4767d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ dependencies = [ "numpy", "opt_einsum", + "orjson", "pyscf >= 2.4", "qiskit >= 1.1", "scipy", diff --git a/python/ffsim/molecular_data.py b/python/ffsim/molecular_data.py index 883976c6f..d5365e004 100644 --- a/python/ffsim/molecular_data.py +++ b/python/ffsim/molecular_data.py @@ -12,10 +12,16 @@ from __future__ import annotations +import bz2 import dataclasses +import gzip +import lzma +import os from collections.abc import Iterable +from typing import Callable import numpy as np +import orjson import pyscf import pyscf.cc import pyscf.mcscf @@ -281,3 +287,96 @@ def run_ccsd( self.ccsd_t1 = ccsd_t1 if store_t2: self.ccsd_t2 = ccsd_t2 + + def to_json( + self, file: str | bytes | os.PathLike, compression: str | None = None + ) -> None: + """Serialize to JSON format, optionally compressed, and save to disk. + + Args: + file: The file path to save to. + compression: The optional compression algorithm to use. + Options: ``"gzip"``, ``"bz2"``, ``"lzma"``. + """ + + def default(obj): + if isinstance(obj, np.ndarray): + return np.ascontiguousarray(obj) + raise TypeError + + open_func: dict[str | None, Callable] = { + None: open, + "gzip": gzip.open, + "bz2": bz2.open, + "lzma": lzma.open, + } + with open_func[compression](file, "wb") as f: + f.write( + orjson.dumps(self, option=orjson.OPT_SERIALIZE_NUMPY, default=default) + ) + + @staticmethod + def from_json( + file: str | bytes | os.PathLike, compression: str | None = None + ) -> MolecularData: + """Load a MolecularData from a (possibly compressed) JSON file. + + Args: + file: The file path to read from. + compression: The compression algorithm, if any, which was used to compress + the file. + Options: ``"gzip"``, ``"bz2"``, ``"lzma"``. + + Returns: The MolecularData object. + """ + open_func: dict[str | None, Callable] = { + None: open, + "gzip": gzip.open, + "bz2": bz2.open, + "lzma": lzma.open, + } + with open_func[compression](file, "rb") as f: + data = orjson.loads(f.read()) + + def as_array_or_none(val): + if val is None: + return None + return np.asarray(val) + + def as_array_tuple_or_none(val): + if val is None: + return None + return tuple(np.asarray(arr) for arr in val) + + nelec = tuple(data["nelec"]) + n_alpha, n_beta = nelec + arrays_func = as_array_or_none if n_alpha == n_beta else as_array_tuple_or_none + + return MolecularData( + atom=[ + (element, tuple(coordinates)) for element, coordinates in data["atom"] + ], + basis=data["basis"], + spin=data["spin"], + symmetry=data["symmetry"], + norb=data["norb"], + nelec=nelec, + mo_coeff=np.asarray(data["mo_coeff"]), + mo_occ=np.asarray(data["mo_occ"]), + active_space=data["active_space"], + core_energy=data["core_energy"], + one_body_integrals=np.asarray(data["one_body_integrals"]), + two_body_integrals=np.asarray(data["two_body_integrals"]), + hf_energy=data.get("hf_energy"), + hf_mo_coeff=as_array_or_none(data.get("hf_mo_coeff")), + hf_mo_occ=as_array_or_none(data.get("hf_mo_occ")), + mp2_energy=data.get("mp2_energy"), + mp2_t2=arrays_func(data.get("mp2_t2")), + ccsd_energy=data.get("ccsd_energy"), + ccsd_t1=arrays_func(data.get("ccsd_t1")), + ccsd_t2=arrays_func(data.get("ccsd_t2")), + fci_energy=data.get("fci_energy"), + fci_vec=as_array_or_none(data.get("fci_vec")), + dipole_integrals=as_array_or_none(data.get("dipole_integrals")), + orbital_symmetries=data.get("orbital_symmetries"), + ) diff --git a/tests/python/molecular_data_test.py b/tests/python/molecular_data_test.py index f8c3a83d4..8c3cccb01 100644 --- a/tests/python/molecular_data_test.py +++ b/tests/python/molecular_data_test.py @@ -8,6 +8,9 @@ # copyright notice, and modified files need to carry a notice indicating # that they have been altered from the originals. +import dataclasses +import pathlib + import numpy as np import pyscf import pyscf.data.elements @@ -15,6 +18,33 @@ import ffsim +def _assert_mol_data_equal( + actual_mol_data: ffsim.MolecularData, expected_mol_data: ffsim.MolecularData +): + for field in dataclasses.fields(actual_mol_data): + actual = getattr(actual_mol_data, field.name) + expected = getattr(expected_mol_data, field.name) + if field.type == "np.ndarray": + assert isinstance(actual, np.ndarray) + np.testing.assert_array_equal(actual, expected) + elif field.type in [ + "np.ndarray | None", + "np.ndarray | tuple[np.ndarray, np.ndarray] | None", + "np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None", + ]: + if actual is not None: + if isinstance(actual, tuple): + for actual_val, expected_val in zip(actual, expected): + assert isinstance(actual_val, np.ndarray) + assert isinstance(expected_val, np.ndarray) + np.testing.assert_array_equal(actual_val, expected_val) + else: + assert isinstance(actual, np.ndarray) + np.testing.assert_array_equal(actual, expected) + else: + assert actual == expected + + def test_molecular_data_sym(): # Build N2 molecule mol = pyscf.gto.Mole() @@ -88,3 +118,50 @@ def test_molecular_data_run_methods(): np.testing.assert_allclose(mol_data.mp2_energy, -108.58852784026) np.testing.assert_allclose(mol_data.fci_energy, -108.595987350986) np.testing.assert_allclose(mol_data.ccsd_energy, -108.5933309085008) + + +def test_json_closed_shell(tmp_path: pathlib.Path): + """Test saving to and loading from JSON for a closed-shell molecule.""" + mol = pyscf.gto.Mole() + mol.build( + atom=[("N", (0, 0, 0)), ("N", (1.0, 0, 0))], + basis="sto-6g", + symmetry="Dooh", + ) + n_frozen = pyscf.data.elements.chemcore(mol) + active_space = range(n_frozen, mol.nao_nr()) + scf = pyscf.scf.RHF(mol).run() + mol_data = ffsim.MolecularData.from_scf(scf, active_space=active_space) + mol_data.run_mp2(store_t2=True) + mol_data.run_ccsd(store_t1=True, store_t2=True) + mol_data.run_fci(store_fci_vec=True) + + for compression in [None, "gzip", "bz2", "lzma"]: + mol_data.to_json(tmp_path / "test.json", compression=compression) + loaded_mol_data = ffsim.MolecularData.from_json( + tmp_path / "test.json", compression=compression + ) + _assert_mol_data_equal(loaded_mol_data, mol_data) + + +def test_json_open_shell(tmp_path: pathlib.Path): + """Test saving to and loading from JSON for an open-shell molecule.""" + mol = pyscf.gto.Mole() + mol.build( + atom=[("H", (0, 0, 0)), ("O", (0, 0, 1.1))], + basis="6-31g", + spin=1, + symmetry="Coov", + ) + scf = pyscf.scf.ROHF(mol).run() + mol_data = ffsim.MolecularData.from_scf(scf) + mol_data.run_mp2(store_t2=True) + mol_data.run_ccsd(store_t1=True, store_t2=True) + mol_data.run_fci(store_fci_vec=True) + + for compression in [None, "gzip", "bz2", "lzma"]: + mol_data.to_json(tmp_path / "test.json", compression=compression) + loaded_mol_data = ffsim.MolecularData.from_json( + tmp_path / "test.json", compression=compression + ) + _assert_mol_data_equal(loaded_mol_data, mol_data)