diff --git a/.gitignore b/.gitignore index c1eefd0..5d3cc32 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,4 @@ nohup.out *.xyz *.csv *.txt +*.sh diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py new file mode 100644 index 0000000..f67ccab --- /dev/null +++ b/src/openqdc/datasets/dess.py @@ -0,0 +1,110 @@ +from os.path import join as p_join + +import datamol as dm +import numpy as np +import pandas as pd +from tqdm import tqdm + +from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def read_mol(mol_path, smiles, subset, targets): + try: + with open(mol_path, "r") as f: + mol_block = f.read() + mol = dm.read_molblock(mol_block, remove_hs=False, fail_if_invalid=True) + + x = get_atomic_number_and_charge(mol) + positions = mol.GetConformer().GetPositions() + + res = dict( + name=np.array([smiles]), + subset=np.array([subset]), + energies=np.array(targets).astype(np.float32)[None, :], + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32), + n_atoms=np.array([x.shape[0]], dtype=np.int32), + ) + except Exception as e: + print(f"Skipping: {mol_path} due to {e}") + res = None + + return res + + +class DESS(BaseDataset): + __name__ = "dess" + __energy_methods__ = [ + "mp2_cc", + "mp2_qz", + "mp2_tz", + "mp2_cbs", + "ccsd(t)_cc", + "ccsd(t)_cbs", + "ccsd(t)_nn", + "sapt", + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + ] + # ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all'] + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + partitions = ["DES370K", "DES5M"] + + def __init__(self) -> None: + super().__init__() + + def _read_raw_(self, part): + df = pd.read_csv(p_join(self.root, f"{part}.csv")) + for col in self.energy_target_names: + if col not in df.columns: + df[col] = np.nan + smiles = (df["smiles0"] + "." + df["smiles1"]).tolist() + subsets = (f"{part}_" + df["group_orig"]).tolist() + targets = df[self.energy_target_names].values + paths = ( + p_join(self.root, "geometries/") + + df["system_id"].astype(str) + + f"/{part}_" + + df["geom_id"].astype(str) + + ".mol" + ) + + inputs = [ + dict(smiles=smiles[i], subset=subsets[i], targets=targets[i], mol_path=paths[i]) + for i in tqdm(range(len(smiles))) + ] + f = lambda xs: [read_mol(**x) for x in xs] + samples = dm.parallelized_with_batches( + f, inputs, n_jobs=-1, progress=True, batch_size=1024, scheduler="threads" + ) + return samples + + def read_raw_entries(self): + samples = sum([self._read_raw_(partition) for partition in self.partitions], []) + return samples + + +if __name__ == "__main__": + for data_class in [DESS]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 452cce1..5e44263 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -9,7 +9,7 @@ from openqdc.utils.molecule import atom_table -def read_mol(mol_id, conf_dict, base_path, energy_target_names): +def read_archive(mol_id, conf_dict, base_path, energy_target_names): res = [] for conf_id, conf_label in conf_dict.items(): try: @@ -60,7 +60,7 @@ def read_raw_entries(self): # if i > 10: # break # exit() - fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names) + fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names) res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) return samples diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index e69de29..bfa0e38 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -0,0 +1,94 @@ +import json +import tarfile +from glob import glob +from os.path import join as p_join + +import datamol as dm +import numpy as np +import pandas as pd + +from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER + + +def flatten_dict(d, sep: str = "."): + return pd.json_normalize(d, sep=sep).to_dict(orient="records")[0] + + +def read_content(f): + try: + r = flatten_dict(json.load(f)) + x = np.concatenate( + ( + r["atoms.elements.number"][:, None], + r["atoms.core electrons"][:, None], + r["atoms.coords.3d"].reshape(-1, 3), + ), + axis=-1, + ).astype(np.float32) + + res = dict( + name=np.array([r["smiles"]]), + subset=np.array([r["formula"]]), + energies=np.array(["properties.energy.total"]).astype(np.float32)[None, :], + atomic_inputs=x, + n_atoms=np.array([x.shape[0]], dtype=np.int32), + ) + except Exception: + res = None + + return res + + +def read_archive(path): + with tarfile.open(path) as tar: + res = [read_content(tar.extractfile(member)) for member in tar.getmembers()] + # print(len(res)) + return res + + +class PubchemQC(BaseDataset): + __name__ = "pubchemqc" + __energy_methods__ = [ + "b3lyp", + "pm6", + ] + + energy_target_names = [ + "b3lyp", + "pm6", + ] + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + partitions = ["b3lyp", "pm6"] + + def __init__(self) -> None: + super().__init__() + + def _read_raw_(self, part): + arxiv_paths = glob(p_join(self.root, f"{part}", "*.tar.gz")) + print(len(arxiv_paths)) + samples = dm.parallelized(read_archive, arxiv_paths, n_jobs=-1, progress=True, scheduler="threads") + res = sum(samples, []) + print(len(res)) + exit() + return res + + def read_raw_entries(self): + samples = sum([self._read_raw_(partition) for partition in self.partitions], []) + return samples + + +if __name__ == "__main__": + for data_class in [PubchemQC]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index 55f395c..da055d5 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -1,45 +1,69 @@ from os.path import join as p_join import numpy as np +from tqdm import tqdm -from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.io import load_hdf5_file -class ISO17(BaseDataset): - __name__ = "iso_17" +def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): + m = mol_h5 + cids = list(mol_h5.keys()) + + zs = [m[c]["atNUM"] for c in cids] + xyz = np.concatenate([m[c]["atXYZ"] for c in cids], axis=0) + n_atoms = np.array([len(z) for z in zs], dtype=np.int32) + n, zs = len(n_atoms), np.concatenate(zs, axis=0) + a_inputs = np.concatenate([np.stack([zs, np.zeros_like(zs)], axis=-1), xyz], axis=-1) + + forces = np.concatenate([np.stack([m[c][f_tag] for f_tag in force_target_names], axis=-1) for c in cids], axis=0) + energies = np.stack([np.array([m[c][e_tag][0] for e_tag in energy_target_names]) for c in cids], axis=0) + + res = dict( + name=np.array([mol_name] * n), + subset=np.array(["qm7x"] * n), + energies=energies.astype(np.float32), + atomic_inputs=a_inputs.astype(np.float32), + forces=forces.astype(np.float32), + n_atoms=n_atoms, + ) + + return res + + +class QM7X(BaseDataset): + __name__ = "qm7x" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = [ - "pbe-ts", - ] + __energy_methods__ = ["pbe-ts", "mbd"] - energy_target_names = [ - "PBE-TS Energy", - ] + energy_target_names = ["ePBE0", "eMBD"] - __force_methods__ = [ - "pbe-ts", - ] + __force_methods__ = ["pbe-ts", "vdw"] - force_target_names = [ - "PBE-TS Gradient", - ] + force_target_names = ["pbe0FOR", "vdwFOR"] def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, "iso_17.h5") - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) + samples = [] + for i in range(1, 9): + raw_path = p_join(self.root, f"{i}000") + data = load_hdf5_file(raw_path) + samples += [ + read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys()) + ] return samples if __name__ == "__main__": - for data_class in [ISO17]: + for data_class in [QM7X]: data = data_class() n = len(data) @@ -49,5 +73,3 @@ def read_raw_entries(self): for k in x: if x[k] is not None: print(k, x[k].shape, end=" ") - - print() diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index b528f42..6fc468b 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -12,7 +12,7 @@ def read_mol(mol_dir): filenames = glob(p_join(mol_dir, "*.sdf")) - mols = [dm.read_sdf(f)[0] for f in filenames] + mols = [dm.read_sdf(f, remove_hs=False)[0] for f in filenames] n_confs = len(mols) if len(mols) == 0: diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 8be8281..88da2c5 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -7,33 +7,33 @@ class SN2RXN(BaseDataset): - __name__ = "iso_17" + __name__ = "sn2_rxn" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "pbe-ts", + "dsd-blyp-d3(bj)_tz", ] energy_target_names = [ - "PBE-TS Energy", + "DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy", ] __force_methods__ = [ - "pbe-ts", + "dsd-blyp-d3(bj)_tz", ] force_target_names = [ - "PBE-TS Gradient", + "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] def __init__(self) -> None: super().__init__() def read_raw_entries(self): - raw_path = p_join(self.root, "iso_17.h5") - samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) + raw_path = p_join(self.root, "sn2_rxn.h5") + samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) return samples diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index e69de29..801d28f 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -0,0 +1,53 @@ +from os.path import join as p_join + +import numpy as np + +from openqdc.datasets.base import BaseDataset, read_qc_archive_h5 +from openqdc.utils.constants import MAX_ATOMIC_NUMBER + + +class SolvatedPeptides(BaseDataset): + __name__ = "solvated_peptides" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = [ + "revpbe-d3(bj)_tz", + ] + + energy_target_names = [ + "revPBE-D3(BJ):def2-TZVP Atomization Energy", + ] + + __force_methods__ = [ + "revpbe-d3(bj)_tz", + ] + + force_target_names = [ + "revPBE-D3(BJ):def2-TZVP Gradient", + ] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + raw_path = p_join(self.root, "solvated_peptides.h5") + samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) + + return samples + + +if __name__ == "__main__": + for data_class in [SolvatedPeptides]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") + + print() diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py new file mode 100644 index 0000000..c08ccfc --- /dev/null +++ b/src/openqdc/datasets/tmqm.py @@ -0,0 +1,84 @@ +from io import StringIO +from os.path import join as p_join + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import atom_table + + +def content_to_xyz(content, e_map): + try: + tmp = content.split("\n")[1].split(" | ") + code = tmp[0].split(" ")[-1] + name = tmp[3].split(" ")[-1] + except Exception: + print(content) + return None + + s = StringIO(content) + d = np.loadtxt(s, skiprows=2, dtype="str") + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([atom_table.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + e = e_map[code] + + conf = dict( + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([name]), + energies=np.array([e], dtype=np.float32)[:, None], + n_atoms=np.array([positions.shape[0]], dtype=np.int32), + subset=np.array(["tmqm"]), + ) + + return conf + + +def read_xyz(fname, e_map): + with open(fname, "r") as f: + contents = f.read().split("\n\n") + + print("toto", len(contents)) + res = [content_to_xyz(content, e_map) for content in tqdm(contents)] + return res + + +class TMQM(BaseDataset): + __name__ = "tmqm" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = ["tpssh_tz"] + + energy_target_names = ["TPSSh/def2TZVP level"] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) + e_map = dict(zip(df["CSD_code"], df["Electronic_E"])) + raw_fnames = ["tmQM_X1.xyz", "tmQM_X2.xyz", "Benchmark2_TPSSh_Opt.xyz"] + samples = [] + for fname in raw_fnames: + data = read_xyz(p_join(self.root, fname), e_map) + samples += data + + return samples + + +if __name__ == "__main__": + for data_class in [TMQM]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index e69de29..eff78b1 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -0,0 +1,85 @@ +from io import StringIO +from os.path import join as p_join + +import numpy as np +from tqdm import tqdm + +from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.molecule import atom_table + + +def content_to_xyz(content, n_waters): + content = content.strip() + + try: + tmp = content.splitlines() + s = StringIO(content) + d = np.loadtxt(s, skiprows=2, dtype="str") + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([atom_table.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + e = float(tmp[1].strip().split(" ")[-1]) + except Exception: + print("Error in reading xyz file") + print(n_waters, content) + return None + + conf = dict( + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([f"water_{n_waters}"]), + energies=np.array([e], dtype=np.float32)[:, None], + n_atoms=np.array([positions.shape[0]], dtype=np.int32), + subset=np.array([f"water_{n_waters}"]), + ) + + return conf + + +def read_xyz(fname, n_waters): + s = 3 * n_waters + 2 + with open(fname, "r") as f: + lines = f.readlines() + contents = ["".join(lines[i : i + s]) for i in range(0, len(lines), s)] + + res = [content_to_xyz(content, n_waters) for content in tqdm(contents)] + return res + + +class WaterClusters(BaseDataset): + __name__ = "waterclusters3_30" + + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + + __energy_methods__ = ["ttm2.1-f"] + + energy_target_names = ["TTM2.1-F Potential"] + + def __init__(self) -> None: + super().__init__() + + def read_raw_entries(self): + samples = [] + for i in range(3, 31): + raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz") + data = read_xyz( + raw_path, + i, + ) + samples += data + + return samples + + +if __name__ == "__main__": + for data_class in [WaterClusters]: + data = data_class() + n = len(data) + + for i in np.random.choice(n, 3, replace=False): + x = data[i] + print(x.name, x.subset, end=" ") + for k in x: + if x[k] is not None: + print(k, x[k].shape, end=" ") diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 86d1d33..87e7620 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -65,8 +65,8 @@ class DataConfigFactory: ) qm7x = dict( - dataset_name="qm7x", - links={f"{i}000.xz": "https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, + dataset_name="qm7x", # https://zenodo.org/record/4288677/files/1000.xz?download=1 + links={f"{i}000.xz": f"https://zenodo.org/record/4288677/files/{i}000.xz" for i in range(1, 9)}, ) qmugs = dict( @@ -82,6 +82,22 @@ class DataConfigFactory: links={"SPICE-1.1.4.hdf5": "https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5"}, ) + dess = dict( + dataset_name="dess5m", + links={ + "DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip", + "DESS370.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", + }, + ) + + tmqm = dict( + dataset_name="tmqm", + links={ + x: f"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}" + for x in ["tmQM_X1.xyz.gz", "tmQM_X2.xyz.gz", "tmQM_y.csv", "Benchmark2_TPSSh_Opt.xyz"] + }, + ) + misato = dict( dataset_name="misato", links={ diff --git a/src/openqdc/raws/fetch.py b/src/openqdc/raws/fetch.py index b7fc4e4..5aefa10 100644 --- a/src/openqdc/raws/fetch.py +++ b/src/openqdc/raws/fetch.py @@ -16,9 +16,68 @@ from sklearn.utils import Bunch from openqdc.raws.config_factory import DataConfigFactory +from openqdc.raws.pubchemqc import download_b3lyp_pm6 from openqdc.utils.io import get_local_cache +def download_url(url, local_filename): + logger.info(f"Url: {url} File: {local_filename}") + if "drive.google.com" in url: + gdown.download(url, local_filename, quiet=False) + elif "raw.github" in url: + r = requests.get(url, allow_redirects=True) + with open(local_filename, "wb") as f: + f.write(r.content) + else: + r = requests.get(url, stream=True) + with fsspec.open(local_filename, "wb") as f: + for chunk in tqdm.tqdm(r.iter_content(chunk_size=16384)): + if chunk: + f.write(chunk) + + +def decompress_tar_gz(local_filename): + parent = os.path.dirname(local_filename) + with tarfile.open(local_filename) as tar: + logger.info(f"Verifying archive extraction states: {local_filename}") + all_names = tar.getnames() + all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + tar.extractall(path=parent) + else: + logger.info(f"Archive already extracted: {local_filename}") + + +def decompress_zip(local_filename): + parent = os.path.dirname(local_filename) + + logger.info(f"Verifying archive extraction states: {local_filename}") + with zipfile.ZipFile(local_filename, "r") as zip_ref: + all_names = zip_ref.namelist() + all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + zip_ref.extractall(parent) + else: + logger.info(f"Archive already extracted: {local_filename}") + + +def decompress_gz(local_filename): + logger.info(f"Verifying archive extraction states: {local_filename}") + out_filename = local_filename.replace(".gz", "") + if out_filename.endswith("hdf5"): + out_filename = local_filename.replace("hdf5", "h5") + + all_extracted = os.path.exists(out_filename) + if not all_extracted: + logger.info(f"Extracting archive: {local_filename}") + with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + else: + logger.info(f"Archive already extracted: {local_filename}") + + # function to download large files with requests def fetch_file(url, local_filename, overwrite=False): """ @@ -40,50 +99,18 @@ def fetch_file(url, local_filename, overwrite=False): if os.path.exists(local_filename) and not overwrite: logger.info("File already exists, skipping download") else: - logger.info(f"File: {local_filename}") - if "drive.google.com" in url: - gdown.download(url, local_filename, quiet=False) - else: - r = requests.get(url, stream=True) - with fsspec.open(local_filename, "wb") as f: - for chunk in tqdm.tqdm(r.iter_content(chunk_size=16384)): - if chunk: - f.write(chunk) + download_url(url, local_filename) # decompress archive if necessary parent = os.path.dirname(local_filename) if local_filename.endswith("tar.gz"): - with tarfile.open(local_filename) as tar: - logger.info(f"Verifying archive extraction states: {local_filename}") - all_names = tar.getnames() - all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - tar.extractall(path=parent) - else: - logger.info(f"Archive already extracted: {local_filename}") + decompress_tar_gz(local_filename) elif local_filename.endswith("zip"): - logger.info(f"Verifying archive extraction states: {local_filename}") - with zipfile.ZipFile(local_filename, "r") as zip_ref: - all_names = zip_ref.namelist() - all_extracted = all([os.path.exists(os.path.join(parent, x)) for x in all_names]) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - zip_ref.extractall(parent) - else: - logger.info(f"Archive already extracted: {local_filename}") - - elif local_filename.endswith("hdf5.gz"): - logger.info(f"Verifying archive extraction states: {local_filename}") - out_filename = local_filename.replace("hdf5.gz", "h5") - all_extracted = os.path.exists(out_filename) - if not all_extracted: - logger.info(f"Extracting archive: {local_filename}") - with gzip.open(local_filename, "rb") as f_in, open(out_filename, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - else: - logger.info(f"Archive already extracted: {local_filename}") + decompress_zip(local_filename) + + elif local_filename.endswith(".gz"): + decompress_gz(local_filename) elif local_filename.endswith("xz"): logger.info(f"Extracting archive: {local_filename}") @@ -132,8 +159,9 @@ def from_name(self, name): if __name__ == "__main__": - dataset_names = DataConfigFactory.available_datasets - dataset_names = ["ani"] - for dataset_name in dataset_names: - dd = DataDownloader() - dd.from_name(dataset_name) + download_b3lyp_pm6() + # dataset_names = DataConfigFactory.available_datasets + # dataset_names = ["tmqm"] + # for dataset_name in dataset_names: + # dd = DataDownloader() + # dd.from_name(dataset_name) diff --git a/src/openqdc/raws/pubchemqc.py b/src/openqdc/raws/pubchemqc.py new file mode 100644 index 0000000..d756bd5 --- /dev/null +++ b/src/openqdc/raws/pubchemqc.py @@ -0,0 +1,118 @@ +import os + +import click +from tqdm import tqdm + +from openqdc.utils.io import get_local_cache + + +def download_b3lyp_pm6_item(i, method="b3lyp"): + try: + step_size = 25000 + start = str(i * step_size + 1).rjust(9, "0") + stop = str((i + 1) * step_size).rjust(9, "0") + + cmd_b3lyp = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: +text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image +/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +--header="Referer: https://chibakoudai.sharepoint.com/sites/stair02/Shared +%20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair02%2FShared%20 +Documents%2Fdata%2FPubChemQC%2FB3LYP%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz&viewid=f6d34767%2 +D64f0%2D480e%2Dab70%2Dd8524dbdc74e&parent=%2Fsites%2Fstair02%2FShared%20 +Documents%2Fdata%2FPubChemQC%2FB3LYP%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2F +json%2Fall" --header="Cookie: MicrosoftApplicationsTelemetryDeviceId=cec40b8a +-9870-4c4f-bb71-838a300c8685; MSFPC=GUID=511089efdbeb49d3923fdc7e6404bd9b& +HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; FedAuth= +77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+VjEzLDBoLmZ8bWV +tYmVyc2hpcHx1cm4lM2FzcG8lM2Fhbm9uIzI1YTJjOTIyMjVkMWNlNzlkOWVmN2NjYmRjNzc5Y +WI1MmJhMDY2N2E5NDRmZTg3NGFmZTFhZjRjMjQ0OGE3ZTUsMCMuZnxtZW1iZXJzaGlwfHVybiU +zYXNwbyUzYWFub24jMjVhMmM5MjIyNWQxY2U3OWQ5ZWY3Y2NiZGM3NzlhYjUyYmEwNjY3YTk0N +GZlODc0YWZlMWFmNGMyNDQ4YTdlNSwxMzM0MTM0MDE0OTAwMDAwMDAsMCwxMzM0MTYwOTY0NjE +yNDYxODcsMC4wLjAuMCwyNTgsNTIyZTlhNzYtYWNiZC00MDJiLWEyZmMtN2NmNjg5ZGRmNTkwL +CwsOTA3NGUyYTAtNDAyNS0yMDAwLWE2NDEtYjdiNDU2N2JlNzI5LDc2MjNlM2EwLWQwYzEtMjA +wMC1hNjQxLWI4MGEyYmU3YmExZSxSNFZHUmtMSXdrT3RETDI0alZUSm9RLDAsMCwwLCwsLDI2N +TA0Njc3NDM5OTk5OTk5OTksMCwsLCwsLCwwLCwxOTU2NzYsR0FkeFdYM3FnLXBsUDRlOVhCUDF +5MTZpZmpVLFN2QXdUYjI3b0MrM0RKa2hsODdRNnhkVFVpQ2l5U0tqU2RxZ3EzNUFsa2lOcmczQ +0NJZWplSmNCR1dteCtWRS8zL1lacmZFYVk3eGJGVDFSWHoxREhXVE5oK0dUSzhiQ0FYOUUxQ20 +yUXpPVG5jZm5MNDdpWUVOLzRzUzVTdnFpbnZ1eDh3L2FrQmZISW01Zlpqbk02c25KOWs5V294b +24wY1F1dUgvY1d0UUNOTkJ2WmtvRkVReitUVldBSmtQRmtxNUlibXFyL2hMUzcreGlqS3FWeXd +WZldIeGp3Q25iUTlzYitjcnhqcDlYR2szLzZ1YUFUeTMyVi9MVFBBdmM4am9wL2hRdjV4bXBnZ +k95M1cvSkljNXpPTlBlbmdQVkl2MXJtb0EwS0h6QVpCNjBnY3pEM1BaYWZVZHFsdGV6RndRTTV +xSFB3Q1hqelJ3SDRyL0Vsdz09PC9TUD4=" --header="Connection: keep-alive" "https +://chibakoudai.sharepoint.com/sites/stair02/_layouts/15/download.aspx? +SourceUrl=%2Fsites%2Fstair02%2FShared%20Documents%2Fdata%2FPubChemQC%2FB3LYP +%5FPM6%2Fb3lyp%5Fpm6%5Fver1%2E0%2E0%2Fjson%2Fall%2FCompound%5F{start}%5F +{stop}%2Etar%2Exz" -c -O 'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" + + cmd_pm6 = f"""wget --header="Host: chibakoudai.sharepoint.com" +--header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" --header="Accept: +text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/ +webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" +--header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7" +--header="Referer: https://chibakoudai.sharepoint.com/sites/stair01/ +Shared%20Documents/Forms/AllItems.aspx?ga=1&id=%2Fsites%2Fstair01%2F +Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz&viewid=2a7fb7f8 +%2Df3f8%2D4ad2%2D931e%2Dfc786e938ea8&parent=%2Fsites%2Fstair01%2FShared +%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2Fjson%2Fall" +--header="Cookie: MicrosoftApplicationsTelemetryDeviceId=cec40b8a-9870- +4c4f-bb71-838a300c8685; MSFPC=GUID=511089efdbeb49d3923fdc7e6404bd9b& +HASH=5110&LV=202303&V=4&LU=1678287635332; WSS_FullScreenMode=false; +FedAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+ +VjEzLDBoLmZ8bWVtYmVyc2hpcHx1cm4lM2FzcG8lM2Fhbm9uIzI1YTJjOTIyMjVkMWNl +NzlkOWVmN2NjYmRjNzc5YWI1MmJhMDY2N2E5NDRmZTg3NGFmZTFhZjRjMjQ0OGE3ZTUs +MCMuZnxtZW1iZXJzaGlwfHVybiUzYXNwbyUzYWFub24jMjVhMmM5MjIyNWQxY2U3OWQ5 +ZWY3Y2NiZGM3NzlhYjUyYmEwNjY3YTk0NGZlODc0YWZlMWFmNGMyNDQ4YTdlNSwxMzM0 +MTM0MDE0OTAwMDAwMDAsMCwxMzM0MTYyODU0MDg5NTI4NjAsMC4wLjAuMCwyNTgsNTIy +ZTlhNzYtYWNiZC00MDJiLWEyZmMtN2NmNjg5ZGRmNTkwLCwsOTA3NGUyYTAtNDAyNS0y +MDAwLWE2NDEtYjdiNDU2N2JlNzI5LDdiMzVlM2EwLWYwYmMtMjAwMC05ZmM0LWU4ODFi +NmM4NGNjZSxSNFZHUmtMSXdrT3RETDI0alZUSm9RLDAsMCwwLCwsLDI2NTA0Njc3NDM5 +OTk5OTk5OTksMCwsLCwsLCwwLCwxOTU2NzYsR0FkeFdYM3FnLXBsUDRlOVhCUDF5MTZp +ZmpVLFNRKzRNWHJYNzRaSHUxMUxVcE9adVZTT1BiK0xJTllwdHY3YTBIM2hLOEdPNThw +L1F1VDZ2K1FTWUZWekpqL3FFblp1TUhlVjFWaytxQ2lhSC9tWXNkMXlRM1N6YVRJaUtx +cHVsWkhTUEVsWmg4TmtHMDhzT3ZXN2J5dW1OMmY4dFJMUVNmekFYQnREVzdnN1hUMUgy +MUsyVlFyUys3WEtHSXpvMmFjQU5XQVNMUTQwRTJFVEd5SlhjRE9ya09HS2ZiSThDVWk4 +bHNwaFRVZTJ6UjBPbjRZaGVFSDUrYTJsSVB4bUNLdG0weXBsS1V6M2pEakxHcml0Rk5l +dWdUdEk0WUpZY3ZOcGZENmZDU0M3dGFhOXlXYmpZUU1QMlhmbXd1bGtkRCs1aUdYRjZi +SFNBNXlNY1FuUXBCVWZjSjgwcDZXSmtlbXlzMWlWZXA5RGU4UHpvZz09PC9TUD4=" +--header="Connection: keep-alive" "https://chibakoudai.sharepoint.com/ +sites/stair01/_layouts/15/download.aspx?SourceUrl=%2Fsites%2Fstair01%2F +Shared%20Documents%2Fdata%2FPubChemQC%2FPM6%2Fpm6opt%5Fver2%2E0%2E0%2F +json%2Fall%2FCompound%5F{start}%5F{stop}%2Etar%2Exz" -c -O +'Compound_{start}_{stop}.tar.xz' > /dev/null 2>&1""" + + cmd = cmd_b3lyp if method == "b3lyp" else cmd_pm6 + cmd = cmd.replace("\n", "") + os.system(cmd) + except Exception: + pass + # else: + # print(f"Downloaded: Compound_{start}_{stop}.tar.xz") + + +def download_b3lyp_pm6(start=0, stop=10000, method="b3lyp"): + path = os.path.join(get_local_cache(), "pubchemqc", method) + os.makedirs(path, exist_ok=True) + os.chdir(path) + ixs = list(range(start, stop)) + for i in tqdm(ixs): + download_b3lyp_pm6_item(i, method=method) + + +@click.command() +@click.option("--id", "-i", type=int, default=0, help="chunk id starting at 0") +@click.option("--chunk-size", "-s", type=int, default=50, help="Chunk size to divide and conquer.") +@click.option("--method", "-m", type=str, default="pm6", help="QM Method used for the calculations.") +def main(id, chunk_size, method): + start = id * chunk_size + stop = (id + 1) * chunk_size + download_b3lyp_pm6(start=start, stop=stop, method=method) + + +if __name__ == "__main__": + main()