diff --git a/0.1.2/404.html b/0.1.2/404.html new file mode 100644 index 0000000..f7c5080 --- /dev/null +++ b/0.1.2/404.html @@ -0,0 +1,1875 @@ + + + +
+ + + + + + + + + + + + + + + + +The BaseDataset defining shared functionality between all datasets.
+ + + +BaseDataset
+
+
+¶
+ Bases: DatasetPropertyMixIn
Base class for datasets in the openQDC package.
+ +openqdc/datasets/base.py
77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 |
|
__force_methods__
+
+
+ property
+
+
+¶For backward compatibility. To be removed in the future.
+e0s_dispatcher: AtomEnergies
+
+
+ property
+
+
+¶Property to get the object that dispatched the isolated atom energies of the QM methods.
+ + +Returns:
+Type | +Description | +
---|---|
+ AtomEnergies
+ |
+
+
+
+ Object wrapping the isolated atom energies of the QM methods. + |
+
energy_methods: List[str]
+
+
+ property
+
+
+¶Return the string version of the energy methods
+__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
energy_unit |
+
+ Optional[str]
+ |
+
+
+
+ Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"] + |
+
+ None
+ |
+
distance_unit |
+
+ Optional[str]
+ |
+
+
+
+ Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"] + |
+
+ None
+ |
+
array_format |
+
+ str
+ |
+
+
+
+ Format to return arrays in. Supported formats: ["numpy", "torch", "jax"] + |
+
+ 'numpy'
+ |
+
energy_type |
+
+ Optional[str]
+ |
+
+
+
+ Type of isolated atom energy to use for the dataset. Default: "formation" +Supported types: ["formation", "regression", "null", None] + |
+
+ 'formation'
+ |
+
overwrite_local_cache |
+
+ bool
+ |
+
+
+
+ Whether to overwrite the locally cached dataset. + |
+
+ False
+ |
+
cache_dir |
+
+ Optional[str]
+ |
+
+
+
+ Cache directory location. Defaults to "~/.cache/openqdc" + |
+
+ None
+ |
+
recompute_statistics |
+
+ bool
+ |
+
+
+
+ Whether to recompute the statistics of the dataset. + |
+
+ False
+ |
+
transform |
+
+ Optional[Callable]
+ |
+
+
+
+ transformation to apply to the getitem calls + |
+
+ None
+ |
+
regressor_kwargs |
+
+ Dict
+ |
+
+
+
+ Dictionary of keyword arguments to pass to the regressor. +Default: {"solver_type": "linear", "sub_sample": None, "stride": 1} +solver_type can be one of ["linear", "ridge"] + |
+
+ {'solver_type': 'linear', 'sub_sample': None, 'stride': 1}
+ |
+
openqdc/datasets/base.py
98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 |
|
__smiles_converter__(x)
+
+¶util function to convert string to smiles: useful if the smiles is +encoded in a different format than its display format
+ +openqdc/datasets/base.py
719 +720 +721 +722 +723 |
|
as_iter(atoms=False, energy_method=0)
+
+¶Return the dataset as an iterator.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
atoms |
+
+ bool
+ |
+
+
+
+ Whether to return the items as ASE atoms object, by default False + |
+
+ False
+ |
+
energy_method |
+
+ int
+ |
+
+
+
+ Index of the energy method to use + |
+
+ 0
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Iterable
+ |
+
+
+
+ Iterator of the dataset + |
+
openqdc/datasets/base.py
651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 |
|
calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)
+
+¶Compute the descriptors for the dataset.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
descriptor_name |
+
+ str
+ |
+
+
+
+ Name of the descriptor to use. Supported descriptors are ["soap"] + |
+
+ 'soap'
+ |
+
chemical_species |
+
+ Optional[List[str]]
+ |
+
+
+
+ List of chemical species to use for the descriptor computation, by default None. +If None, the chemical species of the dataset are used. + |
+
+ None
+ |
+
n_samples |
+
+ Optional[Union[List[int], int, float]]
+ |
+
+
+
+ Number of samples to use for the computation, by default None. +If None, all the dataset is used. +If a list of integers is provided, the descriptors are computed for +each of the specified idx of samples. + |
+
+ None
+ |
+
progress |
+
+ bool
+ |
+
+
+
+ Whether to show a progress bar, by default True. + |
+
+ True
+ |
+
**descriptor_kwargs |
+ + | +
+
+
+ dict +Keyword arguments to pass to the descriptor instantiation of the model. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Dict[str, ndarray]
+ |
+
+
+
+ Dictionary containing the following keys: +- values : np.ndarray of shape (N, M) containing the descriptors for the dataset +- idxs : np.ndarray of shape (N,) containing the indices of the samples used + |
+
openqdc/datasets/base.py
600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 |
|
collate_list(list_entries)
+
+¶Collate a list of entries into a single dictionary.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
list_entries |
+
+ List[Dict]
+ |
+
+
+
+ List of dictionaries containing the entries to collate. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Dict
+ |
+
+
+
+ Dictionary containing the collated entries. + |
+
openqdc/datasets/base.py
389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 |
|
get_ase_atoms(idx, energy_method=0, ext=True)
+
+¶Get the ASE atoms object for the entry at index idx.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
idx |
+
+ int
+ |
+
+
+
+ Index of the entry. + |
+ + required + | +
energy_method |
+
+ int
+ |
+
+
+
+ Index of the energy method to use + |
+
+ 0
+ |
+
ext |
+
+ bool
+ |
+
+
+
+ Whether to include additional informations + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Atoms
+ |
+
+
+
+ ASE atoms object + |
+
openqdc/datasets/base.py
566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 |
|
get_statistics(return_none=True)
+
+¶Get the converted statistics of the dataset.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
return_none |
+ + | +
+
+
+ Whether to return None if the statistics for the forces are not available, by default True +Otherwise, the statistics for the forces are set to 0.0 + |
+
+ True
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Dict
+ |
+
+
+
+ Dictionary containing the statistics of the dataset + |
+
openqdc/datasets/base.py
674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 |
|
is_cached()
+
+¶Check if the dataset is cached locally.
+ + +Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the dataset is cached locally, False otherwise. + |
+
openqdc/datasets/base.py
480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 |
|
is_preprocessed()
+
+¶Check if the dataset is preprocessed and available online or locally.
+ + +Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the dataset is available remotely or locally, False otherwise. + |
+
openqdc/datasets/base.py
466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 |
|
no_init()
+
+
+ classmethod
+
+
+¶Class method to avoid the init method to be called when the class is instanciated. +Useful for debugging purposes or preprocessing data.
+ +openqdc/datasets/base.py
209 +210 +211 +212 +213 +214 +215 |
|
preprocess(upload=False, overwrite=True, as_zarr=True)
+
+¶Preprocess the dataset and save it.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
upload |
+
+ bool
+ |
+
+
+
+ Whether to upload the preprocessed data to the remote storage or only saving it locally. + |
+
+ False
+ |
+
overwrite |
+
+ bool
+ |
+
+
+
+ hether to overwrite the preprocessed data if it already exists. +Only used if upload is True. Cache is always overwritten locally. + |
+
+ True
+ |
+
as_zarr |
+
+ bool
+ |
+
+
+
+ Whether to save the data as zarr files + |
+
+ True
+ |
+
openqdc/datasets/base.py
494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 |
|
read_raw_entries()
+
+¶Preprocess the raw (aka from the fetched source) into a list of dictionaries.
+ +openqdc/datasets/base.py
383 +384 +385 +386 +387 |
|
save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)
+
+¶Save the preprocessed data to the cache directory and optionally upload it to the remote storage.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_dict |
+
+ Dict[str, ndarray]
+ |
+
+
+
+ Dictionary containing the preprocessed data. + |
+ + required + | +
upload |
+
+ bool
+ |
+
+
+
+ Whether to upload the preprocessed data to the remote storage or only saving it locally. + |
+
+ False
+ |
+
overwrite |
+
+ bool
+ |
+
+
+
+ Whether to overwrite the preprocessed data if it already exists. +Only used if upload is True. Cache is always overwritten locally. + |
+
+ True
+ |
+
openqdc/datasets/base.py
410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 |
|
save_xyz(idx, energy_method=0, path=None, ext=True)
+
+¶Save a single entry at index idx as an extxyz file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
idx |
+
+ int
+ |
+
+
+
+ Index of the entry + |
+ + required + | +
energy_method |
+
+ int
+ |
+
+
+
+ Index of the energy method to use + |
+
+ 0
+ |
+
path |
+
+ Optional[str]
+ |
+
+
+
+ Path to save the xyz file. If None, the current working directory is used. + |
+
+ None
+ |
+
ext |
+
+ bool
+ |
+
+
+
+ Whether to include additional informations like forces and other metadatas (extxyz format) + |
+
+ True
+ |
+
openqdc/datasets/base.py
529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 |
|
set_distance_unit(value)
+
+¶Set a new distance unit for the dataset.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
value |
+
+ str
+ |
+
+
+
+ New distance unit to set. + |
+ + required + | +
openqdc/datasets/base.py
366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 |
|
set_energy_unit(value)
+
+¶Set a new energy unit for the dataset.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
value |
+
+ str
+ |
+
+
+
+ New energy unit to set. + |
+ + required + | +
openqdc/datasets/base.py
353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 |
|
to_xyz(energy_method=0, path=None)
+
+¶Save dataset as single xyz file (extended xyz format).
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
energy_method |
+
+ int
+ |
+
+
+
+ Index of the energy method to use + |
+
+ 0
+ |
+
path |
+
+ Optional[str]
+ |
+
+
+
+ Path to save the xyz file + |
+
+ None
+ |
+
openqdc/datasets/base.py
548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 |
|
upload(overwrite=False, as_zarr=False)
+
+¶Upload the preprocessed data to the remote storage. Must be called after preprocess and +need to have write privileges.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
overwrite |
+
+ bool
+ |
+
+
+
+ Whether to overwrite the remote data if it already exists + |
+
+ False
+ |
+
as_zarr |
+
+ bool
+ |
+
+
+
+ Whether to upload the data as zarr files + |
+
+ False
+ |
+
openqdc/datasets/base.py
512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 |
|
Alchemy
+
+
+¶
+ Bases: BaseDataset
Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. +Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level +with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used +to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G +is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the +B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The +auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange +matrix.
+Usage: +
from openqdc.datasets import Alchemy
+dataset = Alchemy()
+
https://arxiv.org/abs/1906.09427 +https://alchemy.tencent.com/
+openqdc/datasets/potential/alchemy.py
46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 |
|
ANI1
+
+
+¶
+ Bases: BaseDataset
The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic +molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the +wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules +are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary +point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT +level.
+Usage: +
from openqdc.datasets import ANI1
+dataset = ANI1()
+
openqdc/datasets/potential/ani.py
40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 |
|
ANI1CCX
+
+
+¶
+ Bases: ANI1
ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active +learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.
+Usage: +
from openqdc.datasets import ANI1CCX
+dataset = ANI1CCX()
+
openqdc/datasets/potential/ani.py
160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 |
|
__smiles_converter__(x)
+
+¶util function to convert string to smiles: useful if the smiles is +encoded in a different format than its display format
+ +openqdc/datasets/potential/ani.py
197 +198 +199 +200 +201 |
|
ANI1CCX_V2
+
+
+¶
+ Bases: ANI1CCX
ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels +for each conformation.
+Usage: +
from openqdc.datasets import ANI1CCX_V2
+dataset = ANI1CCX_V2()
+
openqdc/datasets/potential/ani.py
204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 |
|
ANI1X
+
+
+¶
+ Bases: ANI1
The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to +a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, +generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques +are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and +(4) torsion sampling.
+Usage: +
from openqdc.datasets import ANI1X
+dataset = ANI1X()
+
openqdc/datasets/potential/ani.py
99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 |
|
ANI2X
+
+
+¶
+ Bases: ANI1
The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. +It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized +using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are +used for generating geometries.
+Usage: +
from openqdc.datasets import ANI2X
+dataset = ANI2X()
+
openqdc/datasets/potential/ani.py
227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 |
|
COMP6
+
+
+¶
+ Bases: BaseDataset
COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the +ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and +Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using +the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and +molecular dipoles.
+ + +S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and
+mixed influence interactions.
+ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular
+
dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small +proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point +calculations are performed to calculate energies and forces.
+GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.
+
The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence +criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal +mode sampling (DNMS) is carried out to generate non-equilibrium conformations.
+GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11
+
and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are +generated via DNMS.
+Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.
+
+DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.
+
Structures are optimized similar to GDB7to9.
+Usage: +
from openqdc.datasets import COMP6
+dataset = COMP6()
+
https://aip.scitation.org/doi/abs/10.1063/1.5023802
+https://github.com/isayev/COMP6
+S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d
+GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/
+ + +openqdc/datasets/potential/comp6.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 |
|
__smiles_converter__(x)
+
+¶util function to convert string to smiles: useful if the smiles is +encoded in a different format than its display format
+ +openqdc/datasets/potential/comp6.py
81 +82 +83 +84 +85 |
|
DES370K
+
+
+¶
+ Bases: BaseInteractionDataset
, IDES
DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies +computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules +and ions) including water and functional groups found in proteins. Dimer geometries are generated using +QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.
+Usage: +
from openqdc.datasets import DES370K
+dataset = DES370K()
+
openqdc/datasets/interaction/des.py
75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 |
|
DES5M
+
+
+¶
+ Bases: DES370K
DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies +computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using +QM based optimization and MD simulations.
+Usage: +
from openqdc.datasets import DES5M
+dataset = DES5M()
+
openqdc/datasets/interaction/des.py
180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 |
|
DESS66
+
+
+¶
+ Bases: DES370K
DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS +dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. +The protocol for estimating energies is based on the DES370K paper.
+Usage: +
from openqdc.datasets import DESS66
+dataset = DESS66()
+
openqdc/datasets/interaction/des.py
254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 |
|
DESS66x8
+
+
+¶
+ Bases: DESS66
DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS +dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve +giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.
+Usage: +
from openqdc.datasets import DESS66x8
+dataset = DESS66x8()
+
openqdc/datasets/interaction/des.py
279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 |
|
GDML
+
+
+¶
+ Bases: BaseDataset
Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio +molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene +(627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin +(211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), +Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for +each conformation are computed using the PBE + vdW-TS electronic structure method. +molecular dynamics (AIMD) trajectories.
+ + +Benzene: 627000 samples
+Uracil: 133000 samples
+Naptalene: 326000 samples
+Aspirin: 211000 samples
+Salicylic Acid: 320000 samples
+Malonaldehyde: 993000 samples
+Ethanol: 555000 samples
+Toluene: 100000 samples
+Usage: +
from openqdc.datasets import GDML
+dataset = GDML()
+
openqdc/datasets/potential/gdml.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 |
|
+ Bases: BaseDataset
Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules +from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. +For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and +the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the +conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.
+Usage: +
from openqdc.datasets import GEOM
+dataset = GEOM()
+
https://www.nature.com/articles/s41597-022-01288-4
+https://github.com/learningmatter-mit/geom
+CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d
+openqdc/datasets/potential/geom.py
62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 |
|
ISO17
+
+
+¶
+ Bases: BaseDataset
ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of +atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing +5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics +trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient +approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der +Waals correction method.
+Usage: +
from openqdc.datasets import ISO17
+dataset = ISO17()
+
https://arxiv.org/abs/1706.08566
+https://arxiv.org/abs/1609.08259
+https://www.nature.com/articles/sdata201422
+ + +openqdc/datasets/potential/iso_17.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 |
|
__smiles_converter__(x)
+
+¶util function to convert string to smiles: useful if the smiles is +encoded in a different format than its display format
+ +openqdc/datasets/potential/iso_17.py
52 +53 +54 +55 +56 |
|
L7
+
+
+¶
+ Bases: YamlDataset
The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with +energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are +taken from crystal X-ray data and optimized with a DFT method specific to the complex.
+Usage: +
from openqdc.datasets import L7
+dataset = L7()
+
openqdc/datasets/interaction/l7.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 |
|
MD22
+
+
+¶
+ Bases: RevMD17
MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, +ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories +are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD +level of theory.
+Usage: +
from openqdc.datasets import MD22
+dataset = MD22()
+
openqdc/datasets/potential/md22.py
42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 |
|
Metcalf
+
+
+¶
+ Bases: BaseInteractionDataset
Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to +156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and +the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various +components.
+Usage: +
from openqdc.datasets import Metcalf
+dataset = Metcalf()
+
openqdc/datasets/interaction/metcalf.py
85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 |
|
Molecule3D
+
+
+¶
+ Bases: BaseDataset
Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the +B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing +molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, +or with damaged log files.
+Usage: +
from openqdc.datasets import Molecule3D
+dataset = Molecule3D()
+
openqdc/datasets/potential/molecule3d.py
68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 |
|
read_mol(mol, energy)
+
+¶Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies
+mol: Chem.rdchem.Mol + RDKit molecule +energy: float + Energy of the molecule
+res: dict + Dictionary containing the following keys: + - name: np.ndarray of shape (N,) containing the smiles of the molecule + - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions + - energies: np.ndarray of shape (1,) containing the energy of the conformer + - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer + - subset: np.ndarray of shape (1) containing "molecule3d"
+ +openqdc/datasets/potential/molecule3d.py
16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 |
|
MultixcQM9
+
+
+¶
+ Bases: BaseDataset
MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting +of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets +resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the +molecules are used directly from Kim et al. which uses G4MP2 method.
+Usage: +
from openqdc.datasets import MultixcQM9
+dataset = MultixcQM9()
+
https://www.nature.com/articles/s41597-023-02690-2
+ + +openqdc/datasets/potential/multixcqm9.py
38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 |
|
NablaDFT
+
+
+¶
+ Bases: BaseDataset
NablaDFT is a dataset constructed from a subset of the +Molecular Sets (MOSES) dataset consisting of 1 million molecules +with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of +conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that +cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. +This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at +wB97X-D/def2-XVP levels are used to generate the energy.
+Usage: +
from openqdc.datasets import NablaDFT
+dataset = NablaDFT()
+
https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D
+ +openqdc/datasets/potential/nabladft.py
51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 |
|
OrbnetDenali
+
+
+¶
+ Bases: BaseDataset
Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range +of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and +counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. +First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer +generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using +normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of +theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of +theory.
+Usage: +
from openqdc.datasets import OrbnetDenali
+dataset = OrbnetDenali()
+
https://arxiv.org/abs/2107.00299
+https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
+openqdc/datasets/potential/orbnet_denali.py
37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 |
|
PCQM_B3LYP
+
+
+¶
+ Bases: PCQM_PM6
PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to +biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, +the electronic structure and properties are calculated using B3LIP/6-31G method.
+Usage: +
from openqdc.datasets import PCQM_B3LYP
+dataset = PCQM_B3LYP()
+
openqdc/datasets/potential/pcqm.py
178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 |
|
PCQM_PM6
+
+
+¶
+ Bases: BaseDataset
PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized +molecular geometries and electronic properties. To generate the dataset, only molecules with weights less +than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel +and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also +computed using the PM6 method.
+Usage: +
from openqdc.datasets import PCQM_PM6
+dataset = PCQM_PM6()
+
openqdc/datasets/potential/pcqm.py
68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 |
|
MDDataset
+
+
+¶
+ Bases: ProteinFragments
MDDataset is a subset of the proteinfragments dataset that +generated from the molecular dynamics with their model. +The sampling was done with Molecular Dynamics +at room temperature 300K in various solvent phase:
+ + +Polyalanine: + All the polyalanine are sampled in gas phase. AceAla15Lys is + a polyalanine peptides capped with an N-terminal acetyl group + and a protonated lysine residue at the C-terminus, + Acela15nme is polyalanine peptide capped with an N-terminal acetyl group + and a C-terminal N-methyl amide group
+Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)
+Usage: +
from openqdc.datasets import MDDataset
+dataset = MDDataset()
+
openqdc/datasets/potential/proteinfragments.py
161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 |
|
ProteinFragments
+
+
+¶
+ Bases: BaseDataset
ProteinFragments is a dataset constructed from a subset of the +the data was generated from a top-down and bottom-up approach:
+ + +Fragments are generated by cutting out a spherical +region around an atom (including solvent molecules) +and saturating all dangling bonds. +Sampling was done with the Molecular Dynamics (MD) method from +conventional FF at room temperature.
+Fragments are generated by constructing chemical graphs +of one to eight nonhydrogen atoms. +Sampling of multiple conformers per fragments was done with +MD simulations at high temperatures or normal mode sampling.
+Usage: +
from openqdc.datasets import ProteinFragments
+dataset = ProteinFragments()
+
openqdc/datasets/potential/proteinfragments.py
91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 |
|
QM1B
+
+
+¶
+ Bases: BaseDataset
QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom +PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are +subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. +Electronic properties for each conformation are then calculated using the density functional B3LYP +and the basis set STO-3G.
+Usage: +
from openqdc.datasets import QM1B
+dataset = QM1B()
+
openqdc/datasets/potential/qm1b.py
79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 |
|
QM1B_SMALL
+
+
+¶
+ Bases: QM1B
QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.
+Usage: +
from openqdc.datasets import QM1B_SMALL
+dataset = QM1B_SMALL()
+
openqdc/datasets/potential/qm1b.py
145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 |
|
QM7X
+
+
+¶
+ Bases: BaseDataset
QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with +up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, +OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- +stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure +is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) +interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non +-equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of +normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has +energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.
+Usage: +
from openqdc.datasets import QM7X
+dataset = QM7X()
+
openqdc/datasets/potential/qm7x.py
36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
QM7X_V2
+
+
+¶
+ Bases: QM7X
QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.
+Usage: +
from openqdc.datasets import QM7X_V2
+dataset = QM7X_V2()
+
openqdc/datasets/potential/qm7x.py
86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 |
|
QMugs
+
+
+¶
+ Bases: BaseDataset
The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules +extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB +method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical +method (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
+Usage: +
from openqdc.datasets import QMugs
+dataset = QMugs()
+
https://arxiv.org/abs/2107.00367
+https://www.nature.com/articles/s41597-022-01390-7#ethics
+https://www.research-collection.ethz.ch/handle/20.500.11850/482129
+openqdc/datasets/potential/qmugs.py
38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 |
|
QMugs_V2
+
+
+¶
+ Bases: QMugs
QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.
+Usage: +
from openqdc.datasets import QMugs_V2
+dataset = QMugs_V2()
+
openqdc/datasets/potential/qmugs.py
80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 |
|
QM7
+
+
+¶
+ Bases: QMX
QM7 is a dataset constructed from subsets of the GDB-13 database ( +stable and synthetically accessible organic molecules), +containing up to seven “heavy” atoms. +The molecules conformation are optimized using DFT at the +PBE0/def2-TZVP level of theory.
+ + +[C, N, O, S, H]
+Usage: +
from openqdc.datasets import QM7
+dataset = QM7()
+
openqdc/datasets/potential/qmx.py
79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 |
|
QM7b
+
+
+¶
+ Bases: QMX
QM7b is a dataset constructed from subsets of the GDB-13 database ( +stable and synthetically accessible organic molecules), +containing up to seven “heavy” atoms. +The molecules conformation are optimized using DFT at the +PBE0/def2-TZVP level of theory.
+ + +[C, N, O, S, Cl, H]
+Usage: +
from openqdc.datasets import QM7b
+dataset = QM7b()
+
openqdc/datasets/potential/qmx.py
189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 |
|
QM8
+
+
+¶
+ Bases: QMX
QM8 is the subset of QM9 used in a study on modeling quantum +mechanical calculations of electronic spectra and excited +state energy (a increase of energy from the ground states) of small molecules +up to eight heavy atoms. +Multiple methods were used, including +time-dependent density functional theories (TDDFT) and +second-order approximate coupled-cluster (CC2). +The molecules conformations are relaxed geometries computed using +the DFT B3LYP with basis set 6-31G(2df,p). +For more information about the sampling, check QM9 dataset.
+Usage: +
from openqdc.datasets import QM8
+dataset = QM8()
+
openqdc/datasets/potential/qmx.py
293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 |
|
QM9
+
+
+¶
+ Bases: QMX
QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, +containing up to 9 “heavy” atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) +level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed +by relaxing geometries with quantum mechanical method B3LYP.
+Usage: +
from openqdc.datasets import QM9
+dataset = QM9()
+
openqdc/datasets/potential/qmx.py
361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 |
|
QMX
+
+
+¶
+ Bases: ABC
, BaseDataset
QMX dataset base abstract class
+ +openqdc/datasets/potential/qmx.py
36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 |
|
RevMD17
+
+
+¶
+ Bases: BaseDataset
Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original +dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies +are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration +grid. The dataset contains the following molecules: + Benzene: 627000 samples
+Uracil: 133000 samples
+
+Naptalene: 326000 samples
+
+Aspirin: 211000 samples
+
+Salicylic Acid: 320000 samples
+
+Malonaldehyde: 993000 samples
+
+Ethanol: 555000 samples
+
+Toluene: 100000 samples
+
Usage: +
from openqdc.datasets import RevMD17
+dataset = RevMD17()
+
openqdc/datasets/potential/revmd17.py
55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 |
|
SN2RXN
+
+
+¶
+ Bases: BaseDataset
This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and +contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by +running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment +(ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and +for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset +contains 452,709 structures along with the energy, force and dipole moments.
+Usage: +
from openqdc.datasets import SN2RXN
+dataset = SN2RXN()
+
openqdc/datasets/potential/sn2_rxn.py
40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 |
|
SolvatedPeptides
+
+
+¶
+ Bases: BaseDataset
The solvated protein fragments dataset probes many-body intermolecular interactions between "protein fragments" +and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are +run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 +steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.
+Usage: +
from openqdc.datasets import SolvatedPeptides
+dataset = SolvatedPeptides()
+
openqdc/datasets/potential/solvated_peptides.py
8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 |
|
__smiles_converter__(x)
+
+¶util function to convert string to smiles: useful if the smiles is +encoded in a different format than its display format
+ +openqdc/datasets/potential/solvated_peptides.py
49 +50 +51 +52 +53 |
|
Spice
+
+
+¶
+ Bases: BaseDataset
Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of +small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, +and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate +100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and +molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the +wB97M-D3(BJ)/def2-TZVPPD level of theory.
+Usage: +
from openqdc.datasets import Spice
+dataset = Spice()
+
openqdc/datasets/potential/spice.py
41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 |
|
SpiceV2
+
+
+¶
+ Bases: Spice
SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. +The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain +silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve +sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and +(5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.
+Usage: +
from openqdc.datasets import SpiceV2
+dataset = SpiceV2()
+
openqdc/datasets/potential/spice.py
100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 |
|
SpiceVL2
+
+
+¶
+ Bases: SpiceV2
SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.
+Usage: +
from openqdc.datasets import SpiceVL2
+dataset = SpiceVL2()
+
openqdc/datasets/potential/spice.py
156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 |
|
read_record(r, obj)
+
+¶Read record from hdf5 file. + r : hdf5 record + obj : Spice class object used to grab subset and names
+ +openqdc/datasets/potential/spice.py
13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 |
|
Splinter
+
+
+¶
+ Bases: BaseInteractionDataset
Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated +by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies +and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.
+Usage: +
from openqdc.datasets import Splinter
+dataset = Splinter()
+
openqdc/datasets/interaction/splinter.py
13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 |
|
TMQM
+
+
+¶
+ Bases: BaseDataset
tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of +organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated +at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database +and then optimized in gas phase with the extended tight-binding GFN2-xTB method.
+Usage: +
from openqdc.datasets import TMQM
+dataset = TMQM()
+
openqdc/datasets/potential/tmqm.py
48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 |
|
Transition1X
+
+
+¶
+ Bases: BaseDataset
Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy +and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and +the transition states are generated by running Nudged Elastic Band (NEB) with DFT.
+Usage: +
from openqdc.datasets import Transition1X
+dataset = Transition1X()
+
References: +- https://www.nature.com/articles/s41597-022-01870-w
+ + +openqdc/datasets/potential/transition1x.py
40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
VQM24
+
+
+¶
+ Bases: BaseDataset
Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical +properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional +isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and +relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.
+Usage: +
from openqdc.datasets import VQM24
+dataset = VQM24()
+
openqdc/datasets/potential/vqm24.py
42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 |
|
SCANWaterClusters
+
+
+¶
+ Bases: BaseDataset
The SCAN Water Clusters dataset contains conformations of +neutral water clusters containing up to 20 monomers, charged water clusters, +and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: +the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 +neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of +ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F−, Cl−, or Br−. +Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics +simulations using AMBER 9 and optimized to obtain +lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.
+ + +[H, O, Li, Na, K, F, Cl, Br]
+Usage: +
from openqdc.datasets import SCANWaterClusters
+dataset = SCANWaterClusters()
+
https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec
+https://github.com/esoteric-ephemera/water_cluster_density_errors
+openqdc/datasets/potential/waterclusters.py
103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 |
|
WaterClusters
+
+
+¶
+ Bases: BaseDataset
The WaterClusters dataset contains putative minima and low energy networks for water +clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with +the TTM2.1-F ab-initio based interaction potential for water. +It contains approximately 4.5 mil. structures. +Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.
+ + +["H", "O"]
+Usage: +
from openqdc.datasets import WaterClusters
+dataset = WaterClusters()
+
openqdc/datasets/potential/waterclusters3_30.py
50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 |
|
X40
+
+
+¶
+ Bases: YamlDataset
X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules +where the halogens participate in various interaction types such as electrostatic interactions, london +dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic +molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries +are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are +computed with CCSD(T)/CBS level of theory.
+Usage: +
from openqdc.datasets import X40
+dataset = X40()
+
openqdc/datasets/interaction/x40.py
9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 |
|
AtomEnergies
+
+
+¶Manager class for interface with the isolated atom energies classes +and providing the generals function to retrieve the data
+ +openqdc/datasets/energies.py
98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 |
|
e0s_dict: Dict[AtomSpecies, AtomEnergy]
+
+
+ property
+
+
+¶Return the isolated atom energies dictionary
+ + +Returns:
+Type | +Description | +
---|---|
+ Dict[AtomSpecies, AtomEnergy]
+ |
+
+
+
+ Dictionary with the isolated atom energies + |
+
e0s_matrix: np.ndarray
+
+
+ property
+
+
+¶Return the isolated atom energies dictionary
+ + +Returns:
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Matrix Array with the isolated atom energies + |
+
__getitem__(item)
+
+¶Retrieve a key from the isolated atom dictionary. +Item can be written as tuple(Symbol, charge), +tuple(Chemical number, charge). If no charge is passed, +it will be automatically set to 0.
+ + +Examples:
+AtomEnergies[6], AtomEnergies[6,1],
+AtomEnergies["C",1], AtomEnergies[(6,1)],
+AtomEnergies[("C,1)]
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
item |
+
+ AtomSpecies
+ |
+
+
+
+ AtomSpecies object or tuple with the atom symbol and charge + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ AtomEnergy
+ |
+
+
+
+ AtomEnergy object with the isolated atom energy + |
+
openqdc/datasets/energies.py
134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 |
|
AtomEnergy
+
+
+
+ dataclass
+
+
+¶Datastructure to store isolated atom energies +and the std deviation associated to the value. +By default the std will be 1 if no value was calculated +or not available (formation energy case)
+ +openqdc/datasets/energies.py
74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 |
|
append(other)
+
+¶Append the mean and std of another atom energy
+ +openqdc/datasets/energies.py
90 +91 +92 +93 +94 +95 |
|
AtomSpecies
+
+
+
+ dataclass
+
+
+¶Structure that defines a tuple of chemical specie and charge +and provide hash and automatic conversion from atom number to +checmical symbol
+ +openqdc/datasets/energies.py
48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 |
|
IsolatedEnergyInterface
+
+
+¶
+ Bases: ABC
Abstract class that defines the interface for the +different implementation of an isolated atom energy value
+ +openqdc/datasets/energies.py
166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 |
|
e0_dict: Dict
+
+
+ property
+
+
+¶Return the isolated atom energies dict
+ + +Returns:
+Type | +Description | +
---|---|
+ Dict
+ |
+
+
+
+ Dictionary with the isolated atom energies + |
+
e0_matrix: np.ndarray
+
+
+ property
+
+
+¶Return the isolated atom energies matrixes
+ + +Returns:
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Matrix Array with the isolated atom energies + |
+
__init__(data, **kwargs)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data |
+ + | +
+
+
+ openqdc.datasets.Dataset +Dataset object that contains the information +about the isolated atom energies. Info will be passed +by references + |
+ + required + | +
kwargs |
+ + | +
+
+
+ dict +Additional arguments that will be passed to the +selected energy class. Mostly used for regression +to pass the regressor_kwargs. + |
+
+ {}
+ |
+
openqdc/datasets/energies.py
172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 |
|
NullEnergy
+
+
+¶
+ Bases: IsolatedEnergyInterface
Class that returns a null (zeros) matrix for the isolated atom energies in case +of no energies are available.
+ +openqdc/datasets/energies.py
252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 |
|
PhysicalEnergy
+
+
+¶
+ Bases: IsolatedEnergyInterface
Class that returns a physical (SE,DFT,etc) isolated atom energies.
+ +openqdc/datasets/energies.py
230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 |
|
RegressionEnergy
+
+
+¶
+ Bases: IsolatedEnergyInterface
Class that compute and returns the regressed isolated atom energies.
+ +openqdc/datasets/energies.py
275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 |
|
preprocess_path
+
+
+ property
+
+
+¶Return the path to the object pickle file.
+attempt_load()
+
+¶Try to load the regressed isolated atom energies from the +object pickle file and return the success of the operation.
+ +openqdc/datasets/energies.py
336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 |
|
save_e0s()
+
+¶Save the regressed isolated atom energies in a pickle file.
+ +openqdc/datasets/energies.py
330 +331 +332 +333 +334 |
|
dispatch_factory(data, **kwargs)
+
+¶Factory function that select the correct +energy class for the fetching/calculation +of isolated atom energies.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data |
+ + | +
+
+
+ openqdc.datasets.Dataset +Dataset object that contains the information +about the isolated atom energies. Info will be passed +by references + |
+ + required + | +
kwargs |
+ + | +
+
+
+ dict +Additional arguments that will be passed to the +selected energy class. Mostly used for regression +to pass the regressor_kwargs. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ IsolatedEnergyInterface
+ |
+
+
+
+ Initialized IsolatedEnergyInterface-like object + |
+
openqdc/datasets/energies.py
17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 |
|
GeneralStructure
+
+
+¶
+ Bases: ABC
Abstract Factory class for datasets type in the openQDC package.
+ +openqdc/datasets/structure.py
13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 |
|
load_fn: Callable
+
+
+ abstractmethod
+ property
+
+
+¶Function to use for loading the data. +Must be implemented by the child class.
+ + +Returns:
+Type | +Description | +
---|---|
+ Callable
+ |
+
+
+
+ the function to use for loading the data + |
+
add_extension(filename)
+
+¶Add the correct extension to a filename
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
filename |
+
+ str
+ |
+
+
+
+ the filename to add the extension to + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ the filename with the extension + |
+
openqdc/datasets/structure.py
37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 |
|
join_and_ext(path, filename)
+
+¶Join a path and a filename and add the correct extension.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+
+ Union[str, PathLike]
+ |
+
+
+
+ the path to join + |
+ + required + | +
filename |
+
+ str
+ |
+
+
+
+ the filename to join + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Union[str, PathLike]
+ |
+
+
+
+ the joined path with the correct extension + |
+
openqdc/datasets/structure.py
93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 |
|
load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)
+
+¶Main method to load the data from a filetype structure like memmap or zarr.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
preprocess_path |
+
+ Union[str, PathLike]
+ |
+
+
+
+ path to the preprocessed data file + |
+ + required + | +
data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the data file + |
+ + required + | +
data_types |
+
+ Dict[str, dtype]
+ |
+
+
+
+ dictionary of data types for each key + |
+ + required + | +
data_shapes |
+
+ Dict[str, Tuple[int, int]]
+ |
+
+
+
+ dictionary of shapes for each key + |
+ + required + | +
extra_data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the extra data file + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ whether to overwrite the local cache + |
+ + required + | +
openqdc/datasets/structure.py
106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 |
|
load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite)
+
+
+ abstractmethod
+
+
+¶Load extra files required to define other types of data. +Must be implemented by the child class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data |
+
+ Dict[str, ndarray]
+ |
+
+
+
+ dictionary of data to load + |
+ + required + | +
preprocess_path |
+
+ Union[str, PathLike]
+ |
+
+
+
+ path to the preprocessed data file + |
+ + required + | +
data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the data file + |
+ + required + | +
pkl_data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the extra files + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ whether to overwrite the local cache + |
+ + required + | +
openqdc/datasets/structure.py
71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 |
|
save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types)
+
+
+ abstractmethod
+
+
+¶Save the preprocessed data to the cache directory and optionally upload it to the remote storage. +Must be implemented by the child class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
preprocess_path |
+
+ Union[str, PathLike]
+ |
+
+
+
+ path to the preprocessed data file + |
+ + required + | +
data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the data file + |
+ + required + | +
data_dict |
+
+ Dict[str, ndarray]
+ |
+
+
+
+ dictionary of data to save + |
+ + required + | +
extra_data_keys |
+
+ List[str]
+ |
+
+
+
+ list of keys to load from the extra data file + |
+ + required + | +
extra_data_types |
+
+ Dict[str, type]
+ |
+
+
+
+ dictionary of data types for each key + |
+ + required + | +
openqdc/datasets/structure.py
49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 |
|
unpack(data)
+
+¶Unpack the data from the loaded file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data |
+
+ any
+ |
+
+
+
+ the data to unpack + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ any
+ |
+
+
+
+ the unpacked data + |
+
openqdc/datasets/structure.py
137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 |
|
MemMapDataset
+
+
+¶
+ Bases: GeneralStructure
Dataset structure for memory-mapped numpy arrays and props.pkl files.
+ +openqdc/datasets/structure.py
150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 |
|
ZarrDataset
+
+
+¶
+ Bases: GeneralStructure
Dataset structure for zarr files.
+ +openqdc/datasets/structure.py
205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 |
|
InteractionMethod
+
+
+¶
+ Bases: QmMethod
openqdc/methods/enums.py
546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 |
|
atom_energies_dict
+
+
+ property
+
+
+¶Get an empty atomization energy dictionary because Interaction methods don't require this
+PotentialMethod
+
+
+¶
+ Bases: QmMethod
openqdc/methods/enums.py
248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 |
|
atom_energies_dict
+
+
+ property
+
+
+¶Get the atomization energy dictionary
+QmMethod
+
+
+¶
+ Bases: Enum
openqdc/methods/enums.py
221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 |
|
to_e_matrix(atom_energies)
+
+¶Get the matrix of isolated atom energies for a dict of non-null values calculates
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
atom_energies |
+
+ Dict
+ |
+
+
+
+ Dict of energies computed for a given QM method. +Keys are pairs of (atom, charge) and values are energy values + |
+ + required + | +
np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Matrix containing the isolated atom energies for each atom and charge written in the form: +
|
+
openqdc/methods/atom_energies.py
21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 |
|
DatasetPropertyMixIn
+
+
+¶Mixin class for BaseDataset class to add +properties that are common to all datasets.
+ +openqdc/datasets/properties.py
10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 |
|
average_n_atoms: int
+
+
+ property
+
+
+¶Average number of atoms in a molecule in the dataset.
+ + +Returns:
+Type | +Description | +
---|---|
+ int
+ |
+
+
+
+ Average number of atoms in a molecule in the dataset. + |
+
charges: np.ndarray
+
+
+ property
+
+
+¶Unique charges in the dataset
+ + +Returns:
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Array of the unique charges in the dataset + |
+
chemical_species: np.ndarray
+
+
+ property
+
+
+¶Chemical symbols in the dataset
+ + +Returns:
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Array of the chemical symbols in the dataset + |
+
min_max_charges: Tuple[int, int]
+
+
+ property
+
+
+¶Minimum and maximum charges in the dataset
+ + +Returns:
+Type | +Description | +
---|---|
+ Tuple[int, int]
+ |
+
+
+
+ (min_charge, max_charge) + |
+
numbers: np.ndarray
+
+
+ property
+
+
+¶Unique atomic numbers in the dataset
+ + +Returns:
+Type | +Description | +
---|---|
+ ndarray
+ |
+
+
+
+ Array of the unique atomic numbers in the dataset + |
+
Linear Atom Energies regression utilities.
+ + + +LinearSolver
+
+
+¶
+ Bases: Solver
Linear regression solver.
+ + +No Uncertainty associated as it is quite small.
+openqdc/utils/regressor.py
216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 |
|
Regressor
+
+
+¶Regressor class for preparing and solving regression problem for isolated atom energies. +A isolated atom energy regression problem is defined as:
+X = [n_samples, n_species] (number of atoms of each species per sample)
+Y = [n_samples, ] (energies)
+The regression problem is solved by solving the linear system X E0 = Y.
+ + +For a sytem of 2 samples (H20, CH4)
+n_species = 3, n_samples = 2
+
+H20 = 2H , 1O -> X = [2, 1, 0]
+
+CH4 = 4C, 1H -> X = [1, 0, 4]
+
+X = [[2, 1, 0],
+ [ 1, 0, 4]]
+
+Y = [[10, 20]]
+
+X E0 = Y
+
Linear system to solve
+[[2 eH, 1 eO, 0 eC],
+[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]
+
openqdc/utils/regressor.py
49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 |
|
__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)
+
+¶Regressor class for preparing and solving regression problem for isolated atom energies.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
energies |
+
+ ndarray
+ |
+
+
+
+ numpy array of energies in the shape (n_samples, n_energy_methods) + |
+ + required + | +
atomic_numbers |
+
+ ndarray
+ |
+
+
+
+ numpy array of atomic numbers in the shape (n_atoms,) + |
+ + required + | +
position_idx_range |
+
+ ndarray
+ |
+
+
+
+ array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset + |
+ + required + | +
solver_type |
+
+ str
+ |
+
+
+
+ Type of solver to use. ["linear", "ridge"] + |
+
+ 'linear'
+ |
+
stride |
+
+ int
+ |
+
+
+
+ Stride to use for the regression. + |
+
+ 1
+ |
+
subsample |
+
+ Optional[Union[float, int]]
+ |
+
+
+
+ Sumsample the dataset. +If a float, it is interpreted as a fraction of the dataset to use. +If >1 it is interpreted as the number of samples to use. + |
+
+ None
+ |
+
remove_nan |
+
+ bool
+ |
+
+
+
+ Sanitize the dataset by removing energies samples with NaN values. + |
+
+ True
+ |
+
*args |
+
+ any
+ |
+
+
+
+ Additional arguments to be passed to the regressor. + |
+
+ ()
+ |
+
**kwargs |
+
+ any
+ |
+
+
+
+ Additional keyword arguments to be passed to the regressor. + |
+
+ {}
+ |
+
openqdc/utils/regressor.py
73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 |
|
from_openqdc_dataset(dataset, *args, **kwargs)
+
+
+ classmethod
+
+
+¶Initialize the regressor object from an openqdc dataset. This is the default method. +args and and *kwargs are passed to the init method and depends on the specific regressor.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dataset |
+
+ any
+ |
+
+
+
+ openqdc dataset object. + |
+ + required + | +
*args |
+
+ any
+ |
+
+
+
+ Additional arguments to be passed to the regressor. + |
+
+ ()
+ |
+
**kwargs |
+
+ any
+ |
+
+
+
+ Additional keyword arguments to be passed to the regressor. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Regressor
+ |
+
+
+
+ Instance of the regressor class. + |
+
openqdc/utils/regressor.py
119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 |
|
solve()
+
+¶Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.
+ +openqdc/utils/regressor.py
180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 |
|
RidgeSolver
+
+
+¶
+ Bases: Solver
Ridge regression solver.
+ +openqdc/utils/regressor.py
233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 |
|
Solver
+
+
+¶
+ Bases: ABC
Abstract class for regression solvers.
+ +openqdc/utils/regressor.py
18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 |
|
solve(X, Y)
+
+
+ abstractmethod
+ staticmethod
+
+
+¶Main method to solve the regression problem. +Must be implemented in all the subclasses.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ ndarray
+ |
+
+
+
+ Input features of shape (n_samples, n_species) + |
+ + required + | +
Y |
+
+ ndarray
+ |
+
+
+
+ Target values of shape (n_samples,) (energy values for the regression) + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Tuple[ndarray, Optional[ndarray]]
+ |
+
+
+
+ Tuple of predicted values and the estimated uncertainty. + |
+
openqdc/utils/regressor.py
23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 |
|
atom_standardization(X, y)
+
+¶Standardize the energies and the atom counts. +This will make the calculated uncertainty more +meaningful.
+ +openqdc/utils/regressor.py
203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 |
|
non_nan_idxs(array)
+
+¶Return non nan indices of an array.
+ +openqdc/utils/regressor.py
11 +12 +13 +14 +15 |
|
AbstractStatsCalculator
+
+
+¶
+ Bases: ABC
Abstract class that defines the interface for all +the calculators object and the methods to +compute the statistics.
+ +openqdc/datasets/statistics.py
159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 |
|
root
+
+
+ property
+
+
+¶Path to the dataset folder
+__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
name |
+ + | +
+
+
+ Name of the dataset for saving and loading. + |
+ + required + | +
energy_type |
+ + | +
+
+
+ Type of the energy for the computation of the statistics. Used for loading and saving. + |
+
+ None
+ |
+
force_recompute |
+ + | +
+
+
+ Flag to force the recomputation of the statistics + |
+
+ False
+ |
+
energies |
+ + | +
+
+
+ n +Energies of the dataset + |
+
+ None
+ |
+
n_atoms |
+ + | +
+
+
+ Number of atoms in the dataset + |
+
+ None
+ |
+
atom_species |
+ + | +
+
+
+ Atomic species of the dataset + |
+
+ None
+ |
+
position_idx_range |
+ + | +
+
+
+ n +Position index range of the dataset + |
+
+ None
+ |
+
e0_matrix |
+ + | +
+
+
+ Isolated atom energies matrix of the dataset + |
+
+ None
+ |
+
atom_charges |
+ + | +
+
+
+ Atomic charges of the dataset + |
+
+ None
+ |
+
forces |
+ + | +
+
+
+ Forces of the dataset + |
+
+ None
+ |
+
openqdc/datasets/statistics.py
170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 |
|
attempt_load()
+
+¶Load precomputed statistics file and return the success of the operation
+ +openqdc/datasets/statistics.py
271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 |
|
compute()
+
+
+ abstractmethod
+
+
+¶Abstract method to compute the statistics. +Must return a StatisticsResults object and be implemented +in all the childs
+ +openqdc/datasets/statistics.py
256 +257 +258 +259 +260 +261 +262 +263 |
|
from_openqdc_dataset(dataset, recompute=False)
+
+
+ classmethod
+
+
+¶Create a calculator object from a dataset object.
+ +openqdc/datasets/statistics.py
236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 |
|
run(state)
+
+¶Main method to run the calculator. +Setup the dependencies from the state dictionary +Check if the statistics are already computed and load them or +recompute them +Save the statistics in the correct folder
+ + +dictionary containing the state of the calculator
+openqdc/datasets/statistics.py
304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 |
|
save_statistics()
+
+¶Save statistics file to the dataset folder as a pkl file
+ +openqdc/datasets/statistics.py
265 +266 +267 +268 +269 |
|
write_state(update)
+
+¶Write/update the state dictionary with the update dictionary
+ + +dictionary containing the update to the state
+openqdc/datasets/statistics.py
295 +296 +297 +298 +299 +300 +301 +302 |
|
EnergyStatistics
+
+
+
+ dataclass
+
+
+¶
+ Bases: StatisticsResults
Dataclass for energy related statistics
+ +openqdc/datasets/statistics.py
41 +42 +43 +44 +45 +46 +47 +48 |
|
ForceStatistics
+
+
+
+ dataclass
+
+
+¶
+ Bases: StatisticsResults
Dataclass for force statistics
+ +openqdc/datasets/statistics.py
51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 |
|
ForcesCalculatorStats
+
+
+¶
+ Bases: AbstractStatsCalculator
Forces statistics calculator class
+ +openqdc/datasets/statistics.py
324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 |
|
FormationEnergyInterface
+
+
+¶
+ Bases: AbstractStatsCalculator
, ABC
Formation Energy interface calculator class. +Define the use of the dependency formation_energy in the +compute method
+ +openqdc/datasets/statistics.py
360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 |
|
FormationEnergyStats
+
+
+¶
+ Bases: FormationEnergyInterface
Formation Energy calculator class.
+ +openqdc/datasets/statistics.py
402 +403 +404 +405 +406 +407 +408 +409 +410 |
|
PerAtomFormationEnergyStats
+
+
+¶
+ Bases: FormationEnergyInterface
Per atom Formation Energy calculator class.
+ +openqdc/datasets/statistics.py
413 +414 +415 +416 +417 +418 +419 +420 +421 |
|
StatisticManager
+
+
+¶Manager class that automatically handle the shared state between +the statistic calculators
+ +openqdc/datasets/statistics.py
64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 |
|
state: Dict
+
+
+ property
+
+
+¶Return the dictionary state of the manager
+ + +Returns:
+Type | +Description | +
---|---|
+ Dict
+ |
+
+
+
+ State of the StatisticManager + |
+
__init__(dataset, recompute=False, *statistic_calculators)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dataset |
+ + | +
+
+
+ openqdc.datasets.base.BaseDataset +The dataset object to compute the statistics + |
+ + required + | +
recompute |
+
+ bool
+ |
+
+
+
+ Flag to recompute the statistics + |
+
+ False
+ |
+
*statistic_calculators |
+
+ AbstractStatsCalculator
+ |
+
+
+
+ List of statistic calculators to run + |
+
+ ()
+ |
+
openqdc/datasets/statistics.py
70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 |
|
get_results(as_dict=False)
+
+¶Aggregate results from all the calculators
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
as_dict |
+
+ bool
+ |
+
+
+
+ Flag to return the results as a dictionary + |
+
+ False
+ |
+
openqdc/datasets/statistics.py
136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 |
|
get_state(key=None)
+
+¶Return the value of the key in the state dictionary
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+
+ Optional[str]
+ |
+
+
+
+ str, default = None + |
+
+ None
+ |
+
Returns: + the value of the key in the state dictionary + or the whole state dictionary if key is None
+ +openqdc/datasets/statistics.py
109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 |
|
has_state(key)
+
+¶Check is state has key
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+
+ str
+ |
+
+
+
+ Key to check in the state dictionary + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the key is in the state dictionary + |
+
openqdc/datasets/statistics.py
123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 |
|
reset_results()
+
+¶Reset the results dictionary
+ +openqdc/datasets/statistics.py
103 +104 +105 +106 +107 |
|
reset_state()
+
+¶Reset the state dictionary
+ +openqdc/datasets/statistics.py
97 + 98 + 99 +100 +101 |
|
run_calculators()
+
+¶Run the saved calculators and save the results in the manager
+ +openqdc/datasets/statistics.py
149 +150 +151 +152 +153 +154 +155 +156 |
|
StatisticsResults
+
+
+¶Parent class to statistics results +to provide general methods.
+ +openqdc/datasets/statistics.py
13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 |
|
to_dict()
+
+¶Convert the class to a dictionary
+ + +Returns:
+Type | +Description | +
---|---|
+ Dict
+ |
+
+
+
+ Dictionary representation of the class + |
+
openqdc/datasets/statistics.py
19 +20 +21 +22 +23 +24 +25 +26 |
|
transform(func)
+
+¶Apply a function to all the attributes of the class
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
func |
+
+ Callable
+ |
+
+
+
+ Function to apply to the attributes + |
+ + required + | +
openqdc/datasets/statistics.py
28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 |
|
TotalEnergyStats
+
+
+¶
+ Bases: AbstractStatsCalculator
Total Energy statistics calculator class
+ +openqdc/datasets/statistics.py
348 +349 +350 +351 +352 +353 +354 +355 +356 +357 |
|
Units conversion utilities module.
+ + +["kcal/mol", "kj/mol", "hartree", "ev" "mev", "ryd]
+["ang", "nm", "bohr"]
+Combinations between Energy and Distance units
+Conversion
+
+
+¶Conversion from one unit system to another defined by a name and a callable
+ +openqdc/utils/units.py
130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 |
|
__init__(in_unit, out_unit, func)
+
+¶Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
in_unit |
+
+ str
+ |
+
+
+
+ String defining the units of the current values + |
+ + required + | +
out_unit |
+
+ str
+ |
+
+
+
+ String defining the target units + |
+ + required + | +
func |
+
+ Callable[[float], float]
+ |
+
+
+
+ The callable to compute the conversion + |
+ + required + | +
openqdc/utils/units.py
135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 |
|
DistanceTypeConversion
+
+
+¶
+ Bases: ConversionEnum
, StrEnum
Define the possible distance units for conversion
+ +openqdc/utils/units.py
59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 |
|
to(distance, fraction=False)
+
+¶Get the conversion function to convert the distance to the desired units.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
distance |
+
+ DistanceTypeConversion
+ |
+
+
+
+ distance unit to convert to + |
+ + required + | +
fraction |
+
+ bool
+ |
+
+
+
+ whether it is distance^1 or distance^-1 + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Callable[[float], float]
+ |
+
+
+
+ callable to convert the distance to the desired units + |
+
openqdc/utils/units.py
69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 |
|
EnergyTypeConversion
+
+
+¶
+ Bases: ConversionEnum
, StrEnum
Define the possible energy units for conversion
+ +openqdc/utils/units.py
33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 |
|
to(energy)
+
+¶Get the conversion function to convert the energy to the desired units.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
energy |
+
+ EnergyTypeConversion
+ |
+
+
+
+ energy unit to convert to + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Callable[[float], float]
+ |
+
+
+
+ Callable to convert the distance to the desired units + |
+
openqdc/utils/units.py
46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 |
|
ForceTypeConversion
+
+
+¶
+ Bases: ConversionEnum
Define the possible foce units for conversion
+ +openqdc/utils/units.py
83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 |
|
to(energy, distance)
+
+¶Get the conversion function to convert the force to the desired units.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
energy |
+
+ EnergyTypeConversion
+ |
+
+
+
+ energy unit to convert to + |
+ + required + | +
distance |
+
+ DistanceTypeConversion
+ |
+
+
+
+ distance unit to convert to + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Callable[[float], float]
+ |
+
+
+
+ callable to convert the distance to the desired units + |
+
openqdc/utils/units.py
116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 |
|
get_conversion(in_unit, out_unit)
+
+¶Utility function to get the conversion function between two units.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
in_unit |
+ + | +
+
+
+ The input unit + |
+ + required + | +
out_unit |
+ + | +
+
+
+ The output unit + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Callable[[float], float]
+ |
+
+
+
+ The conversion function + |
+
openqdc/utils/units.py
156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 |
|
check_file(path)
+
+¶Checks if file present on local
+ +openqdc/utils/io.py
123 +124 +125 |
|
create_hdf5_file(hdf5_file_path)
+
+¶Creates hdf5 file with fsspec
+ +openqdc/utils/io.py
179 +180 +181 +182 +183 +184 |
|
get_conversion(in_unit, out_unit)
+
+¶Utility function to get the conversion function between two units.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
in_unit |
+ + | +
+
+
+ The input unit + |
+ + required + | +
out_unit |
+ + | +
+
+
+ The output unit + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Callable[[float], float]
+ |
+
+
+
+ The conversion function + |
+
openqdc/utils/units.py
156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 |
|
get_local_cache()
+
+¶Returns the local cache directory. It creates it if it does not exist.
+ + +Returns:
+Name | Type | +Description | +
---|---|---|
str |
+ str
+ |
+
+
+
+ path to the local cache directory + |
+
openqdc/utils/io.py
48 +49 +50 +51 +52 +53 +54 +55 +56 +57 |
|
get_remote_cache(write_access=False)
+
+¶Returns the entry point based on the write access.
+ +openqdc/utils/io.py
60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 |
|
load_hdf5_file(hdf5_file_path)
+
+¶Loads hdf5 file with fsspec
+ +openqdc/utils/io.py
161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 |
|
load_json(path)
+
+¶Loads json file
+ +openqdc/utils/io.py
187 +188 +189 +190 |
|
load_pkl(path, check=True)
+
+¶Load pkl file
+ +openqdc/utils/io.py
151 +152 +153 +154 +155 +156 +157 +158 |
|
makedirs(path, exist_ok=True)
+
+¶Creates directory
+ +openqdc/utils/io.py
118 +119 +120 |
|
read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)
+
+¶Extracts data from the HDF5 archive file.
+ +openqdc/utils/io.py
288 +289 +290 +291 +292 +293 +294 +295 +296 +297 |
|
save_pkl(file, path)
+
+¶Saves pkl file
+ +openqdc/utils/io.py
134 +135 +136 +137 +138 |
|
set_cache_dir(d)
+
+¶Optionally set the _OPENQDC_CACHE_DIR directory.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
d |
+
+ str
+ |
+
+
+
+ path to a local folder. + |
+ + required + | +
openqdc/utils/io.py
35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 |
|