Skip to content

Commit

Permalink
Optimize hessian (pyscf#265)
Browse files Browse the repository at this point in the history
* New cuda kernel

* general case correct

* fixes

* Unroll ejk_ip2

* Bugfixes

* Optimize SR integrals for gradients and hessian

* Optimize CPHF solver in Hessian

* Control binary size

* Optimize CPHF solver

* lint

* Improve Hessian CPHF accuracy

* Assert cupy array

* Adjust dtypes in Hessian functions

---------

Co-authored-by: Qiming Sun <[email protected]>
  • Loading branch information
sunqm and Qiming Sun authored Dec 11, 2024
1 parent 65e3365 commit 010ca2d
Show file tree
Hide file tree
Showing 33 changed files with 46,226 additions and 53,540 deletions.
6 changes: 5 additions & 1 deletion gpu4pyscf/df/hessian/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
from gpu4pyscf.df.grad.rhf import _gen_metric_solver
from gpu4pyscf.gto.mole import sort_atoms

LINEAR_DEP_THR = df.LINEAR_DEP_THR
BLKSIZE = 128
Expand Down Expand Up @@ -430,7 +431,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,

def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mol = hessobj.mol
h1ao = [None] * mol.natm
natm = mol.natm
nocc = int(cupy.count_nonzero(mo_occ > 0))
nmo = len(mo_occ)
h1ao = cupy.empty((natm, 3, nmo, nocc))
for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
atmlst, verbose, True):
h1 += vj1 - vk1 * .5
Expand Down
10 changes: 7 additions & 3 deletions gpu4pyscf/df/hessian/uhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
from gpu4pyscf.df.grad.rhf import _gen_metric_solver
from gpu4pyscf.gto.mole import sort_atoms

LINEAR_DEP_THR = df.LINEAR_DEP_THR
BLKSIZE = 256
Expand Down Expand Up @@ -452,11 +453,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,

def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mol = hessobj.mol
natm = mol.natm
if atmlst is None:
atmlst = range(mol.natm)
atmlst = range(natm)

h1aoa = [None] * mol.natm
h1aob = [None] * mol.natm
nocca, noccb = hessobj.base.nelec
nmo = len(mo_occ[0])
h1aoa = cupy.empty((natm, 3, nmo, nocca))
h1aob = cupy.empty((natm, 3, nmo, noccb))
for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
atmlst, verbose, True):
h1a, h1b = h1
Expand Down
5 changes: 0 additions & 5 deletions gpu4pyscf/df/hessian/uks.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,3 @@ class Hessian(uks_hess.Hessian):
hess_elec = uhf_hess.hess_elec
kernel = rhf_hess.kernel
hess = kernel

def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
fx=None, atmlst=None, max_memory=4000, verbose=None):
return uhf_hess.solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
fx, atmlst, max_memory, verbose)
1 change: 0 additions & 1 deletion gpu4pyscf/dft/numint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1528,7 +1528,6 @@ def _sparse_index(mol, coords, l_ctr_offsets):
ctr_offsets_slice = cumsum[glob_ctr_offsets-1]
ctr_offsets_slice[0] = 0

from pyscf import gto
gto_type = 'cart' if mol.cart else 'sph'
non0shl_idx = non0shl_idx == 1
ao_loc_slice = gto.moleintor.make_loc(mol._bas[non0shl_idx,:], gto_type)
Expand Down
34 changes: 12 additions & 22 deletions gpu4pyscf/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@
'Grad'
]

def _jk_energy_per_atom(mol, dm, vhfopt=None, with_j=True, with_k=True, verbose=None):
''' Computes the first-order derivatives of the energy contributions from J and K per atom.
def _jk_energy_per_atom(mol, dm, vhfopt=None,
j_factor=1., k_factor=1., verbose=None):
''' Computes the first-order derivatives of the energy per atom for
j_factor * J_derivatives - k_factor * K_derivatives
'''
assert mol.omega >= 0
log = logger.new_logger(mol, verbose)
cput0 = t1 = log.init_timer()
if vhfopt is None:
Expand All @@ -60,17 +61,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, with_j=True, with_k=True, verbose=
dms = cp.asarray(dms, order='C')
assert n_dm <= 2

vj = vk = None
vj_ptr = vk_ptr = lib.c_null_ptr()

assert with_j or with_k
if with_k:
vk = cp.zeros((mol.natm, 3))
vk_ptr = ctypes.cast(vk.data.ptr, ctypes.c_void_p)
if with_j:
vj = cp.zeros((mol.natm, 3))
vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)

ejk = cp.zeros((mol.natm, 3))
init_constant(mol)
ao_loc = mol.ao_loc
dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
Expand Down Expand Up @@ -107,7 +98,9 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, with_j=True, with_k=True, verbose=
tile_kl_mapping = tile_mappings[k,l]
scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
err = kern(
vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
ctypes.c_double(j_factor), ctypes.c_double(k_factor),
ctypes.cast(dms.data.ptr, ctypes.c_void_p),
ctypes.c_int(n_dm), ctypes.c_int(nao),
vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
(ctypes.c_int*8)(*ij_shls, *kl_shls),
Expand Down Expand Up @@ -137,11 +130,8 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, with_j=True, with_k=True, verbose=
log.debug1('kernel launches %d', kern_counts)
for llll, t in timing_collection.items():
log.debug1('%s wall time %.2f', llll, t)

if with_j:
vj *= 2.
log.timer_debug1('grad jk energy', *cput0)
return vj, vk
return ejk

def _ejk_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
ls = l_ctr_pattern[:,0]
Expand All @@ -151,8 +141,9 @@ def _ejk_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
nps = l_ctr_pattern[:,1]
ij_prims = nps[0] * nps[1]
nroots = (order + 1) // 2 + 1

unit = nroots*2 + g_size*3 + ij_prims*4
if mol.omega < 0: # SR
unit += nroots * 2
counts = shm_size // (unit*8)
n = min(THREADS, _nearest_power2(counts))
gout_stride = THREADS // n
Expand Down Expand Up @@ -367,8 +358,7 @@ def get_veff(self, mol=None, dm=None, verbose=None):
if mol is None: mol = self.mol
if dm is None: dm = self.base.make_rdm1()
vhfopt = self.base._opt_gpu.get(None, None)
ej, ek = _jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)
return ej - ek * .5
return _jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)

Grad = Gradients

Expand Down
45 changes: 30 additions & 15 deletions gpu4pyscf/grad/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,22 +103,37 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None):
exc1_per_atom = [exc1_per_atom[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
exc1_per_atom = cupy.asarray(exc1_per_atom)

omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = ni.libxc.is_hybrid_xc(mf.xc)
vhfopt = mf._opt_gpu.get(None, None)
if not ni.libxc.is_hybrid_xc(mf.xc):
ej = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, with_k=False,
verbose=verbose)[0]
exc1_per_atom += ej
else:
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
ej, ek = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)
ek *= hyb
if abs(omega) > 1e-10: # For range separated Coulomb operator
vhfopt = mf._opt_gpu.get(omega, None)
with mol.with_range_coulomb(omega):
ek_lr = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, with_j=False,
verbose=verbose)[1]
ek += ek_lr * (alpha - hyb)
exc1_per_atom += ej - ek * .5
j_factor = 1.
k_factor = 0.
if with_k:
if omega == 0:
k_factor = hyb
elif alpha == 0: # LR=0, only SR exchange
pass
elif hyb == 0: # SR=0, only LR exchange
k_factor = alpha
else: # SR and LR exchange with different ratios
k_factor = alpha
ejk = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, j_factor, k_factor,
verbose=verbose)
exc1_per_atom += ejk
if with_k and omega != 0:
j_factor = 0.
omega = -omega # Prefer computing the SR part
if alpha == 0: # LR=0, only SR exchange
k_factor = hyb
elif hyb == 0: # SR=0, only LR exchange
# full range exchange was computed in the previous step
k_factor = -alpha
else: # SR and LR exchange with different ratios
k_factor = hyb - alpha # =beta
vhfopt = mf._opt_gpu.get(omega, None)
with mol.with_range_coulomb(omega):
exc1_per_atom += rhf_grad._jk_energy_per_atom(
mol, dm, vhfopt, j_factor, k_factor, verbose=verbose)
return tag_array(exc1_per_atom, exc1_grid=exc)

def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
Expand Down
6 changes: 3 additions & 3 deletions gpu4pyscf/grad/uhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):


class Gradients(rhf_grad.GradientsBase):

to_cpu = utils.to_cpu
to_gpu = utils.to_gpu
device = utils.device
Expand All @@ -120,8 +120,8 @@ def get_veff(self, mol, dm, verbose=None):
In the CPU version, get_veff returns the first order derivatives of Veff matrix.
'''
vhfopt = self.base._opt_gpu.get(None, None)
ej, ek = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)
return ej - ek
ejk = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)
return ejk

def make_rdm1e(self, mo_energy=None, mo_coeff=None, mo_occ=None):
if mo_energy is None: mo_energy = self.base.mo_energy
Expand Down
46 changes: 30 additions & 16 deletions gpu4pyscf/grad/uks.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,37 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None):
exc1_per_atom = [exc1_per_atom[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
exc1_per_atom = cupy.asarray(exc1_per_atom)

omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = ni.libxc.is_hybrid_xc(mf.xc)
vhfopt = mf._opt_gpu.get(None, None)
if not ni.libxc.is_hybrid_xc(mf.xc):
ej = rhf_grad._jk_energy_per_atom(
mol, dm[0]+dm[1], vhfopt, with_k=False, verbose=verbose)[0]
exc1_per_atom += ej
else:
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
ej, ek = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, verbose=verbose)
ek *= hyb
if omega != 0:
vhfopt = mf._opt_gpu.get(omega, None)
with mol.with_range_coulomb(omega):
ek_lr = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, with_j=False,
verbose=verbose)[1]
ek += ek_lr * (alpha - hyb)

exc1_per_atom += ej - ek
j_factor = 1.
k_factor = 0.
if with_k:
if omega == 0:
k_factor = hyb
elif alpha == 0: # LR=0, only SR exchange
pass
elif hyb == 0: # SR=0, only LR exchange
k_factor = alpha
else: # SR and LR exchange with different ratios
k_factor = alpha
ejk = rhf_grad._jk_energy_per_atom(mol, dm, vhfopt, j_factor, k_factor,
verbose=verbose)
exc1_per_atom += ejk
if with_k and omega != 0:
j_factor = 0.
omega = -omega # Prefer computing the SR part
if alpha == 0: # LR=0, only SR exchange
k_factor = hyb
elif hyb == 0: # SR=0, only LR exchange
# full range exchange was computed in the previous step
k_factor = -alpha
else: # SR and LR exchange with different ratios
k_factor = hyb - alpha # =beta
vhfopt = mf._opt_gpu.get(omega, None)
with mol.with_range_coulomb(omega):
exc1_per_atom += rhf_grad._jk_energy_per_atom(
mol, dm, vhfopt, j_factor, k_factor, verbose=verbose)

return tag_array(exc1_per_atom, exc1_grid=exc)

Expand Down
Loading

0 comments on commit 010ca2d

Please sign in to comment.