Skip to content

Commit

Permalink
Refactor test_random to minimize collective calls (#1677)
Browse files Browse the repository at this point in the history
* debugging

* fix misinterpretation of dtype

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* debugging

* replace numpy() calls with alternative checks

* debugging

* debugging

* debugging randint

* debugging

* cast ints to float in statistical ops

* bypass numpy call l. 197

* bypass more numpy calls, skip median checks

* bypass more numpy calls, skip median checks

* bypass numpy calls wherever possible

* reinstate median checks

* skip ht.median if split>0

* skip all ht.median

* Revert "skip all ht.median"

This reverts commit 1241454.

* Revert "skip ht.median if split>0"

This reverts commit 4da8c93.

* Revert "reinstate median checks"

This reverts commit bf50914.

(cherry picked from commit 4b3e570)
  • Loading branch information
ClaudiaComito authored and github-actions[bot] committed Oct 17, 2024
1 parent 863cb99 commit 20e7f11
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 104 deletions.
16 changes: 15 additions & 1 deletion heat/core/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,11 +982,18 @@ def reduce_means_elementwise(output_shape_i: torch.Tensor) -> DNDarray:
return mu_tot[0][0] if mu_tot[0].size == 1 else mu_tot[0]

# ----------------------------------------------------------------------------------------------
# sanitize dtype
if types.heat_type_is_exact(x.dtype):
if x.dtype is types.int64:
x = x.astype(types.float64)
else:
x = x.astype(types.float32)

if axis is None:
# full matrix calculation
if not x.is_distributed():
# if x is not distributed do a torch.mean on x
ret = torch.mean(x.larray.float())
ret = torch.mean(x.larray)
return DNDarray(
ret,
gshape=tuple(ret.shape),
Expand Down Expand Up @@ -1791,6 +1798,13 @@ def std(
>>> ht.std(a, 1)
DNDarray([1.2961, 0.3362, 1.0739, 0.9820], dtype=ht.float32, device=cpu:0, split=None)
"""
# sanitize dtype
if types.heat_type_is_exact(x.dtype):
if x.dtype is types.int64:
x = x.astype(types.float64)
else:
x = x.astype(types.float32)

if not isinstance(ddof, int):
raise TypeError(f"ddof must be integer, is {type(ddof)}")
# elif ddof > 1:
Expand Down
179 changes: 77 additions & 102 deletions heat/core/tests/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,28 +153,26 @@ def test_rand(self):

a = ht.random.rand(21, 16, 17, 21, dtype=ht.float32, split=2)
b = ht.random.rand(15, 11, 19, 31, dtype=ht.float32, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
c = np.concatenate((a, b))
a = a.flatten()
b = b.flatten()
c = ht.concatenate((a, b))

# Values should be spread evenly across the range [0, 1)
mean = np.mean(c)
median = np.median(c)
std = np.std(c)
mean = ht.mean(c)
# median = np.median(c)
std = ht.std(c)
self.assertTrue(0.49 < mean < 0.51)
self.assertTrue(0.49 < median < 0.51)
# self.assertTrue(0.49 < median < 0.51)
self.assertTrue(std < 0.3)
self.assertTrue(((0 <= c) & (c < 1)).all())

def test_randint(self):
# Checked that the random values are in the correct range
a = ht.random.randint(low=0, high=10, size=(10, 10), dtype=ht.int64)
self.assertEqual(a.dtype, ht.int64)
a = a.numpy()
self.assertTrue(((0 <= a) & (a < 10)).all())

a = ht.random.randint(low=100000, high=150000, size=(31, 25, 11), dtype=ht.int64, split=2)
a = a.numpy()
self.assertTrue(((100000 <= a) & (a < 150000)).all())

# For the range [0, 1) only the value 0 is allowed
Expand All @@ -194,20 +192,18 @@ def test_randint(self):
shape = (15, 13, 9, 21, 65)
ht.random.seed(13579)
a = ht.random.randint(10000, size=shape, split=2, dtype=ht.int64)
a = a.numpy()

ht.random.seed(13579)
b = ht.random.randint(low=0, high=10000, size=shape, split=2, dtype=ht.int64)
b = b.numpy()

self.assertTrue(np.array_equal(a, b))
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
self.assertTrue(ht.equal(a, b))
mean = ht.mean(a)
# median = ht.median(a)
std = ht.std(a)

# Mean and median should be in the center while the std is very high due to an even distribution
self.assertTrue(4900 < mean < 5100)
self.assertTrue(4900 < median < 5100)
# self.assertTrue(4900 < median < 5100)
self.assertTrue(std < 2900)

with self.assertRaises(ValueError):
Expand All @@ -226,31 +222,26 @@ def test_randint(self):
self.assertEqual(a.dtype, ht.int32)
self.assertEqual(a.larray.dtype, torch.int32)
self.assertEqual(b.dtype, ht.int32)
a = a.numpy()
b = b.numpy()
self.assertEqual(a.dtype, np.int32)
self.assertTrue(np.array_equal(a, b))
self.assertTrue(ht.equal(a, b))
self.assertTrue(((50 <= a) & (a < 1000)).all())
self.assertTrue(((50 <= b) & (b < 1000)).all())

c = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0)
c = c.numpy()
self.assertFalse(np.array_equal(a, c))
self.assertFalse(np.array_equal(b, c))
self.assertFalse(ht.equal(a, c))
self.assertFalse(ht.equal(b, c))
self.assertTrue(((50 <= c) & (c < 1000)).all())

ht.random.seed(0xFFFFFFF)
a = ht.random.randint(
10000, size=(123, 42, 13, 21), split=3, dtype=ht.int32, comm=ht.MPI_WORLD
)
a = a.numpy()
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
mean = ht.mean(a)
# median = np.median(a)
std = ht.std(a)

# Mean and median should be in the center while the std is very high due to an even distribution
self.assertTrue(4900 < mean < 5100)
self.assertTrue(4900 < median < 5100)
# self.assertTrue(4900 < median < 5100)
self.assertTrue(std < 2900)

# test aliases
Expand Down Expand Up @@ -297,23 +288,21 @@ def test_randn(self):
a = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertEqual(a.dtype, ht.float32)
self.assertEqual(a.larray[0, 0, 0].dtype, torch.float32)
a = a.numpy()
self.assertEqual(a.dtype, np.float32)
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
mean = ht.mean(a)
# median = np.median(a)
std = ht.std(a)
self.assertTrue(-0.02 < mean < 0.02)
self.assertTrue(-0.02 < median < 0.02)
# self.assertTrue(-0.02 < median < 0.02)
self.assertTrue(0.99 < std < 1.01)

ls = 272 + ht.MPI_WORLD.rank
ht.random.set_state(("Batchparallel", None, ls))
b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy()
self.assertTrue(np.allclose(a, b))
b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertTrue(ht.allclose(a, b))

c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy()
self.assertFalse(np.allclose(a, c))
self.assertFalse(np.allclose(b, c))
c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertFalse(ht.allclose(a, c))
self.assertFalse(ht.allclose(b, c))

# check wrong shapes
with self.assertRaises(ValueError):
Expand Down Expand Up @@ -539,10 +528,9 @@ def test_rand(self):
a = ht.random.rand(2, 3, 4, 5, split=0)
ht.random.set_state(("Threefry", seed, 0x10000000000000000))
b = ht.random.rand(2, 44, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
self.assertEqual(a.dtype, np.float32)
self.assertTrue(np.array_equal(a[32:], b))
a = a.flatten()
b = b.flatten()
self.assertTrue(ht.equal(a[32:], b))

# Check that random numbers don't repeat after first overflow
seed = 12345
Expand All @@ -557,9 +545,9 @@ def test_rand(self):
a = ht.random.rand(2, 34, split=0)
ht.random.set_state(("Threefry", seed, 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0))
b = ht.random.rand(2, 50, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
self.assertTrue(np.array_equal(a, b[32:]))
a = a.flatten()
b = b.flatten()
self.assertTrue(ht.equal(a, b[32:]))

# different split axis with resetting seed
ht.random.seed(seed)
Expand All @@ -573,9 +561,9 @@ def test_rand(self):
a = ht.random.rand(2, 50, split=0)
ht.random.seed(seed)
b = ht.random.rand(100, split=None)
a = a.numpy().flatten()
b = b.larray.cpu().numpy()
self.assertTrue(np.array_equal(a, b))
a = a.flatten()
b = ht.resplit(b, 0)
self.assertTrue(ht.equal(a, b))

# On different shape and split the same random values are used
ht.random.seed(seed)
Expand Down Expand Up @@ -632,37 +620,36 @@ def test_rand(self):

ht.random.seed(9876)
b = ht.random.rand(np.prod(shape), dtype=ht.float32)
a = a.numpy().flatten()
b = b.larray.cpu().numpy()
self.assertTrue(np.array_equal(a, b))
self.assertEqual(a.dtype, np.float32)
a = a.flatten()
b = ht.resplit(b, 0)
self.assertTrue(ht.equal(a, b))

a = ht.random.rand(21, 16, 17, 21, dtype=ht.float32, split=2)
b = ht.random.rand(15, 11, 19, 31, dtype=ht.float32, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
c = np.concatenate((a, b))
a = a.flatten()
b = b.flatten()
c = ht.concatenate((a, b))

# Values should be spread evenly across the range [0, 1)
mean = np.mean(c)
median = np.median(c)
std = np.std(c)
mean = ht.mean(c)
# median = np.median(c)
std = ht.std(c)
self.assertTrue(0.49 < mean < 0.51)
self.assertTrue(0.49 < median < 0.51)
# self.assertTrue(0.49 < median < 0.51)
self.assertTrue(std < 0.3)
self.assertTrue(((0 <= c) & (c < 1)).all())

ht.random.seed(11111)
a = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy()
a = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32)
# Overflow reached
ht.random.set_state(("Threefry", 11111, 0x10000000000000000))
b = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy()
self.assertTrue(np.array_equal(a, b))
b = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32)
self.assertTrue(ht.equal(a, b))

ht.random.set_state(("Threefry", 11111, 0x100000000))
c = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32).numpy()
self.assertFalse(np.array_equal(a, c))
self.assertFalse(np.array_equal(b, c))
c = ht.random.rand(12, 32, 44, split=1, dtype=ht.float32)
self.assertFalse(ht.equal(a, c))
self.assertFalse(ht.equal(b, c))

# To check working with large number of elements
ht.random.randn(6667, 3523, dtype=ht.float64, split=None)
Expand All @@ -675,11 +662,9 @@ def test_randint(self):
# Checked that the random values are in the correct range
a = ht.random.randint(low=0, high=10, size=(10, 10), dtype=ht.int64)
self.assertEqual(a.dtype, ht.int64)
a = a.numpy()
self.assertTrue(((0 <= a) & (a < 10)).all())

a = ht.random.randint(low=100000, high=150000, size=(31, 25, 11), dtype=ht.int64, split=2)
a = a.numpy()
self.assertTrue(((100000 <= a) & (a < 150000)).all())

# For the range [0, 1) only the value 0 is allowed
Expand All @@ -699,35 +684,32 @@ def test_randint(self):
ht.random.seed(13579)
shape = (15, 13, 9, 21, 65)
a = ht.random.randint(15, 100, size=shape, split=0, dtype=ht.int64)
a = a.numpy().flatten()
a = a.flatten()

ht.random.seed(13579)
elements = np.prod(shape)
b = ht.random.randint(low=15, high=100, size=(elements,), dtype=ht.int64)
b = b.numpy()
self.assertTrue(np.array_equal(a, b))
self.assertTrue(ht.equal(a, b))

# Two arrays with the same seed and shape have identical values
ht.random.seed(13579)
a = ht.random.randint(10000, size=shape, split=2, dtype=ht.int64)
a = a.numpy()

ht.random.seed(13579)
b = ht.random.randint(low=0, high=10000, size=shape, split=2, dtype=ht.int64)
b = b.numpy()

ht.random.seed(13579)
c = ht.random.randint(low=0, high=10000, dtype=ht.int64)
self.assertTrue(np.equal(b[0, 0, 0, 0, 0], c))
self.assertTrue(ht.equal(b[0, 0, 0, 0, 0], c))

self.assertTrue(np.array_equal(a, b))
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
self.assertTrue(ht.equal(a, b))
mean = ht.mean(a)
# median = np.median(a)
std = ht.std(a)

# Mean and median should be in the center while the std is very high due to an even distribution
self.assertTrue(4900 < mean < 5100)
self.assertTrue(4900 < median < 5100)
# self.assertTrue(4900 < median < 5100)
self.assertTrue(std < 2900)

with self.assertRaises(ValueError):
Expand All @@ -746,31 +728,26 @@ def test_randint(self):
self.assertEqual(a.dtype, ht.int32)
self.assertEqual(a.larray.dtype, torch.int32)
self.assertEqual(b.dtype, ht.int32)
a = a.numpy()
b = b.numpy()
self.assertEqual(a.dtype, np.int32)
self.assertTrue(np.array_equal(a, b))
self.assertTrue(ht.equal(a, b))
self.assertTrue(((50 <= a) & (a < 1000)).all())
self.assertTrue(((50 <= b) & (b < 1000)).all())

c = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0)
c = c.numpy()
self.assertFalse(np.array_equal(a, c))
self.assertFalse(np.array_equal(b, c))
self.assertFalse(ht.equal(a, c))
self.assertFalse(ht.equal(b, c))
self.assertTrue(((50 <= c) & (c < 1000)).all())

ht.random.seed(0xFFFFFFF)
a = ht.random.randint(
10000, size=(123, 42, 13, 21), split=3, dtype=ht.int32, comm=ht.MPI_WORLD
)
a = a.numpy()
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
mean = ht.mean(a)
# median = np.median(a)
std = ht.std(a)

# Mean and median should be in the center while the std is very high due to an even distribution
self.assertTrue(4900 < mean < 5100)
self.assertTrue(4900 < median < 5100)
# self.assertTrue(4900 < median < 5100)
self.assertTrue(std < 2900)

# test aliases
Expand Down Expand Up @@ -826,22 +803,20 @@ def test_randn(self):
a = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertEqual(a.dtype, ht.float32)
self.assertEqual(a.larray[0, 0, 0].dtype, torch.float32)
a = a.numpy()
self.assertEqual(a.dtype, np.float32)
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
mean = ht.mean(a)
# median = np.median(a)
std = ht.std(a)
self.assertTrue(-0.01 < mean < 0.01)
self.assertTrue(-0.01 < median < 0.01)
# self.assertTrue(-0.01 < median < 0.01)
self.assertTrue(0.99 < std < 1.01)

ht.random.set_state(("Threefry", 54321, 0x10000000000000000))
b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy()
self.assertTrue(np.allclose(a, b))
b = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertTrue(ht.allclose(a, b))

c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2).numpy()
self.assertFalse(np.allclose(a, c))
self.assertFalse(np.allclose(b, c))
c = ht.random.randn(30, 30, 30, dtype=ht.float32, split=2)
self.assertFalse(ht.allclose(a, c))
self.assertFalse(ht.allclose(b, c))

def test_randperm(self):
ht.random.set_state(("Threefry", 0, 0))
Expand Down
2 changes: 1 addition & 1 deletion heat/utils/data/matrixgallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def hermitian(
matrix = randn(n, n, dtype=real_dtype, split=split, device=device, comm=comm) + 1j * randn(
n, n, dtype=real_dtype, split=split, device=device, comm=comm
)
elif not heat_type_is_exact(dtype):
elif dtype in [core.float32, core.float64]:
matrix = randn(n, n, dtype=dtype, split=split, device=device, comm=comm)
else:
raise ValueError("dtype must be floating-point data-type but is ", dtype, ".")
Expand Down

0 comments on commit 20e7f11

Please sign in to comment.