-
Notifications
You must be signed in to change notification settings - Fork 1
/
plot_entropy.py
107 lines (94 loc) · 3.68 KB
/
plot_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
f = open("entropy_last.log", "r")
results = [[] for u in range(0, 6)]
results2 = [[] for u in range(0, 6)]
results3 = [[] for u in range(0, 6)]
sz = 67108864
for line in f:
tokens = line.split(" ")
name = tokens[0].split("/")[-1].split(".")[0]
entropy = int(name.split("_")[1])
tokens[3] = int(tokens[3])
tokens[2] = float(tokens[2])
tokens[1] = float(tokens[1])
if "fp." in line and "ZSTD" in line:
results[0].append((entropy, sz/ tokens[3]))
results2[0].append((entropy, int(sz/ (1024*1024) / tokens[1])))
results3[0].append((entropy, int(sz/ (1024*1024) / tokens[2])))
elif "dict" in line and "ZSTD" in line:
results[1].append((entropy, sz/ tokens[3]))
results2[1].append((entropy, int(sz/ (1024*1024) / tokens[1])))
results3[1].append((entropy, int(sz/ (1024*1024) / tokens[2])))
elif "no_enc" in line and "ZSTD" in line:
results[2].append((entropy, sz/ tokens[3]))
results2[2].append((entropy, int(sz/ (1024 * 1024) / tokens[1])))
results3[2].append((entropy, int(sz/ (1024*1024) / tokens[2])))
elif "dict" in line and "UNCOMPRESSED" in line:
results[3].append((entropy, sz/ tokens[3]))
results2[3].append((entropy, int(sz/ (1024 * 1024) / tokens[1])))
results3[3].append((entropy, int(sz/ (1024*1024) / tokens[2])))
elif "fp_xor" in line:
results[5].append((entropy, sz/ tokens[3]))
results2[5].append((entropy, int(sz/ (1024*1024) / tokens[1])))
results3[5].append((entropy, int(sz/ (1024*1024) / tokens[2])))
elif "fp_simd" in line:
results[4].append((entropy, sz/ tokens[3]))
results2[4].append((entropy, int(sz/ (1024*1024) / tokens[1])))
results3[4].append((entropy, int(sz/ (1024*1024) / tokens[2])))
else:
print(line)
continue
assert(False)
x = [[] for u in range(0, 6)]
y = [[] for u in range(0, 6)]
for i in range(0, 6):
results[i] = sorted(results[i], key=lambda u: u[0])
x[i] = [u[0] for u in results[i]]
y[i] = [u[1] for u in results[i]]
fontP = FontProperties()
fontP.set_size('x-small')
# Compression ratio
fig = plt.figure()
plt.xlabel("Entropy (bits/element)")
plt.ylabel("Compression ratio")
labels = ["BYTE_STREAM_SPLIT + ZSTD", "DICTIONARY + ZSTD", "PLAIN + ZSTD", "DICTIONARY (Uncompressed)", "BYTE_STREAM_SPLIT (simd) + ZSTD"]
markers=['o', '.', '+', 'x', '^']
for u in range(0, 4):
plt.plot(x[u], y[u], label=labels[u], marker=markers[u])
plt.legend(prop=fontP)
plt.savefig("entropy_ratio.jpeg")
plt.close(fig)
# Compression speed
x = [[] for u in range(0, 6)]
y = [[] for u in range(0, 6)]
for i in range(0, 5):
results2[i] = sorted(results2[i], key=lambda u: u[0])
x[i] = [u[0] for u in results2[i]]
y[i] = [u[1] for u in results2[i]]
fig = plt.figure()
plt.xlabel("Entropy (bits/element)")
plt.ylabel("Parquet write throughput (MiB/s)")
plt.ylim(.0, 700.0)
markers=['o', '.', '+', 'x', '^']
for u in range(0, 5):
plt.plot(x[u], y[u], label=labels[u], marker=markers[u])
plt.legend(prop=fontP)
plt.savefig("entropy_write_speed.jpeg")
plt.close(fig)
# decompression speed
x = [[] for u in range(0, 5)]
y = [[] for u in range(0, 5)]
for i in range(0, 5):
results3[i] = sorted(results3[i], key=lambda u: u[0])
x[i] = [u[0] for u in results3[i]]
y[i] = [u[1] for u in results3[i]]
plt.xlabel("Entropy (bits/element)")
plt.ylabel("Parquet read throughput(MiB/s)")
plt.ylim(.0, 1100.0)
markers=['o', '.', '+', 'x', '^']
for u in range(0, 5):
plt.plot(x[u], y[u], label=labels[u], marker=markers[u])
plt.legend(prop=fontP)
plt.savefig("entropy_read_speed.jpeg")