/
integer_hash_tables_benchmark.cpp
226 lines (208 loc) · 10.2 KB
/
integer_hash_tables_benchmark.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#include <iostream>
#include <iomanip>
#include <vector>
#include <unordered_map>
#include <sparsehash/dense_hash_map>
#include <sparsehash/sparse_hash_map>
#include <absl/container/flat_hash_map.h>
#include <Common/Stopwatch.h>
#include <base/types.h>
#include <IO/ReadBufferFromFile.h>
#include <Compression/CompressedReadBuffer.h>
#include <Common/HashTable/HashMap.h>
template <typename Key, typename Map>
void NO_INLINE test(const Key * data, size_t size, const std::string & name, std::function<void(Map &)> init = {})
{
Stopwatch watch;
Map map;
if (init)
init(map);
for (const auto * end = data + size; data < end; ++data)
++map[*data];
watch.stop();
std::cerr << name
<< ":\nElapsed: " << watch.elapsedSeconds()
<< " (" << size / watch.elapsedSeconds() << " elem/sec.)"
<< ", map size: " << map.size() << "\n";
}
template <typename Key>
static void NO_INLINE testForType(size_t method, size_t rows_size)
{
std::cerr << std::fixed << std::setprecision(3);
std::vector<Key> data(rows_size);
{
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
DB::CompressedReadBuffer in2(in1);
in2.readStrict(reinterpret_cast<char*>(data.data()), sizeof(data[0]) * rows_size);
}
if (method == 0)
{
test<Key, HashMap<Key, UInt64, DefaultHash<Key>>>(data.data(), data.size(), "CH HashMap");
}
else if (method == 1)
{
test<Key, ::google::dense_hash_map<Key, UInt64, absl::Hash<Key>>>(data.data(), data.size(), "Google DenseMap", [](auto & map){ map.set_empty_key(0); });
}
else if (method == 2)
{
test<Key, ::absl::flat_hash_map<Key, UInt64>>(data.data(), data.size(), "Abseil HashMap");
}
else if (method == 3)
{
test<Key, ::absl::flat_hash_map<Key, UInt64, DefaultHash<Key>>>(data.data(), data.size(), "Abseil HashMap with CH Hash");
}
else if (method == 4)
{
test<Key, std::unordered_map<Key, UInt64>>(data.data(), data.size(), "std::unordered_map");
}
else
{
std::cerr << "Unexpected method passed " << method << std::endl;
}
}
/** This benchmark does not test which hash table is fastest.
* It tests simple aggregation scenario that is important for ClickHouse.
*
* Support bash script it is important to rerun program for each method:
* benchmark.sh
* # Usage benchmark.sh column_file_name.bin column_type
* echo File $1
* ./integer_hash_tables_benchmark 0 $2 100000000 < $1
* ./integer_hash_tables_benchmark 1 $2 100000000 < $1
* ./integer_hash_tables_benchmark 2 $2 100000000 < $1
* ./integer_hash_tables_benchmark 3 $2 100000000 < $1
* ./integer_hash_tables_benchmark 4 $2 100000000 < $1
*
* Results of this benchmark on hits_100m_obfuscated X86-64
*
* File hits_100m_obfuscated/201307_1_96_4/WatchID.bin
* CH HashMap: Elapsed: 7.416 (13484217.815 elem/sec.), map size: 99997493
* Google DenseMap: Elapsed: 10.303 (9706022.031 elem/sec.), map size: 99997493
* Abseil HashMap: Elapsed: 9.106 (10982139.229 elem/sec.), map size: 99997493
* Abseil HashMap with CH Hash: Elapsed: 9.221 (10845360.669 elem/sec.), map size: 99997493
* std::unordered_map: Elapsed: 45.213 (2211758.706 elem/sec.), map size: 9999749
*
* File hits_100m_obfuscated/201307_1_96_4/URLHash.bin
* CH HashMap: Elapsed: 2.620 (38168135.308 elem/sec.), map size: 20714865
* Google DenseMap: Elapsed: 3.426 (29189309.058 elem/sec.), map size: 20714865
* Abseil HashMap: Elapsed: 2.788 (35870495.097 elem/sec.), map size: 20714865
* Abseil HashMap with CH Hash: Elapsed: 2.991 (33428850.155 elem/sec.), map size: 20714865
* std::unordered_map: Elapsed: 8.503 (11760331.346 elem/sec.), map size: 20714865
*
* File hits_100m_obfuscated/201307_1_96_4/UserID.bin
* CH HashMap: Elapsed: 2.157 (46352039.753 elem/sec.), map size: 17630976
* Google DenseMap: Elapsed: 2.725 (36694226.782 elem/sec.), map size: 17630976
* Abseil HashMap: Elapsed: 2.590 (38604284.187 elem/sec.), map size: 17630976
* Abseil HashMap with CH Hash: Elapsed: 2.785 (35904856.137 elem/sec.), map size: 17630976
* std::unordered_map: Elapsed: 7.268 (13759557.609 elem/sec.), map size: 17630976
*
* File hits_100m_obfuscated/201307_1_96_4/RegionID.bin
* CH HashMap: Elapsed: 0.192 (521583315.810 elem/sec.), map size: 9040
* Google DenseMap: Elapsed: 0.297 (337081407.799 elem/sec.), map size: 9046
* Abseil HashMap: Elapsed: 0.295 (338805623.511 elem/sec.), map size: 9040
* Abseil HashMap with CH Hash: Elapsed: 0.331 (302155391.036 elem/sec.), map size: 9040
* std::unordered_map: Elapsed: 0.455 (219971555.390 elem/sec.), map size: 9040
*
* File hits_100m_obfuscated/201307_1_96_4/CounterID.bin
* CH HashMap: Elapsed: 0.217 (460216823.609 elem/sec.), map size: 6506
* Google DenseMap: Elapsed: 0.373 (267838665.098 elem/sec.), map size: 6506
* Abseil HashMap: Elapsed: 0.325 (308124728.989 elem/sec.), map size: 6506
* Abseil HashMap with CH Hash: Elapsed: 0.354 (282167144.801 elem/sec.), map size: 6506
* std::unordered_map: Elapsed: 0.390 (256573354.171 elem/sec.), map size: 6506
*
* File hits_100m_obfuscated/201307_1_96_4/TraficSourceID.bin
* CH HashMap: Elapsed: 0.246 (406714566.282 elem/sec.), map size: 10
* Google DenseMap: Elapsed: 0.760 (131615151.233 elem/sec.), map size: 1565609 /// Broken because there is 0 key in dataset
* Abseil HashMap: Elapsed: 0.309 (324068156.680 elem/sec.), map size: 10
* Abseil HashMap with CH Hash: Elapsed: 0.339 (295108223.814 elem/sec.), map size: 10
* std::unordered_map: Elapsed: 0.811 (123304031.195 elem/sec.), map size: 10
*
* File hits_100m_obfuscated/201307_1_96_4/AdvEngineID.bin
* CH HashMap: Elapsed: 0.155 (643245257.748 elem/sec.), map size: 19
* Google DenseMap: Elapsed: 1.629 (61395025.417 elem/sec.), map size: 32260732 // Broken because there is 0 key in dataset
* Abseil HashMap: Elapsed: 0.292 (342765027.204 elem/sec.), map size: 19
* Abseil HashMap with CH Hash: Elapsed: 0.330 (302822020.210 elem/sec.), map size: 19
* std::unordered_map: Elapsed: 0.308 (325059333.730 elem/sec.), map size: 19
*
*
* Results of this benchmark on hits_100m_obfuscated AARCH64
*
* File hits_100m_obfuscated/201307_1_96_4/WatchID.bin
* CH HashMap: Elapsed: 9.530 (10493528.533 elem/sec.), map size: 99997493
* Google DenseMap: Elapsed: 14.436 (6927091.135 elem/sec.), map size: 99997493
* Abseil HashMap: Elapsed: 16.671 (5998504.085 elem/sec.), map size: 99997493
* Abseil HashMap with CH Hash: Elapsed: 16.803 (5951365.711 elem/sec.), map size: 99997493
* std::unordered_map: Elapsed: 50.805 (1968305.658 elem/sec.), map size: 99997493
*
* File hits_100m_obfuscated/201307_1_96_4/URLHash.bin
* CH HashMap: Elapsed: 3.693 (27076878.092 elem/sec.), map size: 20714865
* Google DenseMap: Elapsed: 5.051 (19796401.694 elem/sec.), map size: 20714865
* Abseil HashMap: Elapsed: 5.617 (17804528.625 elem/sec.), map size: 20714865
* Abseil HashMap with CH Hash: Elapsed: 5.702 (17537013.639 elem/sec.), map size: 20714865
* std::unordered_map: Elapsed: 10.757 (9296040.953 elem/sec.), map size: 2071486
*
* File hits_100m_obfuscated/201307_1_96_4/UserID.bin
* CH HashMap: Elapsed: 2.982 (33535795.695 elem/sec.), map size: 17630976
* Google DenseMap: Elapsed: 3.940 (25381557.959 elem/sec.), map size: 17630976
* Abseil HashMap: Elapsed: 4.493 (22259078.458 elem/sec.), map size: 17630976
* Abseil HashMap with CH Hash: Elapsed: 4.596 (21759738.710 elem/sec.), map size: 17630976
* std::unordered_map: Elapsed: 9.035 (11067903.596 elem/sec.), map size: 17630976
*
* File hits_100m_obfuscated/201307_1_96_4/RegionID.bin
* CH HashMap: Elapsed: 0.302 (331026285.361 elem/sec.), map size: 9040
* Google DenseMap: Elapsed: 0.623 (160419421.840 elem/sec.), map size: 9046
* Abseil HashMap: Elapsed: 0.981 (101971186.758 elem/sec.), map size: 9040
* Abseil HashMap with CH Hash: Elapsed: 0.991 (100932993.199 elem/sec.), map size: 9040
* std::unordered_map: Elapsed: 0.809 (123541402.715 elem/sec.), map size: 9040
*
* File hits_100m_obfuscated/201307_1_96_4/CounterID.bin
* CH HashMap: Elapsed: 0.343 (291821742.078 elem/sec.), map size: 6506
* Google DenseMap: Elapsed: 0.718 (139191105.450 elem/sec.), map size: 6506
* Abseil HashMap: Elapsed: 1.019 (98148285.278 elem/sec.), map size: 6506
* Abseil HashMap with CH Hash: Elapsed: 1.048 (95446843.667 elem/sec.), map size: 6506
* std::unordered_map: Elapsed: 0.701 (142701070.085 elem/sec.), map size: 6506
*
* File hits_100m_obfuscated/201307_1_96_4/TraficSourceID.bin
* CH HashMap: Elapsed: 0.376 (265905243.103 elem/sec.), map size: 10
* Google DenseMap: Elapsed: 1.309 (76420707.298 elem/sec.), map size: 1565609 /// Broken because there is 0 key in dataset
* Abseil HashMap: Elapsed: 0.955 (104668109.775 elem/sec.), map size: 10
* Abseil HashMap with CH Hash: Elapsed: 0.967 (103456305.391 elem/sec.), map size: 10
* std::unordered_map: Elapsed: 1.241 (80591305.890 elem/sec.), map size: 10
*
* File hits_100m_obfuscated/201307_1_96_4/AdvEngineID.bin
* CH HashMap: Elapsed: 0.213 (470208130.105 elem/sec.), map size: 19
* Google DenseMap: Elapsed: 2.525 (39607131.523 elem/sec.), map size: 32260732 /// Broken because there is 0 key in dataset
* Abseil HashMap: Elapsed: 0.950 (105233678.618 elem/sec.), map size: 19
* Abseil HashMap with CH Hash: Elapsed: 0.962 (104001230.717 elem/sec.), map size: 19
* std::unordered_map: Elapsed: 0.585 (171059989.837 elem/sec.), map size: 19
*/
int main(int argc, char ** argv)
{
if (argc < 4)
{
std::cerr << "Usage: program method column_type_name rows_count < input_column.bin \n";
return 1;
}
size_t method = std::stoull(argv[1]);
std::string type_name = std::string(argv[2]);
size_t n = std::stoull(argv[3]);
if (type_name == "UInt8")
testForType<UInt8>(method, n);
else if (type_name == "UInt16")
testForType<UInt16>(method, n);
else if (type_name == "UInt32")
testForType<UInt32>(method, n);
else if (type_name == "UInt64")
testForType<UInt64>(method, n);
else if (type_name == "Int8")
testForType<Int8>(method, n);
else if (type_name == "Int16")
testForType<Int16>(method, n);
else if (type_name == "Int32")
testForType<Int32>(method, n);
else if (type_name == "Int64")
testForType<Int64>(method, n);
else
std::cerr << "Unexpected type passed " << type_name << std::endl;
return 0;
}