-
Notifications
You must be signed in to change notification settings - Fork 151
/
LatencyTable.cpp
373 lines (340 loc) · 11.3 KB
/
LatencyTable.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
/*========================== begin_copyright_notice ============================
Copyright (C) 2019-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "LatencyTable.h"
#include "../G4_IR.hpp"
#include "LocalScheduler_G4IR.h"
#include <type_traits>
using namespace vISA;
// c++23 std::to_underlying.
template <typename Enum>
inline std::underlying_type_t<Enum> value_of(Enum val) {
return static_cast<std::underlying_type_t<Enum>>(val);
}
class LatencyTableLegacy : public LatencyTable {
public:
LatencyTableLegacy(const IR_Builder& builder) : LatencyTable(builder) {
vASSERT(builder.getPlatformGeneration() < PlatformGen::XE);
}
uint16_t getLatency(const G4_INST *Inst) const override;
uint16_t getOccupancy(const G4_INST *Inst) const override;
uint16_t getDPASLatency(uint8_t repeatCount) const override;
};
template <PlatformGen Gen>
class LatencyTableXe: public LatencyTable {
// Select latency information based on platform generation.
using LI = typename std::conditional<Gen >= PlatformGen::XE,
XELatencyInfo,
void>::type;
public:
LatencyTableXe(const IR_Builder& builder) : LatencyTable(builder) {
static_assert(Gen >= PlatformGen::XE);
}
// General implementations to get latency and occupancy for the given
// instruction based on heuristics. The implementation can be specialized if
// needed.
uint16_t getLatency(const G4_INST *Inst) const override;
uint16_t getOccupancy(const G4_INST *Inst) const override;
// The details of heuristics used to calculate the latency. The
// implementation can be specialized if needed.
uint16_t getDPASLatency(uint8_t repeatCount) const override;
private:
uint16_t getMsgLatency(const G4_INST *Inst) const;
uint16_t getMathLatency(const G4_INST *inst) const;
uint16_t getBranchLatency(const G4_INST *inst) const;
uint16_t getIntrinsicLatency(const G4_INST *inst) const;
uint16_t getDPASLatency(const G4_InstDpas *dpas) const;
uint16_t getARFAccessLatency(const G4_INST *inst) const;
uint16_t getArithmeticLatency(const G4_INST *inst) const;
};
std::unique_ptr<LatencyTable>
LatencyTable::createLatencyTable(const IR_Builder &builder) {
auto GEN = builder.getPlatformGeneration();
if (GEN >= PlatformGen::XE)
return std::make_unique<LatencyTableXe<PlatformGen::XE>>(builder);
return std::make_unique<LatencyTableLegacy>(builder);
}
uint16_t LatencyTableLegacy::getLatency(const G4_INST *Inst) const {
if (Inst->isSend()) {
G4_SendDesc *MsgDesc = Inst->getMsgDesc();
int SFIDint = SFIDtoInt(MsgDesc->getSFID());
vASSERT(SFIDint < ARRAY_COUNT(LegacyFFLatency));
return LegacyFFLatency[SFIDint];
} else if (Inst->isMath()) {
if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV ||
Inst->asMathInst()->getMathCtrl() == MATH_POW)
return LegacyLatencies::EDGE_LATENCY_MATH_TYPE2;
return LegacyLatencies::EDGE_LATENCY_MATH;
}
return LegacyLatencies::IVB_PIPELINE_LENGTH;
}
// This calculates the node's pipeline occupancy (node delay)
uint16_t LatencyTableLegacy::getOccupancy(const G4_INST *Inst) const {
int divisor = 8;
int InstLatency = LegacyLatencies::UNCOMPR_LATENCY;
if (Inst->isFastHFInstruction()) {
divisor = 16;
}
// Number of n-wide passes in FPU0 or FPU1 (EM).
// "n" is:
// 16 for BDW+ HalfFloatDoublePerf instructions,
// 8 for other instructions.
int passes = std::max(1, Inst->getExecSize() / divisor);
// InstLatency is:
// 4 for EM/FPU1 POW and FDIV instructions ( HSW; for BDW+ it is 2 times
// higher ), 2 for other EM/FPU1 instructions ( HSW; for BDW+ it is 2
// times higher ), 2 for other instructions.
// Update DagNode latency for math.
G4_opcode opCode = Inst->opcode();
switch (opCode) {
case G4_math: {
// Use EdgeLatencyMathType2 for FDIV, FPOW functions.
if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV ||
Inst->asMathInst()->getMathCtrl() == MATH_POW) {
InstLatency = 4;
} else {
// Used EdgeLatencyMath for other functions.
InstLatency = 2;
}
// BDW+ platforms have lower math TPT and longer latency (all math
// functions).
InstLatency *= 2;
break;
}
case G4_bfe:
case G4_bfi1:
case G4_bfi2:
case G4_bfrev:
case G4_cbit:
case G4_dp2:
case G4_dp3:
case G4_dp4:
case G4_dph:
case G4_fbh:
case G4_fbl:
case G4_lrp:
case G4_mac:
case G4_mach:
case G4_pln:
InstLatency *= 2;
break;
case G4_label:
// Labels need special care. They should have a latency of 1.
// But their execSize is 255, which sets passes=31.
passes = 1;
InstLatency = 1;
break;
default:
break;
}
return uint16_t(passes * InstLatency);
}
uint16_t LatencyTableLegacy::getDPASLatency(uint8_t repeatCount) const {
vISA_ASSERT_UNREACHABLE("DPAS is not supported");
return LegacyLatencies::UNKNOWN_LATENCY;
}
// General template implementations for XE+.
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getLatency(const G4_INST *Inst) const {
if (Inst->isSend())
return getMsgLatency(Inst);
if (Inst->isMath())
return getMathLatency(Inst);
if (Inst->isFlowControl())
return getBranchLatency(Inst);
if (Inst->isIntrinsic())
return getIntrinsicLatency(Inst);
if (Inst->isDpas())
return getDPASLatency(Inst->asDpasInst());
if (Inst->writesFlag() ||
(Inst->getDst() && Inst->getDst()->isDirectA0()))
return getARFAccessLatency(Inst);
if (Inst->isArithmetic())
return getArithmeticLatency(Inst);
// By default, use the FPU pipeline latency.
return value_of(LI::FPU);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getMsgLatency(const G4_INST *Inst) const {
vASSERT(Inst->isSend());
G4_SendDesc *MsgDesc = Inst->getMsgDesc();
if (MsgDesc->isLSC()) {
if (MsgDesc->getSFID() == SFID::SLM) {
auto Sz = Inst->getExecSize();
return MsgDesc->isFence()
? value_of(LI::SLM_FENCE)
: ((Sz > g4::SIMD16) ? value_of(LI::SLM32)
: value_of(LI::SLM16));
} else if (MsgDesc->isFence()) {
return MsgDesc->isTyped() ? value_of(LI::LSC_TYPED_FENCE)
: value_of(LI::LSC_UNTYPED_FENCE);
} else {
bool isCachedInL1 = MsgDesc->getCachingL1() == Caching::CA ||
(MsgDesc->getCachingL1() != Caching::UC &&
m_builder.getOption(vISA_assumeL1Hit));
if (MsgDesc->isTyped()) {
return isCachedInL1 ? value_of(LI::LSC_TYPED_L1)
: value_of(LI::LSC_TYPED_L3);
} else {
return isCachedInL1 ? value_of(LI::LSC_UNTYPED_L1)
: value_of(LI::LSC_UNTYPED_L3);
}
}
}
if (MsgDesc->isSLM())
return Inst->asSendInst()->isFence() ? value_of(LI::SLM_FENCE)
: value_of(LI::SLM16);
if (MsgDesc->isSampler())
return value_of(LI::SAMPLER_L3);
if (MsgDesc->isHDC())
return value_of(LI::DP_L3);
if (MsgDesc->isBarrier())
return value_of(LI::BARRIER);
return value_of(LI::SEND_OTHERS);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getMathLatency(const G4_INST *Inst) const {
vASSERT(Inst->isMath());
return value_of(LI::MATH);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getBranchLatency(const G4_INST *Inst) const {
vASSERT(Inst->isFlowControl());
return value_of(LI::BRANCH);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getIntrinsicLatency(const G4_INST *Inst) const {
vASSERT(Inst->isIntrinsic());
if (Inst->isPseudoAddrMovIntrinsic())
return value_of(LI::ADDR_MOV);
return value_of(LI::FPU);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getDPASLatency(const G4_InstDpas *dpas) const {
return getDPASLatency(dpas->getRepeatCount());
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getARFAccessLatency(const G4_INST *Inst) const {
vASSERT(Inst->writesFlag() ||
(Inst->getDst() && Inst->getDst()->isDirectA0()));
return value_of(LI::ARF);
}
template<PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getArithmeticLatency(const G4_INST *Inst) const {
vASSERT(Inst->isArithmetic());
auto Dst = Inst->getDst();
if (Dst && Dst->isAccReg())
return value_of(LI::FPU_ACC);
return value_of(LI::FPU);
}
template <PlatformGen Gen>
uint16_t LatencyTableXe<Gen>::getOccupancy(const G4_INST *Inst) const {
auto Sz = Inst->getExecSize();
auto NativeSz = m_builder.getNativeExecSize();
uint16_t Scale = (Sz <= NativeSz) ? 1 : (Sz == NativeSz * 2) ? 2 : 4;
if (Inst->isMath())
return value_of(LI::OC_MATH) * Scale;
if (Inst->isFastHFInstruction())
Scale = (Sz <= NativeSz * 2) ? 1 : 2;
else if (G4_DstRegRegion *Dst = Inst->getDst()) {
if (Dst->getTypeSize() == 8)
Scale = (Sz <= NativeSz / 2) ? 1 : 2;
}
return value_of(LI::OC_OTHERS) * Scale;
}
// XE Specializations.
template <>
uint16_t
LatencyTableXe<PlatformGen::XE>::getDPASLatency(uint8_t repeatCount) const {
switch (m_builder.getPlatform()) {
case Xe_XeHPSDV:
return value_of(LI::DPAS) + repeatCount - 1;
case Xe_DG2:
switch (repeatCount) {
case 1:
return 21;
case 2:
return 22;
case 8:
return 32;
default:
return 32;
}
case Xe_ARL:
switch (repeatCount) {
case 1:
return 21;
case 2:
return 22;
case 8: {
if (m_builder.has4DeepSystolic()) {
return 32;
}
return 46;
}
default:
return 22; // Conservative cycle
}
case Xe_PVC:
return value_of(LI::DPAS) + repeatCount - 1;
case Xe_PVCXT:
return value_of(LI::DPAS) + repeatCount;
case Xe2:
switch (repeatCount) {
case 1:
return 22;
case 2:
return 23;
case 8:
return 33;
default:
return 33;
}
default: // Not supported platform
// TODO: Add vISA_ASSERT_UNREACHABLE.
return 46;
}
}
template<>
uint16_t
LatencyTableXe<PlatformGen::XE>::getMathLatency(const G4_INST *Inst) const {
vASSERT(Inst->isMath());
int Sz = Inst->getExecSize();
int Scale = Scale = (Sz <= 8) ? 0 : (Sz == 16) ? 1 : 3;
return value_of(LI::MATH) + value_of(LI::DELTA_MATH) * Scale;
}
template <>
uint16_t
LatencyTableXe<PlatformGen::XE>::getDPASLatency(const G4_InstDpas *dpas) const {
return getDPASLatency(dpas->getRepeatCount());
}
template<>
uint16_t LatencyTableXe<PlatformGen::XE>::getArithmeticLatency(
const G4_INST *Inst) const {
vASSERT(Inst->isArithmetic());
int Sz = Inst->getExecSize();
int Scale = Scale = (Sz <= 8) ? 0 : (Sz == 16) ? 1 : 3;
auto Delta = value_of(LI::DELTA) * Scale;
auto Dst = Inst->getDst();
if (Dst && Dst->isAccReg())
return value_of(LI::FPU_ACC) + Delta;
return value_of(LI::FPU) + Delta;
}
// TODO: Update PVC+ to consider native exec size as well so that the
// specialization can be removed.
template <>
uint16_t
LatencyTableXe<PlatformGen::XE>::getOccupancy(const G4_INST *Inst) const {
int Sz = Inst->getExecSize();
int Scale = (Sz <= 8) ? 1 : (Sz == 16) ? 2 : 4;
if (Inst->isMath())
return value_of(LI::OC_MATH) * Scale;
if (Inst->isFastHFInstruction())
Scale = (Sz <= 16) ? 1 : 2;
else if (G4_DstRegRegion *Dst = Inst->getDst()) {
if (Dst->getTypeSize() == 8)
Scale = (Sz <= 4) ? 1 : 2;
}
return value_of(LI::OC_OTHERS) * Scale;
}