nv50_ir_lowering_nvc0.cpp 57.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * Copyright 2011 Christoph Bumiller
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
18
19
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22

23
24
#include "codegen/nv50_ir.h"
#include "codegen/nv50_ir_build_util.h"
25

26
#include "codegen/nv50_ir_target_nvc0.h"
27
#include "codegen/nv50_ir_lowering_nvc0.h"
28

29
30
#include <limits>

31
32
33
34
35
36
37
namespace nv50_ir {

#define QOP_ADD  0
#define QOP_SUBR 1
#define QOP_SUB  2
#define QOP_MOV2 3

38
//             UL UR LL LR
39
#define QUADOP(q, r, s, t)                      \
40
41
   ((QOP_##q << 6) | (QOP_##r << 4) |           \
    (QOP_##s << 2) | (QOP_##t << 0))
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

void
NVC0LegalizeSSA::handleDIV(Instruction *i)
{
   FlowInstruction *call;
   int builtin;
   Value *def[2];

   bld.setPosition(i, false);
   def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
   def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
   switch (i->dType) {
   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
   default:
      return;
   }
   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
   bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);

   call->fixed = 1;
   call->absolute = call->builtin = 1;
   call->target.builtin = builtin;
   delete_Instruction(prog, i);
}

void
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
{
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
   assert(i->dType == TYPE_F64);
   // There are instructions that will compute the high 32 bits of the 64-bit
   // float. We will just stick 0 in the bottom 32 bits.

   bld.setPosition(i, false);

   // 1. Take the source and it up.
   Value *src[2], *dst[2], *def = i->getDef(0);
   bld.mkSplit(src, 4, i->getSrc(0));

   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
   dst[0] = bld.loadImm(NULL, 0);
   dst[1] = bld.getSSA();

   // 3. The new version of the instruction takes the high 32 bits of the
   // source and outputs the high 32 bits of the destination.
   i->setSrc(0, src[1]);
   i->setDef(0, dst[1]);
   i->setType(TYPE_F32);
   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;

   // 4. Recombine the two dst pieces back into the original destination.
   bld.setPosition(i, true);
   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
97
98
}

99
100
101
102
void
NVC0LegalizeSSA::handleFTZ(Instruction *i)
{
   // Only want to flush float inputs
103
   assert(i->sType == TYPE_F32);
104
105
106
107
108
109
110
111
112
113
114
115
116
117

   // If we're already flushing denorms (and NaN's) to zero, no need for this.
   if (i->dnz)
      return;

   // Only certain classes of operations can flush
   OpClass cls = prog->getTarget()->getOpClass(i->op);
   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
       cls != OPCLASS_CONVERT)
      return;

   i->ftz = true;
}

118
119
120
121
122
123
124
125
126
127
128
129
130
bool
NVC0LegalizeSSA::visit(Function *fn)
{
   bld.setProgram(fn->getProgram());
   return true;
}

bool
NVC0LegalizeSSA::visit(BasicBlock *bb)
{
   Instruction *next;
   for (Instruction *i = bb->getEntry(); i; i = next) {
      next = i->next;
131
      if (i->sType == TYPE_F32) {
132
133
         if (prog->getType() != Program::TYPE_COMPUTE)
            handleFTZ(i);
134
         continue;
135
      }
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
      switch (i->op) {
      case OP_DIV:
      case OP_MOD:
         handleDIV(i);
         break;
      case OP_RCP:
      case OP_RSQ:
         if (i->dType == TYPE_F64)
            handleRCPRSQ(i);
         break;
      default:
         break;
      }
   }
   return true;
}

153
NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
154
155
156
   : rZero(NULL),
     carry(NULL),
     needTexBar(prog->getTarget()->getChipset() >= 0xe0)
157
158
159
{
}

160
161
162
163
164
165
166
167
168
169
bool
NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
                                    const Instruction *early) const
{
   if (early->bb == later->bb)
      return early->serial < later->serial;
   return later->bb->dominatedBy(early->bb);
}

void
170
NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
171
                              Instruction *usei, const Instruction *texi)
172
173
174
175
176
177
178
179
180
181
182
183
184
185
{
   bool add = true;
   for (std::list<TexUse>::iterator it = uses.begin();
        it != uses.end();) {
      if (insnDominatedBy(usei, it->insn)) {
         add = false;
         break;
      }
      if (insnDominatedBy(it->insn, usei))
         it = uses.erase(it);
      else
         ++it;
   }
   if (add)
186
      uses.push_back(TexUse(usei, texi));
187
188
189
190
191
192
193
194
195
196
197
}

void
NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
                                        Instruction *insn,
                                        const BasicBlock *term,
                                        std::list<TexUse> &uses)
{
   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
      insn = insn->getSrc(0)->getUniqueInsn();

198
199
   // NOTE: the tex itself is, of course, not an overwriting definition
   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
      return;

   switch (insn->op) {
   /* Values not connected to the tex's definition through any of these should
    * not be conflicting.
    */
   case OP_SPLIT:
   case OP_MERGE:
   case OP_PHI:
   case OP_UNION:
      /* recurse again */
      for (int s = 0; insn->srcExists(s); ++s)
         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
                             uses);
      break;
   default:
      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
      addTexUse(uses, insn, texi);
      break;
   }
}

void
223
224
225
226
227
NVC0LegalizePostRA::findFirstUses(
      const Instruction *texi,
      const Instruction *insn,
      std::list<TexUse> &uses,
      std::tr1::unordered_set<const Instruction *>& visited)
228
229
230
231
232
{
   for (int d = 0; insn->defExists(d); ++d) {
      Value *v = insn->getDef(d);
      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
         Instruction *usei = (*u)->getInsn();
233

234
235
236
237
238
239
240
         // NOTE: In case of a loop that overwrites a value but never uses
         // it, it can happen that we have a cycle of uses that consists only
         // of phis and no-op moves and will thus cause an infinite loop here
         // since these are not considered actual uses.
         // The most obvious (and perhaps the only) way to prevent this is to
         // remember which instructions we've already visited.

241
242
243
244
245
         if (visited.find(usei) != visited.end())
            continue;

         visited.insert(usei);

246
         if (usei->op == OP_PHI || usei->op == OP_UNION) {
247
248
249
250
251
252
            // need a barrier before WAW cases, like:
            //   %r0 = tex
            //   if ...
            //     texbar <- is required or tex might replace x again
            //     %r1 = x <- overwriting def
            //   %r2 = phi %r0, %r1
253
254
255
256
257
258
259
            for (int s = 0; usei->srcExists(s); ++s) {
               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
               if (defi && &usei->src(s) != *u)
                  findOverwritingDefs(texi, defi, usei->bb, uses);
            }
         }

260
         if (usei->op == OP_SPLIT ||
261
             usei->op == OP_MERGE ||
262
263
264
             usei->op == OP_PHI ||
             usei->op == OP_UNION) {
            // these uses don't manifest in the machine code
265
            findFirstUses(texi, usei, uses, visited);
266
267
268
         } else
         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
269
            findFirstUses(texi, usei, uses, visited);
270
         } else {
271
            addTexUse(uses, usei, texi);
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
         }
      }
   }
}

// Texture barriers:
// This pass is a bit long and ugly and can probably be optimized.
//
// 1. obtain a list of TEXes and their outputs' first use(s)
// 2. calculate the barrier level of each first use (minimal number of TEXes,
//    over all paths, between the TEX and the use in question)
// 3. for each barrier, if all paths from the source TEX to that barrier
//    contain a barrier of lesser level, it can be culled
bool
NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
{
   std::list<TexUse> *uses;
   std::vector<Instruction *> texes;
   std::vector<int> bbFirstTex;
   std::vector<int> bbFirstUse;
   std::vector<int> texCounts;
   std::vector<TexUse> useVec;
   ArrayList insns;

   fn->orderInstructions(insns);

   texCounts.resize(fn->allBBlocks.getSize(), 0);
   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());

   // tag BB CFG nodes by their id for later
   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
      if (bb)
         bb->cfg.tag = bb->getId();
   }

   // gather the first uses for each TEX
   for (int i = 0; i < insns.getSize(); ++i) {
      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
      if (isTextureOp(tex->op)) {
         texes.push_back(tex);
         if (!texCounts.at(tex->bb->getId()))
            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
         texCounts[tex->bb->getId()]++;
      }
   }
   insns.clear();
   if (texes.empty())
      return false;
   uses = new std::list<TexUse>[texes.size()];
   if (!uses)
      return false;
325
326
327
328
   for (size_t i = 0; i < texes.size(); ++i) {
      std::tr1::unordered_set<const Instruction *> visited;
      findFirstUses(texes[i], texes[i], uses[i], visited);
   }
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

   // determine the barrier level at each use
   for (size_t i = 0; i < texes.size(); ++i) {
      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
           ++u) {
         BasicBlock *tb = texes[i]->bb;
         BasicBlock *ub = u->insn->bb;
         if (tb == ub) {
            u->level = 0;
            for (size_t j = i + 1; j < texes.size() &&
                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
                 ++j)
               u->level++;
         } else {
            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
                                                      &ub->cfg, texCounts);
            if (u->level < 0) {
               WARN("Failed to find path TEX -> TEXBAR\n");
               u->level = 0;
               continue;
            }
            // this counted all TEXes in the origin block, correct that
            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
            // and did not count the TEXes in the destination block, add those
            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
                 ++j)
               u->level++;
         }
         assert(u->level >= 0);
         useVec.push_back(*u);
      }
   }
   delete[] uses;

   // insert the barriers
   for (size_t i = 0; i < useVec.size(); ++i) {
      Instruction *prev = useVec[i].insn->prev;
      if (useVec[i].level < 0)
         continue;
      if (prev && prev->op == OP_TEXBAR) {
         if (prev->subOp > useVec[i].level)
            prev->subOp = useVec[i].level;
         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
      } else {
         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
         bar->fixed = 1;
         bar->subOp = useVec[i].level;
         // make use explicit to ease latency calculation
         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
      }
   }

383
   if (fn->getProgram()->optLevel < 3)
384
385
386
387
388
389
390
391
392
      return true;

   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single

   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
   limitS.resize(fn->allBBlocks.getSize());

   // cull unneeded barriers (should do that earlier, but for simplicity)
393
   IteratorRef bi = fn->cfg.iteratorCFG();
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
   // first calculate min/max outstanding TEXes for each BB
   for (bi->reset(); !bi->end(); bi->next()) {
      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
      BasicBlock *bb = BasicBlock::get(n);
      int min = 0;
      int max = std::numeric_limits<int>::max();
      for (Instruction *i = bb->getFirst(); i; i = i->next) {
         if (isTextureOp(i->op)) {
            min++;
            if (max < std::numeric_limits<int>::max())
               max++;
         } else
         if (i->op == OP_TEXBAR) {
            min = MIN2(min, i->subOp);
            max = MIN2(max, i->subOp);
         }
      }
      // limits when looking at an isolated block
      limitS[bb->getId()].min = min;
      limitS[bb->getId()].max = max;
   }
   // propagate the min/max values
   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
      for (bi->reset(); !bi->end(); bi->next()) {
         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
         BasicBlock *bb = BasicBlock::get(n);
         const int bbId = bb->getId();
         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
            BasicBlock *in = BasicBlock::get(ei.getNode());
            const int inId = in->getId();
            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
         }
         // I just hope this is correct ...
         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
            // no barrier
            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
         } else {
            // block contained a barrier
            limitB[bbId].min = MIN2(limitS[bbId].max,
                                    limitT[bbId].min + limitS[bbId].min);
            limitB[bbId].max = MIN2(limitS[bbId].max,
                                    limitT[bbId].max + limitS[bbId].min);
         }
      }
   }
   // finally delete unnecessary barriers
   for (bi->reset(); !bi->end(); bi->next()) {
      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
      BasicBlock *bb = BasicBlock::get(n);
      Instruction *prev = NULL;
      Instruction *next;
      int max = limitT[bb->getId()].max;
      for (Instruction *i = bb->getFirst(); i; i = next) {
         next = i->next;
         if (i->op == OP_TEXBAR) {
            if (i->subOp >= max) {
               delete_Instruction(prog, i);
453
               i = NULL;
454
455
456
457
458
459
460
461
462
463
464
            } else {
               max = i->subOp;
               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
                  delete_Instruction(prog, prev);
                  prev = NULL;
               }
            }
         } else
         if (isTextureOp(i->op)) {
            max++;
         }
465
         if (i && !i->isNop())
466
467
468
469
470
471
            prev = i;
      }
   }
   return true;
}

472
473
474
bool
NVC0LegalizePostRA::visit(Function *fn)
{
475
476
477
   if (needTexBar)
      insertTextureBarriers(fn);

478
   rZero = new_LValue(fn, FILE_GPR);
479
   carry = new_LValue(fn, FILE_FLAGS);
480

481
   rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
482
   carry->reg.data.id = 0;
483

484
485
486
487
488
489
490
   return true;
}

void
NVC0LegalizePostRA::replaceZero(Instruction *i)
{
   for (int s = 0; i->srcExists(s); ++s) {
491
492
      if (s == 2 && i->op == OP_SUCLAMP)
         continue;
493
494
      ImmediateValue *imm = i->getSrc(s)->asImm();
      if (imm && imm->reg.data.u64 == 0)
495
         i->setSrc(s, rZero);
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
   }
}

// replace CONT with BRA for single unconditional continue
bool
NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
{
   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
      return false;
   Graph::EdgeIterator ei = bb->cfg.incident();
   if (ei.getType() != Graph::Edge::BACK)
      ei.next();
   if (ei.getType() != Graph::Edge::BACK)
      return false;
   BasicBlock *contBB = BasicBlock::get(ei.getNode());

   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
       contBB->getExit()->getPredicate())
      return false;
   contBB->getExit()->op = OP_BRA;
   bb->remove(bb->getEntry()); // delete PRECONT

   ei.next();
   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
   return true;
}

// replace branches to join blocks with join ops
void
NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
{
   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
      return;
   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
      BasicBlock *in = BasicBlock::get(ei.getNode());
      Instruction *exit = in->getExit();
      if (!exit) {
         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
         // there should always be a terminator instruction
         WARN("inserted missing terminator in BB:%i\n", in->getId());
      } else
      if (exit->op == OP_BRA) {
         exit->op = OP_JOIN;
         exit->asFlow()->limit = 1; // must-not-propagate marker
      }
   }
   bb->remove(bb->getEntry());
}

bool
NVC0LegalizePostRA::visit(BasicBlock *bb)
{
   Instruction *i, *next;

   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
   for (i = bb->getFirst(); i; i = next) {
      next = i->next;
      if (i->op == OP_EMIT || i->op == OP_RESTART) {
         if (!i->getDef(0)->refCount())
            i->setDef(0, NULL);
556
         if (i->src(0).getFile() == FILE_IMMEDIATE)
557
            i->setSrc(0, rZero); // initial value must be 0
558
         replaceZero(i);
559
560
561
562
      } else
      if (i->isNop()) {
         bb->remove(i);
      } else {
563
564
565
566
567
568
569
570
571
         // TODO: Move this to before register allocation for operations that
         // need the $c register !
         if (typeSizeof(i->dType) == 8) {
            Instruction *hi;
            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
            if (hi)
               next = hi;
         }

572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
         if (i->op != OP_MOV && i->op != OP_PFETCH)
            replaceZero(i);
      }
   }
   if (!bb->getEntry())
      return true;

   if (!tryReplaceContWithBra(bb))
      propagateJoin(bb);

   return true;
}

NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
{
   bld.setProgram(prog);
588
   gMemBase = NULL;
589
590
591
592
593
594
595
596
597
598
}

bool
NVC0LoweringPass::visit(Function *fn)
{
   if (prog->getType() == Program::TYPE_GEOMETRY) {
      assert(!strncmp(fn->getName(), "MAIN", 4));
      // TODO: when we generate actual functions pass this value along somehow
      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
599
600
601
602
      if (fn->cfgExit) {
         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
         bld.mkMovToReg(0, gpEmitAddress);
      }
603
604
605
606
607
608
609
610
611
612
   }
   return true;
}

bool
NVC0LoweringPass::visit(BasicBlock *bb)
{
   return true;
}

613
614
615
616
617
618
619
620
621
inline Value *
NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
{
   uint8_t b = prog->driver->io.resInfoCBSlot;
   uint32_t off = prog->driver->io.texBindBase + slot * 4;
   return bld.
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
}

622
623
624
625
// move array source to first slot, convert to u16, add indirections
bool
NVC0LoweringPass::handleTEX(TexInstruction *i)
{
626
627
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
   const int arg = i->tex.target.getArgCount();
628
   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
629
   const int chipset = prog->getTarget()->getChipset();
630

631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
   // Arguments to the TEX instruction are a little insane. Even though the
   // encoding is identical between SM20 and SM30, the arguments mean
   // different things between Fermi and Kepler+. A lot of arguments are
   // optional based on flags passed to the instruction. This summarizes the
   // order of things.
   //
   // Fermi:
   //  array/indirect
   //  coords
   //  sample
   //  lod bias
   //  depth compare
   //  offsets:
   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
   //    - other: 4 bits each, single reg
   //
   // Kepler+:
   //  indirect handle
   //  array (+ offsets for txd in upper 16 bits)
   //  coords
   //  sample
   //  lod bias
   //  depth compare
   //  offsets (same as fermi, except txd which takes it with array)
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
   //
   // Maxwell (tex):
   //  array
   //  coords
   //  indirect handle
   //  sample
   //  lod bias
   //  depth compare
   //  offsets
   //
   // Maxwell (txd):
   //  indirect handle
   //  coords
   //  array + offsets
   //  derivatives
670

671
   if (chipset >= NVISA_GK104_CHIPSET) {
672
      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
673
674
675
676
677
678
679
680
681
682
683
         // XXX this ignores tsc, and assumes a 1:1 mapping
         assert(i->tex.rIndirectSrc >= 0);
         Value *hnd = loadTexHandle(
               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                          i->getIndirectR(), bld.mkImm(2)),
               i->tex.r);
         i->tex.r = 0xff;
         i->tex.s = 0x1f;
         i->setIndirectR(hnd);
         i->setIndirectS(NULL);
      } else if (i->tex.r == i->tex.s) {
684
         i->tex.r += prog->driver->io.texBindBase / 4;
685
686
         i->tex.s  = 0; // only a single cX[] value possible here
      } else {
687
688
689
690
691
692
693
694
695
         Value *hnd = bld.getScratch();
         Value *rHnd = loadTexHandle(NULL, i->tex.r);
         Value *sHnd = loadTexHandle(NULL, i->tex.s);

         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);

         i->tex.r = 0; // not used for indirect tex
         i->tex.s = 0;
         i->setIndirectR(hnd);
696
697
698
      }
      if (i->tex.target.isArray()) {
         LValue *layer = new_LValue(func, FILE_GPR);
699
         Value *src = i->getSrc(lyr);
700
701
702
         const int sat = (i->op == OP_TXF) ? 1 : 0;
         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
703
704
705
706
707
708
709
         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
            for (int s = dim; s >= 1; --s)
               i->setSrc(s, i->getSrc(s - 1));
            i->setSrc(0, layer);
         } else {
            i->setSrc(dim, layer);
         }
710
      }
711
      // Move the indirect reference to the first place
712
713
      if (i->tex.rIndirectSrc >= 0 && (
                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
714
715
716
717
718
719
720
721
         Value *hnd = i->getIndirectR();

         i->setIndirectR(NULL);
         i->moveSources(0, 1);
         i->setSrc(0, hnd);
         i->tex.rIndirectSrc = 0;
         i->tex.sIndirectSrc = -1;
      }
722
723
   } else
   // (nvc0) generate and move the tsc/tic/array source to the front
724
   if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
725
726
      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa

727
728
729
      Value *ticRel = i->getIndirectR();
      Value *tscRel = i->getIndirectS();

730
      if (ticRel) {
731
         i->setSrc(i->tex.rIndirectSrc, NULL);
732
733
734
735
736
         if (i->tex.r)
            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
                                ticRel, bld.mkImm(i->tex.r));
      }
      if (tscRel) {
737
         i->setSrc(i->tex.sIndirectSrc, NULL);
738
739
740
741
         if (i->tex.s)
            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
                                tscRel, bld.mkImm(i->tex.s));
      }
742

743
      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
744
745
746
747
      for (int s = dim; s >= 1; --s)
         i->setSrc(s, i->getSrc(s - 1));
      i->setSrc(0, arrayIndex);

748
749
750
751
752
      if (arrayIndex) {
         int sat = (i->op == OP_TXF) ? 1 : 0;
         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
      } else {
753
         bld.loadImm(src, 0);
754
      }
755

756
      if (ticRel)
757
         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
758
      if (tscRel)
759
760
761
762
763
         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);

      i->setSrc(0, src);
   }

764
765
766
767
768
769
770
   // For nvc0, the sample id has to be in the second operand, as the offset
   // does. Right now we don't know how to pass both in, and this case can't
   // happen with OpenGL. On nve0, the sample id is part of the texture
   // coordinate argument.
   assert(chipset >= NVISA_GK104_CHIPSET ||
          !i->tex.useOffsets || !i->tex.target.isMS());

771
   // offset is between lod and dc
772
773
   if (i->tex.useOffsets) {
      int n, c;
774
      int s = i->srcCount(0xff, true);
775
776
777
778
779
780
781
782
      if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
         if (i->tex.target.isShadow())
            s--;
         if (i->srcExists(s)) // move potential predicate out of the way
            i->moveSources(s, 1);
         if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
            i->moveSources(s + 1, 1);
      }
783
      if (i->op == OP_TXG) {
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
         // Either there is 1 offset, which goes into the 2 low bytes of the
         // first source, or there are 4 offsets, which go into 2 sources (8
         // values, 1 byte each).
         Value *offs[2] = {NULL, NULL};
         for (n = 0; n < i->tex.useOffsets; n++) {
            for (c = 0; c < 2; ++c) {
               if ((n % 2) == 0 && c == 0)
                  offs[n / 2] = i->offset[n][c].get();
               else
                  bld.mkOp3(OP_INSBF, TYPE_U32,
                            offs[n / 2],
                            i->offset[n][c].get(),
                            bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
                            offs[n / 2]);
            }
         }
         i->setSrc(s, offs[0]);
         if (offs[1])
            i->setSrc(s + 1, offs[1]);
803
      } else {
804
805
806
807
         unsigned imm = 0;
         assert(i->tex.useOffsets == 1);
         for (c = 0; c < 3; ++c) {
            ImmediateValue val;
808
809
            if (!i->offset[0][c].getImmediate(val))
               assert(!"non-immediate offset passed to non-TXG");
810
811
            imm |= (val.reg.data.u32 & 0xf) << (c * 4);
         }
812
813
814
815
         if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
            // The offset goes into the upper 16 bits of the array index. So
            // create it if it's not already there, and INSBF it if it already
            // is.
816
            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
817
818
            if (chipset >= NVISA_GM107_CHIPSET)
               s += dim;
819
            if (i->tex.target.isArray()) {
820
               bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
821
                         bld.loadImm(NULL, imm), bld.mkImm(0xc10),
822
                         i->getSrc(s));
823
            } else {
824
825
               i->moveSources(s, 1);
               i->setSrc(s, bld.loadImm(NULL, imm << 16));
826
827
828
829
            }
         } else {
            i->setSrc(s, bld.loadImm(NULL, imm));
         }
830
      }
831
832
   }

833
   if (chipset >= NVISA_GK104_CHIPSET) {
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
      //
      // If TEX requires more than 4 sources, the 2nd register tuple must be
      // aligned to 4, even if it consists of just a single 4-byte register.
      //
      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
      //
      int s = i->srcCount(0xff, true);
      if (s > 4 && s < 7) {
         if (i->srcExists(s)) // move potential predicate out of the way
            i->moveSources(s, 7 - s);
         while (s < 7)
            i->setSrc(s++, bld.loadImm(NULL, 0));
      }
   }

849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
   return true;
}

bool
NVC0LoweringPass::handleManualTXD(TexInstruction *i)
{
   static const uint8_t qOps[4][2] =
   {
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
   };
   Value *def[4][4];
   Value *crd[3];
   Instruction *tex;
   Value *zero = bld.loadImm(bld.getSSA(), 0);
   int l, c;
   const int dim = i->tex.target.getDim();
868
   const int array = i->tex.target.isArray();
869
870
871
872
873
874
875
876
877
878

   i->op = OP_TEX; // no need to clone dPdx/dPdy later

   for (c = 0; c < dim; ++c)
      crd[c] = bld.getScratch();

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
879
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
880
881
882
883
884
885
886
      // add dPdx from lane l to lanes dx
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
      // texture
887
      bld.insert(tex = cloneForward(func, i));
888
      for (c = 0; c < dim; ++c)
889
         tex->setSrc(c + array, crd[c]);
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
         def[c][l] = bld.getSSA();
         mov = bld.mkMov(def[c][l], tex->getDef(c));
         mov->fixed = 1;
         mov->lanes = 1 << l;
      }
   }
   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);

   for (c = 0; i->defExists(c); ++c) {
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
      for (l = 0; l < 4; ++l)
         u->setSrc(l, def[c][l]);
   }

   i->bb->remove(i);
   return true;
}

bool
NVC0LoweringPass::handleTXD(TexInstruction *txd)
{
   int dim = txd->tex.target.getDim();
915
916
917
918
919
920
921
   unsigned arg = txd->tex.target.getArgCount();
   unsigned expected_args = arg;
   const int chipset = prog->getTarget()->getChipset();

   if (chipset >= NVISA_GK104_CHIPSET) {
      if (!txd->tex.target.isArray() && txd->tex.useOffsets)
         expected_args++;
922
923
      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
         expected_args++;
924
925
926
   } else {
      if (txd->tex.useOffsets)
         expected_args++;
927
928
929
      if (!txd->tex.target.isArray() && (
                txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
         expected_args++;
930
931
932
933
934
935
936
   }

   if (expected_args > 4 ||
       dim > 2 ||
       txd->tex.target.isShadow() ||
       txd->tex.target.isCube())
      txd->op = OP_TEX;
937
938

   handleTEX(txd);
939
   while (txd->srcExists(arg))
940
941
      ++arg;

942
   txd->tex.derivAll = true;
943
   if (txd->op == OP_TEX)
944
945
      return handleManualTXD(txd);

946
   assert(arg == expected_args);
947
   for (int c = 0; c < dim; ++c) {
948
949
      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
950
951
      txd->dPdx[c].set(NULL);
      txd->dPdy[c].set(NULL);
952
953
954
955
   }
   return true;
}

956
957
958
bool
NVC0LoweringPass::handleTXQ(TexInstruction *txq)
{
959
   if (txq->tex.rIndirectSrc < 0)
960
961
962
963
964
965
966
967
      return true;

   Value *ticRel = txq->getIndirectR();
   const int chipset = prog->getTarget()->getChipset();

   txq->setIndirectS(NULL);
   txq->tex.sIndirectSrc = -1;

968
969
   assert(ticRel);

970
971
972
   if (chipset < NVISA_GK104_CHIPSET) {
      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa

973
974
975
976
      txq->setSrc(txq->tex.rIndirectSrc, NULL);
      if (txq->tex.r)
         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
                             ticRel, bld.mkImm(txq->tex.r));
977

978
      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999

      txq->moveSources(0, 1);
      txq->setSrc(0, src);
   } else {
      Value *hnd = loadTexHandle(
            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                       txq->getIndirectR(), bld.mkImm(2)),
            txq->tex.r);
      txq->tex.r = 0xff;
      txq->tex.s = 0x1f;

      if (chipset < NVISA_GM107_CHIPSET) {
         txq->setIndirectR(NULL);
         txq->moveSources(0, 1);
         txq->setSrc(0, hnd);
         txq->tex.rIndirectSrc = 0;
      } else {
         txq->setIndirectR(hnd);
      }
   }

1000
   return true;
For faster browsing, not all history is shown. View entire blame