File backport-llvm-r233689 of Package llvm
------------------------------------------------------------------------
r233689 | uweigand | 2015-03-31 14:56:33 +0200 (Tue, 31 Mar 2015) | 9 lines
[SystemZ] Use POPCNT instruction on z196
We already exploit a number of instructions specific to z196,
but not yet POPCNT. Add support for the population-count
facility, MC support for the POPCNT instruction, CodeGen
support for using POPCNT, and implement the getPopcntSupport
TargetTransformInfo hook.
------------------------------------------------------------------------
Index: test/MC/Disassembler/SystemZ/insns.txt
===================================================================
--- test/MC/Disassembler/SystemZ/insns.txt.orig
+++ test/MC/Disassembler/SystemZ/insns.txt
@@ -6334,6 +6334,18 @@
# CHECK: pfd 15, 0
0xe3 0xf0 0x00 0x00 0x00 0x36
+# CHECK: popcnt %r0, %r0
+0xb9 0xe1 0x00 0x00
+
+# CHECK: popcnt %r0, %r15
+0xb9 0xe1 0x00 0x0f
+
+# CHECK: popcnt %r15, %r0
+0xb9 0xe1 0x00 0xf0
+
+# CHECK: popcnt %r7, %r8
+0xb9 0xe1 0x00 0x78
+
# CHECK: risbg %r0, %r0, 0, 0, 0
0xec 0x00 0x00 0x00 0x00 0x55
Index: test/MC/SystemZ/insn-bad.s
===================================================================
--- test/MC/SystemZ/insn-bad.s.orig
+++ test/MC/SystemZ/insn-bad.s
@@ -2666,6 +2666,11 @@
pfdrl 1, 1
pfdrl 1, 0x100000000
+#CHECK: error: {{(instruction requires: population-count)?}}
+#CHECK: popcnt %r0, %r0
+
+ popcnt %r0, %r0
+
#CHECK: error: invalid operand
#CHECK: risbg %r0,%r0,0,0,-1
#CHECK: error: invalid operand
Index: test/MC/SystemZ/insn-good-z196.s
===================================================================
--- test/MC/SystemZ/insn-good-z196.s.orig
+++ test/MC/SystemZ/insn-good-z196.s
@@ -1021,6 +1021,16 @@
ork %r15,%r0,%r0
ork %r7,%r8,%r9
+#CHECK: popcnt %r0, %r0 # encoding: [0xb9,0xe1,0x00,0x00]
+#CHECK: popcnt %r0, %r15 # encoding: [0xb9,0xe1,0x00,0x0f]
+#CHECK: popcnt %r15, %r0 # encoding: [0xb9,0xe1,0x00,0xf0]
+#CHECK: popcnt %r7, %r8 # encoding: [0xb9,0xe1,0x00,0x78]
+
+ popcnt %r0,%r0
+ popcnt %r0,%r15
+ popcnt %r15,%r0
+ popcnt %r7,%r8
+
#CHECK: risbhg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x5d]
#CHECK: risbhg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x5d]
#CHECK: risbhg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x5d]
Index: test/CodeGen/SystemZ/ctpop-01.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/ctpop-01.ll
@@ -0,0 +1,96 @@
+; Test population-count instruction
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32 %a)
+declare i64 @llvm.ctpop.i64(i64 %a)
+
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: popcnt %r0, %r2
+; CHECK: sllk %r1, %r0, 16
+; CHECK: ar %r1, %r0
+; CHECK: sllk %r2, %r1, 8
+; CHECK: ar %r2, %r1
+; CHECK: srl %r2, 24
+; CHECK: br %r14
+
+ %popcnt = call i32 @llvm.ctpop.i32(i32 %a)
+ ret i32 %popcnt
+}
+
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llhr %r0, %r2
+; CHECK: popcnt %r0, %r0
+; CHECK: risblg %r2, %r0, 16, 151, 8
+; CHECK: ar %r2, %r0
+; CHECK: srl %r2, 8
+; CHECK: br %r14
+ %and = and i32 %a, 65535
+ %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+ ret i32 %popcnt
+}
+
+define i32 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: llcr %r0, %r2
+; CHECK: popcnt %r2, %r0
+; CHECK: br %r14
+ %and = and i32 %a, 255
+ %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+ ret i32 %popcnt
+}
+
+define i64 @f4(i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: popcnt %r0, %r2
+; CHECK: sllg %r1, %r0, 32
+; CHECK: agr %r1, %r0
+; CHECK: sllg %r0, %r1, 16
+; CHECK: agr %r0, %r1
+; CHECK: sllg %r1, %r0, 8
+; CHECK: agr %r1, %r0
+; CHECK: srlg %r2, %r1, 56
+; CHECK: br %r14
+ %popcnt = call i64 @llvm.ctpop.i64(i64 %a)
+ ret i64 %popcnt
+}
+
+define i64 @f5(i64 %a) {
+; CHECK-LABEL: f5:
+; CHECK: llgfr %r0, %r2
+; CHECK: popcnt %r0, %r0
+; CHECK: sllg %r1, %r0, 16
+; CHECK: algfr %r0, %r1
+; CHECK: sllg %r1, %r0, 8
+; CHECK: algfr %r0, %r1
+; CHECK: srlg %r2, %r0, 24
+ %and = and i64 %a, 4294967295
+ %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+ ret i64 %popcnt
+}
+
+define i64 @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: llghr %r0, %r2
+; CHECK: popcnt %r0, %r0
+; CHECK: risbg %r1, %r0, 48, 183, 8
+; CHECK: agr %r1, %r0
+; CHECK: srlg %r2, %r1, 8
+; CHECK: br %r14
+ %and = and i64 %a, 65535
+ %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+ ret i64 %popcnt
+}
+
+define i64 @f7(i64 %a) {
+; CHECK-LABEL: f7:
+; CHECK: llgcr %r0, %r2
+; CHECK: popcnt %r2, %r0
+; CHECK: br %r14
+ %and = and i64 %a, 255
+ %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+ ret i64 %popcnt
+}
+
Index: lib/Target/SystemZ/SystemZProcessors.td
===================================================================
--- lib/Target/SystemZ/SystemZProcessors.td.orig
+++ lib/Target/SystemZ/SystemZProcessors.td
@@ -39,6 +39,11 @@ def FeatureFPExtension : SystemZFeature<
"Assume that the floating-point extension facility is installed"
>;
+def FeaturePopulationCount : SystemZFeature<
+ "population-count", "PopulationCount",
+ "Assume that the population-count facility is installed"
+>;
+
def FeatureFastSerialization : SystemZFeature<
"fast-serialization", "FastSerialization",
"Assume that the fast-serialization facility is installed"
@@ -54,9 +59,9 @@ def : Processor<"generic", NoItineraries
def : Processor<"z10", NoItineraries, []>;
def : Processor<"z196", NoItineraries,
[FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
- FeatureFPExtension, FeatureFastSerialization,
- FeatureInterlockedAccess1]>;
+ FeatureFPExtension, FeaturePopulationCount,
+ FeatureFastSerialization, FeatureInterlockedAccess1]>;
def : Processor<"zEC12", NoItineraries,
[FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
- FeatureFPExtension, FeatureFastSerialization,
- FeatureInterlockedAccess1]>;
+ FeatureFPExtension, FeaturePopulationCount,
+ FeatureFastSerialization, FeatureInterlockedAccess1]>;
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp.orig
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -81,6 +81,8 @@ public:
virtual unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx,
const APInt &Imm, Type *Ty) const;
+ virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
/// @}
};
@@ -300,3 +302,11 @@ unsigned SystemZTTI::getIntImmCost(Intri
return SystemZTTI::getIntImmCost(Imm, Ty);
}
+SystemZTTI::PopcntSupportKind
+SystemZTTI::getPopcntSupport(unsigned TyWidth) const {
+ assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
+ if (ST->hasPopulationCount() && TyWidth <= 64)
+ return PSK_FastHardware;
+ return PSK_Software;
+}
+
Index: lib/Target/SystemZ/SystemZInstrInfo.td
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.td.orig
+++ lib/Target/SystemZ/SystemZInstrInfo.td
@@ -1382,6 +1382,13 @@ let Defs = [CC] in {
def : Pat<(ctlz GR64:$src),
(EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
+// Population count. Counts bits set per byte.
+let Predicates = [FeaturePopulationCount], Defs = [CC] in {
+ def POPCNT : InstRRE<0xB9E1, (outs GR64:$R1), (ins GR64:$R2),
+ "popcnt\t$R1, $R2",
+ [(set GR64:$R1, (z_popcnt GR64:$R2))]>;
+}
+
// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
def : Pat<(i64 (anyext GR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp.orig
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -164,8 +164,13 @@ SystemZTargetLowering::SystemZTargetLowe
// available, or if the operand is constant.
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ // Use POPCNT on z196 and above.
+ if (Subtarget.hasPopulationCount())
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ else
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
// No special instructions for these.
- setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@@ -2306,6 +2311,45 @@ SDValue SystemZTargetLowering::lowerOR(S
MVT::i64, HighOp, Low32);
}
+SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ int64_t OrigBitSize = VT.getSizeInBits();
+ SDLoc DL(Op);
+
+ // Get the known-zero mask for the operand.
+ Op = Op.getOperand(0);
+ APInt KnownZero, KnownOne;
+ DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+ uint64_t Mask = ~KnownZero.getZExtValue();
+
+ // Skip known-zero high parts of the operand.
+ int64_t BitSize = OrigBitSize;
+ while ((Mask & ((((uint64_t)1 << (BitSize / 2)) - 1) << (BitSize / 2))) == 0)
+ BitSize = BitSize / 2;
+
+ // The POPCNT instruction counts the number of bits in each byte.
+ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
+ Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
+ Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+
+ // Add up per-byte counts in a binary tree. All bits of Op at
+ // position larger than BitSize remain zero throughout.
+ for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT));
+ if (BitSize != OrigBitSize)
+ Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
+ DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT));
+ Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+ }
+
+ // Extract overall result from high byte.
+ if (BitSize > 8)
+ Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT));
+
+ return Op;
+}
+
// Op is an atomic load. Lower it into a normal volatile load.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
SelectionDAG &DAG) const {
@@ -2558,6 +2602,8 @@ SDValue SystemZTargetLowering::LowerOper
return lowerUDIVREM(Op, DAG);
case ISD::OR:
return lowerOR(Op, DAG);
+ case ISD::CTPOP:
+ return lowerCTPOP(Op, DAG);
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:
Index: lib/Target/SystemZ/SystemZSubtarget.h
===================================================================
--- lib/Target/SystemZ/SystemZSubtarget.h.orig
+++ lib/Target/SystemZ/SystemZSubtarget.h
@@ -32,6 +32,7 @@ protected:
bool HasLoadStoreOnCond;
bool HasHighWord;
bool HasFPExtension;
+ bool HasPopulationCount;
bool HasFastSerialization;
bool HasInterlockedAccess1;
@@ -60,6 +61,9 @@ public:
// Return true if the target has the floating-point extension facility.
bool hasFPExtension() const { return HasFPExtension; }
+ // Return true if the target has the population-count facility.
+ bool hasPopulationCount() const { return HasPopulationCount; }
+
// Return true if the target has the fast-serialization facility.
bool hasFastSerialization() const { return HasFastSerialization; }
Index: lib/Target/SystemZ/SystemZSubtarget.cpp
===================================================================
--- lib/Target/SystemZ/SystemZSubtarget.cpp.orig
+++ lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -26,7 +26,8 @@ SystemZSubtarget::SystemZSubtarget(const
const std::string &FS)
: SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
- HasFastSerialization(false), HasInterlockedAccess1(false),
+ HasPopulationCount(false), HasFastSerialization(false),
+ HasInterlockedAccess1(false),
TargetTriple(TT) {
std::string CPUName = CPU;
if (CPUName.empty())
Index: lib/Target/SystemZ/SystemZOperators.td
===================================================================
--- lib/Target/SystemZ/SystemZOperators.td.orig
+++ lib/Target/SystemZ/SystemZOperators.td
@@ -121,6 +121,7 @@ def z_select_ccmask : SDNode<"System
def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
def z_extract_access : SDNode<"SystemZISD::EXTRACT_ACCESS",
SDT_ZExtractAccess>;
+def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
def z_umul_lohi64 : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
def z_sdivrem32 : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
Index: lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.h.orig
+++ lib/Target/SystemZ/SystemZISelLowering.h
@@ -87,6 +87,9 @@ enum {
// the number of the register.
EXTRACT_ACCESS,
+ // Count number of bits set in operand 0 per byte.
+ POPCNT,
+
// Wrappers around the ISD opcodes of the same name. The output and
// first input operands are GR128s. The trailing numbers are the
// widths of the second operand in bits.
@@ -291,6 +294,7 @@ private:
SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,