File backport-llvm-r229654 of Package llvm
------------------------------------------------------------------------
r229654 | uweigand | 2015-02-18 10:13:27 +0100 (Wed, 18 Feb 2015) | 21 lines
[SystemZ] Support all TLS access models - CodeGen part
The current SystemZ back-end only supports the local-exec TLS access model.
This patch adds all required CodeGen support for the other TLS models, which
means in particular:
- Expand initial-exec TLS accesses by loading TLS offsets from the GOT
  using @indntpoff relocations.
- Expand general-dynamic and local-dynamic accesses by generating the
  appropriate calls to __tls_get_offset.  Note that this routine has
  a non-standard ABI and requires loading the GOT pointer into %r12,
  so the patch also adds support for the GLOBAL_OFFSET_TABLE ISD node.
- Add a new platform-specific optimization pass to remove redundant
  __tls_get_offset calls in the local-dynamic model (modeled after
  the corresponding X86 pass).
- Add test cases verifying all access models and optimizations.
------------------------------------------------------------------------
Index: lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.h.orig
+++ lib/Target/SystemZ/SystemZISelLowering.h
@@ -34,6 +34,11 @@ enum {
   CALL,
   SIBCALL,
 
+  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+  // (The call target is implicitly __tls_get_offset.)
+  TLS_GDCALL,
+  TLS_LDCALL,
+
   // Wraps a TargetGlobalAddress that should be loaded using PC-relative
   // accesses (LARL).  Operand 0 is the address.
   PCREL_WRAPPER,
@@ -266,6 +271,9 @@ private:
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
                              SelectionDAG &DAG) const;
+  SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                            SelectionDAG &DAG, unsigned Opcode,
+                            SDValue GOTOffset) const;
   SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                 SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(BlockAddressSDNode *Node,
Index: lib/Target/SystemZ/SystemZTargetMachine.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetMachine.cpp.orig
+++ lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -61,6 +61,10 @@ void SystemZPassConfig::addIRPasses() {
 
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+
+ if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZLDCleanupPass(getSystemZTargetMachine()));
+
   return false;
 }
 
Index: lib/Target/SystemZ/SystemZConstantPoolValue.h
===================================================================
--- lib/Target/SystemZ/SystemZConstantPoolValue.h.orig
+++ lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -19,13 +19,17 @@ class GlobalValue;
 
 namespace SystemZCP {
 enum SystemZCPModifier {
+  TLSGD,
+  TLSLDM,
+  DTPOFF,
   NTPOFF
 };
 } // end namespace SystemZCP
 
 /// A SystemZ-specific constant pool value.  At present, the only
-/// defined constant pool values are offsets of thread-local variables
-/// (written x@NTPOFF).
+/// defined constant pool values are module IDs or offsets of
+/// thread-local variables (written x@TLSGD, x@TLSLDM, x@DTPOFF,
+/// or x@NTPOFF).
 class SystemZConstantPoolValue : public MachineConstantPoolValue {
   const GlobalValue *GV;
   SystemZCP::SystemZCPModifier Modifier;
Index: lib/Target/SystemZ/SystemZMachineFunctionInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZMachineFunctionInfo.h.orig
+++ lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -23,11 +23,13 @@ class SystemZMachineFunctionInfo : publi
   unsigned VarArgsFrameIndex;
   unsigned RegSaveFrameIndex;
   bool ManipulatesSP;
+  unsigned NumLocalDynamics;
 
 public:
   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
     : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
-      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false) {}
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false),
+      NumLocalDynamics(0) {}
 
   // Get and set the first call-saved GPR that should be saved and restored
   // by this function.  This is 0 if no GPRs need to be saved or restored.
@@ -61,6 +63,10 @@ public:
   // e.g. through STACKSAVE or STACKRESTORE.
   bool getManipulatesSP() const { return ManipulatesSP; }
   void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+
+  // Count number of local-dynamic TLS symbols used.
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 };
 
 } // end namespace llvm
Index: lib/Target/SystemZ/SystemZInstrInfo.td
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.td.orig
+++ lib/Target/SystemZ/SystemZInstrInfo.td
@@ -249,6 +249,15 @@ let isCall = 1, isTerminator = 1, isRetu
     def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
+// TLS calls.  These will be lowered into a call to __tls_get_offset,
+// with an extra relocation specifying the TLS symbol.
+let isCall = 1, Defs = [R14D, CC] in {
+  def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
+  def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
+}
+
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
 // Allow an optional TLS marker symbol to generate TLS call relocations.
@@ -588,6 +597,12 @@ let neverHasSideEffects = 1, isAsCheapAs
                      [(set GR64:$R1, pcrel32:$I2)]>;
 }
 
+// Load the Global Offset Table address.  This will be lowered into a
+//     larl $R1, _GLOBAL_OFFSET_TABLE_
+// instruction.
+def GOT : Alias<6, (outs GR64:$R1), (ins),
+                [(set GR64:$R1, (global_offset_table))]>;
+
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp.orig
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1831,6 +1831,52 @@ SDValue SystemZTargetLowering::lowerGlob
   return Result;
 }
 
+SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                                                 SelectionDAG &DAG,
+                                                 unsigned Opcode,
+                                                 SDValue GOTOffset) const {
+  SDLoc DL(Node);
+  EVT PtrVT = getPointerTy();
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Glue;
+
+  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
+  Glue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
+  Glue = Chain.getValue(1);
+
+  // The first call operand is the chain and the second is the TLS symbol.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
+                                           Node->getValueType(0),
+                                           0, 0));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
+  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies.
+  Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(Opcode, DL, NodeTys, &Ops[0], Ops.size());
+  Glue = Chain.getValue(1);
+
+  // Copy the return value from %r2.
+  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
+}
+
 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 						     SelectionDAG &DAG) const {
   SDLoc DL(Node);
@@ -1838,9 +1884,6 @@ SDValue SystemZTargetLowering::lowerGlob
   EVT PtrVT = getPointerTy();
   TLSModel::Model model = TM.getTLSModel(GV);
 
-  if (model != TLSModel::LocalExec)
-    llvm_unreachable("only local-exec TLS mode supported");
-
   // The high part of the thread pointer is in access register 0.
   SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
                              DAG.getConstant(0, MVT::i32));
@@ -1856,15 +1899,82 @@ SDValue SystemZTargetLowering::lowerGlob
 				    DAG.getConstant(32, PtrVT));
   SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
 
-  // Get the offset of GA from the thread pointer.
-  SystemZConstantPoolValue *CPV =
-    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
-
-  // Force the offset into the constant pool and load it from there.
-  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
-  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-			       CPAddr, MachinePointerInfo::getConstantPool(),
-			       false, false, false, 0);
+  // Get the offset of GA from the thread pointer, based on the TLS model.
+  SDValue Offset;
+  switch (model) {
+    case TLSModel::GeneralDynamic: {
+      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
+      break;
+    }
+
+    case TLSModel::LocalDynamic: {
+      // Load the GOT offset of the module ID.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the module base offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
+
+      // Note: The SystemZLDCleanupPass will remove redundant computations
+      // of the module base offset.  Count total number of local-dynamic
+      // accesses to trigger execution of that pass.
+      SystemZMachineFunctionInfo* MFI =
+        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
+      MFI->incNumLocalDynamicTLSAccesses();
+
+      // Add the per-symbol offset.
+      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
+
+      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                              DTPOffset, MachinePointerInfo::getConstantPool(),
+                              false, false, false, 0);
+
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
+      break;
+    }
+
+    case TLSModel::InitialExec: {
+      // Load the offset from the GOT.
+      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                          SystemZII::MO_INDNTPOFF);
+      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+      break;
+    }
+
+    case TLSModel::LocalExec: {
+      // Force the offset into the constant pool and load it from there.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      break;
+   }
+
+    default:
+      llvm_unreachable("Unknown TLS model.");
+  }
 
   // Add the base and offset together.
   return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
Index: lib/Target/SystemZ/SystemZMCInstLower.cpp
===================================================================
--- lib/Target/SystemZ/SystemZMCInstLower.cpp.orig
+++ lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -22,6 +22,8 @@ static MCSymbolRefExpr::VariantKind getV
       return MCSymbolRefExpr::VK_None;
     case SystemZII::MO_GOT:
       return MCSymbolRefExpr::VK_GOT;
+    case SystemZII::MO_INDNTPOFF:
+      return MCSymbolRefExpr::VK_INDNTPOFF;
   }
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
Index: lib/Target/SystemZ/SystemZLDCleanup.cpp
===================================================================
--- /dev/null
+++ lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -0,0 +1,145 @@
+//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines multiple accesses to local-dynamic TLS variables so that
+// the TLS base address for the module is only fetched once per execution path
+// through the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SystemZLDCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLDCleanup(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(0), MF(0) {}
+
+  virtual const char *getPassName() const LLVM_OVERRIDE {
+    return "SystemZ Local Dynamic TLS Access Clean-up";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) LLVM_OVERRIDE;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const LLVM_OVERRIDE;
+
+private:
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg);
+  MachineInstr *ReplaceTLSCall(MachineInstr *I, unsigned TLSBaseAddrReg);
+  MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg);
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+};
+
+char SystemZLDCleanup::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
+  return new SystemZLDCleanup(TM);
+}
+
+void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  MF = &F;
+
+  SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
+  if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+    // No point folding accesses if there isn't at least two.
+    return false;
+  }
+
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+  return VisitNode(DT->getRootNode(), 0);
+}
+
+// Visit the dominator subtree rooted at Node in pre-order.
+// If TLSBaseAddrReg is non-null, then use that to replace any
+// TLS_LDCALL instructions. Otherwise, create the register
+// when the first such instruction is seen, and then use it
+// as we encounter more instructions.
+bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
+                                 unsigned TLSBaseAddrReg) {
+  MachineBasicBlock *BB = Node->getBlock();
+  bool Changed = false;
+
+  // Traverse the current block.
+  for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+       I != E; ++I) {
+    switch (I->getOpcode()) {
+      case SystemZ::TLS_LDCALL:
+        if (TLSBaseAddrReg)
+          I = ReplaceTLSCall(I, TLSBaseAddrReg);
+        else
+          I = SetRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Visit the children of this block in the dominator tree.
+  for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+       I != E; ++I)
+    Changed |= VisitNode(*I, TLSBaseAddrReg);
+
+  return Changed;
+}
+
+// Replace the TLS_LDCALL instruction I with a copy from TLSBaseAddrReg,
+// returning the new instruction.
+MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
+                                               unsigned TLSBaseAddrReg) {
+  // Insert a Copy from TLSBaseAddrReg to R2.
+  MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), SystemZ::R2D)
+                               .addReg(TLSBaseAddrReg);
+
+  // Erase the TLS_LDCALL instruction.
+  I->eraseFromParent();
+
+  return Copy;
+}
+
+// Create a virtal register in *TLSBaseAddrReg, and populate it by
+// inserting a copy instruction after I. Returns the new instruction.
+MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
+                                            unsigned *TLSBaseAddrReg) {
+  // Create a virtual register for the TLS base address.
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  *TLSBaseAddrReg = RegInfo.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+  // Insert a copy from R2 to TLSBaseAddrReg.
+  MachineInstr *Next = I->getNextNode();
+  MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+                               .addReg(SystemZ::R2D);
+
+  return Copy;
+}
+
Index: lib/Target/SystemZ/SystemZInstrInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.h.orig
+++ lib/Target/SystemZ/SystemZInstrInfo.h
@@ -56,10 +56,13 @@ static inline unsigned getCompareZeroCCM
 // SystemZ MachineOperand target flags.
 enum {
   // Masks out the bits for the access model.
-  MO_SYMBOL_MODIFIER = (1 << 0),
+  MO_SYMBOL_MODIFIER = (3 << 0),
 
   // @GOT (aka @GOTENT)
-  MO_GOT = (1 << 0)
+  MO_GOT = (1 << 0),
+
+  // @INDNTPOFF
+  MO_INDNTPOFF = (2 << 0)
 };
 // Classifies a branch.
 enum BranchType {
Index: lib/Target/SystemZ/SystemZConstantPoolValue.cpp
===================================================================
--- lib/Target/SystemZ/SystemZConstantPoolValue.cpp.orig
+++ lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -28,6 +28,11 @@ SystemZConstantPoolValue::Create(const G
 
 unsigned SystemZConstantPoolValue::getRelocationInfo() const {
   switch (Modifier) {
+  case SystemZCP::TLSGD:
+  case SystemZCP::TLSLDM:
+  case SystemZCP::DTPOFF:
+    // May require a dynamic relocation.
+    return 2;
   case SystemZCP::NTPOFF:
     // May require a relocation, but the relocations are always resolved
     // by the static linker.
Index: lib/Target/SystemZ/SystemZAsmPrinter.cpp
===================================================================
--- lib/Target/SystemZ/SystemZAsmPrinter.cpp.orig
+++ lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -66,6 +66,20 @@ static MCInst lowerRIEfLow(const Machine
     .addImm(MI->getOperand(5).getImm());
 }
 
+static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
+  StringRef Name = "__tls_get_offset";
+  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_PLT,
+                                 Context);
+}
+
+static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
+  StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_None,
+                                 Context);
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
@@ -95,6 +109,26 @@ void SystemZAsmPrinter::EmitInstruction(
     LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
     break;
 
+  case SystemZ::TLS_GDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSGD));
+    break;
+
+  case SystemZ::TLS_LDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSLDM));
+    break;
+
+  case SystemZ::GOT:
+    LoweredMI = MCInstBuilder(SystemZ::LARL)
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(getGlobalOffsetTable(MF->getContext()));
+    break;
+
   case SystemZ::IILF64:
     LoweredMI = MCInstBuilder(SystemZ::IILF)
       .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
@@ -172,6 +206,9 @@ void SystemZAsmPrinter::EmitInstruction(
 static MCSymbolRefExpr::VariantKind
 getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
   switch (Modifier) {
+  case SystemZCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
+  case SystemZCP::TLSLDM: return MCSymbolRefExpr::VK_TLSLDM;
+  case SystemZCP::DTPOFF: return MCSymbolRefExpr::VK_DTPOFF;
   case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
   }
   llvm_unreachable("Invalid SystemCPModifier!");
Index: lib/Target/SystemZ/SystemZ.h
===================================================================
--- lib/Target/SystemZ/SystemZ.h.orig
+++ lib/Target/SystemZ/SystemZ.h
@@ -111,6 +111,7 @@ FunctionPass *createSystemZISelDag(Syste
 FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
 } // end namespace llvm
 
 #endif
Index: lib/Target/SystemZ/CMakeLists.txt
===================================================================
--- lib/Target/SystemZ/CMakeLists.txt.orig
+++ lib/Target/SystemZ/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_target(SystemZCodeGen
   SystemZISelDAGToDAG.cpp
   SystemZISelLowering.cpp
   SystemZInstrInfo.cpp
+  SystemZLDCleanup.cpp
   SystemZLongBranch.cpp
   SystemZMachineFunctionInfo.cpp
   SystemZMCInstLower.cpp
Index: lib/Target/SystemZ/SystemZOperators.td
===================================================================
--- lib/Target/SystemZ/SystemZOperators.td.orig
+++ lib/Target/SystemZ/SystemZOperators.td
@@ -90,6 +90,7 @@ def callseq_start       : SDNode<"ISD::C
 def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
                                  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
                                   SDNPOutGlue]>;
+def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
 
 // Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
 def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
@@ -100,6 +101,12 @@ def z_call              : SDNode<"System
 def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
+def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
Index: test/CodeGen/SystemZ/tls-06.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-06.ll
@@ -0,0 +1,17 @@
+; Test general-dynamic TLS access optimizations.
+;
+; If we access two different TLS variables, we need two calls to
+; __tls_get_offset, but should load _GLOBAL_OFFSET_TABLE only once.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 2
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "_GLOBAL_OFFSET_TABLE_" | count 1
+
+@x = thread_local global i32 0
+@y = thread_local global i32 0
+
+define i32 @foo() {
+  %valx = load i32* @x
+  %valy = load i32* @y
+  %add = add nsw i32 %valx, %valy
+  ret i32 %add
+}
Index: test/CodeGen/SystemZ/tls-01.ll
===================================================================
--- test/CodeGen/SystemZ/tls-01.ll.orig
+++ test/CodeGen/SystemZ/tls-01.ll
@@ -1,7 +1,7 @@
-; Test initial-exec TLS accesses.
+; Test local-exec TLS accesses.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
 
 @x = thread_local global i32 0
 
Index: test/CodeGen/SystemZ/tls-07.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-07.ll
@@ -0,0 +1,16 @@
+; Test local-dynamic TLS access optimizations.
+;
+; If we access two different local-dynamic TLS variables, we only
+; need a single call to __tls_get_offset.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
+
+@x = thread_local(localdynamic) global i32 0
+@y = thread_local(localdynamic) global i32 0
+
+define i32 @foo() {
+  %valx = load i32* @x
+  %valy = load i32* @y
+  %add = add nsw i32 %valx, %valy
+  ret i32 %add
+}
Index: test/CodeGen/SystemZ/tls-02.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-02.ll
@@ -0,0 +1,18 @@
+; Test initial-exec TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+
+@x = thread_local(initialexec) global i32 0
+
+; The offset must be loaded from the GOT.  This TLS access model does
+; not use literal pool constants.
+define i32 *@foo() {
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg %r2, [[HIGH]], 32
+; CHECK-MAIN: ear %r2, %a1
+; CHECK-MAIN: larl %r1, x@INDNTPOFF
+; CHECK-MAIN: ag %r2, 0(%r1)
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
Index: test/CodeGen/SystemZ/tls-03.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-03.ll
@@ -0,0 +1,23 @@
+; Test general-dynamic TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
+
+@x = thread_local global i32 0
+
+; Call __tls_get_offset to retrieve the symbol's TLS offset.
+define i32 *@foo() {
+; CHECK-CP: .LCP{{.*}}:
+; CHECK-CP: .quad x@TLSGD
+;
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
+; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}
+; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_gdcall:x
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
+; CHECK-MAIN: ear [[TP]], %a1
+; CHECK-MAIN: agr %r2, [[TP]]
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
Index: test/CodeGen/SystemZ/tls-04.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-04.ll
@@ -0,0 +1,28 @@
+; Test local-dynamic TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
+
+@x = thread_local(localdynamic) global i32 0
+
+; Call __tls_get_offset to retrieve the module's TLS base offset.
+; Add the per-symbol offset and the thread pointer.
+define i32 *@foo() {
+; CHECK-CP: .LCP{{.*}}_0:
+; CHECK-CP: .quad x@TLSLDM
+; CHECK-CP: .LCP{{.*}}_1:
+; CHECK-CP: .quad x@DTPOFF
+;
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
+; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}_0
+; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_ldcall:x
+; CHECK-MAIN: larl %r1, .LCP{{.*}}_1
+; CHECK-MAIN: ag %r2, 0(%r1)
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
+; CHECK-MAIN: ear [[TP]], %a1
+; CHECK-MAIN: agr %r2, [[TP]]
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
Index: test/CodeGen/SystemZ/tls-05.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/tls-05.ll
@@ -0,0 +1,15 @@
+; Test general-dynamic TLS access optimizations.
+;
+; If we access the same TLS variable twice, there should only be
+; a single call to __tls_get_offset.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
+
+@x = thread_local global i32 0
+
+define i32 @foo() {
+  %val = load i32* @x
+  %inc = add nsw i32 %val, 1
+  store i32 %inc, i32* @x
+  ret i32 %val
+}