File s390-long-loop-prediction-1 of Package gcc43

Index: gcc/config/s390/s390.c
===================================================================
--- gcc/config/s390/s390.c.orig	2009-11-20 13:52:36.000000000 +0100
+++ gcc/config/s390/s390.c	2009-11-20 13:52:37.000000000 +0100
@@ -356,6 +356,10 @@ struct machine_function GTY(())
 #define REGNO_PAIR_OK(REGNO, MODE)                               \
   (HARD_REGNO_NREGS ((REGNO), (MODE)) == 1 || !((REGNO) & 1))
 
+/* That's the read ahead of the dynamic branch prediction unit in
+   bytes on a z10 CPU.  */
+#define Z10_PREDICT_DISTANCE 384
+
 static enum machine_mode
 s390_libgcc_cmp_return_mode (void)
 {
@@ -9599,6 +9603,66 @@ s390_optimize_prologue (void)
     }
 }
 
+/* On z10 the dynamic branch prediction must see the backward jump in
+   a window of 384 bytes. If not it falls back to the static
+   prediction.  This function rearranges the loop backward branch in a
+   way which makes the static prediction always correct.  The function
+   returns true if it added an instruction.  */
+static bool
+s390_z10_fix_long_loop_prediction (rtx insn)
+{
+  rtx set = single_set (insn);
+  rtx code_label, label_ref, new_label;
+  rtx uncond_jump;
+  rtx cur_insn;
+  rtx tmp;
+  int distance;
+
+  /* This will exclude branch on count and branch on index patterns
+     since these are correctly statically predicted.  */
+  if (!set
+      || SET_DEST (set) != pc_rtx
+      || GET_CODE (SET_SRC(set)) != IF_THEN_ELSE)
+    return false;
+
+  label_ref = (GET_CODE (XEXP (SET_SRC (set), 1)) == LABEL_REF ?
+	       XEXP (SET_SRC (set), 1) : XEXP (SET_SRC (set), 2));
+
+  gcc_assert (GET_CODE (label_ref) == LABEL_REF);
+
+  code_label = XEXP (label_ref, 0);
+
+  if (INSN_ADDRESSES (INSN_UID (code_label)) == -1
+      || INSN_ADDRESSES (INSN_UID (insn)) == -1
+      || (INSN_ADDRESSES (INSN_UID (insn))
+	  - INSN_ADDRESSES (INSN_UID (code_label)) < Z10_PREDICT_DISTANCE))
+    return false;
+
+  for (distance = 0, cur_insn = PREV_INSN (insn);
+       distance < Z10_PREDICT_DISTANCE - 6;
+       distance += get_attr_length (cur_insn), cur_insn = PREV_INSN (cur_insn))
+    if (!cur_insn || JUMP_P (cur_insn) || LABEL_P (cur_insn))
+      return false;
+
+  new_label = gen_label_rtx ();
+  uncond_jump = emit_jump_insn_after (
+		  gen_rtx_SET (VOIDmode, pc_rtx,
+			       gen_rtx_LABEL_REF (VOIDmode, code_label)),
+		  insn);
+  emit_label_after (new_label, uncond_jump);
+
+  tmp = XEXP (SET_SRC (set), 1);
+  XEXP (SET_SRC (set), 1) = XEXP (SET_SRC (set), 2);
+  XEXP (SET_SRC (set), 2) = tmp;
+  INSN_CODE (insn) = -1;
+
+  XEXP (label_ref, 0) = new_label;
+  JUMP_LABEL (insn) = new_label;
+  JUMP_LABEL (uncond_jump) = code_label;
+
+  return true;
+}
+
 /* Returns 1 if INSN reads the value of REG for purposes not related
    to addressing of memory, and 0 otherwise.  */
 static int
@@ -9681,97 +9745,87 @@ s390_swap_cmp (rtx cond, rtx *op0, rtx *
    if that register's value is delivered via a bypass, then the
    pipeline recycles, thereby causing significant performance decline.
    This function locates such situations and exchanges the two
-   operands of the compare.  */
-static void
-s390_z10_optimize_cmp (void)
+   operands of the compare.  The function return true whenever it
+   added an insn.  */
+static bool
+s390_z10_optimize_cmp (rtx insn)
 {
-  rtx insn, prev_insn, next_insn;
-  int added_NOPs = 0;
+  rtx prev_insn, next_insn;
+  bool insn_added_p = false;
+  rtx cond, *op0, *op1;
 
-  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+  if (GET_CODE (PATTERN (insn)) == PARALLEL)
     {
-      rtx cond, *op0, *op1;
-
-      if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
-	continue;
-
-      if (GET_CODE (PATTERN (insn)) == PARALLEL)
-	{
-	  /* Handle compare and branch and branch on count
-	     instructions.  */
-	  rtx pattern = single_set (insn);
-
-	  if (!pattern
-	      || SET_DEST (pattern) != pc_rtx
-	      || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
-	    continue;
+      /* Handle compare and branch and branch on count
+	 instructions.  */
+      rtx pattern = single_set (insn);
+
+      if (!pattern
+	  || SET_DEST (pattern) != pc_rtx
+	  || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
+	return false;
 
-	  cond = XEXP (SET_SRC (pattern), 0);
-	  op0 = &XEXP (cond, 0);
-	  op1 = &XEXP (cond, 1);
-	}
-      else if (GET_CODE (PATTERN (insn)) == SET)
-	{
-	  rtx src, dest;
+      cond = XEXP (SET_SRC (pattern), 0);
+      op0 = &XEXP (cond, 0);
+      op1 = &XEXP (cond, 1);
+    }
+  else if (GET_CODE (PATTERN (insn)) == SET)
+    {
+      rtx src, dest;
 
-	  /* Handle normal compare instructions.  */
-	  src = SET_SRC (PATTERN (insn));
-	  dest = SET_DEST (PATTERN (insn));
+      /* Handle normal compare instructions.  */
+      src = SET_SRC (PATTERN (insn));
+      dest = SET_DEST (PATTERN (insn));
 
-	  if (!REG_P (dest)
-	      || !CC_REGNO_P (REGNO (dest))
-	      || GET_CODE (src) != COMPARE)
-	    continue;
+      if (!REG_P (dest)
+	  || !CC_REGNO_P (REGNO (dest))
+	  || GET_CODE (src) != COMPARE)
+	return false;
 
-	  /* s390_swap_cmp will try to find the conditional
-	     jump when passing NULL_RTX as condition.  */
-	  cond = NULL_RTX;
-	  op0 = &XEXP (src, 0);
-	  op1 = &XEXP (src, 1);
-	}
-      else
-	continue;
+      /* s390_swap_cmp will try to find the conditional
+	 jump when passing NULL_RTX as condition.  */
+      cond = NULL_RTX;
+      op0 = &XEXP (src, 0);
+      op1 = &XEXP (src, 1);
+    }
+  else
+    return false;
 
-      if (!REG_P (*op0) || !REG_P (*op1))
-	continue;
+  if (!REG_P (*op0) || !REG_P (*op1))
+    return false;
 
-      /* Swap the COMPARE arguments and its mask if there is a
-	 conflicting access in the previous insn.  */
-      prev_insn = PREV_INSN (insn);
+  /* Swap the COMPARE arguments and its mask if there is a
+     conflicting access in the previous insn.  */
+  prev_insn = PREV_INSN (insn);
+  if (prev_insn != NULL_RTX && INSN_P (prev_insn)
+      && reg_referenced_p (*op1, PATTERN (prev_insn)))
+    s390_swap_cmp (cond, op0, op1, insn);
+
+  /* Check if there is a conflict with the next insn. If there
+     was no conflict with the previous insn, then swap the
+     COMPARE arguments and its mask.  If we already swapped
+     the operands, or if swapping them would cause a conflict
+     with the previous insn, issue a NOP after the COMPARE in
+     order to separate the two instuctions.  */
+  next_insn = NEXT_INSN (insn);
+  if (next_insn != NULL_RTX && INSN_P (next_insn)
+      && s390_non_addr_reg_read_p (*op1, next_insn))
+    {
       if (prev_insn != NULL_RTX && INSN_P (prev_insn)
-	  && reg_referenced_p (*op1, PATTERN (prev_insn)))
-	s390_swap_cmp (cond, op0, op1, insn);
-
-      /* Check if there is a conflict with the next insn. If there
-	 was no conflict with the previous insn, then swap the
-	 COMPARE arguments and its mask.  If we already swapped
-	 the operands, or if swapping them would cause a conflict
-	 with the previous insn, issue a NOP after the COMPARE in
-	 order to separate the two instuctions.  */
-      next_insn = NEXT_INSN (insn);
-      if (next_insn != NULL_RTX && INSN_P (next_insn)
-	  && s390_non_addr_reg_read_p (*op1, next_insn))
+	  && s390_non_addr_reg_read_p (*op0, prev_insn))
 	{
-	  if (prev_insn != NULL_RTX && INSN_P (prev_insn)
-	      && s390_non_addr_reg_read_p (*op0, prev_insn))
-	    {
-	      if (REGNO (*op1) == 0)
-		emit_insn_after (gen_nop1 (), insn);
-	      else
-		emit_insn_after (gen_nop (), insn);
-	      added_NOPs = 1;
-	    }
+	  if (REGNO (*op1) == 0)
+	    emit_insn_after (gen_nop1 (), insn);
 	  else
-	    s390_swap_cmp (cond, op0, op1, insn);
+	    emit_insn_after (gen_nop (), insn);
+	  insn_added_p = true;
 	}
+      else
+	s390_swap_cmp (cond, op0, op1, insn);
     }
-
-  /* Adjust branches if we added new instructions.  */
-  if (added_NOPs)
-    shorten_branches (get_insns ());
+  return insn_added_p;
 }
 
-
 /* Perform machine-dependent processing.  */
 
 static void
@@ -9885,10 +9939,33 @@ s390_reorg (void)
   /* Try to optimize prologue and epilogue further.  */
   s390_optimize_prologue ();
 
-  /* Eliminate z10-specific pipeline recycles related to some compare
-     instructions.  */
+  /* Walk over the insns and do some z10 specific changes.  */
   if (s390_tune == PROCESSOR_2097_Z10)
-    s390_z10_optimize_cmp ();
+    {
+      rtx insn;
+      bool insn_added_p = false;
+
+      /* The insn lengths and addresses have to be up to date for the
+	 following manipulations.  */
+      shorten_branches (get_insns ());
+
+      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+	{
+	  if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
+	    continue;
+
+	  if (JUMP_P (insn))
+	    insn_added_p |= s390_z10_fix_long_loop_prediction (insn);
+
+	  if (GET_CODE (PATTERN (insn)) == PARALLEL
+	      || GET_CODE (PATTERN (insn)) == SET)
+	    insn_added_p |= s390_z10_optimize_cmp (insn);
+	}
+
+      /* Adjust branches if we added new instructions.  */
+      if (insn_added_p)
+	shorten_branches (get_insns ());
+    }
 }
 
 
Index: gcc/config/s390/s390.md
===================================================================
--- gcc/config/s390/s390.md.orig	2009-11-20 13:52:34.000000000 +0100
+++ gcc/config/s390/s390.md	2009-11-20 13:52:37.000000000 +0100
@@ -1073,6 +1073,64 @@
                       (const_int 6) (const_int 12)))]) ; 8 byte for clr/jg
                                                        ; 10 byte for clgr/jg
 
+; And now the same two patterns as above but with a negated CC mask.
+
+; cij, cgij, crj, cgrj, cfi, cgfi, cr, cgr
+; The following instructions do a complementary access of their second
+; operand (z01 only): crj_c, cgrjc, cr, cgr
+(define_insn "*icmp_and_br_signed_<mode>"
+  [(set (pc)
+	(if_then_else (match_operator 0 "s390_signed_integer_comparison"
+			[(match_operand:GPR 1 "register_operand"  "d,d")
+			 (match_operand:GPR 2 "nonmemory_operand" "d,C")])
+		      (pc)
+		      (label_ref (match_operand 3 "" ""))))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_Z10"
+{
+  if (get_attr_length (insn) == 6)
+    return which_alternative ?
+      "c<g>ij%D0\t%1,%c2,%l3" : "c<g>rj%D0\t%1,%2,%l3";
+  else
+    return which_alternative ?
+      "c<g>fi\t%1,%c2\;jg%D0\t%l3" : "c<g>r\t%1,%2\;jg%D0\t%l3";
+}
+  [(set_attr "op_type" "RIE")
+   (set_attr "type"    "branch")
+   (set_attr "z10prop" "z10_super_c,z10_super")
+   (set (attr "length")
+        (if_then_else (lt (abs (minus (pc) (match_dup 3))) (const_int 60000))
+                      (const_int 6) (const_int 12)))]) ; 8 byte for cr/jg
+                                                       ; 10 byte for cgr/jg
+
+; clij, clgij, clrj, clgrj, clfi, clgfi, clr, clgr
+; The following instructions do a complementary access of their second
+; operand (z10 only): clrj, clgrj, clr, clgr
+(define_insn "*icmp_and_br_unsigned_<mode>"
+  [(set (pc)
+	(if_then_else (match_operator 0 "s390_unsigned_integer_comparison"
+			[(match_operand:GPR 1 "register_operand"  "d,d")
+			 (match_operand:GPR 2 "nonmemory_operand" "d,I")])
+		      (pc)
+		      (label_ref (match_operand 3 "" ""))))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_Z10"
+{
+  if (get_attr_length (insn) == 6)
+    return which_alternative ?
+      "cl<g>ij%D0\t%1,%b2,%l3" : "cl<g>rj%D0\t%1,%2,%l3";
+  else
+    return which_alternative ?
+      "cl<g>fi\t%1,%b2\;jg%D0\t%l3" : "cl<g>r\t%1,%2\;jg%D0\t%l3";
+}
+  [(set_attr "op_type" "RIE")
+   (set_attr "type"    "branch")
+   (set_attr "z10prop" "z10_super_c,z10_super")
+   (set (attr "length")
+        (if_then_else (lt (abs (minus (pc) (match_dup 3))) (const_int 60000))
+                      (const_int 6) (const_int 12)))]) ; 8 byte for clr/jg
+                                                       ; 10 byte for clgr/jg
+
 ;;
 ;;- Move instructions.
 ;;