File ibm-cell-split of Package cross-spu-gcc-static
2008-08-27  Trevor Smigiel <Trevor_Smigiel@playstation.sony.com>
	
	Improve code generated for loads and stores on SPU.
	* doc/tm.texi (SPLIT_BEFORE_CSE2) : Document.
	* tree-pass.h (pass_split_before_cse2) : Declare.
	* final.c (rest_of_clean_state) : Initialize split0_completed.
	* recog.c (split0_completed) : Define.
	(gate_handle_split_before_cse2, rest_of_handle_split_before_cse2) :
	New functions.
	(pass_split_before_cse2) : New pass.
	* rtl.h (split0_completed) : Declare.
        * passes.c (init_optimization_passes) : Add pass_split_before_cse2
        before pass_cse2 .
	* config/spu/spu-protos.h (spu_legitimate_address) : Add
	for_split argument.
	(aligned_mem_p, spu_valid_move) : Remove prototypes.
	(spu_split_load, spu_split_store) : Change return type to int.
	* config/spu/predicates.md (spu_mem_operand) : Remove.
	(spu_dest_operand) : Add.
	* config/spu/spu-builtins.md (spu_lqd, spu_lqx, spu_lqa,
	spu_lqr, spu_stqd, spu_stqx, spu_stqa, spu_stqr) : Remove AND
	operation.
	* config/spu/spu.c (regno_aligned_for_load) : Remove.
	(reg_aligned_for_addr, address_needs_split) : New functions.
	(spu_legitimate_address, spu_expand_mov, spu_split_load,
	spu_split_store) : Update.
	(spu_init_expanders) : Pregenerate a couple of pseudo-registers.
	* config/spu/spu.h (REG_ALIGN, SPLIT_BEFORE_CSE2) : Define.
	(GO_IF_LEGITIMATE_ADDRESS) : Update for spu_legitimate_address.
	* config/spu/spu.md ("_mov<mode>", "_movdi", "_movti") : Update
	predicates.
	("load", "store") : Change to define_split.
testsuite/
	* testsuite/gcc.target/spu/split0-1.c : Add test.
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/predicates.md gcc-4_3/gcc/config/spu/predicates.md
*** gcc-4_3-orig/gcc/config/spu/predicates.md	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/predicates.md	2008-09-10 20:09:59.000000000 +0200
***************
*** 39,52 ****
         (ior (not (match_code "subreg"))
              (match_test "valid_subreg (op)"))))
  
- (define_predicate "spu_mem_operand"
-   (and (match_operand 0 "memory_operand")
-        (match_test "reload_in_progress || reload_completed || aligned_mem_p (op)")))
- 
  (define_predicate "spu_mov_operand"
!   (ior (match_operand 0 "spu_mem_operand")
         (match_operand 0 "spu_nonmem_operand")))
  
  (define_predicate "call_operand"
    (and (match_code "mem")
         (match_test "(!TARGET_LARGE_MEM && satisfies_constraint_S (op))
--- 39,52 ----
         (ior (not (match_code "subreg"))
              (match_test "valid_subreg (op)"))))
  
  (define_predicate "spu_mov_operand"
!   (ior (match_operand 0 "memory_operand")
         (match_operand 0 "spu_nonmem_operand")))
  
+ (define_predicate "spu_dest_operand"
+   (ior (match_operand 0 "memory_operand")
+        (match_operand 0 "spu_reg_operand")))
+ 
  (define_predicate "call_operand"
    (and (match_code "mem")
         (match_test "(!TARGET_LARGE_MEM && satisfies_constraint_S (op))
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/spu-builtins.md gcc-4_3/gcc/config/spu/spu-builtins.md
*** gcc-4_3-orig/gcc/config/spu/spu-builtins.md	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/spu-builtins.md	2008-09-10 20:09:59.000000000 +0200
***************
*** 23,31 ****
  
  (define_expand "spu_lqd"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (and:SI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 				 (match_operand:SI 2 "spu_nonmem_operand" ""))
! 		        (const_int -16))))]
    ""
    {
      if (GET_CODE (operands[2]) == CONST_INT
--- 23,30 ----
  
  (define_expand "spu_lqd"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 			 (match_operand:SI 2 "spu_nonmem_operand" ""))))]
    ""
    {
      if (GET_CODE (operands[2]) == CONST_INT
***************
*** 42,57 ****
  
  (define_expand "spu_lqx"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (and:SI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
!                                  (match_operand:SI 2 "spu_reg_operand" ""))
!                         (const_int -16))))]
    ""
    "")
  
  (define_expand "spu_lqa"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (and:SI (match_operand:SI 1 "immediate_operand" "")
!                         (const_int -16))))]
    ""
    {
      if (GET_CODE (operands[1]) == CONST_INT
--- 41,54 ----
  
  (define_expand "spu_lqx"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 			 (match_operand:SI 2 "spu_reg_operand" ""))))]
    ""
    "")
  
  (define_expand "spu_lqa"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
!         (mem:TI (match_operand:SI 1 "immediate_operand" "")))]
    ""
    {
      if (GET_CODE (operands[1]) == CONST_INT
***************
*** 61,75 ****
  
  (define_expand "spu_lqr"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
! 	(mem:TI (and:SI (match_operand:SI 1 "address_operand" "")
! 			(const_int -16))))]
    ""
    "")
  
  (define_expand "spu_stqd"
!   [(set (mem:TI (and:SI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 				 (match_operand:SI 2 "spu_nonmem_operand" ""))
! 		        (const_int -16)))
          (match_operand:TI 0 "spu_reg_operand" "r,r"))]
    ""
    {
--- 58,70 ----
  
  (define_expand "spu_lqr"
    [(set (match_operand:TI 0 "spu_reg_operand" "")
! 	(mem:TI (match_operand:SI 1 "address_operand" "")))]
    ""
    "")
  
  (define_expand "spu_stqd"
!   [(set (mem:TI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 			 (match_operand:SI 2 "spu_nonmem_operand" "")))
          (match_operand:TI 0 "spu_reg_operand" "r,r"))]
    ""
    {
***************
*** 86,101 ****
    })
  
  (define_expand "spu_stqx"
!   [(set (mem:TI (and:SI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 				 (match_operand:SI 2 "spu_reg_operand" ""))
! 		        (const_int -16)))
          (match_operand:TI 0 "spu_reg_operand" "r"))]
    ""
    "")
  
  (define_expand "spu_stqa"
!   [(set (mem:TI (and:SI (match_operand:SI 1 "immediate_operand" "")
! 			(const_int -16)))
          (match_operand:TI 0 "spu_reg_operand" "r"))]
    ""
    {
--- 81,94 ----
    })
  
  (define_expand "spu_stqx"
!   [(set (mem:TI (plus:SI (match_operand:SI 1 "spu_reg_operand" "")
! 			 (match_operand:SI 2 "spu_reg_operand" "")))
          (match_operand:TI 0 "spu_reg_operand" "r"))]
    ""
    "")
  
  (define_expand "spu_stqa"
!   [(set (mem:TI (match_operand:SI 1 "immediate_operand" ""))
          (match_operand:TI 0 "spu_reg_operand" "r"))]
    ""
    {
***************
*** 105,112 ****
    })
  
  (define_expand "spu_stqr"
!     [(set (mem:TI (and:SI (match_operand:SI 1 "address_operand" "")
! 			  (const_int -16)))
  	  (match_operand:TI 0 "spu_reg_operand" ""))]
    ""
    "")
--- 98,104 ----
    })
  
  (define_expand "spu_stqr"
!     [(set (mem:TI (match_operand:SI 1 "address_operand" ""))
  	  (match_operand:TI 0 "spu_reg_operand" ""))]
    ""
    "")
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/spu.c gcc-4_3/gcc/config/spu/spu.c
*** gcc-4_3-orig/gcc/config/spu/spu.c	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/spu.c	2008-09-11 13:45:00.000000000 +0200
*************** static tree spu_build_builtin_va_list (v
*** 120,128 ****
  static void spu_va_start (tree, rtx);
  static tree spu_gimplify_va_arg_expr (tree valist, tree type, tree * pre_p,
  				      tree * post_p);
- static int regno_aligned_for_load (int regno);
  static int store_with_one_insn_p (rtx mem);
  static int mem_is_padded_component_ref (rtx x);
  static bool spu_assemble_integer (rtx x, unsigned int size, int aligned_p);
  static void spu_asm_globalize_label (FILE * file, const char *name);
  static unsigned char spu_rtx_costs (rtx x, int code, int outer_code,
--- 120,128 ----
  static void spu_va_start (tree, rtx);
  static tree spu_gimplify_va_arg_expr (tree valist, tree type, tree * pre_p,
  				      tree * post_p);
  static int store_with_one_insn_p (rtx mem);
  static int mem_is_padded_component_ref (rtx x);
+ static int reg_aligned_for_addr (rtx x, int aligned);
  static bool spu_assemble_integer (rtx x, unsigned int size, int aligned_p);
  static void spu_asm_globalize_label (FILE * file, const char *name);
  static unsigned char spu_rtx_costs (rtx x, int code, int outer_code,
*************** spu_legitimate_constant_p (rtx x)
*** 2857,2880 ****
  /* Valid address are:
     - symbol_ref, label_ref, const
     - reg
!    - reg + const, where either reg or const is 16 byte aligned
     - reg + reg, alignment doesn't matter
    The alignment matters in the reg+const case because lqd and stqd
!   ignore the 4 least significant bits of the const.  (TODO: It might be
!   preferable to allow any alignment and fix it up when splitting.) */
  int
! spu_legitimate_address (enum machine_mode mode ATTRIBUTE_UNUSED,
! 			rtx x, int reg_ok_strict)
  {
!   if (mode == TImode && GET_CODE (x) == AND
!       && GET_CODE (XEXP (x, 1)) == CONST_INT
!       && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) -16)
      x = XEXP (x, 0);
    switch (GET_CODE (x))
      {
-     case SYMBOL_REF:
      case LABEL_REF:
!       return !TARGET_LARGE_MEM;
  
      case CONST:
        if (!TARGET_LARGE_MEM && GET_CODE (XEXP (x, 0)) == PLUS)
--- 2857,2908 ----
  /* Valid address are:
     - symbol_ref, label_ref, const
     - reg
!    - reg + const, where const is 16 byte aligned
     - reg + reg, alignment doesn't matter
    The alignment matters in the reg+const case because lqd and stqd
!   ignore the 4 least significant bits of the const.  
! 
!   Addresses are handled in 4 phases. 
!   1) from the beginning of rtl expansion until the split0 pass.  Any
!      address is acceptable.  
!   2) The split0 pass. It is responsible for making every load and store
!      valid.  It calls legitimate_address with FOR_SPLIT set to 1.  This
!      is where non-16-byte aligned loads/stores are split into multiple
!      instructions to extract or insert just the part we care about.
!   3) From the split0 pass to the beginning of reload.  During this
!      phase the constant part of an address must be 16 byte aligned, and
!      we don't allow any loads/store of less than 4 bytes.  We also
!      allow a mask of -16 to be part of the address as an optimization.
!   4) From reload until the end.  Reload can change the modes of loads
!      and stores to something smaller than 4-bytes which we need to allow
!      now, and it also adjusts the address to match.  So in this phase we
!      allow that special case.  Still allow addresses with a mask of -16.
! 
!   FOR_SPLIT is only set to 1 for phase 2, otherwise it is 0.  */
  int
! spu_legitimate_address (enum machine_mode mode, rtx x, int reg_ok_strict,
! 			int for_split)
  {
!   int aligned = (split0_completed || for_split)
!     && !reload_in_progress && !reload_completed;
!   int const_aligned = split0_completed || for_split;
!   if (GET_MODE_SIZE (mode) >= 16)
!     aligned = 0;
!   else if (aligned && GET_MODE_SIZE (mode) < 4)
!     return 0;
!   if (split0_completed
!       && (GET_CODE (x) == AND
! 	  && GET_CODE (XEXP (x, 1)) == CONST_INT
! 	  && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) - 16
! 	  && !CONSTANT_P (XEXP (x, 0))))
      x = XEXP (x, 0);
    switch (GET_CODE (x))
      {
      case LABEL_REF:
!       return !TARGET_LARGE_MEM && !aligned;
! 
!     case SYMBOL_REF:
!       return !TARGET_LARGE_MEM && (!aligned || ALIGNED_SYMBOL_REF_P (x));
  
      case CONST:
        if (!TARGET_LARGE_MEM && GET_CODE (XEXP (x, 0)) == PLUS)
*************** spu_legitimate_address (enum machine_mod
*** 2882,2903 ****
  	  rtx sym = XEXP (XEXP (x, 0), 0);
  	  rtx cst = XEXP (XEXP (x, 0), 1);
  
- 	  /* Accept any symbol_ref + constant, assuming it does not
- 	     wrap around the local store addressability limit.  */
  	  if (GET_CODE (sym) == SYMBOL_REF && GET_CODE (cst) == CONST_INT)
! 	    return 1;
  	}
        return 0;
  
      case CONST_INT:
        return INTVAL (x) >= 0 && INTVAL (x) <= 0x3ffff;
  
      case SUBREG:
        x = XEXP (x, 0);
!       gcc_assert (GET_CODE (x) == REG);
  
      case REG:
!       return INT_REG_OK_FOR_BASE_P (x, reg_ok_strict);
  
      case PLUS:
      case LO_SUM:
--- 2910,2939 ----
  	  rtx sym = XEXP (XEXP (x, 0), 0);
  	  rtx cst = XEXP (XEXP (x, 0), 1);
  
  	  if (GET_CODE (sym) == SYMBOL_REF && GET_CODE (cst) == CONST_INT)
! 	    {
! 	      /* Check for alignment if required.  */
! 	      if (!aligned)
! 		return 1;
! 	      if ((INTVAL (cst) & 15) == 0 && ALIGNED_SYMBOL_REF_P (sym))
! 		return 1;
! 	    }
  	}
        return 0;
  
      case CONST_INT:
+       /* We don't test alignement here.  For an absolute address we
+          assume the user knows what they are doing. */
        return INTVAL (x) >= 0 && INTVAL (x) <= 0x3ffff;
  
      case SUBREG:
        x = XEXP (x, 0);
!       if (GET_CODE (x) != REG)
! 	return 0;
  
      case REG:
!       return INT_REG_OK_FOR_BASE_P (x, reg_ok_strict)
! 	&& reg_aligned_for_addr (x, 0);
  
      case PLUS:
      case LO_SUM:
*************** spu_legitimate_address (enum machine_mod
*** 2908,2928 ****
  	  op0 = XEXP (op0, 0);
  	if (GET_CODE (op1) == SUBREG)
  	  op1 = XEXP (op1, 0);
- 	/* We can't just accept any aligned register because CSE can
- 	   change it to a register that is not marked aligned and then
- 	   recog will fail.   So we only accept frame registers because
- 	   they will only be changed to other frame registers. */
  	if (GET_CODE (op0) == REG
  	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
  	    && GET_CODE (op1) == CONST_INT
  	    && INTVAL (op1) >= -0x2000
  	    && INTVAL (op1) <= 0x1fff
! 	    && (regno_aligned_for_load (REGNO (op0)) || (INTVAL (op1) & 15) == 0))
  	  return 1;
  	if (GET_CODE (op0) == REG
  	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
  	    && GET_CODE (op1) == REG
! 	    && INT_REG_OK_FOR_INDEX_P (op1, reg_ok_strict))
  	  return 1;
        }
        break;
--- 2944,2972 ----
  	  op0 = XEXP (op0, 0);
  	if (GET_CODE (op1) == SUBREG)
  	  op1 = XEXP (op1, 0);
  	if (GET_CODE (op0) == REG
  	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
  	    && GET_CODE (op1) == CONST_INT
  	    && INTVAL (op1) >= -0x2000
  	    && INTVAL (op1) <= 0x1fff
! 	    && reg_aligned_for_addr (op0, 0)
! 	    && (!const_aligned
! 		|| (INTVAL (op1) & 15) == 0
! 		|| ((reload_in_progress || reload_completed)
! 		    && GET_MODE_SIZE (mode) < 4
! 		    && (INTVAL (op1) & 15) == 4 - GET_MODE_SIZE (mode))
! 		/* Some passes create a fake register for testing valid
! 		   addresses, be more lenient when we see those.  ivopts
! 		   and reload do it. */
! 		|| REGNO (op0) == LAST_VIRTUAL_REGISTER + 1
! 		|| REGNO (op0) == LAST_VIRTUAL_REGISTER + 2))
  	  return 1;
  	if (GET_CODE (op0) == REG
  	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
+ 	    && reg_aligned_for_addr (op0, 0)
  	    && GET_CODE (op1) == REG
! 	    && INT_REG_OK_FOR_INDEX_P (op1, reg_ok_strict)
! 	    && reg_aligned_for_addr (op1, 0))
  	  return 1;
        }
        break;
*************** spu_legitimize_address (rtx x, rtx oldx 
*** 2960,2966 ****
        else if (GET_CODE (op1) != REG)
  	op1 = force_reg (Pmode, op1);
        x = gen_rtx_PLUS (Pmode, op0, op1);
!       if (spu_legitimate_address (mode, x, 0))
  	return x;
      }
    return NULL_RTX;
--- 3004,3010 ----
        else if (GET_CODE (op1) != REG)
  	op1 = force_reg (Pmode, op1);
        x = gen_rtx_PLUS (Pmode, op0, op1);
!       if (spu_legitimate_address (mode, x, 0, 0))
  	return x;
      }
    return NULL_RTX;
*************** spu_conditional_register_usage (void)
*** 3385,3444 ****
      }
  }
  
! /* This is called to decide when we can simplify a load instruction.  We
!    must only return true for registers which we know will always be
!    aligned.  Taking into account that CSE might replace this reg with
!    another one that has not been marked aligned.  
!    So this is really only true for frame, stack and virtual registers,
!    which we know are always aligned and should not be adversely effected
!    by CSE.  */
  static int
! regno_aligned_for_load (int regno)
! {
!   return regno == FRAME_POINTER_REGNUM
!     || (frame_pointer_needed && regno == HARD_FRAME_POINTER_REGNUM)
!     || regno == ARG_POINTER_REGNUM
!     || regno == STACK_POINTER_REGNUM
!     || (regno >= FIRST_VIRTUAL_REGISTER 
! 	&& regno <= LAST_VIRTUAL_REGISTER);
! }
! 
! /* Return TRUE when mem is known to be 16-byte aligned. */
! int
! aligned_mem_p (rtx mem)
  {
!   if (MEM_ALIGN (mem) >= 128)
      return 1;
!   if (GET_MODE_SIZE (GET_MODE (mem)) >= 16)
!     return 1;
!   if (GET_CODE (XEXP (mem, 0)) == PLUS)
!     {
!       rtx p0 = XEXP (XEXP (mem, 0), 0);
!       rtx p1 = XEXP (XEXP (mem, 0), 1);
!       if (regno_aligned_for_load (REGNO (p0)))
! 	{
! 	  if (GET_CODE (p1) == REG && regno_aligned_for_load (REGNO (p1)))
! 	    return 1;
! 	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15) == 0)
! 	    return 1;
! 	}
!     }
!   else if (GET_CODE (XEXP (mem, 0)) == REG)
!     {
!       if (regno_aligned_for_load (REGNO (XEXP (mem, 0))))
! 	return 1;
!     }
!   else if (ALIGNED_SYMBOL_REF_P (XEXP (mem, 0)))
!     return 1;
!   else if (GET_CODE (XEXP (mem, 0)) == CONST)
!     {
!       rtx p0 = XEXP (XEXP (XEXP (mem, 0), 0), 0);
!       rtx p1 = XEXP (XEXP (XEXP (mem, 0), 0), 1);
!       if (GET_CODE (p0) == SYMBOL_REF
! 	  && GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15) == 0)
! 	return 1;
!     }
!   return 0;
  }
  
  /* Encode symbol attributes (local vs. global, tls model) of a SYMBOL_REF
--- 3429,3444 ----
      }
  }
  
! /* This is called any time we inspect the alignment of a register for
!    addresses.  */
  static int
! reg_aligned_for_addr (rtx x, int aligned)
  {
!   int regno =
!     REGNO (x) < FIRST_PSEUDO_REGISTER ? ORIGINAL_REGNO (x) : REGNO (x);
!   if (!aligned)
      return 1;
!   return REGNO_POINTER_ALIGN (regno) >= 128;
  }
  
  /* Encode symbol attributes (local vs. global, tls model) of a SYMBOL_REF
*************** spu_encode_section_info (tree decl, rtx 
*** 3467,3475 ****
  static int
  store_with_one_insn_p (rtx mem)
  {
    rtx addr = XEXP (mem, 0);
!   if (GET_MODE (mem) == BLKmode)
      return 0;
    /* Only static objects. */
    if (GET_CODE (addr) == SYMBOL_REF)
      {
--- 3467,3478 ----
  static int
  store_with_one_insn_p (rtx mem)
  {
+   enum machine_mode mode = GET_MODE (mem);
    rtx addr = XEXP (mem, 0);
!   if (mode == BLKmode)
      return 0;
+   if (GET_MODE_SIZE (mode) >= 16)
+     return 1;
    /* Only static objects. */
    if (GET_CODE (addr) == SYMBOL_REF)
      {
*************** store_with_one_insn_p (rtx mem)
*** 3493,3498 ****
--- 3496,3517 ----
    return 0;
  }
  
+ /* Return 1 when the address is not valid for a simple load and store as
+    required by the '_mov*' patterns.   We could make this less strict
+    for loads, but we prefer mem's to look the same so they are more
+    likely to be merged.  */
+ static int
+ address_needs_split (rtx mem)
+ {
+   if (GET_MODE_SIZE (GET_MODE (mem)) < 16
+       && (GET_MODE_SIZE (GET_MODE (mem)) < 4
+ 	  || !(store_with_one_insn_p (mem)
+ 	       || mem_is_padded_component_ref (mem))))
+     return 1;
+ 
+   return 0;
+ }
+ 
  int
  spu_expand_mov (rtx * ops, enum machine_mode mode)
  {
*************** spu_expand_mov (rtx * ops, enum machine_
*** 3540,3564 ****
      }
    else
      {
-       if (GET_CODE (ops[0]) == MEM)
- 	{
- 	  if (!spu_valid_move (ops))
- 	    {
- 	      emit_insn (gen_store (ops[0], ops[1], gen_reg_rtx (TImode),
- 				    gen_reg_rtx (TImode)));
- 	      return 1;
- 	    }
- 	}
-       else if (GET_CODE (ops[1]) == MEM)
- 	{
- 	  if (!spu_valid_move (ops))
- 	    {
- 	      emit_insn (gen_load
- 			 (ops[0], ops[1], gen_reg_rtx (TImode),
- 			  gen_reg_rtx (SImode)));
- 	      return 1;
- 	    }
- 	}
        /* Catch the SImode immediates greater than 0x7fffffff, and sign
           extend them. */
        if (GET_CODE (ops[1]) == CONST_INT)
--- 3559,3564 ----
*************** spu_expand_mov (rtx * ops, enum machine_
*** 3574,3580 ****
    return 0;
  }
  
! void
  spu_split_load (rtx * ops)
  {
    enum machine_mode mode = GET_MODE (ops[0]);
--- 3574,3580 ----
    return 0;
  }
  
! int
  spu_split_load (rtx * ops)
  {
    enum machine_mode mode = GET_MODE (ops[0]);
*************** spu_split_load (rtx * ops)
*** 3582,3587 ****
--- 3582,3598 ----
    int rot_amt;
  
    addr = XEXP (ops[1], 0);
+   gcc_assert (GET_CODE (addr) != AND);
+ 
+   if (!address_needs_split (ops[1]))
+     {
+       addr = XEXP (ops[1], 0);
+       if (spu_legitimate_address (mode, addr, 0, 1))
+ 	return 0;
+       ops[1] = change_address (ops[1], VOIDmode, force_reg (Pmode, addr));
+       emit_move_insn (ops[0], ops[1]);
+       return 1;
+     }
  
    rot = 0;
    rot_amt = 0;
*************** spu_split_load (rtx * ops)
*** 3599,3610 ****
         */
        p0 = XEXP (addr, 0);
        p1 = XEXP (addr, 1);
!       if (REG_P (p0) && !regno_aligned_for_load (REGNO (p0)))
  	{
! 	  if (REG_P (p1) && !regno_aligned_for_load (REGNO (p1)))
  	    {
! 	      emit_insn (gen_addsi3 (ops[3], p0, p1));
! 	      rot = ops[3];
  	    }
  	  else
  	    rot = p0;
--- 3610,3641 ----
         */
        p0 = XEXP (addr, 0);
        p1 = XEXP (addr, 1);
!       if (!reg_aligned_for_addr (p0, 1))
  	{
! 	  if (GET_CODE (p1) == REG && !reg_aligned_for_addr (p1, 1))
  	    {
! 	      rot = gen_reg_rtx (SImode);
! 	      emit_insn (gen_addsi3 (rot, p0, p1));
! 	    }
! 	  else if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
! 	    {
! 	      if (INTVAL (p1) > 0
! 		  && INTVAL (p1) * BITS_PER_UNIT < REG_ALIGN (p0))
! 		{
! 		  rot = gen_reg_rtx (SImode);
! 		  emit_insn (gen_addsi3 (rot, p0, p1));
! 		  addr = p0;
! 		}
! 	      else
! 		{
! 		  rtx x = gen_reg_rtx (SImode);
! 		  emit_move_insn (x, p1);
! 		  if (!spu_arith_operand (p1, SImode))
! 		    p1 = x;
! 		  rot = gen_reg_rtx (SImode);
! 		  emit_insn (gen_addsi3 (rot, p0, p1));
! 		  addr = gen_rtx_PLUS (Pmode, p0, x);
! 		}
  	    }
  	  else
  	    rot = p0;
*************** spu_split_load (rtx * ops)
*** 3614,3629 ****
  	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
  	    {
  	      rot_amt = INTVAL (p1) & 15;
! 	      p1 = GEN_INT (INTVAL (p1) & -16);
! 	      addr = gen_rtx_PLUS (SImode, p0, p1);
  	    }
! 	  else if (REG_P (p1) && !regno_aligned_for_load (REGNO (p1)))
  	    rot = p1;
  	}
      }
    else if (GET_CODE (addr) == REG)
      {
!       if (!regno_aligned_for_load (REGNO (addr)))
  	rot = addr;
      }
    else if (GET_CODE (addr) == CONST)
--- 3645,3665 ----
  	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
  	    {
  	      rot_amt = INTVAL (p1) & 15;
! 	      if (INTVAL (p1) & -16)
! 		{
! 		  p1 = GEN_INT (INTVAL (p1) & -16);
! 		  addr = gen_rtx_PLUS (SImode, p0, p1);
! 		}
! 	      else
! 		addr = p0;
  	    }
! 	  else if (GET_CODE (p1) == REG && !reg_aligned_for_addr (p1, 1))
  	    rot = p1;
  	}
      }
    else if (GET_CODE (addr) == REG)
      {
!       if (!reg_aligned_for_addr (addr, 1))
  	rot = addr;
      }
    else if (GET_CODE (addr) == CONST)
*************** spu_split_load (rtx * ops)
*** 3642,3648 ****
  	    addr = XEXP (XEXP (addr, 0), 0);
  	}
        else
! 	rot = addr;
      }
    else if (GET_CODE (addr) == CONST_INT)
      {
--- 3678,3687 ----
  	    addr = XEXP (XEXP (addr, 0), 0);
  	}
        else
! 	{
! 	  rot = gen_reg_rtx (Pmode);
! 	  emit_move_insn (rot, addr);
! 	}
      }
    else if (GET_CODE (addr) == CONST_INT)
      {
*************** spu_split_load (rtx * ops)
*** 3650,3656 ****
        addr = GEN_INT (rot_amt & -16);
      }
    else if (!ALIGNED_SYMBOL_REF_P (addr))
!     rot = addr;
  
    if (GET_MODE_SIZE (mode) < 4)
      rot_amt += GET_MODE_SIZE (mode) - 4;
--- 3689,3698 ----
        addr = GEN_INT (rot_amt & -16);
      }
    else if (!ALIGNED_SYMBOL_REF_P (addr))
!     {
!       rot = gen_reg_rtx (Pmode);
!       emit_move_insn (rot, addr);
!     }
  
    if (GET_MODE_SIZE (mode) < 4)
      rot_amt += GET_MODE_SIZE (mode) - 4;
*************** spu_split_load (rtx * ops)
*** 3659,3673 ****
  
    if (rot && rot_amt)
      {
!       emit_insn (gen_addsi3 (ops[3], rot, GEN_INT (rot_amt)));
!       rot = ops[3];
        rot_amt = 0;
      }
  
!   load = ops[2];
  
!   addr = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
!   mem = change_address (ops[1], TImode, addr);
  
    emit_insn (gen_movti (load, mem));
  
--- 3701,3715 ----
  
    if (rot && rot_amt)
      {
!       rtx x = gen_reg_rtx (SImode);
!       emit_insn (gen_addsi3 (x, rot, GEN_INT (rot_amt)));
!       rot = x;
        rot_amt = 0;
      }
  
!   load = gen_reg_rtx (TImode);
  
!   mem = change_address (ops[1], TImode, copy_rtx (addr));
  
    emit_insn (gen_movti (load, mem));
  
*************** spu_split_load (rtx * ops)
*** 3676,3698 ****
    else if (rot_amt)
      emit_insn (gen_rotlti3 (load, load, GEN_INT (rot_amt * 8)));
  
!   if (reload_completed)
!     emit_move_insn (ops[0], gen_rtx_REG (GET_MODE (ops[0]), REGNO (load)));
!   else
!     emit_insn (gen_spu_convert (ops[0], load));
  }
  
! void
  spu_split_store (rtx * ops)
  {
    enum machine_mode mode = GET_MODE (ops[0]);
!   rtx pat = ops[2];
!   rtx reg = ops[3];
    rtx addr, p0, p1, p1_lo, smem;
    int aform;
    int scalar;
  
    addr = XEXP (ops[0], 0);
  
    if (GET_CODE (addr) == PLUS)
      {
--- 3718,3748 ----
    else if (rot_amt)
      emit_insn (gen_rotlti3 (load, load, GEN_INT (rot_amt * 8)));
  
!   emit_insn (gen_spu_convert (ops[0], load));
!   return 1;
  }
  
! int
  spu_split_store (rtx * ops)
  {
    enum machine_mode mode = GET_MODE (ops[0]);
!   rtx reg;
    rtx addr, p0, p1, p1_lo, smem;
    int aform;
    int scalar;
  
+   if (!address_needs_split (ops[0]))
+     {
+       addr = XEXP (ops[0], 0);
+       if (spu_legitimate_address (mode, addr, 0, 1))
+ 	return 0;
+       ops[0] = change_address (ops[0], VOIDmode, force_reg (Pmode, addr));
+       emit_move_insn (ops[0], ops[1]);
+       return 1;
+     }
+ 
    addr = XEXP (ops[0], 0);
+   gcc_assert (GET_CODE (addr) != AND);
  
    if (GET_CODE (addr) == PLUS)
      {
*************** spu_split_store (rtx * ops)
*** 3704,3710 ****
           unaligned reg + aligned reg     => lqx, c?x, shuf, stqx
           unaligned reg + unaligned reg   => lqx, c?x, shuf, stqx
           unaligned reg + aligned const   => lqd, c?d, shuf, stqx
!          unaligned reg + unaligned const -> not allowed by legitimate address
         */
        aform = 0;
        p0 = XEXP (addr, 0);
--- 3754,3760 ----
           unaligned reg + aligned reg     => lqx, c?x, shuf, stqx
           unaligned reg + unaligned reg   => lqx, c?x, shuf, stqx
           unaligned reg + aligned const   => lqd, c?d, shuf, stqx
!          unaligned reg + unaligned const -> lqx, c?d, shuf, stqx
         */
        aform = 0;
        p0 = XEXP (addr, 0);
*************** spu_split_store (rtx * ops)
*** 3712,3719 ****
        if (GET_CODE (p0) == REG && GET_CODE (p1) == CONST_INT)
  	{
  	  p1_lo = GEN_INT (INTVAL (p1) & 15);
! 	  p1 = GEN_INT (INTVAL (p1) & -16);
! 	  addr = gen_rtx_PLUS (SImode, p0, p1);
  	}
      }
    else if (GET_CODE (addr) == REG)
--- 3762,3781 ----
        if (GET_CODE (p0) == REG && GET_CODE (p1) == CONST_INT)
  	{
  	  p1_lo = GEN_INT (INTVAL (p1) & 15);
! 	  if (reg_aligned_for_addr (p0, 1))
! 	    {
! 	      p1 = GEN_INT (INTVAL (p1) & -16);
! 	      if (p1 == const0_rtx)
! 		addr = p0;
! 	      else
! 		addr = gen_rtx_PLUS (SImode, p0, p1);
! 	    }
! 	  else
! 	    {
! 	      rtx x = gen_reg_rtx (SImode);
! 	      emit_move_insn (x, p1);
! 	      addr = gen_rtx_PLUS (SImode, p0, x);
! 	    }
  	}
      }
    else if (GET_CODE (addr) == REG)
*************** spu_split_store (rtx * ops)
*** 3730,3760 ****
        p1_lo = addr;
        if (ALIGNED_SYMBOL_REF_P (addr))
  	p1_lo = const0_rtx;
!       else if (GET_CODE (addr) == CONST)
  	{
! 	  if (GET_CODE (XEXP (addr, 0)) == PLUS
! 	      && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
! 	      && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
! 	    {
! 	      HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
! 	      if ((v & -16) != 0)
! 		addr = gen_rtx_CONST (Pmode,
! 				      gen_rtx_PLUS (Pmode,
! 						    XEXP (XEXP (addr, 0), 0),
! 						    GEN_INT (v & -16)));
! 	      else
! 		addr = XEXP (XEXP (addr, 0), 0);
! 	      p1_lo = GEN_INT (v & 15);
! 	    }
  	}
        else if (GET_CODE (addr) == CONST_INT)
  	{
  	  p1_lo = GEN_INT (INTVAL (addr) & 15);
  	  addr = GEN_INT (INTVAL (addr) & -16);
  	}
      }
  
!   addr = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
  
    scalar = store_with_one_insn_p (ops[0]);
    if (!scalar)
--- 3792,3825 ----
        p1_lo = addr;
        if (ALIGNED_SYMBOL_REF_P (addr))
  	p1_lo = const0_rtx;
!       else if (GET_CODE (addr) == CONST
! 	       && GET_CODE (XEXP (addr, 0)) == PLUS
! 	       && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
! 	       && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
  	{
! 	  HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
! 	  if ((v & -16) != 0)
! 	    addr = gen_rtx_CONST (Pmode,
! 				  gen_rtx_PLUS (Pmode,
! 						XEXP (XEXP (addr, 0), 0),
! 						GEN_INT (v & -16)));
! 	  else
! 	    addr = XEXP (XEXP (addr, 0), 0);
! 	  p1_lo = GEN_INT (v & 15);
  	}
        else if (GET_CODE (addr) == CONST_INT)
  	{
  	  p1_lo = GEN_INT (INTVAL (addr) & 15);
  	  addr = GEN_INT (INTVAL (addr) & -16);
  	}
+       else
+ 	{
+ 	  p1_lo = gen_reg_rtx (SImode);
+ 	  emit_move_insn (p1_lo, addr);
+ 	}
      }
  
!   reg = gen_reg_rtx (TImode);
  
    scalar = store_with_one_insn_p (ops[0]);
    if (!scalar)
*************** spu_split_store (rtx * ops)
*** 3764,3774 ****
           possible, and copying the flags will prevent that in certain
           cases, e.g. consider the volatile flag. */
  
        rtx lmem = change_address (ops[0], TImode, copy_rtx (addr));
        set_mem_alias_set (lmem, 0);
        emit_insn (gen_movti (reg, lmem));
  
!       if (!p0 || regno_aligned_for_load (REGNO (p0)))
  	p0 = stack_pointer_rtx;
        if (!p1_lo)
  	p1_lo = const0_rtx;
--- 3829,3840 ----
           possible, and copying the flags will prevent that in certain
           cases, e.g. consider the volatile flag. */
  
+       rtx pat = gen_reg_rtx (TImode);
        rtx lmem = change_address (ops[0], TImode, copy_rtx (addr));
        set_mem_alias_set (lmem, 0);
        emit_insn (gen_movti (reg, lmem));
  
!       if (!p0 || reg_aligned_for_addr (p0, 1))
  	p0 = stack_pointer_rtx;
        if (!p1_lo)
  	p1_lo = const0_rtx;
*************** spu_split_store (rtx * ops)
*** 3776,3792 ****
        emit_insn (gen_cpat (pat, p0, p1_lo, GEN_INT (GET_MODE_SIZE (mode))));
        emit_insn (gen_shufb (reg, ops[1], reg, pat));
      }
-   else if (reload_completed)
-     {
-       if (GET_CODE (ops[1]) == REG)
- 	emit_move_insn (reg, gen_rtx_REG (GET_MODE (reg), REGNO (ops[1])));
-       else if (GET_CODE (ops[1]) == SUBREG)
- 	emit_move_insn (reg,
- 			gen_rtx_REG (GET_MODE (reg),
- 				     REGNO (SUBREG_REG (ops[1]))));
-       else
- 	abort ();
-     }
    else
      {
        if (GET_CODE (ops[1]) == REG)
--- 3842,3847 ----
*************** spu_split_store (rtx * ops)
*** 3798,3812 ****
      }
  
    if (GET_MODE_SIZE (mode) < 4 && scalar)
!     emit_insn (gen_shlqby_ti
! 	       (reg, reg, GEN_INT (4 - GET_MODE_SIZE (mode))));
  
!   smem = change_address (ops[0], TImode, addr);
    /* We can't use the previous alias set because the memory has changed
       size and can potentially overlap objects of other types.  */
    set_mem_alias_set (smem, 0);
  
    emit_insn (gen_movti (smem, reg));
  }
  
  /* Return TRUE if X is MEM which is a struct member reference
--- 3853,3868 ----
      }
  
    if (GET_MODE_SIZE (mode) < 4 && scalar)
!     emit_insn (gen_ashlti3
! 	       (reg, reg, GEN_INT (32 - GET_MODE_BITSIZE (mode))));
  
!   smem = change_address (ops[0], TImode, copy_rtx (addr));
    /* We can't use the previous alias set because the memory has changed
       size and can potentially overlap objects of other types.  */
    set_mem_alias_set (smem, 0);
  
    emit_insn (gen_movti (smem, reg));
+   return 1;
  }
  
  /* Return TRUE if X is MEM which is a struct member reference
*************** fix_range (const char *const_str)
*** 3905,3941 ****
      }
  }
  
- int
- spu_valid_move (rtx * ops)
- {
-   enum machine_mode mode = GET_MODE (ops[0]);
-   if (!register_operand (ops[0], mode) && !register_operand (ops[1], mode))
-     return 0;
- 
-   /* init_expr_once tries to recog against load and store insns to set
-      the direct_load[] and direct_store[] arrays.  We always want to
-      consider those loads and stores valid.  init_expr_once is called in
-      the context of a dummy function which does not have a decl. */
-   if (cfun->decl == 0)
-     return 1;
- 
-   /* Don't allows loads/stores which would require more than 1 insn.
-      During and after reload we assume loads and stores only take 1
-      insn. */
-   if (GET_MODE_SIZE (mode) < 16 && !reload_in_progress && !reload_completed)
-     {
-       if (GET_CODE (ops[0]) == MEM
- 	  && (GET_MODE_SIZE (mode) < 4
- 	      || !(store_with_one_insn_p (ops[0])
- 		   || mem_is_padded_component_ref (ops[0]))))
- 	return 0;
-       if (GET_CODE (ops[1]) == MEM
- 	  && (GET_MODE_SIZE (mode) < 4 || !aligned_mem_p (ops[1])))
- 	return 0;
-     }
-   return 1;
- }
- 
  /* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
     can be generated using the fsmbi instruction. */
  int
--- 3961,3966 ----
*************** spu_sms_res_mii (struct ddg *g)
*** 5577,5588 ****
  
  void
  spu_init_expanders (void)
! {   
!   /* HARD_FRAME_REGISTER is only 128 bit aligned when
!    * frame_pointer_needed is true.  We don't know that until we're
!    * expanding the prologue. */
    if (cfun)
!     REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
  }
  
  static enum machine_mode
--- 5602,5627 ----
  
  void
  spu_init_expanders (void)
! {
    if (cfun)
!     {
!       rtx r0, r1;
!       /* HARD_FRAME_REGISTER is only 128 bit aligned when
!          frame_pointer_needed is true.  We don't know that until we're
!          expanding the prologue. */
!       REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
! 
!       /* A number of passes use LAST_VIRTUAL_REGISTER+1 and
!          LAST_VIRTUAL_REGISTER+2 to test the back-end.  We want to
!          handle those cases specially, so we reserve those two registers
!          here by generating them. */
!       r0 = gen_reg_rtx (SImode);
!       r1 = gen_reg_rtx (SImode);
!       mark_reg_pointer (r0, 128);
!       mark_reg_pointer (r1, 128);
!       gcc_assert (REGNO (r0) == LAST_VIRTUAL_REGISTER + 1
! 		  && REGNO (r1) == LAST_VIRTUAL_REGISTER + 2);
!     }
  }
  
  static enum machine_mode
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/spu.h gcc-4_3/gcc/config/spu/spu.h
*** gcc-4_3-orig/gcc/config/spu/spu.h	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/spu.h	2008-09-11 13:45:00.000000000 +0200
*************** enum reg_class { 
*** 254,259 ****
--- 254,264 ----
  #define INT_REG_OK_FOR_BASE_P(X,STRICT) \
  	((!(STRICT) || REGNO_OK_FOR_BASE_P (REGNO (X))))
  
+ #define REG_ALIGN(X) \
+ 	(REG_POINTER(X) \
+ 	 	? REGNO_POINTER_ALIGN (ORIGINAL_REGNO (X)) \
+ 		: 0)
+ 
  #define PREFERRED_RELOAD_CLASS(X,CLASS)  (CLASS)
  
  #define CLASS_MAX_NREGS(CLASS, MODE)	\
*************** targetm.resolve_overloaded_builtin = spu
*** 439,445 ****
  #endif
  
  #define GO_IF_LEGITIMATE_ADDRESS(MODE, X, ADDR)			\
!     { if (spu_legitimate_address (MODE, X, REG_OK_STRICT_FLAG))	\
  	goto ADDR;						\
      }
  
--- 444,450 ----
  #endif
  
  #define GO_IF_LEGITIMATE_ADDRESS(MODE, X, ADDR)			\
!     { if (spu_legitimate_address (MODE, X, REG_OK_STRICT_FLAG, 0))	\
  	goto ADDR;						\
      }
  
*************** targetm.resolve_overloaded_builtin = spu
*** 633,635 ****
--- 638,642 ----
  extern GTY(()) rtx spu_compare_op0;
  extern GTY(()) rtx spu_compare_op1;
  
+ #define SPLIT_BEFORE_CSE2 1
+ 
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/spu.md gcc-4_3/gcc/config/spu/spu.md
*** gcc-4_3-orig/gcc/config/spu/spu.md	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/spu.md	2008-09-11 13:45:00.000000000 +0200
***************
*** 273,280 ****
  (define_split 
    [(set (match_operand 0 "spu_reg_operand")
  	(match_operand 1 "immediate_operand"))]
! 
!   ""
    [(set (match_dup 0)
  	(high (match_dup 1)))
     (set (match_dup 0)
--- 273,279 ----
  (define_split 
    [(set (match_operand 0 "spu_reg_operand")
  	(match_operand 1 "immediate_operand"))]
!   "split0_completed"
    [(set (match_dup 0)
  	(high (match_dup 1)))
     (set (match_dup 0)
***************
*** 311,319 ****
  ;; move internal
  
  (define_insn "_mov<mode>"
!   [(set (match_operand:MOV 0 "spu_nonimm_operand" "=r,r,r,r,r,m")
  	(match_operand:MOV 1 "spu_mov_operand" "r,A,f,j,m,r"))]
!   "spu_valid_move (operands)"
    "@
     ori\t%0,%1,0
     il%s1\t%0,%S1
--- 310,319 ----
  ;; move internal
  
  (define_insn "_mov<mode>"
!   [(set (match_operand:MOV 0 "spu_dest_operand" "=r,r,r,r,r,m")
  	(match_operand:MOV 1 "spu_mov_operand" "r,A,f,j,m,r"))]
!   "register_operand(operands[0], <MODE>mode)
!    || register_operand(operands[1], <MODE>mode)"
    "@
     ori\t%0,%1,0
     il%s1\t%0,%S1
***************
*** 331,339 ****
    "iohl\t%0,%2@l")
  
  (define_insn "_movdi"
!   [(set (match_operand:DI 0 "spu_nonimm_operand" "=r,r,r,r,r,m")
  	(match_operand:DI 1 "spu_mov_operand" "r,a,f,k,m,r"))]
!   "spu_valid_move (operands)"
    "@
     ori\t%0,%1,0
     il%d1\t%0,%D1
--- 331,340 ----
    "iohl\t%0,%2@l")
  
  (define_insn "_movdi"
!   [(set (match_operand:DI 0 "spu_dest_operand" "=r,r,r,r,r,m")
  	(match_operand:DI 1 "spu_mov_operand" "r,a,f,k,m,r"))]
!   "register_operand(operands[0], DImode)
!    || register_operand(operands[1], DImode)"
    "@
     ori\t%0,%1,0
     il%d1\t%0,%D1
***************
*** 344,352 ****
    [(set_attr "type" "fx2,fx2,shuf,shuf,load,store")])
  
  (define_insn "_movti"
!   [(set (match_operand:TI 0 "spu_nonimm_operand" "=r,r,r,r,r,m")
  	(match_operand:TI 1 "spu_mov_operand" "r,U,f,l,m,r"))]
!   "spu_valid_move (operands)"
    "@
     ori\t%0,%1,0
     il%t1\t%0,%T1
--- 345,354 ----
    [(set_attr "type" "fx2,fx2,shuf,shuf,load,store")])
  
  (define_insn "_movti"
!   [(set (match_operand:TI 0 "spu_dest_operand" "=r,r,r,r,r,m")
  	(match_operand:TI 1 "spu_mov_operand" "r,U,f,l,m,r"))]
!   "register_operand(operands[0], TImode)
!    || register_operand(operands[1], TImode)"
    "@
     ori\t%0,%1,0
     il%t1\t%0,%T1
***************
*** 356,384 ****
     stq%p0\t%1,%0"
    [(set_attr "type" "fx2,fx2,shuf,shuf,load,store")])
  
! (define_insn_and_split "load"
!   [(set (match_operand 0 "spu_reg_operand" "=r")
! 	(match_operand 1 "memory_operand" "m"))
!    (clobber (match_operand:TI 2 "spu_reg_operand" "=&r"))
!    (clobber (match_operand:SI 3 "spu_reg_operand" "=&r"))]
!   "GET_MODE(operands[0]) == GET_MODE(operands[1])"
!   "#"
!   ""
    [(set (match_dup 0)
  	(match_dup 1))]
!   { spu_split_load(operands); DONE; })
  
! (define_insn_and_split "store"
!   [(set (match_operand 0 "memory_operand" "=m")
! 	(match_operand 1 "spu_reg_operand" "r"))
!    (clobber (match_operand:TI 2 "spu_reg_operand" "=&r"))
!    (clobber (match_operand:TI 3 "spu_reg_operand" "=&r"))]
!   "GET_MODE(operands[0]) == GET_MODE(operands[1])"
!   "#"
!   ""
    [(set (match_dup 0)
  	(match_dup 1))]
!   { spu_split_store(operands); DONE; })
  
  ;; Operand 3 is the number of bytes. 1:b 2:h 4:w 8:d
  
--- 358,382 ----
     stq%p0\t%1,%0"
    [(set_attr "type" "fx2,fx2,shuf,shuf,load,store")])
  
! (define_split
!   [(set (match_operand 0 "spu_reg_operand")
! 	(match_operand 1 "memory_operand"))]
!   "GET_MODE(operands[0]) == GET_MODE(operands[1]) && !split0_completed"
    [(set (match_dup 0)
  	(match_dup 1))]
!   { if (spu_split_load(operands))
!       DONE;
!   })
  
! (define_split
!   [(set (match_operand 0 "memory_operand")
! 	(match_operand 1 "spu_reg_operand"))]
!   "GET_MODE(operands[0]) == GET_MODE(operands[1]) && !split0_completed"
    [(set (match_dup 0)
  	(match_dup 1))]
!   { if (spu_split_store(operands))
!       DONE;
!   })
  
  ;; Operand 3 is the number of bytes. 1:b 2:h 4:w 8:d
  
diff -crNp -x .svn gcc-4_3-orig/gcc/config/spu/spu-protos.h gcc-4_3/gcc/config/spu/spu-protos.h
*** gcc-4_3-orig/gcc/config/spu/spu-protos.h	2008-09-11 13:44:34.000000000 +0200
--- gcc-4_3/gcc/config/spu/spu-protos.h	2008-09-10 20:09:59.000000000 +0200
*************** extern int arith_immediate_p (rtx op, en
*** 54,60 ****
  extern int spu_constant_address_p (rtx x);
  extern int spu_legitimate_constant_p (rtx x);
  extern int spu_legitimate_address (enum machine_mode mode, rtx x,
! 				   int reg_ok_strict);
  extern rtx spu_legitimize_address (rtx x, rtx oldx, enum machine_mode mode);
  extern int spu_initial_elimination_offset (int from, int to);
  extern rtx spu_function_value (const_tree type, const_tree func);
--- 54,60 ----
  extern int spu_constant_address_p (rtx x);
  extern int spu_legitimate_constant_p (rtx x);
  extern int spu_legitimate_address (enum machine_mode mode, rtx x,
! 				   int reg_ok_strict, int for_split);
  extern rtx spu_legitimize_address (rtx x, rtx oldx, enum machine_mode mode);
  extern int spu_initial_elimination_offset (int from, int to);
  extern rtx spu_function_value (const_tree type, const_tree func);
*************** extern void spu_setup_incoming_varargs (
*** 64,74 ****
  					tree type, int *pretend_size,
  					int no_rtl);
  extern void spu_conditional_register_usage (void);
- extern int aligned_mem_p (rtx mem);
  extern int spu_expand_mov (rtx * ops, enum machine_mode mode);
! extern void spu_split_load (rtx * ops);
! extern void spu_split_store (rtx * ops);
! extern int spu_valid_move (rtx * ops);
  extern int fsmbi_const_p (rtx x);
  extern int cpat_const_p (rtx x, enum machine_mode mode);
  extern rtx gen_cpat_const (rtx * ops);
--- 64,72 ----
  					tree type, int *pretend_size,
  					int no_rtl);
  extern void spu_conditional_register_usage (void);
  extern int spu_expand_mov (rtx * ops, enum machine_mode mode);
! extern int spu_split_load (rtx * ops);
! extern int spu_split_store (rtx * ops);
  extern int fsmbi_const_p (rtx x);
  extern int cpat_const_p (rtx x, enum machine_mode mode);
  extern rtx gen_cpat_const (rtx * ops);
diff -crNp -x .svn gcc-4_3-orig/gcc/doc/tm.texi gcc-4_3/gcc/doc/tm.texi
*** gcc-4_3-orig/gcc/doc/tm.texi	2008-09-11 13:44:35.000000000 +0200
--- gcc-4_3/gcc/doc/tm.texi	2008-09-11 13:45:01.000000000 +0200
*************** to the functions in @file{libgcc} that p
*** 10372,10374 ****
--- 10372,10386 ----
  call stack unwinding.  It is used in declarations in @file{unwind-generic.h}
  and the associated definitions of those functions.
  @end defmac
+ 
+ @defmac SPLIT_BEFORE_CSE2
+ This macro determines whether to use an additional split pass before the
+ second CSE pass.  @code{split0_completed} will be set after this pass is
+ completed.
+ 
+ For example, the Cell SPU target uses this for better optimization of
+ the multiple instructions required to do simple loads and stores.  The
+ optimizations before this pass work better on simple memory
+ instructions, and the optimizations right after this pass (e.g., CSE and
+ combine) are be able to optimize the split instructions.
+ @end defmac
diff -crNp -x .svn gcc-4_3-orig/gcc/final.c gcc-4_3/gcc/final.c
*** gcc-4_3-orig/gcc/final.c	2008-09-11 13:44:35.000000000 +0200
--- gcc-4_3/gcc/final.c	2008-09-10 20:09:59.000000000 +0200
*************** rest_of_clean_state (void)
*** 4238,4243 ****
--- 4238,4246 ----
  #ifdef STACK_REGS
    regstack_completed = 0;
  #endif
+ #ifdef SPLIT_BEFORE_CSE2
+   split0_completed = 0;
+ #endif
  
    /* Clear out the insn_length contents now that they are no
       longer valid.  */
diff -crNp -x .svn gcc-4_3-orig/gcc/passes.c gcc-4_3/gcc/passes.c
*** gcc-4_3-orig/gcc/passes.c	2008-09-11 13:44:35.000000000 +0200
--- gcc-4_3/gcc/passes.c	2008-09-10 20:09:59.000000000 +0200
*************** init_optimization_passes (void)
*** 715,720 ****
--- 715,721 ----
  	}
        NEXT_PASS (pass_web);
        NEXT_PASS (pass_jump_bypass);
+       NEXT_PASS (pass_split_before_cse2);
        NEXT_PASS (pass_cse2);
        NEXT_PASS (pass_rtl_dse1);
        NEXT_PASS (pass_rtl_fwprop_addr);
diff -crNp -x .svn gcc-4_3-orig/gcc/recog.c gcc-4_3/gcc/recog.c
*** gcc-4_3-orig/gcc/recog.c	2008-09-11 13:44:35.000000000 +0200
--- gcc-4_3/gcc/recog.c	2008-09-10 20:09:59.000000000 +0200
*************** int reload_completed;
*** 94,99 ****
--- 94,104 ----
  /* Nonzero after thread_prologue_and_epilogue_insns has run.  */
  int epilogue_completed;
  
+ #ifdef SPLIT_BEFORE_CSE2
+ /* Nonzero after split0 pass has run.  */
+ int split0_completed;
+ #endif
+ 
  /* Initialize data used by the function `recog'.
     This must be called once in the compilation of a function
     before any insn recognition may be done in the function.  */
*************** struct tree_opt_pass pass_split_for_shor
*** 3497,3500 ****
--- 3502,3541 ----
    0                                     /* letter */
  };
  
+ static bool
+ gate_handle_split_before_cse2 (void)
+ {
+ #ifdef SPLIT_BEFORE_CSE2
+   return SPLIT_BEFORE_CSE2;
+ #else
+   return 0;
+ #endif
+ }
+ 
+ static unsigned int
+ rest_of_handle_split_before_cse2 (void)
+ {
+ #ifdef SPLIT_BEFORE_CSE2
+   split_all_insns_noflow ();
+   split0_completed = 1;
+ #endif
+   return 0;
+ }
+ 
+ struct tree_opt_pass pass_split_before_cse2 =
+ {
+   "split0",                             /* name */
+   gate_handle_split_before_cse2,        /* gate */
+   rest_of_handle_split_before_cse2,     /* execute */
+   NULL,                                 /* sub */
+   NULL,                                 /* next */
+   0,                                    /* static_pass_number */
+   0,                                    /* tv_id */
+   0,                                    /* properties_required */
+   0,                                    /* properties_provided */
+   0,                                    /* properties_destroyed */
+   0,                                    /* todo_flags_start */
+   TODO_dump_func,                       /* todo_flags_finish */
+   0                                     /* letter */
+ };
  
diff -crNp -x .svn gcc-4_3-orig/gcc/rtl.h gcc-4_3/gcc/rtl.h
*** gcc-4_3-orig/gcc/rtl.h	2008-09-11 13:44:36.000000000 +0200
--- gcc-4_3/gcc/rtl.h	2008-09-10 20:09:59.000000000 +0200
*************** extern int reload_completed;
*** 1997,2002 ****
--- 1997,2007 ----
  /* Nonzero after thread_prologue_and_epilogue_insns has run.  */
  extern int epilogue_completed;
  
+ #ifdef SPLIT_BEFORE_CSE2
+ /* Nonzero after the split0 pass has completed. */
+ extern int split0_completed;
+ #endif
+ 
  /* Set to 1 while reload_as_needed is operating.
     Required by some machines to handle any generated moves differently.  */
  
diff -crNp -x .svn gcc-4_3-orig/gcc/testsuite/gcc.target/spu/split0-1.c gcc-4_3/gcc/testsuite/gcc.target/spu/split0-1.c
*** gcc-4_3-orig/gcc/testsuite/gcc.target/spu/split0-1.c	1970-01-01 01:00:00.000000000 +0100
--- gcc-4_3/gcc/testsuite/gcc.target/spu/split0-1.c	2008-09-10 20:09:59.000000000 +0200
***************
*** 0 ****
--- 1,17 ----
+ /* Make sure there are only 2 loads. */
+ /* { dg-do compile { target spu-*-* } } */
+ /* { dg-options "-O2" } */
+ /* { dg-final { scan-assembler-times "lqd	\\$\[0-9\]+,0\\(\\$\[0-9\]+\\)" 1 } } */
+ /* { dg-final { scan-assembler-times "lqd	\\$\[0-9\]+,16\\(\\$\[0-9\]+\\)" 1 } } */
+ /* { dg-final { scan-assembler-times "lq\[dx\]" 2 } } */
+   
+ struct __attribute__ ((__aligned__(16))) S {
+   int a, b, c, d;
+   int e, f, g, h;
+ };
+   
+ int
+ f(struct S *s)
+ { 
+   return s->a + s->b + s->c + s->d + s->e + s->f + s->g + s->h;
+ } 
diff -crNp -x .svn gcc-4_3-orig/gcc/tree-pass.h gcc-4_3/gcc/tree-pass.h
*** gcc-4_3-orig/gcc/tree-pass.h	2008-09-11 13:44:36.000000000 +0200
--- gcc-4_3/gcc/tree-pass.h	2008-09-10 20:09:59.000000000 +0200
*************** extern struct tree_opt_pass pass_rtl_dol
*** 385,390 ****
--- 385,391 ----
  extern struct tree_opt_pass pass_rtl_loop_done;
  
  extern struct tree_opt_pass pass_web;
+ extern struct tree_opt_pass pass_split_before_cse2;
  extern struct tree_opt_pass pass_cse2;
  extern struct tree_opt_pass pass_df_initialize_opt;
  extern struct tree_opt_pass pass_df_initialize_no_opt;