File sse5-suse.patch01 of Package gcc41

2007-11-26  Michael Meissner  <michael.meissner@amd.com>

	Backport SSE5 changes to 4.2 from 4.3 mainline.
	* config/i386/i386.h (TARGET_ROUND): New macro for the round/ptest
	instructions which are shared between SSE4.1 and SSE5.
	(TARGET_FUSED_MADD): New macro for -mfused-madd swtich.
	(TARGET_CPU_CPP_BUILTINS): Add SSE5 support.

	* config/i386/i386.opt (-msse5): New switch for SSE5 support.
	(-mfused-madd): New switch to give users control over whether the
	compiler optimizes to use the multiply/add SSE5 instructions.

	* config/i386/i386.c (m_AMD_MULTIPLE): Rename from
	m_ATHLON_K8_AMDFAM10, and change all uses.
	(enum pta_flags): Add PTA_SSE5.
	(ix86_handle_option): Turn off 3dnow if -msse5.
	(override_options): Add SSE5 support.
	(print_operand): %Y prints comparison codes for SSE5 com/pcom
	instructions.
	(ix86_expand_sse_movcc): Add SSE5 support.
	(ix86_expand_sse5_unpack): New function to use pperm to unpack a
	vector type to the next largest size.
	(ix86_expand_sse5_pack): New function to use pperm to pack a
	vector type to the next smallest size.
	(IX86_BUILTIN_FMADDSS): New for SSE5 intrinsic.
	(IX86_BUILTIN_FMADDSD): Ditto.
	(IX86_BUILTIN_FMADDPS): Ditto.
	(IX86_BUILTIN_FMADDPD): Ditto.
	(IX86_BUILTIN_FMSUBSS): Ditto.
	(IX86_BUILTIN_FMSUBSD): Ditto.
	(IX86_BUILTIN_FMSUBPS): Ditto.
	(IX86_BUILTIN_FMSUBPD): Ditto.
	(IX86_BUILTIN_FNMADDSS): Ditto.
	(IX86_BUILTIN_FNMADDSD): Ditto.
	(IX86_BUILTIN_FNMADDPS): Ditto.
	(IX86_BUILTIN_FNMADDPD): Ditto.
	(IX86_BUILTIN_FNMSUBSS): Ditto.
	(IX86_BUILTIN_FNMSUBSD): Ditto.
	(IX86_BUILTIN_FNMSUBPS): Ditto.
	(IX86_BUILTIN_FNMSUBPD): Ditto.
	(IX86_BUILTIN_PCMOV_V2DI): Ditto.
	(IX86_BUILTIN_PCMOV_V4SI): Ditto.
	(IX86_BUILTIN_PCMOV_V8HI): Ditto.
	(IX86_BUILTIN_PCMOV_V16QI): Ditto.
	(IX86_BUILTIN_PCMOV_V4SF): Ditto.
	(IX86_BUILTIN_PCMOV_V2DF): Ditto.
	(IX86_BUILTIN_PPERM): Ditto.
	(IX86_BUILTIN_PERMPS): Ditto.
	(IX86_BUILTIN_PERMPD): Ditto.
	(IX86_BUILTIN_PMACSSWW): Ditto.
	(IX86_BUILTIN_PMACSWW): Ditto.
	(IX86_BUILTIN_PMACSSWD): Ditto.
	(IX86_BUILTIN_PMACSWD): Ditto.
	(IX86_BUILTIN_PMACSSDD): Ditto.
	(IX86_BUILTIN_PMACSDD): Ditto.
	(IX86_BUILTIN_PMACSSDQL): Ditto.
	(IX86_BUILTIN_PMACSSDQH): Ditto.
	(IX86_BUILTIN_PMACSDQL): Ditto.
	(IX86_BUILTIN_PMACSDQH): Ditto.
	(IX86_BUILTIN_PMADCSSWD): Ditto.
	(IX86_BUILTIN_PMADCSWD): Ditto.
	(IX86_BUILTIN_PHADDBW): Ditto.
	(IX86_BUILTIN_PHADDBD): Ditto.
	(IX86_BUILTIN_PHADDBQ): Ditto.
	(IX86_BUILTIN_PHADDWD): Ditto.
	(IX86_BUILTIN_PHADDWQ): Ditto.
	(IX86_BUILTIN_PHADDDQ): Ditto.
	(IX86_BUILTIN_PHADDUBW): Ditto.
	(IX86_BUILTIN_PHADDUBD): Ditto.
	(IX86_BUILTIN_PHADDUBQ): Ditto.
	(IX86_BUILTIN_PHADDUWD): Ditto.
	(IX86_BUILTIN_PHADDUWQ): Ditto.
	(IX86_BUILTIN_PHADDUDQ): Ditto.
	(IX86_BUILTIN_PHSUBBW): Ditto.
	(IX86_BUILTIN_PHSUBWD): Ditto.
	(IX86_BUILTIN_PHSUBDQ): Ditto.
	(IX86_BUILTIN_PROTB): Ditto.
	(IX86_BUILTIN_PROTW): Ditto.
	(IX86_BUILTIN_PROTD): Ditto.
	(IX86_BUILTIN_PROTQ): Ditto.
	(IX86_BUILTIN_PROTB_IMM): Ditto.
	(IX86_BUILTIN_PROTW_IMM): Ditto.
	(IX86_BUILTIN_PROTD_IMM): Ditto.
	(IX86_BUILTIN_PROTQ_IMM): Ditto.
	(IX86_BUILTIN_PSHLB): Ditto.
	(IX86_BUILTIN_PSHLW): Ditto.
	(IX86_BUILTIN_PSHLD): Ditto.
	(IX86_BUILTIN_PSHLQ): Ditto.
	(IX86_BUILTIN_PSHAB): Ditto.
	(IX86_BUILTIN_PSHAW): Ditto.
	(IX86_BUILTIN_PSHAD): Ditto.
	(IX86_BUILTIN_PSHAQ): Ditto.
	(IX86_BUILTIN_FRCZSS): Ditto.
	(IX86_BUILTIN_FRCZSD): Ditto.
	(IX86_BUILTIN_FRCZPS): Ditto.
	(IX86_BUILTIN_FRCZPD): Ditto.
	(IX86_BUILTIN_CVTPH2PS): Ditto.
	(IX86_BUILTIN_CVTPS2PH): Ditto.
	(IX86_BUILTIN_COMEQSS): Ditto.
	(IX86_BUILTIN_COMNESS): Ditto.
	(IX86_BUILTIN_COMLTSS): Ditto.
	(IX86_BUILTIN_COMLESS): Ditto.
	(IX86_BUILTIN_COMGTSS): Ditto.
	(IX86_BUILTIN_COMGESS): Ditto.
	(IX86_BUILTIN_COMUEQSS): Ditto.
	(IX86_BUILTIN_COMUNESS): Ditto.
	(IX86_BUILTIN_COMULTSS): Ditto.
	(IX86_BUILTIN_COMULESS): Ditto.
	(IX86_BUILTIN_COMUGTSS): Ditto.
	(IX86_BUILTIN_COMUGESS): Ditto.
	(IX86_BUILTIN_COMORDSS): Ditto.
	(IX86_BUILTIN_COMUNORDSS): Ditto.
	(IX86_BUILTIN_COMFALSESS): Ditto.
	(IX86_BUILTIN_COMTRUESS): Ditto.
	(IX86_BUILTIN_COMEQSD): Ditto.
	(IX86_BUILTIN_COMNESD): Ditto.
	(IX86_BUILTIN_COMLTSD): Ditto.
	(IX86_BUILTIN_COMLESD): Ditto.
	(IX86_BUILTIN_COMGTSD): Ditto.
	(IX86_BUILTIN_COMGESD): Ditto.
	(IX86_BUILTIN_COMUEQSD): Ditto.
	(IX86_BUILTIN_COMUNESD): Ditto.
	(IX86_BUILTIN_COMULTSD): Ditto.
	(IX86_BUILTIN_COMULESD): Ditto.
	(IX86_BUILTIN_COMUGTSD): Ditto.
	(IX86_BUILTIN_COMUGESD): Ditto.
	(IX86_BUILTIN_COMORDSD): Ditto.
	(IX86_BUILTIN_COMUNORDSD): Ditto.
	(IX86_BUILTIN_COMFALSESD): Ditto.
	(IX86_BUILTIN_COMTRUESD): Ditto.
	(IX86_BUILTIN_COMEQPS): Ditto.
	(IX86_BUILTIN_COMNEPS): Ditto.
	(IX86_BUILTIN_COMLTPS): Ditto.
	(IX86_BUILTIN_COMLEPS): Ditto.
	(IX86_BUILTIN_COMGTPS): Ditto.
	(IX86_BUILTIN_COMGEPS): Ditto.
	(IX86_BUILTIN_COMUEQPS): Ditto.
	(IX86_BUILTIN_COMUNEPS): Ditto.
	(IX86_BUILTIN_COMULTPS): Ditto.
	(IX86_BUILTIN_COMULEPS): Ditto.
	(IX86_BUILTIN_COMUGTPS): Ditto.
	(IX86_BUILTIN_COMUGEPS): Ditto.
	(IX86_BUILTIN_COMORDPS): Ditto.
	(IX86_BUILTIN_COMUNORDPS): Ditto.
	(IX86_BUILTIN_COMFALSEPS): Ditto.
	(IX86_BUILTIN_COMTRUEPS): Ditto.
	(IX86_BUILTIN_COMEQPD): Ditto.
	(IX86_BUILTIN_COMNEPD): Ditto.
	(IX86_BUILTIN_COMLTPD): Ditto.
	(IX86_BUILTIN_COMLEPD): Ditto.
	(IX86_BUILTIN_COMGTPD): Ditto.
	(IX86_BUILTIN_COMGEPD): Ditto.
	(IX86_BUILTIN_COMUEQPD): Ditto.
	(IX86_BUILTIN_COMUNEPD): Ditto.
	(IX86_BUILTIN_COMULTPD): Ditto.
	(IX86_BUILTIN_COMULEPD): Ditto.
	(IX86_BUILTIN_COMUGTPD): Ditto.
	(IX86_BUILTIN_COMUGEPD): Ditto.
	(IX86_BUILTIN_COMORDPD): Ditto.
	(IX86_BUILTIN_COMUNORDPD): Ditto.
	(IX86_BUILTIN_COMFALSEPD): Ditto.
	(IX86_BUILTIN_COMTRUEPD): Ditto.
	(IX86_BUILTIN_PCOMEQUB): Ditto.
	(IX86_BUILTIN_PCOMNEUB): Ditto.
	(IX86_BUILTIN_PCOMLTUB): Ditto.
	(IX86_BUILTIN_PCOMLEUB): Ditto.
	(IX86_BUILTIN_PCOMGTUB): Ditto.
	(IX86_BUILTIN_PCOMGEUB): Ditto.
	(IX86_BUILTIN_PCOMFALSEUB): Ditto.
	(IX86_BUILTIN_PCOMTRUEUB): Ditto.
	(IX86_BUILTIN_PCOMEQUW): Ditto.
	(IX86_BUILTIN_PCOMNEUW): Ditto.
	(IX86_BUILTIN_PCOMLTUW): Ditto.
	(IX86_BUILTIN_PCOMLEUW): Ditto.
	(IX86_BUILTIN_PCOMGTUW): Ditto.
	(IX86_BUILTIN_PCOMGEUW): Ditto.
	(IX86_BUILTIN_PCOMFALSEUW): Ditto.
	(IX86_BUILTIN_PCOMTRUEUW): Ditto.
	(IX86_BUILTIN_PCOMEQUD): Ditto.
	(IX86_BUILTIN_PCOMNEUD): Ditto.
	(IX86_BUILTIN_PCOMLTUD): Ditto.
	(IX86_BUILTIN_PCOMLEUD): Ditto.
	(IX86_BUILTIN_PCOMGTUD): Ditto.
	(IX86_BUILTIN_PCOMGEUD): Ditto.
	(IX86_BUILTIN_PCOMFALSEUD): Ditto.
	(IX86_BUILTIN_PCOMTRUEUD): Ditto.
	(IX86_BUILTIN_PCOMEQUQ): Ditto.
	(IX86_BUILTIN_PCOMNEUQ): Ditto.
	(IX86_BUILTIN_PCOMLTUQ): Ditto.
	(IX86_BUILTIN_PCOMLEUQ): Ditto.
	(IX86_BUILTIN_PCOMGTUQ): Ditto.
	(IX86_BUILTIN_PCOMGEUQ): Ditto.
	(IX86_BUILTIN_PCOMFALSEUQ): Ditto.
	(IX86_BUILTIN_PCOMTRUEUQ): Ditto.
	(IX86_BUILTIN_PCOMEQB): Ditto.
	(IX86_BUILTIN_PCOMNEB): Ditto.
	(IX86_BUILTIN_PCOMLTB): Ditto.
	(IX86_BUILTIN_PCOMLEB): Ditto.
	(IX86_BUILTIN_PCOMGTB): Ditto.
	(IX86_BUILTIN_PCOMGEB): Ditto.
	(IX86_BUILTIN_PCOMFALSEB): Ditto.
	(IX86_BUILTIN_PCOMTRUEB): Ditto.
	(IX86_BUILTIN_PCOMEQW): Ditto.
	(IX86_BUILTIN_PCOMNEW): Ditto.
	(IX86_BUILTIN_PCOMLTW): Ditto.
	(IX86_BUILTIN_PCOMLEW): Ditto.
	(IX86_BUILTIN_PCOMGTW): Ditto.
	(IX86_BUILTIN_PCOMGEW): Ditto.
	(IX86_BUILTIN_PCOMFALSEW): Ditto.
	(IX86_BUILTIN_PCOMTRUEW): Ditto.
	(IX86_BUILTIN_PCOMEQD): Ditto.
	(IX86_BUILTIN_PCOMNED): Ditto.
	(IX86_BUILTIN_PCOMLTD): Ditto.
	(IX86_BUILTIN_PCOMLED): Ditto.
	(IX86_BUILTIN_PCOMGTD): Ditto.
	(IX86_BUILTIN_PCOMGED): Ditto.
	(IX86_BUILTIN_PCOMFALSED): Ditto.
	(IX86_BUILTIN_PCOMTRUED): Ditto.
	(IX86_BUILTIN_PCOMEQQ): Ditto.
	(IX86_BUILTIN_PCOMNEQ): Ditto.
	(IX86_BUILTIN_PCOMLTQ): Ditto.
	(IX86_BUILTIN_PCOMLEQ): Ditto.
	(IX86_BUILTIN_PCOMGTQ): Ditto.
	(IX86_BUILTIN_PCOMGEQ): Ditto.
	(IX86_BUILTIN_PCOMFALSEQ): Ditto.
	(IX86_BUILTIN_PCOMTRUEQ): Ditto.
	(bdesc_ptest): Change OPTION_MASK_ISA_SSE4_1 to
	OPTION_MASK_ISA_ROUND for instructions that are shared between
	SSE4.1 and SSE5.
	(bdesc_2arg): Ditto.
	(bdesc_sse_3arg): Ditto.
	(enum multi_arg_type): New enum for describing the various SSE5
	intrinsic argument types.
	(bdesc_multi_arg): New table for SSE5 intrinsics.
	(ix86_init_mmx_sse_builtins): Add SSE5 intrinsic support.
	(ix86_expand_multi_arg_builtin): New function for creating SSE5
	intrinsics.
	(ix86_expand_builtin): Add SSE5 intrinsic support.
	(ix86_sse5_valid_op_p): New function to validate SSE5 3 and 4
	operand instructions.
	(ix86_expand_sse5_multiple_memory): New function to split the
	second memory reference from SSE5 instructions.
	(type_has_variadic_args_p): Delete in favor of stdarg_p.
	(ix86_return_pops_args): Use stdarg_p to determine if the function
	has variable arguments.
	(ix86_setup_incoming_varargs): Ditto.
	(x86_this_parameter): Ditto.

	* config/i386/i386-protos.h (ix86_expand_sse5_unpack): Add
	declaration.
	(ix86_expand_sse5_pack): Ditto.
	(ix86_sse5_valid_op_p): Ditto.
	(ix86_expand_sse5_multiple_memory): Ditto.

	* config/i386/i386.md (UNSPEC_SSE5_INTRINSIC): Add new UNSPEC
	constant for SSE5 support.
	(UNSPEC_SSE5_UNSIGNED_CMP): Ditto.
	(UNSPEC_SSE5_TRUEFALSE): Ditto.
	(UNSPEC_SSE5_PERMUTE): Ditto.
	(UNSPEC_SSE5_ASHIFT): Ditto.
	(UNSPEC_SSE5_LSHIFT): Ditto.
	(UNSPEC_FRCZ): Ditto.
	(UNSPEC_CVTPH2PS): Ditto.
	(UNSPEC_CVTPS2PH): Ditto.
	(PCOM_FALSE): Add new constant for true/false SSE5 comparisons.
	(PCOM_TRUE): Ditto.
	(COM_FALSE_S): Ditto.
	(COM_FALSE_P): Ditto.
	(COM_TRUE_S): Ditto.
	(COM_TRUE_P): Ditto.
	(type attribute): Add ssemuladd, sseiadd1, ssecvt1, sse4arg types.
	(unit attribute): Add support for ssemuladd, ssecvt1, sseiadd1 sse4arg
	types.
	(memory attribute): Ditto.
	(sse4_1_round<mode>2): Use TARGET_ROUND instead of TARGET_SSE4_1.
	Use SSE4_1_ROUND_* constants instead of hard coded numbers.
	(rint<mode>2): Use TARGET_ROUND instead of TARGET_SSE4_1.
	(floor<mode>2): Ditto.
	(ceil<mode>2): Ditto.
	(btrunc<mode>2): Ditto.
	(nearbyintdf2): Ditto.
	(nearbyintsf2): Ditto.
	(sse_setccsf): Disable if SSE5.
	(sse_setccdf): Ditto.
	(sse5_setcc<mode>): New support for SSE5 conditional move.
	(sse5_pcmov_<mode>): Ditto.

	* config/i386/sse.md (SSEMODE1248): New mode iterator for SSE5.
	(SSEMODEF4): Ditto.
	(SSEMODEF2P): Ditto.
	(ssemodesuffixf4): New mode attribute for SSE5.
	(ssemodesuffixf2s): Ditto.
	(ssemodesuffixf2c): Ditto.
	(sserotatemax): Ditto.
	(ssescalarmode): Ditto.
	(sse_maskcmpv4sf3): Disable if SSE5.
	(sse_maskcmpv2df3): Ditto.
	(sse_vmmaskcmpv4sf3): Ditto.
	(sse5_fmadd<mode>4): Add SSE5 floating point multiply/add
	instructions.
	(sse5_vmfmadd<mode>4): Ditto.
	(sse5_fmsub<mode>4): Ditto.
	(sse5_vmfmsub<mode>4): Ditto.
	(sse5_fnmadd<mode>4): Ditto.
	(sse5_vmfnmadd<mode>4): Ditto.
	(sse5_fnmsub<mode>4): Ditto.
	(sse5_vmfnmsub<mode>4): Ditto.
	(sse5i_fmadd<mode>4): Ditto.
	(sse5i_fmsub<mode>4): Ditto.
	(sse5i_fnmadd<mode>4): Ditto.
	(sse5i_fnmsub<mode>4): Ditto.
	(sse5i_vmfmadd<mode>4): Ditto.
	(sse5i_vmfmsub<mode>4): Ditto.
	(sse5i_vmfnmadd<mode>4): Ditto.
	(sse5i_vmfnmsub<mode>4): Ditto.
	(mulv16qi3): Add SSE5 support.
	(mulv4si3): Ditto.
	(sse5_mulv4si3): New insn for 32-bit multiply support on SSE5.
	(sse2_mulv4si3): Disable if SSE5.
	(sse4_1_roundpd): Use TARGET_ROUND instead of TARGET_SSE4_1.
	(sse4_1_roundps): Ditto.
	(sse4_1_roundsd): Ditto.
	(sse4_1_roundss): Ditto.
	(sse_maskcmpv4sf3): Disable if SSE5 so the SSE5 instruction will
	be generated.
	(sse_maskcmpsf3): Ditto.
	(sse_vmmaskcmpv4sf3): Ditto.
	(sse2_maskcmpv2df3): Ditto.
	(sse2_maskcmpdf3): Ditto.
	(sse2_vmmaskcmpv2df3): Ditto.
	(sse2_eq<mode>3): Ditto.
	(sse2_gt<mode>3): Ditto.
	(sse5_pcmov_<mode>): Add SSE5 support.
	(vec_unpacku_hi_v16qi): Ditto.
	(vec_unpacks_hi_v16qi): Ditto.
	(vec_unpacku_lo_v16qi): Ditto.
	(vec_unpacks_lo_v16qi): Ditto.
	(vec_unpacku_hi_v8hi): Ditto.
	(vec_unpacks_hi_v8hi): Ditto.
	(vec_unpacku_lo_v8hi): Ditto.
	(vec_unpacks_lo_v8hi): Ditto.
	(vec_unpacku_hi_v4si): Ditto.
	(vec_unpacks_hi_v4si): Ditto.
	(vec_unpacku_lo_v4si): Ditto.
	(vec_unpacks_lo_v4si): Ditto.
	(sse5_pmacsww): New SSE5 intrinsic insn.
	(sse5_pmacssww): Ditto.
	(sse5_pmacsdd): Ditto.
	(sse5_pmacssdd): Ditto.
	(sse5_pmacssdql): Ditto.
	(sse5_pmacssdqh): Ditto.
	(sse5_pmacsdqh): Ditto.
	(sse5_pmacsswd): Ditto.
	(sse5_pmacswd): Ditto.
	(sse5_pmadcsswd): Ditto.
	(sse5_pmadcswd): Ditto.
	(sse5_pcmov_<move>): Conditional move support on SSE5.
	(sse5_phaddbw): New SSE5 intrinsic insn.
	(sse5_phaddbd): Ditto.
	(sse5_phaddbq): Ditto.
	(sse5_phaddwd): Ditto.
	(sse5_phaddwq): Ditto.
	(sse5_phadddq): Ditto.
	(sse5_phaddubw): Ditto.
	(sse5_phaddubd): Ditto.
	(sse5_phaddubq): Ditto.
	(sse5_phadduwd): Ditto.
	(sse5_phadduwq): Ditto.
	(sse5_phaddudq): Ditto.
	(sse5_phsubbw): Ditto.
	(sse5_phsubwd): Ditto.
	(sse5_phsubdq): Ditto.
	(sse5_pperm): Ditto.
	(sse5_pperm_sign_v16qi_v8hi): New insns for pack/unpack with SSE5.
	(sse5_pperm_zero_v16qi_v8hi): Ditto.
	(sse5_pperm_sign_v8hi_v4si): Ditto.
	(sse5_pperm_zero_v8hi_v4si): Ditto.
	(sse5_pperm_sign_v4si_v2di): Ditto.
	(sse5_pperm_sign_v4si_v2di): Ditto.
	(sse5_pperm_pack_v2di_v4si): Ditto.
	(sse5_pperm_pack_v4si_v8hi): Ditto.
	(sse5_pperm_pack_v8hi_v16qi): Ditto.
	(sse5_perm<mode>): New SSE5 intrinsic insn.
	(rotl<mode>3): Ditto.
	(sse5_rotl<mode>3): Ditto.
	(sse5_ashl<mode>3): Ditto.
	(sse5_lshl<mode>3): Ditto.
	(sse5_frcz<mode>2): Ditto.
	(sse5s_frcz<mode>2): Ditto.
	(sse5_cvtph2ps): Ditto.
	(sse5_cvtps2ph): Ditto.
	(sse5_vmmaskcmp<mode>3): Ditto.
	(sse5_com_tf<mode>3): Ditto.
	(sse5_maskcmp<mode>3): Ditto.
	(sse5_maskcmp_uns<mode>3): Ditto.
	(sse5_maskcmp_uns2<mode>3): Ditto.
	(sse5_pcom_tf<mode>3): Ditto.
	
	* config/i386/predicates.md (const_0_to_31_operand): New predicate
	to match 0..31.
	(sse5_comparison_float_operator): New predicate to match the
	comparison operators supported by the SSE5 com instruction.
	(ix86_comparison_int_operator): New predicate to match just the
	signed int comparisons.
	(ix86_comparison_uns_operator): New predicate to match just the
	unsigned int comparisons.

	* doc/invoke.texi (-msse5): Add documentation.
	(-mfused-madd): Ditto.

	* doc/extend.texi (x86 intrinsics): Document new SSE5 intrinsics.

	* config.gcc (i[34567]86-*-*): Include bmmintrin.h and
	mmintrin-common.h.
	(x86_64-*-*): Ditto.

	* config/i386/bmmintrin.h: New file, provide common x86 compiler
	intrinisics for SSE5.

	* config/i386/mmintrin-common.h: New file, to contain common
	instructions between SSE4.1 and SSE5.

Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi.orig	2009-11-20 13:43:00.000000000 +0100
+++ gcc/doc/invoke.texi	2009-11-20 13:43:02.000000000 +0100
@@ -522,13 +522,14 @@ Objective-C and Objective-C++ Dialects}.
 -mno-fp-ret-in-387  -msoft-float  -msvr3-shlib @gol
 -mno-wide-multiply  -mrtd  -malign-double @gol
 -mpreferred-stack-boundary=@var{num} @gol
--mmmx  -msse  -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol
+-mmmx  -msse  -msse2 -msse3 -mssse3 -msse4a -msse5 -m3dnow -mpopcnt -mabm @gol
 -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm  -mssemisalign @gol
 -momit-leaf-frame-pointer  -mno-red-zone -mno-tls-direct-seg-refs @gol
 -mcmodel=@var{code-model} @gol
--m32  -m64 -mlarge-data-threshold=@var{num}}
+-m32  -m64 -mlarge-data-threshold=@var{num}
+-mfused-madd -mno-fused-madd}
 
 @emph{IA-64 Options}
 @gccoptlist{-mbig-endian  -mlittle-endian  -mgnu-as  -mgnu-ld  -mno-pic @gol
@@ -9075,7 +9076,7 @@ AMD K8 core based CPUs with x86-64 instr
 MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.)
 @item amdfam10, barcelona
 AMD Family 10 core based CPUs with x86-64 instruction set support.  (This supersets
-MMX, SSE, SSE2, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit instruction set
+MMX, SSE, SSE2, SSE4A, SSE5, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit instruction set
 extensions.)
 @item winchip-c6
 IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction
@@ -9364,6 +9365,8 @@ preferred alignment to @option{-mpreferr
 @itemx -mno-ssse3
 @item -msse4a
 @item -mno-sse4a
+@item -msse5
+@item -mno-sse5
 @item -m3dnow
 @itemx -mno-3dnow
 @opindex mmmx
@@ -9377,7 +9380,7 @@ preferred alignment to @option{-mpreferr
 @item -mabm
 @itemx -mno-abm
 These switches enable or disable the use of instructions in the MMX,
-SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets.  
+SSE, SSE2, SSE3, SSSE3, SSE4A, SSE5, ABM or 3DNow! extended instruction sets.  
 These extensions are also available as built-in functions: see 
 @ref{X86 Built-in Functions}, for details of the functions enabled
 and disabled by these switches.
@@ -9497,6 +9500,13 @@ building of shared libraries are not sup
 Generate code for the large model: This model makes no assumptions
 about addresses and sizes of sections.  Currently GCC does not implement
 this model.
+
+@item -mfused-madd
+@itemx -mno-fused-madd
+@opindex mfused-madd
+Enable automatic generation of fused floating point multiply-add instructions
+if the ISA supports such instructions.  The -mfused-madd option is on by
+default.
 @end table
 
 @node IA-64 Options
Index: gcc/config.gcc
===================================================================
--- gcc/config.gcc.orig	2009-11-20 13:43:00.000000000 +0100
+++ gcc/config.gcc	2009-11-20 13:43:02.000000000 +0100
@@ -264,12 +264,14 @@ xscale-*-*)
 i[34567]86-*-*)
 	cpu_type=i386
 	extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
-		       pmmintrin.h tmmintrin.h ammintrin.h"
+		       pmmintrin.h tmmintrin.h ammintrin.h bmmintrin.h
+		       mmintrin-common.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
 	extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
-		       pmmintrin.h tmmintrin.h ammintrin.h"
+		       pmmintrin.h tmmintrin.h ammintrin.h bmmintrin.h
+		       mmintrin-common.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
Index: gcc/config/i386/i386.h
===================================================================
--- gcc/config/i386/i386.h.orig	2009-11-20 13:42:55.000000000 +0100
+++ gcc/config/i386/i386.h	2009-11-20 13:43:02.000000000 +0100
@@ -141,6 +141,8 @@ extern const struct processor_costs *ix8
 #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
 #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
 
+#define TARGET_ROUND TARGET_SSE5
+
 #define TUNEMASK (1 << ix86_tune)
 extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
 extern const int x86_use_bit_test, x86_cmove, x86_fisttp, x86_deep_branch;
@@ -228,6 +230,7 @@ extern const int x86_use_vector_converts
 #define TARGET_USE_BT (x86_use_bt & TUNEMASK)
 #define TARGET_USE_INCDEC (x86_use_incdec & TUNEMASK)
 #define TARGET_PAD_RETURNS (x86_pad_returns & TUNEMASK)
+#define TARGET_FUSED_MADD x86_fused_muladd
 
 #define ASSEMBLER_DIALECT (ix86_asm_dialect)
 
@@ -397,6 +400,8 @@ extern const int x86_use_vector_converts
 	builtin_define ("__SSSE3__");				\
       if (TARGET_SSE4A)						\
  	builtin_define ("__SSE4A__");		                \
+      if (TARGET_SSE5)						\
+ 	builtin_define ("__SSE5__");		                \
       if (TARGET_SSE_MATH && TARGET_SSE)			\
 	builtin_define ("__SSE_MATH__");			\
       if (TARGET_SSE_MATH && TARGET_SSE2)			\
Index: gcc/config/i386/bmmintrin.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ gcc/config/i386/bmmintrin.h	2009-11-20 13:43:02.000000000 +0100
@@ -0,0 +1,1260 @@
+/* Copyright (C) 2007 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to
+   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, if you include this header file into source
+   files compiled by GCC, this header file does not by itself cause
+   the resulting executable to be covered by the GNU General Public
+   License.  This exception does not however invalidate any other
+   reasons why the executable file might be covered by the GNU General
+   Public License.  */
+
+#ifndef _BMMINTRIN_H_INCLUDED
+#define _BMMINTRIN_H_INCLUDED
+
+#ifndef __SSE5__
+# error "SSE5 instruction set not enabled"
+#else
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files.  */
+#include <ammintrin.h>
+#include <mmintrin-common.h>
+
+/* Floating point multiply/add type instructions */
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return  (__m128) __builtin_ia32_fmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fnmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fnmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fnmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fnmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fnmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fnmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_fnmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_fnmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/* Integer multiply/add intructions. */
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_pmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_pmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+/* Packed Integer Horizontal Add and Subtract */
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddbw ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddbd ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddbq ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddwd ((__v8hi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddwq ((__v8hi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phadddq ((__v4si)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddubw ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddubd ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddubq ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phadduwd ((__v8hi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phadduwq ((__v8hi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phaddudq ((__v4si)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phsubbw ((__v16qi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phsubwd ((__v8hi)__A);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_phsubdq ((__v4si)__A);
+}
+
+/* Vector conditional move and permute */
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pcmov (__A, __B, __C);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_pperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_perm_ps(__m128 __A, __m128 __B, __m128i __C)
+{
+  return  (__m128) __builtin_ia32_permps ((__m128)__A, (__m128)__B, (__v16qi)__C);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_perm_pd(__m128d __A, __m128d __B, __m128i __C)
+{
+  return  (__m128d) __builtin_ia32_permpd ((__m128d)__A, (__m128d)__B, (__v16qi)__C);
+}
+
+/* Packed Integer Rotates and Shifts */
+
+/* Rotates - Non-Immediate form */
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_rot_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_protb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_rot_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_protw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_rot_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_protd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_rot_epi64(__m128i __A,  __m128i __B)
+{
+  return (__m128i)  __builtin_ia32_protq ((__v2di)__A, (__v2di)__B);
+}
+
+
+/* Rotates - Immediate form */
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_roti_epi8(__m128i __A,  int __B)
+{
+  return  (__m128i) __builtin_ia32_protbi ((__v16qi)__A, __B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_roti_epi16(__m128i __A, int __B)
+{
+  return  (__m128i) __builtin_ia32_protwi ((__v8hi)__A, __B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_roti_epi32(__m128i __A, int __B)
+{
+  return  (__m128i) __builtin_ia32_protdi ((__v4si)__A, __B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_roti_epi64(__m128i __A, int __B)
+{
+  return  (__m128i) __builtin_ia32_protqi ((__v2di)__A, __B);
+}
+#else
+#define _mm_roti_epi8(A, B) ((_m128i) __builtin_ia32_protbi ((__v16qi)(A), B)
+#define _mm_roti_epi16(A, B) ((_m128i) __builtin_ia32_protwi ((__v8hi)(A), B)
+#define _mm_roti_epi32(A, B) ((_m128i) __builtin_ia32_protdi ((__v4si)(A), B)
+#define _mm_roti_epi64(A, B) ((_m128i) __builtin_ia32_protqi ((__v2di)(A), B)
+#endif
+
+/* pshl */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_shl_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshlb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_shl_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshlw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_shl_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshld ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_shl_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshlq ((__v2di)__A, (__v2di)__B);
+}
+
+/* psha */
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_sha_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshab ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_sha_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshaw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_sha_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshad ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_sha_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_pshaq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Compare and Predicate Generation */
+
+/* com (floating point, packed single) */
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comeq_ps(__m128 __A, __m128 __B)
+{
+  return  (__m128) __builtin_ia32_comeqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comlt_ps(__m128 __A, __m128 __B)
+{
+  return  (__m128) __builtin_ia32_comltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comle_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comunord_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comunordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comneq_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comuneqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnlt_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comunltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnle_ps(__m128 __A, __m128 __B)
+{
+  return (__m128)  __builtin_ia32_comunleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comord_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comueq_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comueqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnge_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comungeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comngt_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comungtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comfalseps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comoneq_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comneqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comge_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comgeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comgt_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comgtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_ps(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comtrueps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* com (floating point, packed double) */
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comeq_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comeqpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comlt_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comltpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comle_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comlepd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comunord_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunordpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comneq_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comuneqpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnlt_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunltpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnle_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunlepd ((__v2df)__A, (__v2df)__B);
+}
+
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comord_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comordpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comueq_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comueqpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnge_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comungepd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comngt_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comungtpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comfalsepd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comoneq_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comneqpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comge_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comgepd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comgt_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comgtpd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_pd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comtruepd ((__v2df)__A, (__v2df)__B);
+}
+
+/* com (floating point, scalar single) */
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comeq_ss(__m128 __A, __m128 __B)
+{
+  return (__m128)  __builtin_ia32_comeqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comlt_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comle_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comless ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comunord_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comunordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comneq_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comuneqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnlt_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comunltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnle_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comunless ((__v4sf)__A, (__v4sf)__B);
+}
+
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comord_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comueq_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comueqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comnge_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comungess ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comngt_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comungtss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comfalsess ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comoneq_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comneqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comge_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comgess ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comgt_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comgtss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_comtruess ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* com (floating point, scalar double) */
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comeq_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comeqsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comlt_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comltsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comle_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comlesd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comunord_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunordsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comneq_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comuneqsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnlt_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunltsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnle_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comunlesd ((__v2df)__A, (__v2df)__B);
+}
+
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comord_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comordsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comueq_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comueqsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comnge_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comungesd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comngt_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comungtsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comfalsesd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comoneq_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comneqsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comge_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comgesd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comgt_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comgtsd ((__v2df)__A, (__v2df)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_comtruesd ((__v2df)__A, (__v2df)__B);
+}
+
+
+/*pcom (integer, unsinged bytes) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltub ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleub ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtub ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeub ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomnequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseub ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueub ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, unsinged words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomnequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, unsinged double words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltud ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleud ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtud ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeud ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomequd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomnequd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseud ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueud ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, unsinged quad words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltuq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleuq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtuq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeuq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomequq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomnequq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseuq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueuq ((__v2di)__A, (__v2di)__B);
+}
+
+/*pcom (integer, signed bytes) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomeqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomneqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseb ((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueb ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, signed words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomlew ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgew ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomeqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomneqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalsew ((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtruew ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, signed double words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomled ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomged ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomeqd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomneqd ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalsed ((__v4si)__A, (__v4si)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrued ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, signed quad words) */
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomltq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomleq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgtq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomgeq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomeqq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomneqq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomfalseq ((__v2di)__A, (__v2di)__B);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __artificial__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pcomtrueq ((__v2di)__A, (__v2di)__B);
+}
+
+/* FRCZ */
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_frcz_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_frczps ((__v4sf)__A);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_frcz_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_frczpd ((__v2df)__A);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_frcz_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_frczss ((__v4sf)__A, (__v4sf)__B);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_frcz_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_frczsd ((__v2df)__A, (__v2df)__B);
+}
+
+#endif /* __SSE5__ */
+
+#endif /* _BMMINTRIN_H_INCLUDED */
Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md.orig	2009-11-20 13:42:55.000000000 +0100
+++ gcc/config/i386/i386.md	2009-11-20 13:43:02.000000000 +0100
@@ -167,6 +167,21 @@
    (UNSPEC_CVTSI2SD_AMDFAM10    141)	
    (UNSPEC_MOVDSI2SF_AMDFAM10 	142)
    (UNSPEC_MOVDSI2DF_AMDFAM10 	143)	
+
+   ; For SSE4.1/SSE5 support
+   (UNSPEC_PTEST		145)
+   (UNSPEC_ROUND		146)
+
+   ;; For SSE5
+   (UNSPEC_SSE5_INTRINSIC	150)
+   (UNSPEC_SSE5_UNSIGNED_CMP	151)
+   (UNSPEC_SSE5_TRUEFALSE	152)
+   (UNSPEC_SSE5_PERMUTE		153)
+   (UNSPEC_SSE5_ASHIFT		154)
+   (UNSPEC_SSE5_LSHIFT		155)
+   (UNSPEC_FRCZ			156)
+   (UNSPEC_CVTPH2PS		157)
+   (UNSPEC_CVTPS2PH		158)
   ])
 
 (define_constants
@@ -186,6 +201,16 @@
    (UNSPECV_LOCK		13)
   ])
 
+;; Constants to represent pcomtrue/pcomfalse variants
+(define_constants
+  [(PCOM_FALSE			0)
+   (PCOM_TRUE			1)
+   (COM_FALSE_S			2)
+   (COM_FALSE_P			3)
+   (COM_TRUE_S			4)
+   (COM_TRUE_P			5)
+  ])
+
 ;; Registers by name.
 (define_constants
   [(BP_REG			 6)
@@ -219,8 +244,9 @@
    push,pop,call,callv,leave,
    str,cld,
    fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint,
-   sselog,sselog1,sseiadd,sseishft,sseimul,
-   sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins,
+   sselog,sselog1,sseiadd,sseiadd1,sseishft,sseimul,
+   sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,ssediv,sseins,
+   ssemuladd,sse4arg,
    mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
   (const_string "other"))
 
@@ -233,8 +259,9 @@
 (define_attr "unit" "integer,i387,sse,mmx,unknown"
   (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
 	   (const_string "i387")
-	 (eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul,
-			  sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins")
+	 (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseimul,
+			  sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,
+			  ssediv,sseins,ssemuladd,sse4arg")
 	   (const_string "sse")
 	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
 	   (const_string "mmx")
@@ -425,12 +452,12 @@
 		 "!alu1,negnot,ishift1,
 		   imov,imovx,icmp,test,
 		   fmov,fcmp,fsgn,
-		   sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1,
-		   mmx,mmxmov,mmxcmp,mmxcvt")
+		   sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,sselog1,
+		   sseiadd1,mmx,mmxmov,mmxcmp,mmxcvt")
 	      (match_operand 2 "memory_operand" ""))
 	   (const_string "load")
-	 (and (eq_attr "type" "icmov")
-	      (match_operand 3 "memory_operand" ""))
+ 	 (and (eq_attr "type" "icmov,ssemuladd,sse4arg")
+  	      (match_operand 3 "memory_operand" ""))
 	   (const_string "load")
 	]
 	(const_string "none")))
@@ -477,10 +504,14 @@
 
 ;; All SSE floating point modes
 (define_mode_macro SSEMODEF [SF DF])
+(define_mode_macro MODEF [SF DF])
  
 ;; All integer modes handled by SSE cvtts?2si* operators.
 (define_mode_macro SSEMODEI24 [SI DI])
 
+;; SSE asm suffix for floating point modes
+(define_mode_attr ssemodefsuffix [(SF "s") (DF "d")])
+
 
 ;; Scheduling descriptions
 
@@ -7500,6 +7531,9 @@
 		 (match_operand:SF 2 "nonimmediate_operand" "")))]
   "TARGET_80387 || TARGET_SSE_MATH"
   "")
+
+;; SSE5 scalar multiply/add instructions are defined in sse.md.
+
 
 ;; Divide instructions
 
@@ -13512,7 +13546,7 @@
 	(match_operator:SF 1 "sse_comparison_operator"
 	  [(match_operand:SF 2 "register_operand" "0")
 	   (match_operand:SF 3 "nonimmediate_operand" "xm")]))]
-  "TARGET_SSE"
+  "TARGET_SSE && !TARGET_SSE5"
   "cmp%D1ss\t{%3, %0|%0, %3}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "SF")])
@@ -13522,10 +13556,21 @@
 	(match_operator:DF 1 "sse_comparison_operator"
 	  [(match_operand:DF 2 "register_operand" "0")
 	   (match_operand:DF 3 "nonimmediate_operand" "Ym")]))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_SSE5"
   "cmp%D1sd\t{%3, %0|%0, %3}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "DF")])
+
+(define_insn "*sse5_setcc<mode>"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(match_operator:MODEF 1 "sse5_comparison_float_operator"
+	  [(match_operand:MODEF 2 "register_operand" "x")
+	   (match_operand:MODEF 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE5"
+  "com%Y1s<ssemodefsuffix>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "<MODE>")])
+
 
 ;; Basic conditional jump instructions.
 ;; We ignore the overflow flag for signed branch instructions.
@@ -17234,6 +17279,15 @@
     operands[i] = gen_reg_rtx (XFmode);
 })
 
+(define_insn "sse4_1_round<mode>2"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x")
+		       (match_operand:SI 2 "const_0_to_15_operand" "n")]
+		      UNSPEC_ROUND))]
+  "TARGET_ROUND"
+  "rounds<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "frndintxf2"
   [(set (match_operand:XF 0 "register_operand" "=f")
@@ -19206,6 +19260,20 @@
   [(set_attr "type" "fcmov")
    (set_attr "mode" "XF")])
 
+;; All moves in SSE5 pcmov instructions are 128 bits and hence we restrict
+;; the scalar versions to have only XMM registers as operands.
+
+;; SSE5 conditional move
+(define_insn "*sse5_pcmov_<mode>"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(if_then_else:MODEF
+	  (match_operand:MODEF 1 "register_operand" "x,0")
+	  (match_operand:MODEF 2 "register_operand" "0,x")
+	  (match_operand:MODEF 3 "register_operand" "x,x")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "pcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}"
+  [(set_attr "type" "sse4arg")])
+
 ;; These versions of the min/max patterns are intentionally ignorant of
 ;; their behavior wrt -0.0 and NaN (via the commutative operand mark).
 ;; Since both the tree-level MAX_EXPR and the rtl-level SMAX operator
Index: gcc/config/i386/predicates.md
===================================================================
--- gcc/config/i386/predicates.md.orig	2009-11-20 13:41:10.000000000 +0100
+++ gcc/config/i386/predicates.md	2009-11-20 13:43:02.000000000 +0100
@@ -1,5 +1,5 @@
 ;; Predicate definitions for IA-32 and x86-64.
-;; Copyright (C) 2004, 2005 Free Software Foundation, Inc.
+;; Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
 ;;
@@ -569,6 +569,11 @@
   (and (match_code "const_int")
        (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 15")))
 
+;; Match 0 to 31.
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 ;; Match 0 to 63.
 (define_predicate "const_0_to_63_operand"
   (and (match_code "const_int")
@@ -809,6 +814,18 @@
 (define_special_predicate "sse_comparison_operator"
   (match_code "eq,lt,le,unordered,ne,unge,ungt,ordered"))
 
+;; Return 1 if OP is a comparison operator that can be issued by sse predicate
+;; generation instructions
+(define_predicate "sse5_comparison_float_operator"
+  (and (match_test "TARGET_SSE5")
+       (match_code "ne,eq,ge,gt,le,lt,unordered,ordered,uneq,unge,ungt,unle,unlt,ltgt")))
+
+(define_predicate "ix86_comparison_int_operator"
+  (match_code "ne,eq,ge,gt,le,lt"))
+
+(define_predicate "ix86_comparison_uns_operator"
+  (match_code "ne,eq,geu,gtu,leu,ltu"))
+
 ;; Return 1 if OP is a valid comparison operator in valid mode.
 (define_predicate "ix86_comparison_operator"
   (match_operand 0 "comparison_operator")
Index: gcc/config/i386/i386-modes.def
===================================================================
--- gcc/config/i386/i386-modes.def.orig	2005-10-28 16:17:15.000000000 +0200
+++ gcc/config/i386/i386-modes.def	2009-11-20 13:43:02.000000000 +0100
@@ -68,6 +68,7 @@ VECTOR_MODES (INT, 8);        /*       V
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
 VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
 VECTOR_MODES (FLOAT, 16);     /*       V8HF V4SF V2DF */
+VECTOR_MODE (INT, QI, 2);     /*                 V2QI */
 VECTOR_MODE (INT, DI, 4);     /*                 V4DI */
 VECTOR_MODE (INT, SI, 8);     /*                 V8SI */
 VECTOR_MODE (INT, HI, 16);    /*                V16HI */
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md.orig	2009-11-20 13:42:55.000000000 +0100
+++ gcc/config/i386/sse.md	2009-11-20 13:43:02.000000000 +0100
@@ -33,10 +33,24 @@
 (define_mode_macro SSEMODE14 [V16QI V4SI])
 (define_mode_macro SSEMODE124 [V16QI V8HI V4SI])
 (define_mode_macro SSEMODE248 [V8HI V4SI V2DI])
+(define_mode_macro SSEMODE1248 [V16QI V8HI V4SI V2DI])
+(define_mode_macro SSEMODEF4 [SF DF V4SF V2DF])
+(define_mode_macro SSEMODEF2P [V4SF V2DF])
 
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")])
 
+;; Mapping of the sse5 suffix
+(define_mode_attr ssemodesuffixf4 [(SF "ss") (DF "sd") (V4SF "ps") (V2DF "pd")])
+(define_mode_attr ssemodesuffixf2s [(SF "ss") (DF "sd") (V4SF "ss") (V2DF "sd")])
+(define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")])
+
+;; Mapping of the max integer size for sse5 rotate immediate constraint
+(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
+
+;; Mapping of vector modes back to the scalar modes
+(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")])
+
 ;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -768,7 +782,7 @@
 	(match_operator:V4SF 3 "sse_comparison_operator"
 		[(match_operand:V4SF 1 "register_operand" "0")
 		 (match_operand:V4SF 2 "nonimmediate_operand" "xm")]))]
-  "TARGET_SSE"
+  "TARGET_SSE && !TARGET_SSE5"
   "cmp%D3ps\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "V4SF")])
@@ -781,7 +795,7 @@
 		 (match_operand:V4SF 2 "register_operand" "x")])
 	 (match_dup 1)
 	 (const_int 1)))]
-  "TARGET_SSE"
+  "TARGET_SSE && !TARGET_SSE5"
   "cmp%D3ss\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "SF")])
@@ -1409,6 +1423,563 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
+;; SSE5 floating point multiply/accumulate instructions This includes the
+;; scalar version of the instructions as well as the vector
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; In order to match (*a * *b) + *c, particularly when vectorizing, allow
+;; combine to generate a multiply/add with two memory references.  We then
+;; split this insn, into loading up the destination register with one of the
+;; memory operations.  If we don't manage to split the insn, reload will
+;; generate the appropriate moves.  The reason this is needed, is that combine
+;; has already folded one of the memory references into both the multiply and
+;; add insns, and it can't generate a new pseudo.  I.e.:
+;;	(set (reg1) (mem (addr1)))
+;;	(set (reg2) (mult (reg1) (mem (addr2))))
+;;	(set (reg3) (plus (reg2) (mem (addr3))))
+
+(define_insn "sse5_fmadd<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x,x")
+	(plus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "%0,0,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,xm,x"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,0,0")))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)"
+  "fmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmadd with two memory operands into a load and the fmadd.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(plus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, true, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_sse5_fmadd<mode>4 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmadd
+(define_insn "sse5_vmfmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (plus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating multiply and subtract
+;; Allow two memory operands the same as fmadd
+(define_insn "sse5_fmsub<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x,x")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "%0,0,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,xm,x"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,0,0")))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)"
+  "fmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, true, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_sse5_fmsub<mode>4 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmsub
+(define_insn "sse5_vmfmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and add
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_sse5_valid_p
+;; Allow two memory operands to help in optimizing.
+(define_insn "sse5_fnmadd<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x,x")
+	(minus:SSEMODEF4
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,0,0")
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "%0,0,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,xm,x"))))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)"
+  "fnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmadd with two memory operands into a load and the fnmadd.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, true, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_sse5_fnmadd<mode>4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmadd
+(define_insn "sse5_vmfnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and subtract
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+;; Allow 2 memory operands to help with optimization
+(define_insn "sse5_fnmsub<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (neg:SSEMODEF4
+	   (match_operand:SSEMODEF4 1 "nonimmediate_operand" "0,0"))
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x")))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)"
+  "fnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (neg:SSEMODEF4
+	   (match_operand:SSEMODEF4 1 "nonimmediate_operand" ""))
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, true, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_sse5_fnmsub<mode>4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmsub
+(define_insn "sse5_vmfnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (neg:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0"))
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE5 && TARGET_FUSED_MADD
+   && ix86_sse5_valid_op_p (operands, insn, 4, true, 2)"
+  "fnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; The same instructions using an UNSPEC to allow the intrinsic to be used
+;; even if the user used -mno-fused-madd
+;; Parallel instructions.  During instruction generation, just default
+;; to registers, and let combine later build the appropriate instruction.
+(define_expand "sse5i_fmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(plus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "")
+	    (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	   (match_operand:SSEMODEF2P 3 "register_operand" ""))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_fmadd<mode>4 (operands[0], operands[1],
+					operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_fmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x,x,x")
+	(unspec:SSEMODEF2P
+	 [(plus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0,0,x,xm")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm,xm,x"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x,0,0"))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "sse5i_fmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "")
+	    (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	   (match_operand:SSEMODEF2P 3 "register_operand" ""))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_fmsub<mode>4 (operands[0], operands[1],
+					operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_fmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x,x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "%0,0,x,xm")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm,xm,x"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x,0,0"))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_sse5_valid_p
+(define_expand "sse5i_fnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 3 "register_operand" "")
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "")
+	    (match_operand:SSEMODEF2P 2 "register_operand" "")))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_fnmadd<mode>4 (operands[0], operands[1],
+					 operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_fnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x,x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x,0,0")
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0,0,x,xm")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm,xm,x")))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+(define_expand "sse5i_fnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (neg:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" ""))
+	    (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	   (match_operand:SSEMODEF2P 3 "register_operand" ""))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_fnmsub<mode>4 (operands[0], operands[1],
+					 operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_fnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x,x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (neg:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0,0,x,xm"))
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm,xm,x"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x,0,0"))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Scalar instructions
+(define_expand "sse5i_vmfmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (plus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "")
+	     (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	    (match_operand:SSEMODEF2P 3 "register_operand" ""))
+	   (match_dup 1)
+	   (const_int 0))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_vmfmadd<mode>4 (operands[0], operands[1],
+					  operands[2], operands[3]));
+      DONE;
+    }
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are accepted.
+(define_insn "*sse5i_vmfmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (plus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "0,0")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 0)
+	   (const_int 0))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "sse5i_vmfmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "")
+	     (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	    (match_operand:SSEMODEF2P 3 "register_operand" ""))
+	   (match_dup 0)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_vmfmsub<mode>4 (operands[0], operands[1],
+					  operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_vmfmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 1)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;; Note operands are out of order to simplify call to ix86_sse5_valid_p
+(define_expand "sse5i_vmfnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 3 "register_operand" "")
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "")
+	     (match_operand:SSEMODEF2P 2 "register_operand" "")))
+	   (match_dup 1)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_vmfnmadd<mode>4 (operands[0], operands[1],
+					   operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_vmfnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+	   (match_dup 1)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "sse5i_vmfnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (neg:SSEMODEF2P
+	      (match_operand:SSEMODEF2P 1 "register_operand" ""))
+	     (match_operand:SSEMODEF2P 2 "register_operand" ""))
+	    (match_operand:SSEMODEF2P 3 "register_operand" ""))
+	   (match_dup 1)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5"
+{
+  /* If we have -mfused-madd, emit the normal insn rather than the UNSPEC */
+  if (TARGET_FUSED_MADD)
+    {
+      emit_insn (gen_sse5_vmfnmsub<mode>4 (operands[0], operands[1],
+					   operands[2], operands[3]));
+      DONE;
+    }
+})
+
+(define_insn "*sse5i_vmfnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (neg:SSEMODEF2P
+	      (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0"))
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 1)
+	   (const_int 1))]
+	 UNSPEC_SSE5_INTRINSIC))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "fnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
 ;; Parallel double-precision floating point arithmetic
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1739,7 +2310,7 @@
 	(match_operator:V2DF 3 "sse_comparison_operator"
 		[(match_operand:V2DF 1 "register_operand" "0")
 		 (match_operand:V2DF 2 "nonimmediate_operand" "xm")]))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_SSE5"
   "cmp%D3pd\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "V2DF")])
@@ -1752,7 +2323,7 @@
 		 (match_operand:V2DF 2 "nonimmediate_operand" "xm")])
 	  (match_dup 1)
 	  (const_int 1)))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_SSE5"
   "cmp%D3sd\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "DF")])
@@ -2670,6 +3241,31 @@
   [(set_attr "type" "sseiadd")
    (set_attr "mode" "TI")])
 
+;; We don't have a straight 32-bit parallel multiply on SSE5, so fake it with a
+;; multiply/add.  In general, we expect the define_split to occur before
+;; register allocation, so we have to handle the corner case where the target
+;; is used as the base or index register in operands 1/2.
+(define_insn_and_split "sse5_mulv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=&x")
+	(mult:V4SI (match_operand:V4SI 1 "register_operand" "%x")
+		   (match_operand:V4SI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE5"
+  "#"
+  "&& (reload_completed
+       || (!reg_mentioned_p (operands[0], operands[1])
+	   && !reg_mentioned_p (operands[0], operands[2])))"
+  [(set (match_dup 0)
+	(match_dup 3))
+   (set (match_dup 0)
+	(plus:V4SI (mult:V4SI (match_dup 1)
+			      (match_dup 2))
+		   (match_dup 0)))]
+{
+  operands[3] = CONST0_RTX (V4SImode);
+}
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
 (define_expand "mulv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "")
 	(mult:V4SI (match_operand:V4SI 1 "register_operand" "")
@@ -2679,6 +3275,13 @@
   rtx t1, t2, t3, t4, t5, t6, thirtytwo;
   rtx op0, op1, op2;
 
+  if (TARGET_SSE5)
+    {
+      ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);
+      emit_insn (gen_sse5_mulv4si3 (operands[0], operands[1], operands[2]));
+      DONE;
+    }
+
   op0 = operands[0];
   op1 = operands[1];
   op2 = operands[2];
@@ -3006,7 +3609,8 @@
 	(eq:SSEMODE124
 	  (match_operand:SSEMODE124 1 "nonimmediate_operand" "%0")
 	  (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))]
-  "TARGET_SSE2 && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "TARGET_SSE2 && !TARGET_SSE5
+   && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
   "pcmpeq<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "TI")])
@@ -3016,7 +3620,7 @@
 	(gt:SSEMODE124
 	  (match_operand:SSEMODE124 1 "register_operand" "0")
 	  (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_SSE5"
   "pcmpgt<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "TI")])
@@ -4610,6 +5214,1248 @@
   [(set_attr "type" "sseins")
    (set_attr "mode" "TI")])
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Common SSE4.1/SSE5 instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
+;; But it is not a really compare instruction.
+(define_insn "sse4_1_ptest"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V2DI 0 "register_operand" "x")
+		    (match_operand:V2DI 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_PTEST))]
+  "TARGET_ROUND"
+  "ptest\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_roundpd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "xm")
+		      (match_operand:SI 2 "const_0_to_15_operand" "n")]
+		     UNSPEC_ROUND))]
+  "TARGET_ROUND"
+  "roundpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_roundps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")
+		      (match_operand:SI 2 "const_0_to_15_operand" "n")]
+		     UNSPEC_ROUND))]
+  "TARGET_ROUND"
+  "roundps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_roundsd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_merge:V2DF
+	  (unspec:V2DF [(match_operand:V2DF 2 "register_operand" "x")
+			(match_operand:SI 3 "const_0_to_15_operand" "n")]
+		       UNSPEC_ROUND)
+	  (match_operand:V2DF 1 "register_operand" "0")
+	  (const_int 1)))]
+  "TARGET_ROUND"
+  "roundsd\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_roundss"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x")
+			(match_operand:SI 3 "const_0_to_15_operand" "n")]
+		       UNSPEC_ROUND)
+	  (match_operand:V4SF 1 "register_operand" "0")
+	  (const_int 1)))]
+  "TARGET_ROUND"
+  "roundss\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V4SF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; SSE5 instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; SSE5 parallel integer multiply/add instructions.
+;; Note the instruction does not allow the value being added to be a memory
+;; operation.  However by pretending via the nonimmediate_operand predicate
+;; that it does and splitting it later allows the following to be recognized:
+;;	a[i] = b[i] * c[i] + d[i];
+(define_insn "sse5_pmacsww"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x,x")
+        (plus:V8HI
+	 (mult:V8HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "%x,x,m")
+	  (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x"))
+	 (match_operand:V8HI 3 "nonimmediate_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 2)"
+  "@
+   pmacsww\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsww\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsww\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; Split pmacsww with two memory operands into a load and the pmacsww.
+(define_split
+  [(set (match_operand:V8HI 0 "register_operand" "")
+	(plus:V8HI
+	 (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "")
+		    (match_operand:V8HI 2 "nonimmediate_operand" ""))
+	 (match_operand:V8HI 3 "nonimmediate_operand" "")))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, false, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, false, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, V8HImode);
+  emit_insn (gen_sse5_pmacsww (operands[0], operands[1], operands[2],
+			       operands[3]));
+  DONE;
+})
+
+(define_insn "sse5_pmacssww"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x,x")
+        (ss_plus:V8HI
+	 (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x,x,m")
+		    (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x"))
+	 (match_operand:V8HI 3 "nonimmediate_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacssww\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssww\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssww\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; Note the instruction does not allow the value being added to be a memory
+;; operation.  However by pretending via the nonimmediate_operand predicate
+;; that it does and splitting it later allows the following to be recognized:
+;;	a[i] = b[i] * c[i] + d[i];
+(define_insn "sse5_pmacsdd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+        (plus:V4SI
+	 (mult:V4SI
+	  (match_operand:V4SI 1 "nonimmediate_operand" "%x,x,m")
+	  (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x"))
+	 (match_operand:V4SI 3 "nonimmediate_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 2)"
+  "@
+   pmacsdd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; Split pmacsdd with two memory operands into a load and the pmacsdd.
+(define_split
+  [(set (match_operand:V4SI 0 "register_operand" "")
+	(plus:V4SI
+	 (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "")
+		    (match_operand:V4SI 2 "nonimmediate_operand" ""))
+	 (match_operand:V4SI 3 "nonimmediate_operand" "")))]
+  "TARGET_SSE5
+   && !ix86_sse5_valid_op_p (operands, insn, 4, false, 1)
+   && ix86_sse5_valid_op_p (operands, insn, 4, false, 2)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_sse5_multiple_memory (operands, 4, V4SImode);
+  emit_insn (gen_sse5_pmacsdd (operands[0], operands[1], operands[2],
+			       operands[3]));
+  DONE;
+})
+
+(define_insn "sse5_pmacssdd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+        (ss_plus:V4SI
+	 (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x,x,m")
+		    (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x"))
+	 (match_operand:V4SI 3 "nonimmediate_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacssdd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmacssdql"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
+	(ss_plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 1)
+		       (const_int 3)])))
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 1)
+		       (const_int 3)])))
+	 (match_operand:V2DI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacssdql\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdql\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdql\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmacssdqh"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
+	(ss_plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 0)
+		       (const_int 2)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 0)
+		       (const_int 2)]))))
+	 (match_operand:V2DI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacssdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacssdqh\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmacsdql"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
+	(plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 1)
+		       (const_int 3)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 1)
+		       (const_int 3)]))))
+	 (match_operand:V2DI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacsdql\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdql\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdql\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmacsdqh"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
+	(plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 0)
+		       (const_int 2)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 0)
+		       (const_int 2)]))))
+	 (match_operand:V2DI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacsdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsdqh\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; SSE5 parallel integer multiply/add instructions for the intrinisics
+(define_insn "sse5_pmacsswd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+	(ss_plus:V4SI
+	 (mult:V4SI
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 1)
+		       (const_int 3)
+		       (const_int 5)
+		       (const_int 7)])))
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 1)
+		       (const_int 3)
+		       (const_int 5)
+		       (const_int 7)]))))
+	 (match_operand:V4SI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacsswd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmacswd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+	(plus:V4SI
+	 (mult:V4SI
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "x,x,m")
+	    (parallel [(const_int 1)
+		       (const_int 3)
+		       (const_int 5)
+		       (const_int 7)])))
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x")
+	    (parallel [(const_int 1)
+		       (const_int 3)
+		       (const_int 5)
+		       (const_int 7)]))))
+	 (match_operand:V4SI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmacswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmacswd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmadcsswd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+	(ss_plus:V4SI
+	 (plus:V4SI
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 1 "nonimmediate_operand" "x,x,m")
+	     (parallel [(const_int 0)
+			(const_int 2)
+			(const_int 4)
+			(const_int 6)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x")
+	     (parallel [(const_int 0)
+			(const_int 2)
+			(const_int 4)
+			(const_int 6)]))))
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 1)
+	     (parallel [(const_int 1)
+			(const_int 3)
+			(const_int 5)
+			(const_int 7)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 2)
+	     (parallel [(const_int 1)
+			(const_int 3)
+			(const_int 5)
+			(const_int 7)])))))
+	 (match_operand:V4SI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmadcsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmadcsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmadcsswd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pmadcswd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
+	(plus:V4SI
+	 (plus:V4SI
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 1 "nonimmediate_operand" "x,x,m")
+	     (parallel [(const_int 0)
+			(const_int 2)
+			(const_int 4)
+			(const_int 6)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 2 "nonimmediate_operand" "x,m,x")
+	     (parallel [(const_int 0)
+			(const_int 2)
+			(const_int 4)
+			(const_int 6)]))))
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 1)
+	     (parallel [(const_int 1)
+			(const_int 3)
+			(const_int 5)
+			(const_int 7)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 2)
+	     (parallel [(const_int 1)
+			(const_int 3)
+			(const_int 5)
+			(const_int 7)])))))
+	 (match_operand:V4SI 3 "register_operand" "0,0,0")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, 1)"
+  "@
+   pmadcswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmadcswd\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pmadcswd\t{%3, %1, %2, %0|%0, %2, %1, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pcmov_<mode>"
+  [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x")
+	(if_then_else:SSEMODE
+	  (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x")
+	  (match_operand:SSEMODE 1 "nonimmediate_operand" "x,xm,0,0")
+	  (match_operand:SSEMODE 2 "nonimmediate_operand" "xm,x,x,xm")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "@
+   pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")])
+
+;; SSE5 horizontal add/subtract instructions
+(define_insn "sse5_phaddbw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(plus:V8HI
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)
+		      (const_int 8)
+		      (const_int 10)
+		      (const_int 12)
+		      (const_int 14)])))
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)
+		      (const_int 9)
+		      (const_int 11)
+		      (const_int 13)
+		      (const_int 15)])))))]
+  "TARGET_SSE5"
+  "phaddbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddbd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (plus:V4SI
+	  (sign_extend:V4SI
+	   (vec_select:V4QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)
+		       (const_int 4)
+		       (const_int 8)
+		       (const_int 12)])))
+	  (sign_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 1)
+		       (const_int 5)
+		       (const_int 9)
+		       (const_int 13)]))))
+	 (plus:V4SI
+	  (sign_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 2)
+		       (const_int 6)
+		       (const_int 10)
+		       (const_int 14)])))
+	  (sign_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 3)
+		       (const_int 7)
+		       (const_int 11)
+		       (const_int 15)]))))))]
+  "TARGET_SSE5"
+  "phaddbd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddbq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (plus:V2DI
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	     (parallel [(const_int 0)
+			(const_int 4)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 1)
+			(const_int 5)]))))
+	  (plus:V2DI
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 2)
+			(const_int 6)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 3)
+			(const_int 7)])))))
+	 (plus:V2DI
+	  (plus:V2DI
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 8)
+			(const_int 12)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 9)
+			(const_int 13)]))))
+	  (plus:V2DI
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 10)
+			(const_int 14)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 11)
+			(const_int 15)])))))))]
+  "TARGET_SSE5"
+  "phaddbq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)])))
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)])))))]
+  "TARGET_SSE5"
+  "phaddwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddwq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)
+		       (const_int 4)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 1)
+		       (const_int 5)]))))
+	 (plus:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 2)
+		       (const_int 6)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 3)
+		       (const_int 7)]))))))]
+  "TARGET_SSE5"
+  "phaddwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phadddq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)])))
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)])))))]
+  "TARGET_SSE5"
+  "phadddq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddubw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(plus:V8HI
+	 (zero_extend:V8HI
+	  (vec_select:V8QI
+	   (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)
+		      (const_int 8)
+		      (const_int 10)
+		      (const_int 12)
+		      (const_int 14)])))
+	 (zero_extend:V8HI
+	  (vec_select:V8QI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)
+		      (const_int 9)
+		      (const_int 11)
+		      (const_int 13)
+		      (const_int 15)])))))]
+  "TARGET_SSE5"
+  "phaddubw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddubd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (plus:V4SI
+	  (zero_extend:V4SI
+	   (vec_select:V4QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)
+		       (const_int 4)
+		       (const_int 8)
+		       (const_int 12)])))
+	  (zero_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 1)
+		       (const_int 5)
+		       (const_int 9)
+		       (const_int 13)]))))
+	 (plus:V4SI
+	  (zero_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 2)
+		       (const_int 6)
+		       (const_int 10)
+		       (const_int 14)])))
+	  (zero_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 3)
+		       (const_int 7)
+		       (const_int 11)
+		       (const_int 15)]))))))]
+  "TARGET_SSE5"
+  "phaddubd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddubq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (plus:V2DI
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	     (parallel [(const_int 0)
+			(const_int 4)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 1)
+			(const_int 5)]))))
+	  (plus:V2DI
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 2)
+			(const_int 6)])))
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 3)
+			(const_int 7)])))))
+	 (plus:V2DI
+	  (plus:V2DI
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 8)
+			(const_int 12)])))
+	   (sign_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 9)
+			(const_int 13)]))))
+	  (plus:V2DI
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 10)
+			(const_int 14)])))
+	   (zero_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 11)
+			(const_int 15)])))))))]
+  "TARGET_SSE5"
+  "phaddubq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phadduwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (zero_extend:V4SI
+	  (vec_select:V4HI
+	   (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)])))
+	 (zero_extend:V4SI
+	  (vec_select:V4HI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)])))))]
+  "TARGET_SSE5"
+  "phadduwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phadduwq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (zero_extend:V2DI
+	   (vec_select:V2HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)
+		       (const_int 4)])))
+	  (zero_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 1)
+		       (const_int 5)]))))
+	 (plus:V2DI
+	  (zero_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 2)
+		       (const_int 6)])))
+	  (zero_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 3)
+		       (const_int 7)]))))))]
+  "TARGET_SSE5"
+  "phadduwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phaddudq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (zero_extend:V2DI
+	  (vec_select:V2SI
+	   (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)])))
+	 (zero_extend:V2DI
+	  (vec_select:V2SI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)])))))]
+  "TARGET_SSE5"
+  "phaddudq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phsubbw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(minus:V8HI
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)
+		      (const_int 8)
+		      (const_int 10)
+		      (const_int 12)
+		      (const_int 14)])))
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)
+		      (const_int 9)
+		      (const_int 11)
+		      (const_int 13)
+		      (const_int 15)])))))]
+  "TARGET_SSE5"
+  "phsubbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phsubwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(minus:V4SI
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)
+		      (const_int 4)
+		      (const_int 6)])))
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)
+		      (const_int 5)
+		      (const_int 7)])))))]
+  "TARGET_SSE5"
+  "phsubwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "sse5_phsubdq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(minus:V2DI
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0)
+		      (const_int 2)])))
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_dup 1)
+	   (parallel [(const_int 1)
+		      (const_int 3)])))))]
+  "TARGET_SSE5"
+  "phsubdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+;; SSE5 permute instructions
+(define_insn "sse5_pperm"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x,x,x")
+	(unspec:V16QI [(match_operand:V16QI 1 "nonimmediate_operand" "0,0,xm,xm")
+		       (match_operand:V16QI 2 "nonimmediate_operand" "x,xm,0,x")
+		       (match_operand:V16QI 3 "nonimmediate_operand" "xm,x,x,0")]
+		     UNSPEC_SSE5_PERMUTE))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "pperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+;; The following are for the various unpack insns which doesn't need the first
+;; source operand, so we can just use the output operand for the first operand.
+;; This allows either of the other two operands to be a memory operand.  We
+;; can't just use the first operand as an argument to the normal pperm because
+;; then an output only argument, suddenly becomes an input operand.
+(define_insn "sse5_pperm_zero_v16qi_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(zero_extend:V8HI
+	 (vec_select:V8QI
+	  (match_operand:V16QI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V16QImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_sign_v16qi_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(sign_extend:V8HI
+	 (vec_select:V8QI
+	  (match_operand:V16QI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V16QImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_zero_v8hi_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(zero_extend:V4SI
+	 (vec_select:V4HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V8HImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_sign_v8hi_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(sign_extend:V4SI
+	 (vec_select:V4HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V8HImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_zero_v4si_v2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(zero_extend:V2DI
+	 (vec_select:V2SI
+	  (match_operand:V4SI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V4SImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_sign_v4si_v2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(sign_extend:V2DI
+	 (vec_select:V2SI
+	  (match_operand:V4SI 1 "nonimmediate_operand" "xm,x")
+	  (match_operand 2 "" ""))))	;; parallel with const_int's
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "x,xm"))]
+  "TARGET_SSE5
+   && (register_operand (operands[1], V4SImode)
+       || register_operand (operands[2], V16QImode))"
+  "pperm\t{%3, %1, %0, %0|%0, %0, %1, %3}"
+  [(set_attr "type" "sseadd")
+   (set_attr "mode" "TI")])
+
+;; SSE5 pack instructions that combine two vectors into a smaller vector
+(define_insn "sse5_pperm_pack_v2di_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,x,x")
+	(vec_concat:V4SI
+	 (truncate:V2SI
+	  (match_operand:V2DI 1 "nonimmediate_operand" "0,0,xm,xm"))
+	 (truncate:V2SI
+	  (match_operand:V2DI 2 "nonimmediate_operand" "x,xm,0,x"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x,x,0"))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "pperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_pack_v4si_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x,x,x")
+	(vec_concat:V8HI
+	 (truncate:V4HI
+	  (match_operand:V4SI 1 "nonimmediate_operand" "0,0,xm,xm"))
+	 (truncate:V4HI
+	  (match_operand:V4SI 2 "nonimmediate_operand" "x,xm,0,x"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x,x,0"))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "pperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_pperm_pack_v8hi_v16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x,x,x")
+	(vec_concat:V16QI
+	 (truncate:V8QI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "0,0,xm,xm"))
+	 (truncate:V8QI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "x,xm,0,x"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x,x,0"))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "pperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+;; Floating point permutation (permps, permpd)
+(define_insn "sse5_perm<mode>"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x,x,x")
+	(unspec:SSEMODEF2P
+	 [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,0,xm,xm")
+	  (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm,0,x")
+	  (match_operand:V16QI 3 "nonimmediate_operand" "xm,x,x,0")]
+	 UNSPEC_SSE5_PERMUTE))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
+  "perm<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "<MODE>")])
+
+;; SSE5 packed rotate instructions
+(define_insn "rotl<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(rotate:SSEMODE1248
+	 (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm")
+	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+  "TARGET_SSE5"
+  "prot<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_rotl<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
+	(rotate:SSEMODE1248
+	 (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+	 (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
+  "prot<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "mode" "TI")])
+
+;; SSE5 packed shift instructions.  Note negative values for the shift amount
+;; convert this into a right shift instead of left shift.  For now, model this
+;; with an UNSPEC instead of using ashift/lshift since the rest of the x86 does
+;; not have the concept of negating the shift amount.  Also, there is no LSHIFT
+(define_insn "sse5_ashl<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
+	(unspec:SSEMODE1248
+	 [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+	  (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
+	 UNSPEC_SSE5_ASHIFT))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
+  "psha<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_lshl<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
+	(unspec:SSEMODE1248
+	 [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+	  (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
+	 UNSPEC_SSE5_LSHIFT))]
+  "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
+  "pshl<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "mode" "TI")])
+
+;; SSE5 FRCZ support
+;; parallel insns
+(define_insn "sse5_frcz<mode>2"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(unspec:SSEMODEF2P
+	 [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm")]
+	 UNSPEC_FRCZ))]
+  "TARGET_SSE5"
+  "frcz<ssesuffixf4>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt1")
+   (set_attr "mode" "<MODE>")])
+
+;; scalar insns
+(define_insn "sse5_vmfrcz<mode>2"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (unspec:SSEMODEF2P
+	   [(match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")]
+	   UNSPEC_FRCZ)
+	  (match_operand:SSEMODEF2P 1 "register_operand" "0")
+	  (const_int 1)))]
+  "TARGET_ROUND"
+  "frcz<ssesuffixf2s>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecvt1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse5_cvtph2ps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(unspec:V4SF [(match_operand:V4HI 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_CVTPH2PS))]
+  "TARGET_SSE5"
+  "cvtph2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse5_cvtps2ph"
+  [(set (match_operand:V4HI 0 "nonimmediate_operand" "=xm")
+	(unspec:V4HI [(match_operand:V4SF 1 "register_operand" "x")]
+		     UNSPEC_CVTPS2PH))]
+  "TARGET_SSE5"
+  "cvtps2ph\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V4SF")])
+
+;; Scalar versions of the com instructions that use vector types that are
+;; called from the intrinsics.  Unlike the the other s{s,d} instructions, the
+;; com instructions fill in 0's in the upper bits instead of leaving them
+;; unmodified, so we use const_vector of 0 instead of match_dup.
+(define_expand "sse5_vmmaskcmp<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(vec_merge:SSEMODEF2P
+	 (match_operator:SSEMODEF2P 1 "sse5_comparison_float_operator"
+	  [(match_operand:SSEMODEF2P 2 "register_operand" "")
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "")])
+	 (match_dup 4)
+	 (const_int 1)))]
+  "TARGET_SSE5"
+{
+  operands[4] = CONST0_RTX (<MODE>mode);
+})
+
+(define_insn "*sse5_vmmaskcmp<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	 (match_operator:SSEMODEF2P 1 "sse5_comparison_float_operator"
+	  [(match_operand:SSEMODEF2P 2 "register_operand" "x")
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm")])
+	  (match_operand:SSEMODEF2P 4 "")
+	  (const_int 1)))]
+  "TARGET_SSE5"
+  "com%Y1<ssemodesuffixf2s>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;; We don't have a comparison operator that always returns true/false, so
+;; handle comfalse and comtrue specially.
+(define_insn "sse5_com_tf<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(unspec:SSEMODEF2P
+	 [(match_operand:SSEMODEF2P 1 "register_operand" "x")
+	  (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")
+	  (match_operand:SI 3 "const_int_operand" "n")]
+	 UNSPEC_SSE5_TRUEFALSE))]
+  "TARGET_SSE5"
+{
+  const char *ret = NULL;
+
+  switch (INTVAL (operands[3]))
+    {
+    case COM_FALSE_S:
+      ret = \"comfalses<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}\";
+      break;
+
+    case COM_FALSE_P:
+      ret = \"comfalsep<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}\";
+      break;
+
+    case COM_TRUE_S:
+      ret = \"comfalses<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}\";
+      break;
+
+    case COM_TRUE_P:
+      ret = \"comfalsep<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}\";
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return ret;
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse5_maskcmp<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(match_operator:SSEMODEF2P 1 "sse5_comparison_float_operator"
+	 [(match_operand:SSEMODEF2P 2 "register_operand" "x")
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE5"
+  "com%Y1<ssemodesuffixf4>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse5_maskcmp<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(match_operator:SSEMODE1248 1 "ix86_comparison_int_operator"
+	 [(match_operand:SSEMODE1248 2 "register_operand" "x")
+	  (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE5"
+  "pcom%Y1<ssevecsize>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse5_maskcmp_uns<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(match_operator:SSEMODE1248 1 "ix86_comparison_uns_operator"
+	 [(match_operand:SSEMODE1248 2 "register_operand" "x")
+	  (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE5"
+  "pcom%Y1u<ssevecsize>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "TI")])
+
+;; Version of pcom*u* that is called from the intrinsics that allows pcomequ*
+;; and pcomneu* not to be converted to the signed ones in case somebody needs
+;; the exact instruction generated for the intrinsic.
+(define_insn "sse5_maskcmp_uns2<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(unspec:SSEMODE1248
+	 [(match_operator:SSEMODE1248 1 "ix86_comparison_uns_operator"
+	  [(match_operand:SSEMODE1248 2 "register_operand" "x")
+	   (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")])]
+	 UNSPEC_SSE5_UNSIGNED_CMP))]
+  "TARGET_SSE5"
+  "pcom%Y1u<ssevecsize>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "TI")])
+
+;; Pcomtrue and pcomfalse support.  These are useless instructions, but are
+;; being added here to be complete.
+(define_insn "sse5_pcom_tf<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(unspec:SSEMODE1248 [(match_operand:SSEMODE1248 1 "register_operand" "x")
+			     (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm")
+			     (match_operand:SI 3 "const_int_operand" "n")]
+			    UNSPEC_SSE5_TRUEFALSE))]
+  "TARGET_SSE5"
+{
+  return ((INTVAL (operands[3]) != 0)
+	  ? "pcomtrue<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+	  : "pcomfalse<ssevecsize>\t{%2, %1, %0|%0, %1, %2}");
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse3_monitor64"
   [(unspec_volatile [(match_operand:DI 0 "register_operand" "a")
 		     (match_operand:DI 1 "register_operand" "c")
Index: gcc/config/i386/i386.opt
===================================================================
--- gcc/config/i386/i386.opt.orig	2009-11-20 13:42:55.000000000 +0100
+++ gcc/config/i386/i386.opt	2009-11-20 13:43:02.000000000 +0100
@@ -1,6 +1,6 @@
 ; Options for the IA-32 and AMD64 ports of the compiler.
 
-; Copyright (C) 2005 Free Software Foundation, Inc.
+; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
 ;
 ; This file is part of GCC.
 ;
@@ -205,6 +205,10 @@ msse4a
 Target Report Mask(SSE4A)
 Support new AMDFAM10 SSE4A built-in functions and code generation
 
+msse5
+Target Report Mask(SSE5)
+Support SSE5 built-in functions and code generation
+
 mssemisalign
 Target Report Mask(SSE_MISALIGN)
 Support misaligned memory in vector operations.
@@ -243,3 +247,9 @@ Schedule code for given CPU
 
 ;; Support Athlon 3Dnow builtins
 Mask(3DNOW_A)
+
+mfused-madd
+Target Report Var(x86_fused_muladd) Init(1)
+Enable automatic generation of fused floating point multiply-add instructions
+if the ISA supports such instructions.  The -mfused-madd option is on by
+default.
Index: gcc/config/i386/mmintrin-common.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ gcc/config/i386/mmintrin-common.h	2009-11-20 13:43:02.000000000 +0100
@@ -0,0 +1,156 @@
+/* Copyright (C) 2007 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to
+   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, if you include this header file into source
+   files compiled by GCC, this header file does not by itself cause
+   the resulting executable to be covered by the GNU General Public
+   License.  This exception does not however invalidate any other
+   reasons why the executable file might be covered by the GNU General
+   Public License.  */
+
+/* Common definition of the ROUND and PTEST intrinsics that are shared
+   between SSE4.1 and SSE5.  */
+
+#ifndef _MMINTRIN_COMMON_H_INCLUDED
+#define _MMINTRIN_COMMON_H_INCLUDED
+
+#if !defined(__SSE5__) && !defined(__SSE4_1__)
+# error "SSE5 or SSE4.1 instruction set not enabled"
+#else
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT	0x00
+#define _MM_FROUND_TO_NEG_INF		0x01
+#define _MM_FROUND_TO_POS_INF		0x02
+#define _MM_FROUND_TO_ZERO		0x03
+#define _MM_FROUND_CUR_DIRECTION	0x04
+
+#define _MM_FROUND_RAISE_EXC		0x00
+#define _MM_FROUND_NO_EXC		0x08
+
+#define _MM_FROUND_NINT		\
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR	\
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL		\
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC	\
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT		\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT	\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+/* Test Instruction */
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) == 0.  */
+static __inline int __attribute__((__always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & ~__M) == 0.  */
+static __inline int __attribute__((__always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) != 0 && (__V & ~__M) != 0.  */
+static __inline int __attribute__((__always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Macros for packed integer 128-bit comparison intrinsics.  */
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+/* Packed/scalar double precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __artificial__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+					   (__v2df)__V,
+					   __M);
+}
+#else
+#define _mm_round_pd(V, M) \
+  ((__m128d) __builtin_ia32_roundpd ((__v2df)(V), (M)))
+
+#define _mm_round_sd(D, V, M) \
+  ((__m128d) __builtin_ia32_roundsd ((__v2df)(D), (__v2df)(V), (M)))
+#endif
+
+/* Packed/scalar single precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __artificial__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+					  (__v4sf)__V,
+					  __M);
+}
+#else
+#define _mm_round_ps(V, M) \
+  ((__m128) __builtin_ia32_roundps ((__v4sf)(V), (M)))
+
+#define _mm_round_ss(D, V, M) \
+  ((__m128) __builtin_ia32_roundss ((__v4sf)(D), (__v4sf)(V), (M)))
+#endif
+
+/* Macros for ceil/floor intrinsics.  */
+#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+#endif /* __SSE5__/__SSE4_1__ */
+
+#endif /* _MMINTRIN_COMMON_H_INCLUDED */
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc/config/i386/i386-protos.h.orig	2009-11-20 13:41:03.000000000 +0100
+++ gcc/config/i386/i386-protos.h	2009-11-20 13:43:02.000000000 +0100
@@ -48,6 +48,12 @@ extern bool x86_extended_QIreg_mentioned
 extern bool x86_extended_reg_mentioned_p (rtx);
 extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 
+extern bool ix86_sse5_valid_op_p (rtx [], rtx, int, bool, int);
+extern void ix86_expand_sse5_multiple_memory (rtx [], int,
+					      enum machine_mode mode);
+extern void ix86_expand_sse5_multiple_memory (rtx [], int,
+					      enum machine_mode mode);
+
 extern int ix86_expand_movmem (rtx, rtx, rtx, rtx);
 extern int ix86_expand_clrmem (rtx, rtx, rtx);
 extern int ix86_expand_strlen (rtx, rtx, rtx, rtx);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c.orig	2009-11-20 13:43:00.000000000 +0100
+++ gcc/config/i386/i386.c	2009-11-20 13:43:02.000000000 +0100
@@ -765,8 +765,9 @@ const struct processor_costs *ix86_cost
 #define m_ATHLON  (1<<PROCESSOR_ATHLON)
 #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
 #define m_K8  (1<<PROCESSOR_K8)
-#define m_ATHLON_K8  (m_K8 | m_ATHLON)
 #define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
+#define m_ATHLON_K8  (m_K8 | m_ATHLON)
+#define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10)
 #define m_NOCONA  (1<<PROCESSOR_NOCONA)
 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
@@ -778,25 +779,25 @@ const struct processor_costs *ix86_cost
 /* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
    Generic64 seems like good code size tradeoff.  We can't enable it for 32bit
    generic because it is not working well with PPro base chips.  */
-const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8 | m_GENERIC64 | m_AMDFAM10;
-const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
+const int x86_use_leave = m_386 | m_K6 | m_AMD_MULTIPLE | m_GENERIC64;
+const int x86_push_memory = m_386 | m_K6 | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
 const int x86_zero_extend_with_and = m_486 | m_PENT;
 /*Enable to zero extend integer registers to avoid partial dependencies*/
-const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10 /* m_386 | m_K6 */;
+const int x86_movx = m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC /* m_386 | m_K6 */;
 const int x86_double_with_add = ~m_386;
 const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6 | m_GENERIC | m_AMDFAM10;
-const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_AMDFAM10; 
+const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_AMD_MULTIPLE | m_K6 | m_GENERIC;
+const int x86_cmove = m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA; 
 /*fisttp is an SSE3 instruction*/
 const int x86_fisttp = m_NOCONA | m_AMDFAM10;
-const int x86_3dnow_a = m_ATHLON_K8 | m_AMDFAM10;
-const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
+const int x86_3dnow_a = m_AMD_MULTIPLE;
+const int x86_deep_branch = m_PPRO | m_K6 | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
 /* Branch hints were put in P4 based on simulation result. But
    after P4 was made, no performance benefit was observed with
    branch hints. It also increases the code size. As the result,
    icc never generates branch hints.  */
 const int x86_branch_hints = 0;
-const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC32; /*m_GENERIC | m_ATHLON_K8 ? */
+const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC32; /*m_GENERIC | m_AMD_MULTIPLE ? */
 /* We probably ought to watch for partial register stalls on Generic32
    compilation setting as well.  However in current implementation the
    partial register stalls are not eliminated very well - they can
@@ -808,13 +809,13 @@ const int x86_use_sahf = m_PPRO | m_K6 |
 const int x86_partial_reg_stall = m_PPRO;
 const int x86_partial_flag_reg_stall = m_GENERIC;
 const int x86_use_himode_fiop = m_386 | m_486 | m_K6;
-const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT | m_GENERIC | m_AMDFAM10);
+const int x86_use_simode_fiop = ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_GENERIC);
 const int x86_use_mov0 = m_K6;
 const int x86_use_cltd = ~(m_PENT | m_K6 | m_GENERIC);
 const int x86_read_modify_write = ~m_PENT;
 const int x86_read_modify = ~(m_PENT | m_PPRO);
 const int x86_split_long_moves = m_PPRO;
-const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8 | m_GENERIC | m_AMDFAM10; /* m_PENT4 ? */
+const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_AMD_MULTIPLE | m_GENERIC; /* m_PENT4 ? */
 const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
 const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
 const int x86_qimode_math = ~(0);
@@ -825,23 +826,23 @@ const int x86_promote_qi_regs = 0;
 const int x86_himode_math = ~(m_PPRO);
 const int x86_promote_hi_regs = m_PPRO;
 /*Enable if add/sub rsp is preferred over 1 or 2 push/pop*/
-const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
-const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
-const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
-const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
+const int x86_sub_esp_4 = m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_sub_esp_8 = m_AMD_MULTIPLE | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_add_esp_4 = m_AMD_MULTIPLE | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_add_esp_8 = m_AMD_MULTIPLE | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC;
 /*Enable if integer moves are preferred for DFmode copies*/
-const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC | m_AMDFAM10);
-const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
-const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
+const int x86_integer_DFmode_moves = ~(m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC);
+const int x86_partial_reg_dependency = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_memory_mismatch_stall = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
 /*If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required for outgoing arguments
 will be computed and placed into the variable `current_function_outgoing_args_size'. No space will be
 pushed onto the stack for each call; instead, the function prologue should increase the stack frame
 size by this amount. Setting both @code{PUSH_ARGS} and @code{ACCUMULATE_OUTGOING_ARGS} is not proper.*/
-const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC | m_AMDFAM10;
-const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_GENERIC;
-const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_GENERIC;
+const int x86_accumulate_outgoing_args = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC;
+const int x86_prologue_using_move = m_AMD_MULTIPLE | m_PPRO | m_GENERIC;
+const int x86_epilogue_using_move = m_AMD_MULTIPLE | m_PPRO | m_GENERIC;
 const int x86_shift1 = ~m_486;
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
+const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
 /* In Generic model we have an confict here in between PPro/Pentium4 based chips
    that thread 128bit SSE registers as single units versus K8 based chips that
    divide SSE registers to two 64bit halves.
@@ -916,27 +917,27 @@ const int x86_sse_partial_reg_dependency
    undefined.  */
 const int x86_sse_split_regs = m_ATHLON_K8;
 const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
-const int x86_sse_typeless_stores = m_ATHLON_K8 | m_AMDFAM10;
+const int x86_sse_typeless_stores = m_AMD_MULTIPLE;
 const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
-const int x86_use_ffreep = m_ATHLON_K8 | m_AMDFAM10;
+const int x86_use_ffreep = m_AMD_MULTIPLE;
 const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6;
 const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_GENERIC);
 
 /* ??? Allowing interunit moves makes it all too easy for the compiler to put
    integer data in xmm registers.  Which results in pretty abysmal code.  */
-const int x86_inter_unit_moves = 0 /* ~(m_ATHLON_K8) */;
+const int x86_inter_unit_moves = 0 /* ~(m_AMD_MULTIPLE) */;
 
 const int x86_ext_80387_constants = m_K6 | m_ATHLON | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC32;
 /* Some CPU cores are not able to predict more than 4 branch instructions in
    the 16 byte window.  */
-const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC | m_AMDFAM10;
-const int x86_schedule = m_PPRO | m_ATHLON_K8 | m_K6 | m_PENT | m_GENERIC | m_AMDFAM10;
-const int x86_use_bt = m_ATHLON_K8 | m_AMDFAM10;
+const int x86_four_jump_limit = m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_schedule = m_PPRO | m_AMD_MULTIPLE | m_K6 | m_PENT | m_GENERIC;
+const int x86_use_bt = m_AMD_MULTIPLE;
 /* Compare and exchange was added for 80486.  */
 const int x86_cmpxchg = ~m_386;
 /* Exchange and add was added for 80486.  */
 const int x86_xadd = ~m_386;
-const int x86_pad_returns = m_ATHLON_K8 | m_GENERIC | m_AMDFAM10;
+const int x86_pad_returns = m_AMD_MULTIPLE | m_GENERIC;
 
 /* Use Vector Converts instead of Scalar Converts. Added for AMDFAM10 */
 const int x86_use_vector_converts = m_AMDFAM10;
@@ -1504,24 +1505,34 @@ ix86_handle_option (size_t code, const c
     case OPT_msse:
       if (!value)
 	{
-	  target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A | MASK_SSE_MISALIGN);
-	  target_flags_explicit |= MASK_SSE2 | MASK_SSE3 | MASK_SSE4A | MASK_SSE_MISALIGN;
+	  target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A
+			    | MASK_SSE_MISALIGN | MASK_SSE5);
+	  target_flags_explicit |= (MASK_SSE2 | MASK_SSE3 | MASK_SSE4A
+				    | MASK_SSE_MISALIGN | MASK_SSE5);
 	}
       return true;
 
     case OPT_msse2:
       if (!value)
 	{
-	  target_flags &= ~(MASK_SSE3 | MASK_SSE4A);
-	  target_flags_explicit |= MASK_SSE3 | MASK_SSE4A;
+	  target_flags &= ~(MASK_SSE3 | MASK_SSE4A | MASK_SSE5);
+	  target_flags_explicit |= MASK_SSE3 | MASK_SSE4A | MASK_SSE5;
 	}
       return true;
 
     case OPT_msse3:
       if (!value)
 	{
-	  target_flags &= ~MASK_SSE4A;
-	  target_flags_explicit |= MASK_SSE4A;
+	  target_flags &= ~(MASK_SSE4A | MASK_SSE5);
+	  target_flags_explicit |= MASK_SSE4A | MASK_SSE5;
+	}
+      return true;
+
+    case OPT_msse4a:
+      if (!value)
+	{
+	  target_flags &= ~MASK_SSE5;
+	  target_flags_explicit |= MASK_SSE5;
 	}
       return true;
 
@@ -1594,7 +1605,8 @@ override_options (void)
 	  PTA_POPCNT= 512,
 	  PTA_ABM = 1024,
 	  PTA_SSE4A = 2048,
-          PTA_SSE_MISALIGN = 4096
+          PTA_SSE_MISALIGN = 4096,
+	  PTA_SSE5 = 8192
 	} flags;
     }
   const processor_alias_table[] =
@@ -1808,6 +1820,9 @@ override_options (void)
 	if (processor_alias_table[i].flags & PTA_SSE4A
 	    && !(target_flags_explicit & MASK_SSE4A))
 	  target_flags |= MASK_SSE4A;
+	if (processor_alias_table[i].flags & PTA_SSE5
+	    && !(target_flags_explicit & MASK_SSE5))
+	  target_flags |= MASK_SSE5;
 	if (processor_alias_table[i].flags & PTA_SSE_MISALIGN
 	    && !(target_flags_explicit & MASK_SSE_MISALIGN))
 	  target_flags |= MASK_SSE_MISALIGN;
@@ -2002,6 +2017,10 @@ override_options (void)
   if (TARGET_SSSE3)
     target_flags |= MASK_SSE3;
 
+  /* Turn on SSE4A builtins for -msse5.  */
+  if (TARGET_SSE5)
+    target_flags |= MASK_SSE4A;
+
   /* Turn on SSE3 builtins for -msse4a.  */
   if (TARGET_SSE4A)
     target_flags |= MASK_SSE3;
@@ -2339,6 +2358,171 @@ optimization_options (int level, int siz
 #endif
 }
 
+/* Validate whether a SSE5 instruction is valid or not.
+   OPERANDS is the array of operands.
+   NUM is the number of operands.
+   USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
+   NUM_MEMORY is the maximum number of memory operands to accept.  */
+bool
+ix86_sse5_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num, bool uses_oc0, int num_memory)
+{
+  int mem_mask;
+  int mem_count;
+  int i;
+
+  /* Count the number of memory arguments */
+  mem_mask = 0;
+  mem_count = 0;
+  for (i = 0; i < num; i++)
+    {
+      enum machine_mode mode = GET_MODE (operands[i]);
+      if (register_operand (operands[i], mode))
+	;
+
+      else if (memory_operand (operands[i], mode))
+	{
+	  mem_mask |= (1 << i);
+	  mem_count++;
+	}
+
+      else
+	return false;
+    }
+
+  /* If there were no memory operations, allow the insn */
+  if (mem_mask == 0)
+    return true;
+
+  /* Do not allow the destination register to be a memory operand.  */
+  else if (mem_mask & (1 << 0))
+    return false;
+
+  /* If there are too many memory operations, disallow the instruction.  While
+     the hardware only allows 1 memory reference, before register allocation
+     for some insns, we allow two memory operations sometimes in order to allow
+     code like the following to be optimized:
+
+	float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
+
+    or similar cases that are vectorized into using the fmaddss
+    instruction.  */
+  else if (mem_count > num_memory)
+    return false;
+
+  /* Don't allow more than one memory operation if not optimizing.  */
+  else if (mem_count > 1 && !optimize)
+    return false;
+
+  else if (num == 4 && mem_count == 1)
+    {
+      /* formats (destination is the first argument), example fmaddss:
+	 xmm1, xmm1, xmm2, xmm3/mem
+	 xmm1, xmm1, xmm2/mem, xmm3
+	 xmm1, xmm2, xmm3/mem, xmm1
+	 xmm1, xmm2/mem, xmm3, xmm1 */
+      if (uses_oc0)
+	return ((mem_mask == (1 << 1))
+		|| (mem_mask == (1 << 2))
+		|| (mem_mask == (1 << 3)));
+
+      /* format, example pmacsdd:
+	 xmm1, xmm2, xmm3/mem, xmm1 */
+      else
+	return (mem_mask == (1 << 2));
+    }
+
+  else if (num == 4 && num_memory == 2)
+    {
+      /* If there are two memory operations, we can load one of the memory ops
+	 into the destination register.  This is for optimizing the
+	 multiply/add ops, which the combiner has optimized both the multiply
+	 and the add insns to have a memory operation.  We have to be careful
+	 that the destination doesn't overlap with the inputs.  */
+      rtx op0 = operands[0];
+
+      if (reg_mentioned_p (op0, operands[1])
+	  || reg_mentioned_p (op0, operands[2])
+	  || reg_mentioned_p (op0, operands[3]))
+	return false;
+
+      /* formats (destination is the first argument), example fmaddss:
+	 xmm1, xmm1, xmm2, xmm3/mem
+	 xmm1, xmm1, xmm2/mem, xmm3
+	 xmm1, xmm2, xmm3/mem, xmm1
+	 xmm1, xmm2/mem, xmm3, xmm1
+
+         For the oc0 case, we will load either operands[1] or operands[3] into
+         operands[0], so any combination of 2 memory operands is ok.  */
+      if (uses_oc0)
+	return true;
+
+      /* format, example pmacsdd:
+	 xmm1, xmm2, xmm3/mem, xmm1
+
+         For the integer multiply/add instructions be more restrictive and
+         require operands[2] and operands[3] to be the memory operands.  */
+      else
+	return (mem_mask == ((1 << 2) | (1 << 3)));
+    }
+
+  else if (num == 3 && num_memory == 1)
+    {
+      /* formats, example protb:
+	 xmm1, xmm2, xmm3/mem
+	 xmm1, xmm2/mem, xmm3 */
+      if (uses_oc0)
+	return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
+
+      /* format, example comeq:
+	 xmm1, xmm2, xmm3/mem */
+      else
+	return (mem_mask == (1 << 2));
+    }
+
+  else
+    gcc_unreachable ();
+
+  return false;
+}
+
+
+/* Fixup an SSE5 instruction that has 2 memory input references into a form the
+   hardware will allow by using the destination register to load one of the
+   memory operations.  Presently this is used by the multiply/add routines to
+   allow 2 memory references.  */
+
+void
+ix86_expand_sse5_multiple_memory (rtx operands[],
+				  int num,
+				  enum machine_mode mode)
+{
+  rtx op0 = operands[0];
+  if (num != 4
+      || memory_operand (op0, mode)
+      || reg_mentioned_p (op0, operands[1])
+      || reg_mentioned_p (op0, operands[2])
+      || reg_mentioned_p (op0, operands[3]))
+    gcc_unreachable ();
+
+  /* For 2 memory operands, pick either operands[1] or operands[3] to move into
+     the destination register.  */
+  if (memory_operand (operands[1], mode))
+    {
+      emit_move_insn (op0, operands[1]);
+      operands[1] = op0;
+    }
+  else if (memory_operand (operands[3], mode))
+    {
+      emit_move_insn (op0, operands[3]);
+      operands[3] = op0;
+    }
+  else
+    gcc_unreachable ();
+
+  return;
+}
+
+
 /* Table of valid machine attributes.  */
 const struct attribute_spec ix86_attribute_table[] =
 {
@@ -7531,6 +7715,7 @@ get_some_local_dynamic_name_1 (rtx *px,
    X -- don't print any sort of PIC '@' suffix for a symbol.
    & -- print some in-use local-dynamic symbol name.
    H -- print a memory address offset by 8; used for sse high-parts
+   Y -- print condition for SSE5 com* instruction.
  */
 
 void
@@ -7802,6 +7987,61 @@ print_operand (FILE *file, rtx x, int co
 	      }
 	    return;
 	  }
+
+	case 'Y':
+	  switch (GET_CODE (x))
+	    {
+	    case NE:
+	      fputs ("neq", file);
+	      break;
+	    case EQ:
+	      fputs ("eq", file);
+	      break;
+	    case GE:
+	    case GEU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
+	      break;
+	    case GT:
+	    case GTU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
+	      break;
+	    case LE:
+	    case LEU:
+	      fputs ("le", file);
+	      break;
+	    case LT:
+	    case LTU:
+	      fputs ("lt", file);
+	      break;
+	    case UNORDERED:
+	      fputs ("unord", file);
+	      break;
+	    case ORDERED:
+	      fputs ("ord", file);
+	      break;
+	    case UNEQ:
+	      fputs ("ueq", file);
+	      break;
+	    case UNGE:
+	      fputs ("nlt", file);
+	      break;
+	    case UNGT:
+	      fputs ("nle", file);
+	      break;
+	    case UNLE:
+	      fputs ("ule", file);
+	      break;
+	    case UNLT:
+	      fputs ("ult", file);
+	      break;
+	    case LTGT:
+	      fputs ("une", file);
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  return;
+
 	default:
 	    output_operand_lossage ("invalid operand code '%c'", code);
 	}
@@ -11352,6 +11592,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
       x = gen_rtx_AND (mode, x, op_false);
       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
     }
+  else if (TARGET_SSE5)
+    {
+      rtx pcmov = gen_rtx_SET (mode, dest,
+			       gen_rtx_IF_THEN_ELSE (mode, cmp,
+						     op_true,
+						     op_false));
+      emit_insn (pcmov);
+    }
   else
     {
       op_true = force_reg (mode, op_true);
@@ -14465,6 +14713,226 @@ enum ix86_builtins
   IX86_BUILTIN_VEC_SET_V8HI,
   IX86_BUILTIN_VEC_SET_V4HI,
 
+  /* SSE4.1/SSE5 Common instructions */
+  IX86_BUILTIN_ROUNDPD,
+  IX86_BUILTIN_ROUNDPS,
+  IX86_BUILTIN_ROUNDSD,
+  IX86_BUILTIN_ROUNDSS,
+
+  IX86_BUILTIN_PTESTZ,
+  IX86_BUILTIN_PTESTC,
+  IX86_BUILTIN_PTESTNZC,
+
+  /* SSE5 instructions */
+  IX86_BUILTIN_FMADDSS,
+  IX86_BUILTIN_FMADDSD,
+  IX86_BUILTIN_FMADDPS,
+  IX86_BUILTIN_FMADDPD,
+  IX86_BUILTIN_FMSUBSS,
+  IX86_BUILTIN_FMSUBSD,
+  IX86_BUILTIN_FMSUBPS,
+  IX86_BUILTIN_FMSUBPD,
+  IX86_BUILTIN_FNMADDSS,
+  IX86_BUILTIN_FNMADDSD,
+  IX86_BUILTIN_FNMADDPS,
+  IX86_BUILTIN_FNMADDPD,
+  IX86_BUILTIN_FNMSUBSS,
+  IX86_BUILTIN_FNMSUBSD,
+  IX86_BUILTIN_FNMSUBPS,
+  IX86_BUILTIN_FNMSUBPD,
+  IX86_BUILTIN_PCMOV_V2DI,
+  IX86_BUILTIN_PCMOV_V4SI,
+  IX86_BUILTIN_PCMOV_V8HI,
+  IX86_BUILTIN_PCMOV_V16QI,
+  IX86_BUILTIN_PCMOV_V4SF,
+  IX86_BUILTIN_PCMOV_V2DF,
+  IX86_BUILTIN_PPERM,
+  IX86_BUILTIN_PERMPS,
+  IX86_BUILTIN_PERMPD,
+  IX86_BUILTIN_PMACSSWW,
+  IX86_BUILTIN_PMACSWW,
+  IX86_BUILTIN_PMACSSWD,
+  IX86_BUILTIN_PMACSWD,
+  IX86_BUILTIN_PMACSSDD,
+  IX86_BUILTIN_PMACSDD,
+  IX86_BUILTIN_PMACSSDQL,
+  IX86_BUILTIN_PMACSSDQH,
+  IX86_BUILTIN_PMACSDQL,
+  IX86_BUILTIN_PMACSDQH,
+  IX86_BUILTIN_PMADCSSWD,
+  IX86_BUILTIN_PMADCSWD,
+  IX86_BUILTIN_PHADDBW,
+  IX86_BUILTIN_PHADDBD,
+  IX86_BUILTIN_PHADDBQ,
+  IX86_BUILTIN_PHADDWD,
+  IX86_BUILTIN_PHADDWQ,
+  IX86_BUILTIN_PHADDDQ,
+  IX86_BUILTIN_PHADDUBW,
+  IX86_BUILTIN_PHADDUBD,
+  IX86_BUILTIN_PHADDUBQ,
+  IX86_BUILTIN_PHADDUWD,
+  IX86_BUILTIN_PHADDUWQ,
+  IX86_BUILTIN_PHADDUDQ,
+  IX86_BUILTIN_PHSUBBW,
+  IX86_BUILTIN_PHSUBWD,
+  IX86_BUILTIN_PHSUBDQ,
+  IX86_BUILTIN_PROTB,
+  IX86_BUILTIN_PROTW,
+  IX86_BUILTIN_PROTD,
+  IX86_BUILTIN_PROTQ,
+  IX86_BUILTIN_PROTB_IMM,
+  IX86_BUILTIN_PROTW_IMM,
+  IX86_BUILTIN_PROTD_IMM,
+  IX86_BUILTIN_PROTQ_IMM,
+  IX86_BUILTIN_PSHLB,
+  IX86_BUILTIN_PSHLW,
+  IX86_BUILTIN_PSHLD,
+  IX86_BUILTIN_PSHLQ,
+  IX86_BUILTIN_PSHAB,
+  IX86_BUILTIN_PSHAW,
+  IX86_BUILTIN_PSHAD,
+  IX86_BUILTIN_PSHAQ,
+  IX86_BUILTIN_FRCZSS,
+  IX86_BUILTIN_FRCZSD,
+  IX86_BUILTIN_FRCZPS,
+  IX86_BUILTIN_FRCZPD,
+  IX86_BUILTIN_CVTPH2PS,
+  IX86_BUILTIN_CVTPS2PH,
+
+  IX86_BUILTIN_COMEQSS,
+  IX86_BUILTIN_COMNESS,
+  IX86_BUILTIN_COMLTSS,
+  IX86_BUILTIN_COMLESS,
+  IX86_BUILTIN_COMGTSS,
+  IX86_BUILTIN_COMGESS,
+  IX86_BUILTIN_COMUEQSS,
+  IX86_BUILTIN_COMUNESS,
+  IX86_BUILTIN_COMULTSS,
+  IX86_BUILTIN_COMULESS,
+  IX86_BUILTIN_COMUGTSS,
+  IX86_BUILTIN_COMUGESS,
+  IX86_BUILTIN_COMORDSS,
+  IX86_BUILTIN_COMUNORDSS,
+  IX86_BUILTIN_COMFALSESS,
+  IX86_BUILTIN_COMTRUESS,
+
+  IX86_BUILTIN_COMEQSD,
+  IX86_BUILTIN_COMNESD,
+  IX86_BUILTIN_COMLTSD,
+  IX86_BUILTIN_COMLESD,
+  IX86_BUILTIN_COMGTSD,
+  IX86_BUILTIN_COMGESD,
+  IX86_BUILTIN_COMUEQSD,
+  IX86_BUILTIN_COMUNESD,
+  IX86_BUILTIN_COMULTSD,
+  IX86_BUILTIN_COMULESD,
+  IX86_BUILTIN_COMUGTSD,
+  IX86_BUILTIN_COMUGESD,
+  IX86_BUILTIN_COMORDSD,
+  IX86_BUILTIN_COMUNORDSD,
+  IX86_BUILTIN_COMFALSESD,
+  IX86_BUILTIN_COMTRUESD,
+
+  IX86_BUILTIN_COMEQPS,
+  IX86_BUILTIN_COMNEPS,
+  IX86_BUILTIN_COMLTPS,
+  IX86_BUILTIN_COMLEPS,
+  IX86_BUILTIN_COMGTPS,
+  IX86_BUILTIN_COMGEPS,
+  IX86_BUILTIN_COMUEQPS,
+  IX86_BUILTIN_COMUNEPS,
+  IX86_BUILTIN_COMULTPS,
+  IX86_BUILTIN_COMULEPS,
+  IX86_BUILTIN_COMUGTPS,
+  IX86_BUILTIN_COMUGEPS,
+  IX86_BUILTIN_COMORDPS,
+  IX86_BUILTIN_COMUNORDPS,
+  IX86_BUILTIN_COMFALSEPS,
+  IX86_BUILTIN_COMTRUEPS,
+
+  IX86_BUILTIN_COMEQPD,
+  IX86_BUILTIN_COMNEPD,
+  IX86_BUILTIN_COMLTPD,
+  IX86_BUILTIN_COMLEPD,
+  IX86_BUILTIN_COMGTPD,
+  IX86_BUILTIN_COMGEPD,
+  IX86_BUILTIN_COMUEQPD,
+  IX86_BUILTIN_COMUNEPD,
+  IX86_BUILTIN_COMULTPD,
+  IX86_BUILTIN_COMULEPD,
+  IX86_BUILTIN_COMUGTPD,
+  IX86_BUILTIN_COMUGEPD,
+  IX86_BUILTIN_COMORDPD,
+  IX86_BUILTIN_COMUNORDPD,
+  IX86_BUILTIN_COMFALSEPD,
+  IX86_BUILTIN_COMTRUEPD,
+
+  IX86_BUILTIN_PCOMEQUB,
+  IX86_BUILTIN_PCOMNEUB,
+  IX86_BUILTIN_PCOMLTUB,
+  IX86_BUILTIN_PCOMLEUB,
+  IX86_BUILTIN_PCOMGTUB,
+  IX86_BUILTIN_PCOMGEUB,
+  IX86_BUILTIN_PCOMFALSEUB,
+  IX86_BUILTIN_PCOMTRUEUB,
+  IX86_BUILTIN_PCOMEQUW,
+  IX86_BUILTIN_PCOMNEUW,
+  IX86_BUILTIN_PCOMLTUW,
+  IX86_BUILTIN_PCOMLEUW,
+  IX86_BUILTIN_PCOMGTUW,
+  IX86_BUILTIN_PCOMGEUW,
+  IX86_BUILTIN_PCOMFALSEUW,
+  IX86_BUILTIN_PCOMTRUEUW,
+  IX86_BUILTIN_PCOMEQUD,
+  IX86_BUILTIN_PCOMNEUD,
+  IX86_BUILTIN_PCOMLTUD,
+  IX86_BUILTIN_PCOMLEUD,
+  IX86_BUILTIN_PCOMGTUD,
+  IX86_BUILTIN_PCOMGEUD,
+  IX86_BUILTIN_PCOMFALSEUD,
+  IX86_BUILTIN_PCOMTRUEUD,
+  IX86_BUILTIN_PCOMEQUQ,
+  IX86_BUILTIN_PCOMNEUQ,
+  IX86_BUILTIN_PCOMLTUQ,
+  IX86_BUILTIN_PCOMLEUQ,
+  IX86_BUILTIN_PCOMGTUQ,
+  IX86_BUILTIN_PCOMGEUQ,
+  IX86_BUILTIN_PCOMFALSEUQ,
+  IX86_BUILTIN_PCOMTRUEUQ,
+
+  IX86_BUILTIN_PCOMEQB,
+  IX86_BUILTIN_PCOMNEB,
+  IX86_BUILTIN_PCOMLTB,
+  IX86_BUILTIN_PCOMLEB,
+  IX86_BUILTIN_PCOMGTB,
+  IX86_BUILTIN_PCOMGEB,
+  IX86_BUILTIN_PCOMFALSEB,
+  IX86_BUILTIN_PCOMTRUEB,
+  IX86_BUILTIN_PCOMEQW,
+  IX86_BUILTIN_PCOMNEW,
+  IX86_BUILTIN_PCOMLTW,
+  IX86_BUILTIN_PCOMLEW,
+  IX86_BUILTIN_PCOMGTW,
+  IX86_BUILTIN_PCOMGEW,
+  IX86_BUILTIN_PCOMFALSEW,
+  IX86_BUILTIN_PCOMTRUEW,
+  IX86_BUILTIN_PCOMEQD,
+  IX86_BUILTIN_PCOMNED,
+  IX86_BUILTIN_PCOMLTD,
+  IX86_BUILTIN_PCOMLED,
+  IX86_BUILTIN_PCOMGTD,
+  IX86_BUILTIN_PCOMGED,
+  IX86_BUILTIN_PCOMFALSED,
+  IX86_BUILTIN_PCOMTRUED,
+  IX86_BUILTIN_PCOMEQQ,
+  IX86_BUILTIN_PCOMNEQ,
+  IX86_BUILTIN_PCOMLTQ,
+  IX86_BUILTIN_PCOMLEQ,
+  IX86_BUILTIN_PCOMGTQ,
+  IX86_BUILTIN_PCOMGEQ,
+  IX86_BUILTIN_PCOMFALSEQ,
+  IX86_BUILTIN_PCOMTRUEQ,
+
   IX86_BUILTIN_MAX
 };
 
@@ -14520,6 +14988,22 @@ static const struct builtin_description
   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
 };
 
+static const struct builtin_description bdesc_ptest[] =
+{
+  /* SSE4.1/SSE5 */
+  { MASK_SSE5, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, 0 },
+  { MASK_SSE5, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, 0 },
+  { MASK_SSE5, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, 0 },
+};
+
+/* SSE builtins with 3 arguments and the last argument must be an immediate or xmm0.  */
+static const struct builtin_description bdesc_sse_3arg[] =
+{
+  /* SSE4.1/SSE5 */
+  { MASK_SSE5, CODE_FOR_sse4_1_roundsd, 0, IX86_BUILTIN_ROUNDSD, UNKNOWN, 0 },
+  { MASK_SSE5, CODE_FOR_sse4_1_roundss, 0, IX86_BUILTIN_ROUNDSS, UNKNOWN, 0 },
+};
+
 static const struct builtin_description bdesc_2arg[] =
 {
   /* SSE */
@@ -14875,6 +15359,295 @@ static const struct builtin_description
   { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 },
 };
 
+/* SSE5 */
+enum multi_arg_type {
+  MULTI_ARG_UNKNOWN,
+  MULTI_ARG_3_SF,
+  MULTI_ARG_3_DF,
+  MULTI_ARG_3_DI,
+  MULTI_ARG_3_SI,
+  MULTI_ARG_3_SI_DI,
+  MULTI_ARG_3_HI,
+  MULTI_ARG_3_HI_SI,
+  MULTI_ARG_3_QI,
+  MULTI_ARG_3_PERMPS,
+  MULTI_ARG_3_PERMPD,
+  MULTI_ARG_2_SF,
+  MULTI_ARG_2_DF,
+  MULTI_ARG_2_DI,
+  MULTI_ARG_2_SI,
+  MULTI_ARG_2_HI,
+  MULTI_ARG_2_QI,
+  MULTI_ARG_2_DI_IMM,
+  MULTI_ARG_2_SI_IMM,
+  MULTI_ARG_2_HI_IMM,
+  MULTI_ARG_2_QI_IMM,
+  MULTI_ARG_2_SF_CMP,
+  MULTI_ARG_2_DF_CMP,
+  MULTI_ARG_2_DI_CMP,
+  MULTI_ARG_2_SI_CMP,
+  MULTI_ARG_2_HI_CMP,
+  MULTI_ARG_2_QI_CMP,
+  MULTI_ARG_2_DI_TF,
+  MULTI_ARG_2_SI_TF,
+  MULTI_ARG_2_HI_TF,
+  MULTI_ARG_2_QI_TF,
+  MULTI_ARG_2_SF_TF,
+  MULTI_ARG_2_DF_TF,
+  MULTI_ARG_1_SF,
+  MULTI_ARG_1_DF,
+  MULTI_ARG_1_DI,
+  MULTI_ARG_1_SI,
+  MULTI_ARG_1_HI,
+  MULTI_ARG_1_QI,
+  MULTI_ARG_1_SI_DI,
+  MULTI_ARG_1_HI_DI,
+  MULTI_ARG_1_HI_SI,
+  MULTI_ARG_1_QI_DI,
+  MULTI_ARG_1_QI_SI,
+  MULTI_ARG_1_QI_HI,
+  MULTI_ARG_1_PH2PS,
+  MULTI_ARG_1_PS2PH
+};
+
+static const struct builtin_description bdesc_multi_arg[] =
+{
+  { MASK_SSE5, CODE_FOR_sse5i_vmfmaddv4sf4,     "__builtin_ia32_fmaddss",    IX86_BUILTIN_FMADDSS,    0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfmaddv2df4,     "__builtin_ia32_fmaddsd",    IX86_BUILTIN_FMADDSD,    0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_fmaddv4sf4,       "__builtin_ia32_fmaddps",    IX86_BUILTIN_FMADDPS,    0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_fmaddv2df4,       "__builtin_ia32_fmaddpd",    IX86_BUILTIN_FMADDPD,    0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfmsubv4sf4,     "__builtin_ia32_fmsubss",    IX86_BUILTIN_FMSUBSS,    0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfmsubv2df4,     "__builtin_ia32_fmsubsd",    IX86_BUILTIN_FMSUBSD,    0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_fmsubv4sf4,       "__builtin_ia32_fmsubps",    IX86_BUILTIN_FMSUBPS,    0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_fmsubv2df4,       "__builtin_ia32_fmsubpd",    IX86_BUILTIN_FMSUBPD,    0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfnmaddv4sf4,    "__builtin_ia32_fnmaddss",   IX86_BUILTIN_FNMADDSS,   0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfnmaddv2df4,    "__builtin_ia32_fnmaddsd",   IX86_BUILTIN_FNMADDSD,   0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_fnmaddv4sf4,      "__builtin_ia32_fnmaddps",   IX86_BUILTIN_FNMADDPS,   0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_fnmaddv2df4,      "__builtin_ia32_fnmaddpd",   IX86_BUILTIN_FNMADDPD,   0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfnmsubv4sf4,    "__builtin_ia32_fnmsubss",   IX86_BUILTIN_FNMSUBSS,   0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_vmfnmsubv2df4,    "__builtin_ia32_fnmsubsd",   IX86_BUILTIN_FNMSUBSD,   0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5i_fnmsubv4sf4,      "__builtin_ia32_fnmsubps",   IX86_BUILTIN_FNMSUBPS,   0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5i_fnmsubv2df4,      "__builtin_ia32_fnmsubpd",   IX86_BUILTIN_FNMSUBPD,   0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v2di,        "__builtin_ia32_pcmov",      IX86_BUILTIN_PCMOV_V2DI, 0,            (int)MULTI_ARG_3_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v2di,        "__builtin_ia32_pcmov_v2di", IX86_BUILTIN_PCMOV_V2DI, 0,            (int)MULTI_ARG_3_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v4si,        "__builtin_ia32_pcmov_v4si", IX86_BUILTIN_PCMOV_V4SI, 0,            (int)MULTI_ARG_3_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v8hi,        "__builtin_ia32_pcmov_v8hi", IX86_BUILTIN_PCMOV_V8HI, 0,            (int)MULTI_ARG_3_HI },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v16qi,       "__builtin_ia32_pcmov_v16qi",IX86_BUILTIN_PCMOV_V16QI,0,            (int)MULTI_ARG_3_QI },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v2df,        "__builtin_ia32_pcmov_v2df", IX86_BUILTIN_PCMOV_V2DF, 0,            (int)MULTI_ARG_3_DF },
+  { MASK_SSE5, CODE_FOR_sse5_pcmov_v4sf,        "__builtin_ia32_pcmov_v4sf", IX86_BUILTIN_PCMOV_V4SF, 0,            (int)MULTI_ARG_3_SF },
+  { MASK_SSE5, CODE_FOR_sse5_pperm,             "__builtin_ia32_pperm",      IX86_BUILTIN_PPERM,      0,            (int)MULTI_ARG_3_QI },
+  { MASK_SSE5, CODE_FOR_sse5_permv4sf,          "__builtin_ia32_permps",     IX86_BUILTIN_PERMPS,     0,            (int)MULTI_ARG_3_PERMPS },
+  { MASK_SSE5, CODE_FOR_sse5_permv2df,          "__builtin_ia32_permpd",     IX86_BUILTIN_PERMPD,     0,            (int)MULTI_ARG_3_PERMPD },
+  { MASK_SSE5, CODE_FOR_sse5_pmacssww,          "__builtin_ia32_pmacssww",   IX86_BUILTIN_PMACSSWW,   0,            (int)MULTI_ARG_3_HI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacsww,           "__builtin_ia32_pmacsww",    IX86_BUILTIN_PMACSWW,    0,            (int)MULTI_ARG_3_HI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacsswd,          "__builtin_ia32_pmacsswd",   IX86_BUILTIN_PMACSSWD,   0,            (int)MULTI_ARG_3_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacswd,           "__builtin_ia32_pmacswd",    IX86_BUILTIN_PMACSWD,    0,            (int)MULTI_ARG_3_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacssdd,          "__builtin_ia32_pmacssdd",   IX86_BUILTIN_PMACSSDD,   0,            (int)MULTI_ARG_3_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacsdd,           "__builtin_ia32_pmacsdd",    IX86_BUILTIN_PMACSDD,    0,            (int)MULTI_ARG_3_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacssdql,         "__builtin_ia32_pmacssdql",  IX86_BUILTIN_PMACSSDQL,  0,            (int)MULTI_ARG_3_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacssdqh,         "__builtin_ia32_pmacssdqh",  IX86_BUILTIN_PMACSSDQH,  0,            (int)MULTI_ARG_3_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacsdql,          "__builtin_ia32_pmacsdql",   IX86_BUILTIN_PMACSDQL,   0,            (int)MULTI_ARG_3_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pmacsdqh,          "__builtin_ia32_pmacsdqh",   IX86_BUILTIN_PMACSDQH,   0,            (int)MULTI_ARG_3_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_pmadcsswd,         "__builtin_ia32_pmadcsswd",  IX86_BUILTIN_PMADCSSWD,  0,            (int)MULTI_ARG_3_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_pmadcswd,          "__builtin_ia32_pmadcswd",   IX86_BUILTIN_PMADCSWD,   0,            (int)MULTI_ARG_3_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_rotlv2di3,         "__builtin_ia32_protq",      IX86_BUILTIN_PROTQ,      0,            (int)MULTI_ARG_2_DI },
+  { MASK_SSE5, CODE_FOR_sse5_rotlv4si3,         "__builtin_ia32_protd",      IX86_BUILTIN_PROTD,      0,            (int)MULTI_ARG_2_SI },
+  { MASK_SSE5, CODE_FOR_sse5_rotlv8hi3,         "__builtin_ia32_protw",      IX86_BUILTIN_PROTW,      0,            (int)MULTI_ARG_2_HI },
+  { MASK_SSE5, CODE_FOR_sse5_rotlv16qi3,        "__builtin_ia32_protb",      IX86_BUILTIN_PROTB,      0,            (int)MULTI_ARG_2_QI },
+  { MASK_SSE5, CODE_FOR_rotlv2di3,              "__builtin_ia32_protqi",     IX86_BUILTIN_PROTQ_IMM,  0,            (int)MULTI_ARG_2_DI_IMM },
+  { MASK_SSE5, CODE_FOR_rotlv4si3,              "__builtin_ia32_protdi",     IX86_BUILTIN_PROTD_IMM,  0,            (int)MULTI_ARG_2_SI_IMM },
+  { MASK_SSE5, CODE_FOR_rotlv8hi3,              "__builtin_ia32_protwi",     IX86_BUILTIN_PROTW_IMM,  0,            (int)MULTI_ARG_2_HI_IMM },
+  { MASK_SSE5, CODE_FOR_rotlv16qi3,             "__builtin_ia32_protbi",     IX86_BUILTIN_PROTB_IMM,  0,            (int)MULTI_ARG_2_QI_IMM },
+  { MASK_SSE5, CODE_FOR_sse5_ashlv2di3,         "__builtin_ia32_pshaq",      IX86_BUILTIN_PSHAQ,      0,            (int)MULTI_ARG_2_DI },
+  { MASK_SSE5, CODE_FOR_sse5_ashlv4si3,         "__builtin_ia32_pshad",      IX86_BUILTIN_PSHAD,      0,            (int)MULTI_ARG_2_SI },
+  { MASK_SSE5, CODE_FOR_sse5_ashlv8hi3,         "__builtin_ia32_pshaw",      IX86_BUILTIN_PSHAW,      0,            (int)MULTI_ARG_2_HI },
+  { MASK_SSE5, CODE_FOR_sse5_ashlv16qi3,        "__builtin_ia32_pshab",      IX86_BUILTIN_PSHAB,      0,            (int)MULTI_ARG_2_QI },
+  { MASK_SSE5, CODE_FOR_sse5_lshlv2di3,         "__builtin_ia32_pshlq",      IX86_BUILTIN_PSHLQ,      0,            (int)MULTI_ARG_2_DI },
+  { MASK_SSE5, CODE_FOR_sse5_lshlv4si3,         "__builtin_ia32_pshld",      IX86_BUILTIN_PSHLD,      0,            (int)MULTI_ARG_2_SI },
+  { MASK_SSE5, CODE_FOR_sse5_lshlv8hi3,         "__builtin_ia32_pshlw",      IX86_BUILTIN_PSHLW,      0,            (int)MULTI_ARG_2_HI },
+  { MASK_SSE5, CODE_FOR_sse5_lshlv16qi3,        "__builtin_ia32_pshlb",      IX86_BUILTIN_PSHLB,      0,            (int)MULTI_ARG_2_QI },
+  { MASK_SSE5, CODE_FOR_sse5_vmfrczv4sf2,       "__builtin_ia32_frczss",     IX86_BUILTIN_FRCZSS,     0,            (int)MULTI_ARG_2_SF },
+  { MASK_SSE5, CODE_FOR_sse5_vmfrczv2df2,       "__builtin_ia32_frczsd",     IX86_BUILTIN_FRCZSD,     0,            (int)MULTI_ARG_2_DF },
+  { MASK_SSE5, CODE_FOR_sse5_frczv4sf2,         "__builtin_ia32_frczps",     IX86_BUILTIN_FRCZPS,     0,            (int)MULTI_ARG_1_SF },
+  { MASK_SSE5, CODE_FOR_sse5_frczv2df2,         "__builtin_ia32_frczpd",     IX86_BUILTIN_FRCZPD,     0,            (int)MULTI_ARG_1_DF },
+  { MASK_SSE5, CODE_FOR_sse5_cvtph2ps,          "__builtin_ia32_cvtph2ps",   IX86_BUILTIN_CVTPH2PS,   0,            (int)MULTI_ARG_1_PH2PS },
+  { MASK_SSE5, CODE_FOR_sse5_cvtps2ph,          "__builtin_ia32_cvtps2ph",   IX86_BUILTIN_CVTPS2PH,   0,            (int)MULTI_ARG_1_PS2PH },
+  { MASK_SSE5, CODE_FOR_sse5_phaddbw,           "__builtin_ia32_phaddbw",    IX86_BUILTIN_PHADDBW,    0,            (int)MULTI_ARG_1_QI_HI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddbd,           "__builtin_ia32_phaddbd",    IX86_BUILTIN_PHADDBD,    0,            (int)MULTI_ARG_1_QI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddbq,           "__builtin_ia32_phaddbq",    IX86_BUILTIN_PHADDBQ,    0,            (int)MULTI_ARG_1_QI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddwd,           "__builtin_ia32_phaddwd",    IX86_BUILTIN_PHADDWD,    0,            (int)MULTI_ARG_1_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddwq,           "__builtin_ia32_phaddwq",    IX86_BUILTIN_PHADDWQ,    0,            (int)MULTI_ARG_1_HI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phadddq,           "__builtin_ia32_phadddq",    IX86_BUILTIN_PHADDDQ,    0,            (int)MULTI_ARG_1_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddubw,          "__builtin_ia32_phaddubw",   IX86_BUILTIN_PHADDUBW,   0,            (int)MULTI_ARG_1_QI_HI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddubd,          "__builtin_ia32_phaddubd",   IX86_BUILTIN_PHADDUBD,   0,            (int)MULTI_ARG_1_QI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddubq,          "__builtin_ia32_phaddubq",   IX86_BUILTIN_PHADDUBQ,   0,            (int)MULTI_ARG_1_QI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phadduwd,          "__builtin_ia32_phadduwd",   IX86_BUILTIN_PHADDUWD,   0,            (int)MULTI_ARG_1_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_phadduwq,          "__builtin_ia32_phadduwq",   IX86_BUILTIN_PHADDUWQ,   0,            (int)MULTI_ARG_1_HI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phaddudq,          "__builtin_ia32_phaddudq",   IX86_BUILTIN_PHADDUDQ,   0,            (int)MULTI_ARG_1_SI_DI },
+  { MASK_SSE5, CODE_FOR_sse5_phsubbw,           "__builtin_ia32_phsubbw",    IX86_BUILTIN_PHSUBBW,    0,            (int)MULTI_ARG_1_QI_HI },
+  { MASK_SSE5, CODE_FOR_sse5_phsubwd,           "__builtin_ia32_phsubwd",    IX86_BUILTIN_PHSUBWD,    0,            (int)MULTI_ARG_1_HI_SI },
+  { MASK_SSE5, CODE_FOR_sse5_phsubdq,           "__builtin_ia32_phsubdq",    IX86_BUILTIN_PHSUBDQ,    0,            (int)MULTI_ARG_1_SI_DI },
+
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comeqss",    IX86_BUILTIN_COMEQSS,    EQ,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comness",    IX86_BUILTIN_COMNESS,    NE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comneqss",   IX86_BUILTIN_COMNESS,    NE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comltss",    IX86_BUILTIN_COMLTSS,    LT,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comless",    IX86_BUILTIN_COMLESS,    LE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comgtss",    IX86_BUILTIN_COMGTSS,    GT,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comgess",    IX86_BUILTIN_COMGESS,    GE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comueqss",   IX86_BUILTIN_COMUEQSS,   UNEQ,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comuness",   IX86_BUILTIN_COMUNESS,   LTGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comuneqss",  IX86_BUILTIN_COMUNESS,   LTGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comunltss",  IX86_BUILTIN_COMULTSS,   UNLT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comunless",  IX86_BUILTIN_COMULESS,   UNLE,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comungtss",  IX86_BUILTIN_COMUGTSS,   UNGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comungess",  IX86_BUILTIN_COMUGESS,   UNGE,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comordss",   IX86_BUILTIN_COMORDSS,   ORDERED,      (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv4sf3,    "__builtin_ia32_comunordss", IX86_BUILTIN_COMUNORDSS, UNORDERED,    (int)MULTI_ARG_2_SF_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comeqsd",    IX86_BUILTIN_COMEQSD,    EQ,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comnesd",    IX86_BUILTIN_COMNESD,    NE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comneqsd",   IX86_BUILTIN_COMNESD,    NE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comltsd",    IX86_BUILTIN_COMLTSD,    LT,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comlesd",    IX86_BUILTIN_COMLESD,    LE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comgtsd",    IX86_BUILTIN_COMGTSD,    GT,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comgesd",    IX86_BUILTIN_COMGESD,    GE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comueqsd",   IX86_BUILTIN_COMUEQSD,   UNEQ,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comunesd",   IX86_BUILTIN_COMUNESD,   LTGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comuneqsd",  IX86_BUILTIN_COMUNESD,   LTGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comunltsd",  IX86_BUILTIN_COMULTSD,   UNLT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comunlesd",  IX86_BUILTIN_COMULESD,   UNLE,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comungtsd",  IX86_BUILTIN_COMUGTSD,   UNGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comungesd",  IX86_BUILTIN_COMUGESD,   UNGE,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comordsd",   IX86_BUILTIN_COMORDSD,   ORDERED,      (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_vmmaskcmpv2df3,    "__builtin_ia32_comunordsd", IX86_BUILTIN_COMUNORDSD, UNORDERED,    (int)MULTI_ARG_2_DF_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comeqps",    IX86_BUILTIN_COMEQPS,    EQ,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comneps",    IX86_BUILTIN_COMNEPS,    NE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comneqps",   IX86_BUILTIN_COMNEPS,    NE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comltps",    IX86_BUILTIN_COMLTPS,    LT,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comleps",    IX86_BUILTIN_COMLEPS,    LE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comgtps",    IX86_BUILTIN_COMGTPS,    GT,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comgeps",    IX86_BUILTIN_COMGEPS,    GE,           (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comueqps",   IX86_BUILTIN_COMUEQPS,   UNEQ,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comuneps",   IX86_BUILTIN_COMUNEPS,   LTGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comuneqps",  IX86_BUILTIN_COMUNEPS,   LTGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comunltps",  IX86_BUILTIN_COMULTPS,   UNLT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comunleps",  IX86_BUILTIN_COMULEPS,   UNLE,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comungtps",  IX86_BUILTIN_COMUGTPS,   UNGT,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comungeps",  IX86_BUILTIN_COMUGEPS,   UNGE,         (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comordps",   IX86_BUILTIN_COMORDPS,   ORDERED,      (int)MULTI_ARG_2_SF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4sf3,      "__builtin_ia32_comunordps", IX86_BUILTIN_COMUNORDPS, UNORDERED,    (int)MULTI_ARG_2_SF_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comeqpd",    IX86_BUILTIN_COMEQPD,    EQ,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comnepd",    IX86_BUILTIN_COMNEPD,    NE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comneqpd",   IX86_BUILTIN_COMNEPD,    NE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comltpd",    IX86_BUILTIN_COMLTPD,    LT,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comlepd",    IX86_BUILTIN_COMLEPD,    LE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comgtpd",    IX86_BUILTIN_COMGTPD,    GT,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comgepd",    IX86_BUILTIN_COMGEPD,    GE,           (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comueqpd",   IX86_BUILTIN_COMUEQPD,   UNEQ,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comunepd",   IX86_BUILTIN_COMUNEPD,   LTGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comuneqpd",  IX86_BUILTIN_COMUNEPD,   LTGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comunltpd",  IX86_BUILTIN_COMULTPD,   UNLT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comunlepd",  IX86_BUILTIN_COMULEPD,   UNLE,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comungtpd",  IX86_BUILTIN_COMUGTPD,   UNGT,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comungepd",  IX86_BUILTIN_COMUGEPD,   UNGE,         (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comordpd",   IX86_BUILTIN_COMORDPD,   ORDERED,      (int)MULTI_ARG_2_DF_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2df3,      "__builtin_ia32_comunordpd", IX86_BUILTIN_COMUNORDPD, UNORDERED,    (int)MULTI_ARG_2_DF_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomeqb",    IX86_BUILTIN_PCOMEQB,    EQ,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomneb",    IX86_BUILTIN_PCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomneqb",   IX86_BUILTIN_PCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomltb",    IX86_BUILTIN_PCOMLTB,    LT,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomleb",    IX86_BUILTIN_PCOMLEB,    LE,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomgtb",    IX86_BUILTIN_PCOMGTB,    GT,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv16qi3,     "__builtin_ia32_pcomgeb",    IX86_BUILTIN_PCOMGEB,    GE,           (int)MULTI_ARG_2_QI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomeqw",    IX86_BUILTIN_PCOMEQW,    EQ,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomnew",    IX86_BUILTIN_PCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomneqw",   IX86_BUILTIN_PCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomltw",    IX86_BUILTIN_PCOMLTW,    LT,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomlew",    IX86_BUILTIN_PCOMLEW,    LE,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomgtw",    IX86_BUILTIN_PCOMGTW,    GT,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv8hi3,      "__builtin_ia32_pcomgew",    IX86_BUILTIN_PCOMGEW,    GE,           (int)MULTI_ARG_2_HI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomeqd",    IX86_BUILTIN_PCOMEQD,    EQ,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomned",    IX86_BUILTIN_PCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomneqd",   IX86_BUILTIN_PCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomltd",    IX86_BUILTIN_PCOMLTD,    LT,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomled",    IX86_BUILTIN_PCOMLED,    LE,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomgtd",    IX86_BUILTIN_PCOMGTD,    GT,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv4si3,      "__builtin_ia32_pcomged",    IX86_BUILTIN_PCOMGED,    GE,           (int)MULTI_ARG_2_SI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomeqq",    IX86_BUILTIN_PCOMEQQ,    EQ,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomneq",    IX86_BUILTIN_PCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomneqq",   IX86_BUILTIN_PCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomltq",    IX86_BUILTIN_PCOMLTQ,    LT,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomleq",    IX86_BUILTIN_PCOMLEQ,    LE,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomgtq",    IX86_BUILTIN_PCOMGTQ,    GT,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmpv2di3,      "__builtin_ia32_pcomgeq",    IX86_BUILTIN_PCOMGEQ,    GE,           (int)MULTI_ARG_2_DI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v16qi3,"__builtin_ia32_pcomequb",   IX86_BUILTIN_PCOMEQUB,   EQ,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v16qi3,"__builtin_ia32_pcomneub",   IX86_BUILTIN_PCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v16qi3,"__builtin_ia32_pcomnequb",  IX86_BUILTIN_PCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv16qi3, "__builtin_ia32_pcomltub",   IX86_BUILTIN_PCOMLTUB,   LTU,          (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv16qi3, "__builtin_ia32_pcomleub",   IX86_BUILTIN_PCOMLEUB,   LEU,          (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv16qi3, "__builtin_ia32_pcomgtub",   IX86_BUILTIN_PCOMGTUB,   GTU,          (int)MULTI_ARG_2_QI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv16qi3, "__builtin_ia32_pcomgeub",   IX86_BUILTIN_PCOMGEUB,   GEU,          (int)MULTI_ARG_2_QI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v8hi3, "__builtin_ia32_pcomequw",   IX86_BUILTIN_PCOMEQUW,   EQ,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v8hi3, "__builtin_ia32_pcomneuw",   IX86_BUILTIN_PCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v8hi3, "__builtin_ia32_pcomnequw",  IX86_BUILTIN_PCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv8hi3,  "__builtin_ia32_pcomltuw",   IX86_BUILTIN_PCOMLTUW,   LTU,          (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv8hi3,  "__builtin_ia32_pcomleuw",   IX86_BUILTIN_PCOMLEUW,   LEU,          (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv8hi3,  "__builtin_ia32_pcomgtuw",   IX86_BUILTIN_PCOMGTUW,   GTU,          (int)MULTI_ARG_2_HI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv8hi3,  "__builtin_ia32_pcomgeuw",   IX86_BUILTIN_PCOMGEUW,   GEU,          (int)MULTI_ARG_2_HI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v4si3, "__builtin_ia32_pcomequd",   IX86_BUILTIN_PCOMEQUD,   EQ,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v4si3, "__builtin_ia32_pcomneud",   IX86_BUILTIN_PCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v4si3, "__builtin_ia32_pcomnequd",  IX86_BUILTIN_PCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv4si3,  "__builtin_ia32_pcomltud",   IX86_BUILTIN_PCOMLTUD,   LTU,          (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv4si3,  "__builtin_ia32_pcomleud",   IX86_BUILTIN_PCOMLEUD,   LEU,          (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv4si3,  "__builtin_ia32_pcomgtud",   IX86_BUILTIN_PCOMGTUD,   GTU,          (int)MULTI_ARG_2_SI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv4si3,  "__builtin_ia32_pcomgeud",   IX86_BUILTIN_PCOMGEUD,   GEU,          (int)MULTI_ARG_2_SI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v2di3, "__builtin_ia32_pcomequq",   IX86_BUILTIN_PCOMEQUQ,   EQ,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v2di3, "__builtin_ia32_pcomneuq",   IX86_BUILTIN_PCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_uns2v2di3, "__builtin_ia32_pcomnequq",  IX86_BUILTIN_PCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv2di3,  "__builtin_ia32_pcomltuq",   IX86_BUILTIN_PCOMLTUQ,   LTU,          (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv2di3,  "__builtin_ia32_pcomleuq",   IX86_BUILTIN_PCOMLEUQ,   LEU,          (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv2di3,  "__builtin_ia32_pcomgtuq",   IX86_BUILTIN_PCOMGTUQ,   GTU,          (int)MULTI_ARG_2_DI_CMP },
+  { MASK_SSE5, CODE_FOR_sse5_maskcmp_unsv2di3,  "__builtin_ia32_pcomgeuq",   IX86_BUILTIN_PCOMGEUQ,   GEU,          (int)MULTI_ARG_2_DI_CMP },
+
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv4sf3,       "__builtin_ia32_comfalsess", IX86_BUILTIN_COMFALSESS, COM_FALSE_S,  (int)MULTI_ARG_2_SF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv4sf3,       "__builtin_ia32_comtruess",  IX86_BUILTIN_COMTRUESS,  COM_TRUE_S,   (int)MULTI_ARG_2_SF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv4sf3,       "__builtin_ia32_comfalseps", IX86_BUILTIN_COMFALSEPS, COM_FALSE_P,  (int)MULTI_ARG_2_SF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv4sf3,       "__builtin_ia32_comtrueps",  IX86_BUILTIN_COMTRUEPS,  COM_TRUE_P,   (int)MULTI_ARG_2_SF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv2df3,       "__builtin_ia32_comfalsesd", IX86_BUILTIN_COMFALSESD, COM_FALSE_S,  (int)MULTI_ARG_2_DF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv2df3,       "__builtin_ia32_comtruesd",  IX86_BUILTIN_COMTRUESD,  COM_TRUE_S,   (int)MULTI_ARG_2_DF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv2df3,       "__builtin_ia32_comfalsepd", IX86_BUILTIN_COMFALSEPD, COM_FALSE_P,  (int)MULTI_ARG_2_DF_TF },
+  { MASK_SSE5, CODE_FOR_sse5_com_tfv2df3,       "__builtin_ia32_comtruepd",  IX86_BUILTIN_COMTRUEPD,  COM_TRUE_P,   (int)MULTI_ARG_2_DF_TF },
+
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv16qi3,     "__builtin_ia32_pcomfalseb", IX86_BUILTIN_PCOMFALSEB, PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv8hi3,      "__builtin_ia32_pcomfalsew", IX86_BUILTIN_PCOMFALSEW, PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv4si3,      "__builtin_ia32_pcomfalsed", IX86_BUILTIN_PCOMFALSED, PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv2di3,      "__builtin_ia32_pcomfalseq", IX86_BUILTIN_PCOMFALSEQ, PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv16qi3,     "__builtin_ia32_pcomfalseub",IX86_BUILTIN_PCOMFALSEUB,PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv8hi3,      "__builtin_ia32_pcomfalseuw",IX86_BUILTIN_PCOMFALSEUW,PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv4si3,      "__builtin_ia32_pcomfalseud",IX86_BUILTIN_PCOMFALSEUD,PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv2di3,      "__builtin_ia32_pcomfalseuq",IX86_BUILTIN_PCOMFALSEUQ,PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
+
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv16qi3,     "__builtin_ia32_pcomtrueb",  IX86_BUILTIN_PCOMTRUEB,  PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv8hi3,      "__builtin_ia32_pcomtruew",  IX86_BUILTIN_PCOMTRUEW,  PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv4si3,      "__builtin_ia32_pcomtrued",  IX86_BUILTIN_PCOMTRUED,  PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv2di3,      "__builtin_ia32_pcomtrueq",  IX86_BUILTIN_PCOMTRUEQ,  PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv16qi3,     "__builtin_ia32_pcomtrueub", IX86_BUILTIN_PCOMTRUEUB, PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv8hi3,      "__builtin_ia32_pcomtrueuw", IX86_BUILTIN_PCOMTRUEUW, PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv4si3,      "__builtin_ia32_pcomtrueud", IX86_BUILTIN_PCOMTRUEUD, PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
+  { MASK_SSE5, CODE_FOR_sse5_pcom_tfv2di3,      "__builtin_ia32_pcomtrueuq", IX86_BUILTIN_PCOMTRUEUQ, PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
+};
+
 static void
 ix86_init_builtins (void)
 {
@@ -15203,6 +15976,137 @@ ix86_init_mmx_sse_builtins (void)
   tree v2di_ftype_v2di_v16qi
     = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
                                 NULL_TREE);
+  tree v2df_ftype_v2df_v2df_v2df
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node, V2DF_type_node,
+				V2DF_type_node, NULL_TREE);
+  tree v4sf_ftype_v4sf_v4sf_v4sf
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node, V4SF_type_node,
+				V4SF_type_node, NULL_TREE);
+  tree v8hi_ftype_v16qi
+    = build_function_type_list (V8HI_type_node, V16QI_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v16qi
+    = build_function_type_list (V4SI_type_node, V16QI_type_node,
+				NULL_TREE);
+  tree v2di_ftype_v16qi
+    = build_function_type_list (V2DI_type_node, V16QI_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v8hi
+    = build_function_type_list (V4SI_type_node, V8HI_type_node,
+				NULL_TREE);
+  tree v2di_ftype_v8hi
+    = build_function_type_list (V2DI_type_node, V8HI_type_node,
+				NULL_TREE);
+  tree v2di_ftype_v4si
+    = build_function_type_list (V2DI_type_node, V4SI_type_node,
+				NULL_TREE);
+  tree v16qi_ftype_v16qi_v16qi_int
+    = build_function_type_list (V16QI_type_node, V16QI_type_node,
+				V16QI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v8hi_ftype_v8hi_v8hi_int
+    = build_function_type_list (V8HI_type_node, V8HI_type_node,
+				V8HI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v4si_v4si_int
+    = build_function_type_list (V4SI_type_node, V4SI_type_node,
+				V4SI_type_node, integer_type_node,
+				NULL_TREE);
+  tree int_ftype_v2di_v2di
+    = build_function_type_list (integer_type_node,
+				V2DI_type_node, V2DI_type_node,
+				NULL_TREE);
+  tree v16qi_ftype_v16qi_v16qi_v16qi
+    = build_function_type_list (V16QI_type_node, V16QI_type_node,
+				V16QI_type_node, V16QI_type_node,
+				NULL_TREE);
+  /* SSE5 instructions */
+  tree v2di_ftype_v2di_v2di_v2di
+    = build_function_type_list (V2DI_type_node,
+				V2DI_type_node,
+				V2DI_type_node,
+				V2DI_type_node,
+				NULL_TREE);
+
+  tree v4si_ftype_v4si_v4si_v4si
+    = build_function_type_list (V4SI_type_node,
+				V4SI_type_node,
+				V4SI_type_node,
+				V4SI_type_node,
+				NULL_TREE);
+
+  tree v4si_ftype_v4si_v4si_v2di
+    = build_function_type_list (V4SI_type_node,
+				V4SI_type_node,
+				V4SI_type_node,
+				V2DI_type_node,
+				NULL_TREE);
+
+  tree v8hi_ftype_v8hi_v8hi_v8hi
+    = build_function_type_list (V8HI_type_node,
+				V8HI_type_node,
+				V8HI_type_node,
+				V8HI_type_node,
+				NULL_TREE);
+
+  tree v8hi_ftype_v8hi_v8hi_v4si
+    = build_function_type_list (V8HI_type_node,
+				V8HI_type_node,
+				V8HI_type_node,
+				V4SI_type_node,
+				NULL_TREE);
+
+  tree v2df_ftype_v2df_v2df_v16qi
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node,
+				V2DF_type_node,
+				V16QI_type_node,
+				NULL_TREE);
+
+  tree v4sf_ftype_v4sf_v4sf_v16qi
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node,
+				V4SF_type_node,
+				V16QI_type_node,
+				NULL_TREE);
+
+  tree v2di_ftype_v2di_si
+    = build_function_type_list (V2DI_type_node,
+				V2DI_type_node,
+				integer_type_node,
+				NULL_TREE);
+
+  tree v4si_ftype_v4si_si
+    = build_function_type_list (V4SI_type_node,
+				V4SI_type_node,
+				integer_type_node,
+				NULL_TREE);
+
+  tree v8hi_ftype_v8hi_si
+    = build_function_type_list (V8HI_type_node,
+				V8HI_type_node,
+				integer_type_node,
+				NULL_TREE);
+
+  tree v16qi_ftype_v16qi_si
+    = build_function_type_list (V16QI_type_node,
+				V16QI_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v4hi
+    = build_function_type_list (V4SF_type_node,
+				V4HI_type_node,
+				NULL_TREE);
+
+  tree v4hi_ftype_v4sf
+    = build_function_type_list (V4HI_type_node,
+				V4SF_type_node,
+				NULL_TREE);
+
+  tree v2di_ftype_v2di
+    = build_function_type_list (V2DI_type_node, V2DI_type_node, NULL_TREE);
 
   tree float80_type;
   tree float128_type;
@@ -15229,6 +16133,48 @@ ix86_init_mmx_sse_builtins (void)
       (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
     }
 
+  /* Add all SSE builtins that are more or less simple operations on
+     three operands.  */
+  for (i = 0, d = bdesc_sse_3arg;
+       i < ARRAY_SIZE (bdesc_sse_3arg);
+       i++, d++)
+    {
+      /* Use one of the operands; the target can have a different mode for
+	 mask-generating compares.  */
+      enum machine_mode mode;
+      tree type;
+
+      if (d->name == 0)
+	continue;
+      mode = insn_data[d->icode].operand[1].mode;
+
+      switch (mode)
+	{
+	case V16QImode:
+	  type = v16qi_ftype_v16qi_v16qi_int;
+	  break;
+	case V8HImode:
+	  type = v8hi_ftype_v8hi_v8hi_int;
+	  break;
+	case V4SImode:
+	  type = v4si_ftype_v4si_v4si_int;
+	  break;
+	case V2DImode:
+	  type = v2di_ftype_v2di_v2di_int;
+	  break;
+	case V2DFmode:
+	  type = v2df_ftype_v2df_v2df_int;
+	  break;
+	case V4SFmode:
+	  type = v4sf_ftype_v4sf_v4sf_int;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      def_builtin (d->mask, d->name, type, d->code);
+    }
+
   /* Add all builtins that are more or less simple operations on two
      operands.  */
   for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
@@ -15354,6 +16300,10 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (MASK_MMX, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
   def_builtin (MASK_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
 
+  /* ptest insns.  */
+  for (i = 0, d = bdesc_ptest; i < ARRAY_SIZE (bdesc_ptest); i++, d++)
+    def_builtin (d->mask, d->name, int_ftype_v2di_v2di, d->code);
+
   /* comi/ucomi insns.  */
   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
     if (d->mask == MASK_SSE2)
@@ -15563,6 +16513,12 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi", v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
   def_builtin (MASK_SSE4A, "__builtin_ia32_insertq", v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
 
+  /* SSE4.1 and SSE5 */
+  def_builtin (MASK_SSE5, "__builtin_ia32_roundpd", v2df_ftype_v2df_int, IX86_BUILTIN_ROUNDPD);
+  def_builtin (MASK_SSE5, "__builtin_ia32_roundps", v4sf_ftype_v4sf_int, IX86_BUILTIN_ROUNDPS);
+  def_builtin (MASK_SSE5, "__builtin_ia32_roundsd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_ROUNDSD);
+  def_builtin (MASK_SSE5, "__builtin_ia32_roundss", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_ROUNDSS);
+
   /* Access to the vec_init patterns.  */
   ftype = build_function_type_list (V2SI_type_node, integer_type_node,
 				    integer_type_node, NULL_TREE);
@@ -15633,6 +16589,72 @@ ix86_init_mmx_sse_builtins (void)
 				    integer_type_node, NULL_TREE);
   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi",
 	       ftype, IX86_BUILTIN_VEC_SET_V4HI);
+
+
+  /* Add SSE5 multi-arg argument instructions */
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    {
+      tree mtype = NULL_TREE;
+
+      if (d->name == 0)
+	continue;
+
+      switch ((enum multi_arg_type)d->flag)
+	{
+	case MULTI_ARG_3_SF:     mtype = v4sf_ftype_v4sf_v4sf_v4sf; 	break;
+	case MULTI_ARG_3_DF:     mtype = v2df_ftype_v2df_v2df_v2df; 	break;
+	case MULTI_ARG_3_DI:     mtype = v2di_ftype_v2di_v2di_v2di; 	break;
+	case MULTI_ARG_3_SI:     mtype = v4si_ftype_v4si_v4si_v4si; 	break;
+	case MULTI_ARG_3_SI_DI:  mtype = v4si_ftype_v4si_v4si_v2di; 	break;
+	case MULTI_ARG_3_HI:     mtype = v8hi_ftype_v8hi_v8hi_v8hi; 	break;
+	case MULTI_ARG_3_HI_SI:  mtype = v8hi_ftype_v8hi_v8hi_v4si; 	break;
+	case MULTI_ARG_3_QI:     mtype = v16qi_ftype_v16qi_v16qi_v16qi; break;
+	case MULTI_ARG_3_PERMPS: mtype = v4sf_ftype_v4sf_v4sf_v16qi; 	break;
+	case MULTI_ARG_3_PERMPD: mtype = v2df_ftype_v2df_v2df_v16qi; 	break;
+	case MULTI_ARG_2_SF:     mtype = v4sf_ftype_v4sf_v4sf;      	break;
+	case MULTI_ARG_2_DF:     mtype = v2df_ftype_v2df_v2df;      	break;
+	case MULTI_ARG_2_DI:     mtype = v2di_ftype_v2di_v2di;      	break;
+	case MULTI_ARG_2_SI:     mtype = v4si_ftype_v4si_v4si;      	break;
+	case MULTI_ARG_2_HI:     mtype = v8hi_ftype_v8hi_v8hi;      	break;
+	case MULTI_ARG_2_QI:     mtype = v16qi_ftype_v16qi_v16qi;      	break;
+	case MULTI_ARG_2_DI_IMM: mtype = v2di_ftype_v2di_si;        	break;
+	case MULTI_ARG_2_SI_IMM: mtype = v4si_ftype_v4si_si;        	break;
+	case MULTI_ARG_2_HI_IMM: mtype = v8hi_ftype_v8hi_si;        	break;
+	case MULTI_ARG_2_QI_IMM: mtype = v16qi_ftype_v16qi_si;        	break;
+	case MULTI_ARG_2_SF_CMP: mtype = v4sf_ftype_v4sf_v4sf;      	break;
+	case MULTI_ARG_2_DF_CMP: mtype = v2df_ftype_v2df_v2df;      	break;
+	case MULTI_ARG_2_DI_CMP: mtype = v2di_ftype_v2di_v2di;      	break;
+	case MULTI_ARG_2_SI_CMP: mtype = v4si_ftype_v4si_v4si;      	break;
+	case MULTI_ARG_2_HI_CMP: mtype = v8hi_ftype_v8hi_v8hi;      	break;
+	case MULTI_ARG_2_QI_CMP: mtype = v16qi_ftype_v16qi_v16qi;      	break;
+	case MULTI_ARG_2_SF_TF:  mtype = v4sf_ftype_v4sf_v4sf;      	break;
+	case MULTI_ARG_2_DF_TF:  mtype = v2df_ftype_v2df_v2df;      	break;
+	case MULTI_ARG_2_DI_TF:  mtype = v2di_ftype_v2di_v2di;      	break;
+	case MULTI_ARG_2_SI_TF:  mtype = v4si_ftype_v4si_v4si;      	break;
+	case MULTI_ARG_2_HI_TF:  mtype = v8hi_ftype_v8hi_v8hi;      	break;
+	case MULTI_ARG_2_QI_TF:  mtype = v16qi_ftype_v16qi_v16qi;      	break;
+	case MULTI_ARG_1_SF:     mtype = v4sf_ftype_v4sf;           	break;
+	case MULTI_ARG_1_DF:     mtype = v2df_ftype_v2df;           	break;
+	case MULTI_ARG_1_DI:     mtype = v2di_ftype_v2di;           	break;
+	case MULTI_ARG_1_SI:     mtype = v4si_ftype_v4si;           	break;
+	case MULTI_ARG_1_HI:     mtype = v8hi_ftype_v8hi;           	break;
+	case MULTI_ARG_1_QI:     mtype = v16qi_ftype_v16qi;           	break;
+	case MULTI_ARG_1_SI_DI:  mtype = v2di_ftype_v4si;           	break;
+	case MULTI_ARG_1_HI_DI:  mtype = v2di_ftype_v8hi;           	break;
+	case MULTI_ARG_1_HI_SI:  mtype = v4si_ftype_v8hi;           	break;
+	case MULTI_ARG_1_QI_DI:  mtype = v2di_ftype_v16qi;           	break;
+	case MULTI_ARG_1_QI_SI:  mtype = v4si_ftype_v16qi;           	break;
+	case MULTI_ARG_1_QI_HI:  mtype = v8hi_ftype_v16qi;           	break;
+	case MULTI_ARG_1_PH2PS:  mtype = v4sf_ftype_v4hi;		break;
+	case MULTI_ARG_1_PS2PH:  mtype = v4hi_ftype_v4sf;		break;
+	case MULTI_ARG_UNKNOWN:
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (mtype)
+	def_builtin (d->mask, d->name, mtype, d->code);
+    }
 }
 
 /* Errors in the source file can cause expand_expr to return const0_rtx
@@ -15714,6 +16736,189 @@ ix86_expand_binop_builtin (enum insn_cod
   return target;
 }
 
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree arglist, rtx target,
+			       enum multi_arg_type m_type,
+			       enum insn_code sub_code)
+{
+  rtx pat;
+  int i;
+  int nargs;
+  bool comparison_p = false;
+  bool tf_p = false;
+  bool last_arg_constant = false;
+  int num_memory = 0;
+  struct {
+    rtx op;
+    enum machine_mode mode;
+  } args[4];
+
+  enum machine_mode tmode = insn_data[icode].operand[0].mode;
+
+  switch (m_type)
+    {
+    case MULTI_ARG_3_SF:
+    case MULTI_ARG_3_DF:
+    case MULTI_ARG_3_DI:
+    case MULTI_ARG_3_SI:
+    case MULTI_ARG_3_SI_DI:
+    case MULTI_ARG_3_HI:
+    case MULTI_ARG_3_HI_SI:
+    case MULTI_ARG_3_QI:
+    case MULTI_ARG_3_PERMPS:
+    case MULTI_ARG_3_PERMPD:
+      nargs = 3;
+      break;
+
+    case MULTI_ARG_2_SF:
+    case MULTI_ARG_2_DF:
+    case MULTI_ARG_2_DI:
+    case MULTI_ARG_2_SI:
+    case MULTI_ARG_2_HI:
+    case MULTI_ARG_2_QI:
+      nargs = 2;
+      break;
+
+    case MULTI_ARG_2_DI_IMM:
+    case MULTI_ARG_2_SI_IMM:
+    case MULTI_ARG_2_HI_IMM:
+    case MULTI_ARG_2_QI_IMM:
+      nargs = 2;
+      last_arg_constant = true;
+      break;
+
+    case MULTI_ARG_1_SF:
+    case MULTI_ARG_1_DF:
+    case MULTI_ARG_1_DI:
+    case MULTI_ARG_1_SI:
+    case MULTI_ARG_1_HI:
+    case MULTI_ARG_1_QI:
+    case MULTI_ARG_1_SI_DI:
+    case MULTI_ARG_1_HI_DI:
+    case MULTI_ARG_1_HI_SI:
+    case MULTI_ARG_1_QI_DI:
+    case MULTI_ARG_1_QI_SI:
+    case MULTI_ARG_1_QI_HI:
+    case MULTI_ARG_1_PH2PS:
+    case MULTI_ARG_1_PS2PH:
+      nargs = 1;
+      break;
+
+    case MULTI_ARG_2_SF_CMP:
+    case MULTI_ARG_2_DF_CMP:
+    case MULTI_ARG_2_DI_CMP:
+    case MULTI_ARG_2_SI_CMP:
+    case MULTI_ARG_2_HI_CMP:
+    case MULTI_ARG_2_QI_CMP:
+      nargs = 2;
+      comparison_p = true;
+      break;
+
+    case MULTI_ARG_2_SF_TF:
+    case MULTI_ARG_2_DF_TF:
+    case MULTI_ARG_2_DI_TF:
+    case MULTI_ARG_2_SI_TF:
+    case MULTI_ARG_2_HI_TF:
+    case MULTI_ARG_2_QI_TF:
+      nargs = 2;
+      tf_p = true;
+      break;
+
+    case MULTI_ARG_UNKNOWN:
+    default:
+      gcc_unreachable ();
+    }
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  gcc_assert (nargs <= 4);
+
+  for (i = 0; i < nargs; i++)
+    {
+      int adjust = (comparison_p) ? 1 : 0;
+      enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+      tree arg;
+      rtx op;
+
+      gcc_assert (arglist != NULL);
+      arg = TREE_VALUE (arglist);
+
+      gcc_assert (arg != NULL);
+      op = expand_expr (arg, NULL_RTX, VOIDmode, 0);
+
+      if (last_arg_constant && i == nargs-1)
+	{
+	  if (GET_CODE (op) != CONST_INT)
+	    {
+	      error ("last argument must be an immediate");
+	      return gen_reg_rtx (tmode);
+	    }
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to be
+	     generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+	  if (optimize
+	      || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode)
+	      || num_memory > 1)
+	    op = force_reg (mode, op);
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+      arglist = TREE_CHAIN (arglist);
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+
+    case 2:
+      if (tf_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			       GEN_INT ((int)sub_code));
+      else if (! comparison_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      else
+	{
+	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+				       args[0].op,
+				       args[1].op);
+
+	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+	}
+      break;
+
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
 /* Subroutine of ix86_expand_builtin to take care of stores.  */
 
 static rtx
@@ -16627,6 +17832,12 @@ ix86_expand_builtin (tree exp, rtx targe
     if (d->code == fcode)
       return ix86_expand_sse_comi (d, arglist, target);
 
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_multi_arg_builtin (d->icode, arglist, target,
+					    (enum multi_arg_type)d->flag,
+					    d->comparison);
+
   gcc_unreachable ();
 }
 
Index: gcc/doc/extend.texi
===================================================================
--- gcc/doc/extend.texi.orig	2009-11-20 13:42:41.000000000 +0100
+++ gcc/doc/extend.texi	2009-11-20 13:43:02.000000000 +0100
@@ -6917,6 +6917,222 @@ __m128i          _mm_insert_si64 (__m128
 __m128i          _mm_inserti_si64 (__m128i, __m128i, int, int);
 @end smallexample
 
+The following built-in functions are available when @option{-msse5} is used.
+All of them generate the machine instruction that is part of the name
+with MMX registers.
+
+@smallexample
+v2df __builtin_ia32_comeqpd (v2df, v2df)
+v2df __builtin_ia32_comeqps (v2df, v2df)
+v4sf __builtin_ia32_comeqsd (v4sf, v4sf)
+v4sf __builtin_ia32_comeqss (v4sf, v4sf)
+v2df __builtin_ia32_comfalsepd (v2df, v2df)
+v2df __builtin_ia32_comfalseps (v2df, v2df)
+v4sf __builtin_ia32_comfalsesd (v4sf, v4sf)
+v4sf __builtin_ia32_comfalsess (v4sf, v4sf)
+v2df __builtin_ia32_comgepd (v2df, v2df)
+v2df __builtin_ia32_comgeps (v2df, v2df)
+v4sf __builtin_ia32_comgesd (v4sf, v4sf)
+v4sf __builtin_ia32_comgess (v4sf, v4sf)
+v2df __builtin_ia32_comgtpd (v2df, v2df)
+v2df __builtin_ia32_comgtps (v2df, v2df)
+v4sf __builtin_ia32_comgtsd (v4sf, v4sf)
+v4sf __builtin_ia32_comgtss (v4sf, v4sf)
+v2df __builtin_ia32_comlepd (v2df, v2df)
+v2df __builtin_ia32_comleps (v2df, v2df)
+v4sf __builtin_ia32_comlesd (v4sf, v4sf)
+v4sf __builtin_ia32_comless (v4sf, v4sf)
+v2df __builtin_ia32_comltpd (v2df, v2df)
+v2df __builtin_ia32_comltps (v2df, v2df)
+v4sf __builtin_ia32_comltsd (v4sf, v4sf)
+v4sf __builtin_ia32_comltss (v4sf, v4sf)
+v2df __builtin_ia32_comnepd (v2df, v2df)
+v2df __builtin_ia32_comneps (v2df, v2df)
+v4sf __builtin_ia32_comnesd (v4sf, v4sf)
+v4sf __builtin_ia32_comness (v4sf, v4sf)
+v2df __builtin_ia32_comordpd (v2df, v2df)
+v2df __builtin_ia32_comordps (v2df, v2df)
+v4sf __builtin_ia32_comordsd (v4sf, v4sf)
+v4sf __builtin_ia32_comordss (v4sf, v4sf)
+v2df __builtin_ia32_comtruepd (v2df, v2df)
+v2df __builtin_ia32_comtrueps (v2df, v2df)
+v4sf __builtin_ia32_comtruesd (v4sf, v4sf)
+v4sf __builtin_ia32_comtruess (v4sf, v4sf)
+v2df __builtin_ia32_comueqpd (v2df, v2df)
+v2df __builtin_ia32_comueqps (v2df, v2df)
+v4sf __builtin_ia32_comueqsd (v4sf, v4sf)
+v4sf __builtin_ia32_comueqss (v4sf, v4sf)
+v2df __builtin_ia32_comugepd (v2df, v2df)
+v2df __builtin_ia32_comugeps (v2df, v2df)
+v4sf __builtin_ia32_comugesd (v4sf, v4sf)
+v4sf __builtin_ia32_comugess (v4sf, v4sf)
+v2df __builtin_ia32_comugtpd (v2df, v2df)
+v2df __builtin_ia32_comugtps (v2df, v2df)
+v4sf __builtin_ia32_comugtsd (v4sf, v4sf)
+v4sf __builtin_ia32_comugtss (v4sf, v4sf)
+v2df __builtin_ia32_comulepd (v2df, v2df)
+v2df __builtin_ia32_comuleps (v2df, v2df)
+v4sf __builtin_ia32_comulesd (v4sf, v4sf)
+v4sf __builtin_ia32_comuless (v4sf, v4sf)
+v2df __builtin_ia32_comultpd (v2df, v2df)
+v2df __builtin_ia32_comultps (v2df, v2df)
+v4sf __builtin_ia32_comultsd (v4sf, v4sf)
+v4sf __builtin_ia32_comultss (v4sf, v4sf)
+v2df __builtin_ia32_comunepd (v2df, v2df)
+v2df __builtin_ia32_comuneps (v2df, v2df)
+v4sf __builtin_ia32_comunesd (v4sf, v4sf)
+v4sf __builtin_ia32_comuness (v4sf, v4sf)
+v2df __builtin_ia32_comunordpd (v2df, v2df)
+v2df __builtin_ia32_comunordps (v2df, v2df)
+v4sf __builtin_ia32_comunordsd (v4sf, v4sf)
+v4sf __builtin_ia32_comunordss (v4sf, v4sf)
+v2df __builtin_ia32_fmaddpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmaddps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmaddsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmaddss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmsubpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmsubps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmsubsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmsubss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmaddpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmaddps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmaddsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmaddss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmsubpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmsubps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmsubsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmsubss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_frczpd (v2df)
+v4sf __builtin_ia32_frczps (v4sf)
+v2df __builtin_ia32_frczsd (v2df, v2df)
+v4sf __builtin_ia32_frczss (v4sf, v4sf)
+v2di __builtin_ia32_pcmov (v2di, v2di, v2di)
+v2di __builtin_ia32_pcmov_v2di (v2di, v2di, v2di)
+v4si __builtin_ia32_pcmov_v4si (v4si, v4si, v4si)
+v8hi __builtin_ia32_pcmov_v8hi (v8hi, v8hi, v8hi)
+v16qi __builtin_ia32_pcmov_v16qi (v16qi, v16qi, v16qi)
+v2df __builtin_ia32_pcmov_v2df (v2df, v2df, v2df)
+v4sf __builtin_ia32_pcmov_v4sf (v4sf, v4sf, v4sf)
+v16qi __builtin_ia32_pcomeqb (v16qi, v16qi)
+v8hi __builtin_ia32_pcomeqw (v8hi, v8hi)
+v4si __builtin_ia32_pcomeqd (v4si, v4si)
+v2di __builtin_ia32_pcomeqq (v2di, v2di)
+v16qi __builtin_ia32_pcomequb (v16qi, v16qi)
+v4si __builtin_ia32_pcomequd (v4si, v4si)
+v2di __builtin_ia32_pcomequq (v2di, v2di)
+v8hi __builtin_ia32_pcomequw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomeqw (v8hi, v8hi)
+v16qi __builtin_ia32_pcomfalseb (v16qi, v16qi)
+v4si __builtin_ia32_pcomfalsed (v4si, v4si)
+v2di __builtin_ia32_pcomfalseq (v2di, v2di)
+v16qi __builtin_ia32_pcomfalseub (v16qi, v16qi)
+v4si __builtin_ia32_pcomfalseud (v4si, v4si)
+v2di __builtin_ia32_pcomfalseuq (v2di, v2di)
+v8hi __builtin_ia32_pcomfalseuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomfalsew (v8hi, v8hi)
+v16qi __builtin_ia32_pcomgeb (v16qi, v16qi)
+v4si __builtin_ia32_pcomged (v4si, v4si)
+v2di __builtin_ia32_pcomgeq (v2di, v2di)
+v16qi __builtin_ia32_pcomgeub (v16qi, v16qi)
+v4si __builtin_ia32_pcomgeud (v4si, v4si)
+v2di __builtin_ia32_pcomgeuq (v2di, v2di)
+v8hi __builtin_ia32_pcomgeuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomgew (v8hi, v8hi)
+v16qi __builtin_ia32_pcomgtb (v16qi, v16qi)
+v4si __builtin_ia32_pcomgtd (v4si, v4si)
+v2di __builtin_ia32_pcomgtq (v2di, v2di)
+v16qi __builtin_ia32_pcomgtub (v16qi, v16qi)
+v4si __builtin_ia32_pcomgtud (v4si, v4si)
+v2di __builtin_ia32_pcomgtuq (v2di, v2di)
+v8hi __builtin_ia32_pcomgtuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomgtw (v8hi, v8hi)
+v16qi __builtin_ia32_pcomleb (v16qi, v16qi)
+v4si __builtin_ia32_pcomled (v4si, v4si)
+v2di __builtin_ia32_pcomleq (v2di, v2di)
+v16qi __builtin_ia32_pcomleub (v16qi, v16qi)
+v4si __builtin_ia32_pcomleud (v4si, v4si)
+v2di __builtin_ia32_pcomleuq (v2di, v2di)
+v8hi __builtin_ia32_pcomleuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomlew (v8hi, v8hi)
+v16qi __builtin_ia32_pcomltb (v16qi, v16qi)
+v4si __builtin_ia32_pcomltd (v4si, v4si)
+v2di __builtin_ia32_pcomltq (v2di, v2di)
+v16qi __builtin_ia32_pcomltub (v16qi, v16qi)
+v4si __builtin_ia32_pcomltud (v4si, v4si)
+v2di __builtin_ia32_pcomltuq (v2di, v2di)
+v8hi __builtin_ia32_pcomltuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomltw (v8hi, v8hi)
+v16qi __builtin_ia32_pcomneb (v16qi, v16qi)
+v4si __builtin_ia32_pcomned (v4si, v4si)
+v2di __builtin_ia32_pcomneq (v2di, v2di)
+v16qi __builtin_ia32_pcomneub (v16qi, v16qi)
+v4si __builtin_ia32_pcomneud (v4si, v4si)
+v2di __builtin_ia32_pcomneuq (v2di, v2di)
+v8hi __builtin_ia32_pcomneuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomnew (v8hi, v8hi)
+v16qi __builtin_ia32_pcomtrueb (v16qi, v16qi)
+v4si __builtin_ia32_pcomtrued (v4si, v4si)
+v2di __builtin_ia32_pcomtrueq (v2di, v2di)
+v16qi __builtin_ia32_pcomtrueub (v16qi, v16qi)
+v4si __builtin_ia32_pcomtrueud (v4si, v4si)
+v2di __builtin_ia32_pcomtrueuq (v2di, v2di)
+v8hi __builtin_ia32_pcomtrueuw (v8hi, v8hi)
+v8hi __builtin_ia32_pcomtruew (v8hi, v8hi)
+v4df __builtin_ia32_permpd (v2df, v2df, v16qi)
+v4sf __builtin_ia32_permps (v4sf, v4sf, v16qi)
+v4si __builtin_ia32_phaddbd (v16qi)
+v2di __builtin_ia32_phaddbq (v16qi)
+v8hi __builtin_ia32_phaddbw (v16qi)
+v2di __builtin_ia32_phadddq (v4si)
+v4si __builtin_ia32_phaddubd (v16qi)
+v2di __builtin_ia32_phaddubq (v16qi)
+v8hi __builtin_ia32_phaddubw (v16qi)
+v2di __builtin_ia32_phaddudq (v4si)
+v4si __builtin_ia32_phadduwd (v8hi)
+v2di __builtin_ia32_phadduwq (v8hi)
+v4si __builtin_ia32_phaddwd (v8hi)
+v2di __builtin_ia32_phaddwq (v8hi)
+v8hi __builtin_ia32_phsubbw (v16qi)
+v2di __builtin_ia32_phsubdq (v4si)
+v4si __builtin_ia32_phsubwd (v8hi)
+v4si __builtin_ia32_pmacsdd (v4si, v4si, v4si)
+v2di __builtin_ia32_pmacsdqh (v4si, v4si, v2di)
+v2di __builtin_ia32_pmacsdql (v4si, v4si, v2di)
+v4si __builtin_ia32_pmacssdd (v4si, v4si, v4si)
+v2di __builtin_ia32_pmacssdqh (v4si, v4si, v2di)
+v2di __builtin_ia32_pmacssdql (v4si, v4si, v2di)
+v4si __builtin_ia32_pmacsswd (v8hi, v8hi, v4si)
+v8hi __builtin_ia32_pmacssww (v8hi, v8hi, v8hi)
+v4si __builtin_ia32_pmacswd (v8hi, v8hi, v4si)
+v8hi __builtin_ia32_pmacsww (v8hi, v8hi, v8hi)
+v4si __builtin_ia32_pmadcsswd (v8hi, v8hi, v4si)
+v4si __builtin_ia32_pmadcswd (v8hi, v8hi, v4si)
+v16qi __builtin_ia32_pperm (v16qi, v16qi, v16qi)
+v16qi __builtin_ia32_protb (v16qi, v16qi)
+v4si __builtin_ia32_protd (v4si, v4si)
+v2di __builtin_ia32_protq (v2di, v2di)
+v8hi __builtin_ia32_protw (v8hi, v8hi)
+v16qi __builtin_ia32_pshab (v16qi, v16qi)
+v4si __builtin_ia32_pshad (v4si, v4si)
+v2di __builtin_ia32_pshaq (v2di, v2di)
+v8hi __builtin_ia32_pshaw (v8hi, v8hi)
+v16qi __builtin_ia32_pshlb (v16qi, v16qi)
+v4si __builtin_ia32_pshld (v4si, v4si)
+v2di __builtin_ia32_pshlq (v2di, v2di)
+v8hi __builtin_ia32_pshlw (v8hi, v8hi)
+@end smallexample
+
+The following builtin-in functions are available when @option{-msse5}
+is used.  The second argument must be an integer constant and generate
+the machine instruction that is part of the name with the @samp{_imm}
+suffix removed.
+
+@smallexample
+v16qi __builtin_ia32_protb_imm (v16qi, int)
+v4si __builtin_ia32_protd_imm (v4si, int)
+v2di __builtin_ia32_protq_imm (v2di, int)
+v8hi __builtin_ia32_protw_imm (v8hi, int)
+@end smallexample
 
 The following built-in functions are available when @option{-m3dnow} is used.
 All of them generate the machine instruction that is part of the name.
openSUSE Build Service is sponsored by