This repository has been archived by the owner on May 31, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 31
/
immintrin.h
2001 lines (1750 loc) · 88.8 KB
/
immintrin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/***
* imminitrin.h - Meta Header file for Intel(R) Architecture intrinsic functions.
*
* Copyright (C) 1985-2015 Intel Corporation. All rights reserved.
*
* The information and source code contained herein is the exclusive
* property of Intel Corporation and may not be disclosed, examined
* or reproduced in whole or in part without explicit written authorization
* from the company.
*
*
*******************************************************************************/
#pragma once
#if !defined(_M_IX86) && !defined(_M_X64)
#error This header is specific to X86 and X64 targets
#endif
#ifndef _INCLUDED_IMM
#define _INCLUDED_IMM
#ifndef __midl
#if defined (_M_CEE_PURE)
#error ERROR: Intel Architecture intrinsic functions not supported in the pure mode!
#else /* defined (_M_CEE_PURE) */
#include <wmmintrin.h>
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/*
* Intel(R) AVX compiler intrinsic functions.
*/
typedef union __declspec(intrin_type) __declspec(align(32)) __m256 {
float m256_f32[8];
} __m256;
typedef struct __declspec(intrin_type) __declspec(align(32)) __m256d {
double m256d_f64[4];
} __m256d;
typedef union __declspec(intrin_type) __declspec(align(32)) __m256i {
__int8 m256i_i8[32];
__int16 m256i_i16[16];
__int32 m256i_i32[8];
__int64 m256i_i64[4];
unsigned __int8 m256i_u8[32];
unsigned __int16 m256i_u16[16];
unsigned __int32 m256i_u32[8];
unsigned __int64 m256i_u64[4];
} __m256i;
/*
* Compare predicates for scalar and packed compare intrinsic functions
*/
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */
#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
#define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */
#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */
#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered,
signaling) */
#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered,
signaling) */
#define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */
#define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */
#define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */
#define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */
#define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */
#define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */
#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
#define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */
#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */
#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered,
nonsignaling) */
#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered,
nonsignaling) */
#define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */
#define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */
#define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */
#define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered,
nonsignaling) */
#define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */
#define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */
/*
* Add Packed Double Precision Floating-Point Values
* **** VADDPD ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the four packed double-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
extern __m256d __cdecl _mm256_add_pd(__m256d, __m256d);
/*
* Add Packed Single Precision Floating-Point Values
* **** VADDPS ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the eight packed single-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed single-precision floating-point results in the
* destination
*/
extern __m256 __cdecl _mm256_add_ps(__m256, __m256);
/*
* Add/Subtract Double Precision Floating-Point Values
* **** VADDSUBPD ymm1, ymm2, ymm3/m256
* Adds odd-numbered double-precision floating-point values of the first
* source operand with the corresponding double-precision floating-point
* values from the second source operand; stores the result in the odd-numbered
* values of the destination. Subtracts the even-numbered double-precision
* floating-point values from the second source operand from the corresponding
* double-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
extern __m256d __cdecl _mm256_addsub_pd(__m256d, __m256d);
/*
* Add/Subtract Packed Single Precision Floating-Point Values
* **** VADDSUBPS ymm1, ymm2, ymm3/m256
* Adds odd-numbered single-precision floating-point values of the first source
* operand with the corresponding single-precision floating-point values from
* the second source operand; stores the result in the odd-numbered values of
* the destination. Subtracts the even-numbered single-precision floating-point
* values from the second source operand from the corresponding
* single-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
extern __m256 __cdecl _mm256_addsub_ps(__m256, __m256);
/*
* Bitwise Logical AND of Packed Double Precision Floating-Point Values
* **** VANDPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256d __cdecl _mm256_and_pd(__m256d, __m256d);
/*
* Bitwise Logical AND of Packed Single Precision Floating-Point Values
* **** VANDPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256 __cdecl _mm256_and_ps(__m256, __m256);
/*
* Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values
* **** VANDNPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the four packed double-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
extern __m256d __cdecl _mm256_andnot_pd(__m256d, __m256d);
/*
* Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values
* **** VANDNPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the eight packed single-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
extern __m256 __cdecl _mm256_andnot_ps(__m256, __m256);
/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8
* Double-Precision Floating-Point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [3:0] determine whether the
* corresponding Double-Precision Floating Point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* corresponding to a word, is "1", then the Double-Precision Floating-Point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
extern __m256d __cdecl _mm256_blend_pd(__m256d, __m256d, const int);
/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8
* Single precision floating point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [7:0] determine whether the
* corresponding single precision floating-point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* corresponding to a word, is "1", then the single-precision floating-point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
extern __m256 __cdecl _mm256_blend_ps(__m256, __m256, const int);
/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each quadword data element of double-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256d __cdecl _mm256_blendv_pd(__m256d, __m256d, __m256d);
/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each dword data element of single-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256 __cdecl _mm256_blendv_ps(__m256, __m256, __m256);
/*
* Divide Packed Double-Precision Floating-Point Values
* **** VDIVPD ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the four packed double-precision floating-point
* values in the first source operand by the four packed double-precision
* floating-point values in the second source operand
*/
extern __m256d __cdecl _mm256_div_pd(__m256d, __m256d);
/*
* Divide Packed Single-Precision Floating-Point Values
* **** VDIVPS ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the eight packed single-precision
* floating-point values in the first source operand by the eight packed
* single-precision floating-point values in the second source operand
*/
extern __m256 __cdecl _mm256_div_ps(__m256, __m256);
/*
* Dot Product of Packed Single-Precision Floating-Point Values
* **** VDPPS ymm1, ymm2, ymm3/m256, imm8
* Multiplies the packed single precision floating point values in the
* first source operand with the packed single-precision floats in the
* second source. Each of the four resulting single-precision values is
* conditionally summed depending on a mask extracted from the high 4 bits
* of the immediate operand. This sum is broadcast to each of 4 positions
* in the destination if the corresponding bit of the mask selected from
* the low 4 bits of the immediate operand is "1". If the corresponding
* low bit 0-3 of the mask is zero, the destination is set to zero.
* The process is replicated for the high elements of the destination.
*/
extern __m256 __cdecl _mm256_dp_ps(__m256, __m256, const int);
/*
* Add Horizontal Double Precision Floating-Point Values
* **** VHADDPD ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent double-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
extern __m256d __cdecl _mm256_hadd_pd(__m256d, __m256d);
/*
* Add Horizontal Single Precision Floating-Point Values
* **** VHADDPS ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent single-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
extern __m256 __cdecl _mm256_hadd_ps(__m256, __m256);
/*
* Subtract Horizontal Double Precision Floating-Point Values
* **** VHSUBPD ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent double-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination
*/
extern __m256d __cdecl _mm256_hsub_pd(__m256d, __m256d);
/*
* Subtract Horizontal Single Precision Floating-Point Values
* **** VHSUBPS ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent single-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination.
*/
extern __m256 __cdecl _mm256_hsub_ps(__m256, __m256);
/*
* Maximum of Packed Double Precision Floating-Point Values
* **** VMAXPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
extern __m256d __cdecl _mm256_max_pd(__m256d, __m256d);
/*
* Maximum of Packed Single Precision Floating-Point Values
* **** VMAXPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
extern __m256 __cdecl _mm256_max_ps(__m256, __m256);
/*
* Minimum of Packed Double Precision Floating-Point Values
* **** VMINPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
extern __m256d __cdecl _mm256_min_pd(__m256d, __m256d);
/*
* Minimum of Packed Single Precision Floating-Point Values
* **** VMINPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
extern __m256 __cdecl _mm256_min_ps(__m256, __m256);
/*
* Multiply Packed Double Precision Floating-Point Values
* **** VMULPD ymm1, ymm2, ymm3/m256
* Performs a SIMD multiply of the four packed double-precision floating-point
* values from the first Source operand to the Second Source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
extern __m256d __cdecl _mm256_mul_pd(__m256d, __m256d);
/*
* Multiply Packed Single Precision Floating-Point Values
* **** VMULPS ymm1, ymm2, ymm3/m256
* Performs an SIMD multiply of the eight packed single-precision
* floating-point values from the first source operand to the second source
* operand, and stores the packed double-precision floating-point results in
* the destination
*/
extern __m256 __cdecl _mm256_mul_ps(__m256, __m256);
/*
* Bitwise Logical OR of Packed Double Precision Floating-Point Values
* **** VORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256d __cdecl _mm256_or_pd(__m256d, __m256d);
/*
* Bitwise Logical OR of Packed Single Precision Floating-Point Values
* **** VORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256 __cdecl _mm256_or_ps(__m256, __m256);
/*
* Shuffle Packed Double Precision Floating-Point Values
* **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8
* Moves either of the two packed double-precision floating-point values from
* each double quadword in the first source operand into the low quadword
* of each double quadword of the destination; moves either of the two packed
* double-precision floating-point values from the second source operand into
* the high quadword of each double quadword of the destination operand.
* The selector operand determines which values are moved to the destination
*/
extern __m256d __cdecl _mm256_shuffle_pd(__m256d, __m256d, const int);
/*
* Shuffle Packed Single Precision Floating-Point Values
* **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8
* Moves two of the four packed single-precision floating-point values
* from each double qword of the first source operand into the low
* quadword of each double qword of the destination; moves two of the four
* packed single-precision floating-point values from each double qword of
* the second source operand into to the high quadword of each double qword
* of the destination. The selector operand determines which values are moved
* to the destination.
*/
extern __m256 __cdecl _mm256_shuffle_ps(__m256, __m256, const int);
/*
* Subtract Packed Double Precision Floating-Point Values
* **** VSUBPD ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the four packed double-precision floating-point
* values of the second Source operand from the first Source operand, and
* stores the packed double-precision floating-point results in the destination
*/
extern __m256d __cdecl _mm256_sub_pd(__m256d, __m256d);
/*
* Subtract Packed Single Precision Floating-Point Values
* **** VSUBPS ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the eight packed single-precision
* floating-point values in the second Source operand from the First Source
* operand, and stores the packed single-precision floating-point results in
* the destination
*/
extern __m256 __cdecl _mm256_sub_ps(__m256, __m256);
/*
* Bitwise Logical XOR of Packed Double Precision Floating-Point Values
* **** VXORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256d __cdecl _mm256_xor_pd(__m256d, __m256d);
/*
* Bitwise Logical XOR of Packed Single Precision Floating-Point Values
* **** VXORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
extern __m256 __cdecl _mm256_xor_ps(__m256, __m256);
/*
* Compare Packed Double-Precision Floating-Point Values
* **** VCMPPD xmm1, xmm2, xmm3/m128, imm8
* **** VCMPPD ymm1, ymm2, ymm3/m256, imm8
* Performs an SIMD compare of the four packed double-precision floating-point
* values in the second source operand (third operand) and the first source
* operand (second operand) and returns the results of the comparison to the
* destination operand (first operand). The comparison predicate operand
* (immediate) specifies the type of comparison performed on each of the pairs
* of packed values.
* For 128-bit intrinsic function with compare predicate values in range 0-7
* compiler may generate SSE2 instructions if it is warranted for performance
* reasons.
*/
extern __m128d __cdecl _mm_cmp_pd(__m128d, __m128d, const int);
extern __m256d __cdecl _mm256_cmp_pd(__m256d, __m256d, const int);
/*
* Compare Packed Single-Precision Floating-Point Values
* **** VCMPPS xmm1, xmm2, xmm3/m256, imm8
* **** VCMPPS ymm1, ymm2, ymm3/m256, imm8
* Performs a SIMD compare of the packed single-precision floating-point values
* in the second source operand (third operand) and the first source operand
* (second operand) and returns the results of the comparison to the
* destination operand (first operand). The comparison predicate operand
* (immediate) specifies the type of comparison performed on each of the pairs
* of packed values.
* For 128-bit intrinsic function with compare predicate values in range 0-7
* compiler may generate SSE2 instructions if it is warranted for performance
* reasons.
*/
extern __m128 __cdecl _mm_cmp_ps(__m128, __m128, const int);
extern __m256 __cdecl _mm256_cmp_ps(__m256, __m256, const int);
/*
* Compare Scalar Double-Precision Floating-Point Values
* **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
* Compares the low double-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results in of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies the
* type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128d __cdecl _mm_cmp_sd(__m128d, __m128d, const int);
/*
* Compare Scalar Single-Precision Floating-Point Values
* **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
* Compares the low single-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies
* the type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128 __cdecl _mm_cmp_ss(__m128, __m128, const int);
/*
* Convert Packed Doubleword Integers to
* Packed Double-Precision Floating-Point Values
* **** VCVTDQ2PD ymm1, xmm2/m128
* Converts four packed signed doubleword integers in the source operand to
* four packed double-precision floating-point values in the destination
*/
extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i);
/*
* Convert Packed Doubleword Integers to
* Packed Single-Precision Floating-Point Values
* **** VCVTDQ2PS ymm1, ymm2/m256
* Converts eight packed signed doubleword integers in the source operand to
* eight packed double-precision floating-point values in the destination
*/
extern __m256 __cdecl _mm256_cvtepi32_ps(__m256i);
/*
* Convert Packed Double-Precision Floating-point values to
* Packed Single-Precision Floating-Point Values
* **** VCVTPD2PS xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed single-precision floating-point values in the
* destination
*/
extern __m128 __cdecl _mm256_cvtpd_ps(__m256d);
/*
* Convert Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination
*/
extern __m256i __cdecl _mm256_cvtps_epi32(__m256);
/*
* Convert Packed Single Precision Floating-point values to
* Packed Double Precision Floating-Point Values
* **** VCVTPS2PD ymm1, xmm2/m128
* Converts four packed single-precision floating-point values in the source
* operand to four packed double-precision floating-point values in the
* destination
*/
extern __m256d __cdecl _mm256_cvtps_pd(__m128);
/*
* Convert with Truncation Packed Double-Precision Floating-Point values to
* Packed Doubleword Integers
* **** VCVTTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d);
/*
* Convert Packed Double-Precision Floating-point values to
* Packed Doubleword Integers
* **** VCVTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination
*/
extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d);
/*
* Convert with Truncation Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
extern __m256i __cdecl _mm256_cvttps_epi32(__m256);
/*
* Extract packed floating-point values
* **** VEXTRACTF128 xmm1/m128, ymm2, imm8
* Extracts 128-bits of packed floating-point values from the source operand
* at an 128-bit offset from imm8[0] into the destination
*/
extern __m128 __cdecl _mm256_extractf128_ps(__m256, const int);
extern __m128d __cdecl _mm256_extractf128_pd(__m256d, const int);
extern __m128i __cdecl _mm256_extractf128_si256(__m256i, const int);
/*
* Zero All YMM registers
* **** VZEROALL
* Zeros contents of all YMM registers
*/
extern void __cdecl _mm256_zeroall(void);
/*
* Zero Upper bits of YMM registers
* **** VZEROUPPER
* Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the
* registers (the corresponding XMM registers) are unmodified
*/
extern void __cdecl _mm256_zeroupper(void);
/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2, ymm3/m256
* **** VPERMILPS xmm1, xmm2, xmm3/m128
* Permute Single-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of corresponding elements the
* shuffle control and store results in the destination
*/
extern __m256 __cdecl _mm256_permutevar_ps(__m256, __m256i);
extern __m128 __cdecl _mm_permutevar_ps(__m128, __m128i);
/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2/m256, imm8
* **** VPERMILPS xmm1, xmm2/m128, imm8
* Permute Single-Precision Floating-Point values in the first source operand
* using four 2-bit control fields in the 8-bit immediate and store results
* in the destination
*/
extern __m256 __cdecl _mm256_permute_ps(__m256, int);
extern __m128 __cdecl _mm_permute_ps(__m128, int);
/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2, ymm3/m256
* **** VPERMILPD xmm1, xmm2, xmm3/m128
* Permute Double-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of the second source operand
* and store results in the destination
*/
extern __m256d __cdecl _mm256_permutevar_pd(__m256d, __m256i);
extern __m128d __cdecl _mm_permutevar_pd(__m128d, __m128i);
/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2/m256, imm8
* **** VPERMILPD xmm1, xmm2/m128, imm8
* Permute Double-Precision Floating-Point values in the first source operand
* using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
* and store results in the destination
*/
extern __m256d __cdecl _mm256_permute_pd(__m256d, int);
extern __m128d __cdecl _mm_permute_pd(__m128d, int);
/*
* Permute Floating-Point Values
* **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
* Permute 128 bit floating-point-containing fields from the first source
* operand and second source operand using bits in the 8-bit immediate and
* store results in the destination
*/
extern __m256 __cdecl _mm256_permute2f128_ps(__m256, __m256, int);
extern __m256d __cdecl _mm256_permute2f128_pd(__m256d, __m256d, int);
extern __m256i __cdecl _mm256_permute2f128_si256(__m256i, __m256i, int);
/*
* Load with Broadcast
* **** VBROADCASTSS ymm1, m32
* **** VBROADCASTSS xmm1, m32
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
extern __m256 __cdecl _mm256_broadcast_ss(float const *);
extern __m128 __cdecl _mm_broadcast_ss(float const *);
/*
* Load with Broadcast
* **** VBROADCASTSD ymm1, m64
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
extern __m256d __cdecl _mm256_broadcast_sd(double const *);
/*
* Load with Broadcast
* **** VBROADCASTF128 ymm1, m128
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
extern __m256 __cdecl _mm256_broadcast_ps(__m128 const *);
extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *);
/*
* Insert packed floating-point values
* **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8
* Performs an insertion of 128-bits of packed floating-point values from the
* second source operand into an the destination at an 128-bit offset from
* imm8[0]. The remaining portions of the destination are written by the
* corresponding fields of the first source operand
*/
extern __m256 __cdecl _mm256_insertf128_ps(__m256, __m128, int);
extern __m256d __cdecl _mm256_insertf128_pd(__m256d, __m128d, int);
extern __m256i __cdecl _mm256_insertf128_si256(__m256i, __m128i, int);
/*
* Move Aligned Packed Double-Precision Floating-Point Values
* **** VMOVAPD ymm1, m256
* **** VMOVAPD m256, ymm1
* Moves 4 double-precision floating-point values from the source operand to
* the destination
*/
extern __m256d __cdecl _mm256_load_pd(double const *);
extern void __cdecl _mm256_store_pd(double *, __m256d);
/*
* Move Aligned Packed Single-Precision Floating-Point Values
* **** VMOVAPS ymm1, m256
* **** VMOVAPS m256, ymm1
* Moves 8 single-precision floating-point values from the source operand to
* the destination
*/
extern __m256 __cdecl _mm256_load_ps(float const *);
extern void __cdecl _mm256_store_ps(float *, __m256);
/*
* Move Unaligned Packed Double-Precision Floating-Point Values
* **** VMOVUPD ymm1, m256
* **** VMOVUPD m256, ymm1
* Moves 256 bits of packed double-precision floating-point values from the
* source operand to the destination
*/
extern __m256d __cdecl _mm256_loadu_pd(double const *);
extern void __cdecl _mm256_storeu_pd(double *, __m256d);
/*
* Move Unaligned Packed Single-Precision Floating-Point Values
* **** VMOVUPS ymm1, m256
* **** VMOVUPS m256, ymm1
* Moves 256 bits of packed single-precision floating-point values from the
* source operand to the destination
*/
extern __m256 __cdecl _mm256_loadu_ps(float const *);
extern void __cdecl _mm256_storeu_ps(float *, __m256);
/*
* Move Aligned Packed Integer Values
* **** VMOVDQA ymm1, m256
* **** VMOVDQA m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
extern __m256i __cdecl _mm256_load_si256(__m256i const *);
extern void __cdecl _mm256_store_si256(__m256i *, __m256i);
/*
* Move Unaligned Packed Integer Values
* **** VMOVDQU ymm1, m256
* **** VMOVDQU m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
extern __m256i __cdecl _mm256_loadu_si256(__m256i const *);
extern void __cdecl _mm256_storeu_si256(__m256i *, __m256i);
/*
* Load Two Unaligned Packed 128-bit Values
* Loads two potentially unaligned 128-bit values
* and combines them into one 256-bit value.
*
* The data types here (float const*, double const* and __m128i const*)
* were chosen for consistency with the underlying _mm_loadu_{ps,pd,si128}
* intrinsics.
*/
#define _mm256_loadu2_m128(/* float const* */ hiaddr, \
/* float const* */ loaddr) \
_mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
#define _mm256_loadu2_m128d(/* double const* */ hiaddr, \
/* double const* */ loaddr) \
_mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr))
#define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \
/* __m128i const* */ loaddr) \
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
/*
* Store 256-bit Value To Two Unaligned 128-bit Locations
* Stores the high and low 128-bit halves of a 256-bit value
* to two different potentially unaligned addresses.
*/
#define _mm256_storeu2_m128(/* float* */ hiaddr, /* float* */ loaddr, \
/* __m256 */ a) \
do { \
__m256 _a = (a); /* reference a only once in macro body */ \
_mm_storeu_ps((loaddr), _mm256_castps256_ps128(_a)); \
_mm_storeu_ps((hiaddr), _mm256_extractf128_ps(_a, 0x1)); \
} while (0)
#define _mm256_storeu2_m128d(/* double* */ hiaddr, /* double* */ loaddr, \
/* __m256d */ a) \
do { \
__m256d _a = (a); /* reference a only once in macro body */ \
_mm_storeu_pd((loaddr), _mm256_castpd256_pd128(_a)); \
_mm_storeu_pd((hiaddr), _mm256_extractf128_pd(_a, 0x1)); \
} while (0)
#define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, \
/* __m256i */ a) \
do { \
__m256i _a = (a); /* reference a only once in macro body */ \
_mm_storeu_si128((loaddr), _mm256_castsi256_si128(_a)); \
_mm_storeu_si128((hiaddr), _mm256_extractf128_si256(_a, 0x1)); \
} while (0)
/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPD xmm1, xmm2, m128
* **** VMASKMOVPD ymm1, ymm2, m256
* **** VMASKMOVPD m128, xmm1, xmm2
* **** VMASKMOVPD m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
extern __m256d __cdecl _mm256_maskload_pd(double const *, __m256i);
extern void __cdecl _mm256_maskstore_pd(double *, __m256i, __m256d);
extern __m128d __cdecl _mm_maskload_pd(double const *, __m128i);
extern void __cdecl _mm_maskstore_pd(double *, __m128i, __m128d);
/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPS xmm1, xmm2, m128
* **** VMASKMOVPS ymm1, ymm2, m256
* **** VMASKMOVPS m128, xmm1, xmm2
* **** VMASKMOVPS m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
extern __m256 __cdecl _mm256_maskload_ps(float const *, __m256i);
extern void __cdecl _mm256_maskstore_ps(float *, __m256i, __m256);
extern __m128 __cdecl _mm_maskload_ps(float const *, __m128i);
extern void __cdecl _mm_maskstore_ps(float *, __m128i, __m128);
/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSHDUP ymm1, ymm2/m256
* Duplicates odd-indexed single-precision floating-point values from the
* source operand
*/
extern __m256 __cdecl _mm256_movehdup_ps(__m256);
/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSLDUP ymm1, ymm2/m256
* Duplicates even-indexed single-precision floating-point values from the
* source operand
*/
extern __m256 __cdecl _mm256_moveldup_ps(__m256);
/*
* Replicate Double-Precision Floating-Point Values
* **** VMOVDDUP ymm1, ymm2/m256
* Duplicates even-indexed double-precision floating-point values from the
* source operand
*/
extern __m256d __cdecl _mm256_movedup_pd(__m256d);
/*
* Move Unaligned Integer
* **** VLDDQU ymm1, m256
* The instruction is functionally similar to VMOVDQU YMM, m256 for loading
* from memory. That is: 32 bytes of data starting at an address specified by
* the source memory operand are fetched from memory and placed in a
* destination
*/
extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *);
/*
* Store Packed Integers Using Non-Temporal Hint
* **** VMOVNTDQ m256, ymm1
* Moves the packed integers in the source operand to the destination using a
* non-temporal hint to prevent caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_si256(__m256i *, __m256i);
/*
* Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPD m256, ymm1
* Moves the packed double-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_pd(double *, __m256d);
/*
* Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPS m256, ymm1
* Moves the packed single-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_ps(float *, __m256);
/*
* Compute Approximate Reciprocals of Packed Single-Precision Floating-Point
* Values
* **** VRCPPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the eight
* packed single precision floating-point values in the source operand and
* stores the packed single-precision floating-point results in the destination
*/
extern __m256 __cdecl _mm256_rcp_ps(__m256);
/*
* Compute Approximate Reciprocals of Square Roots of
* Packed Single-Precision Floating-point Values
* **** VRSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the square
* roots of the eight packed single precision floating-point values in the
* source operand and stores the packed single-precision floating-point results
* in the destination
*/
extern __m256 __cdecl _mm256_rsqrt_ps(__m256);
/*
* Square Root of Double-Precision Floating-Point Values
* **** VSQRTPD ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the two or four packed
* double-precision floating-point values in the source operand and stores
* the packed double-precision floating-point results in the destination
*/
extern __m256d __cdecl _mm256_sqrt_pd(__m256d);
/*
* Square Root of Single-Precision Floating-Point Values
* **** VSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the eight packed
* single-precision floating-point values in the source operand stores the
* packed double-precision floating-point results in the destination
*/
extern __m256 __cdecl _mm256_sqrt_ps(__m256);
/*
* Round Packed Double-Precision Floating-Point Values
* **** VROUNDPD ymm1,ymm2/m256,imm8
* Round the four Double-Precision Floating-Point Values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
extern __m256d __cdecl _mm256_round_pd(__m256d, int);
#define _mm256_ceil_pd(val) _mm256_round_pd((val), _MM_FROUND_CEIL)
#define _mm256_floor_pd(val) _mm256_round_pd((val), _MM_FROUND_FLOOR)
/*
* Round Packed Single-Precision Floating-Point Values
* **** VROUNDPS ymm1,ymm2/m256,imm8
* Round the four single-precision floating-point values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
extern __m256 __cdecl _mm256_round_ps(__m256, int);
#define _mm256_ceil_ps(val) _mm256_round_ps((val), _MM_FROUND_CEIL)
#define _mm256_floor_ps(val) _mm256_round_ps((val), _MM_FROUND_FLOOR)
/*
* Unpack and Interleave High Packed Double-Precision Floating-Point Values
* **** VUNPCKHPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the high double-precision floating-point
* values from the first source operand and the second source operand.
*/
extern __m256d __cdecl _mm256_unpackhi_pd(__m256d, __m256d);
/*
* Unpack and Interleave High Packed Single-Precision Floating-Point Values
* **** VUNPCKHPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the high single-precision floating-point
* values from the first source operand and the second source operand
*/
extern __m256 __cdecl _mm256_unpackhi_ps(__m256, __m256);
/*
* Unpack and Interleave Low Packed Double-Precision Floating-Point Values
* **** VUNPCKLPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the low double-precision floating-point