From 0800521cb3171422613653ef95246bfff592c767 Mon Sep 17 00:00:00 2001
From: Alex Shepeliev <shepeliev@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:51:31 +0300
Subject: [PATCH] Upgrade Kotlin to v2.0.21 and other deps (#145)

* Upgrade Kotlin to 2.0.21

* Fix _Float16 error for iosX64 target
---
 build.gradle.kts                              |    3 +
 buildSrc/build.gradle.kts                     |    8 -
 buildSrc/settings.gradle.kts                  |   13 -
 .../src/main/kotlin/KotlinAndroidTarget.kt    |    7 -
 .../kotlin/KotlinMultiplatformExtension.kt    |   15 -
 .../kotlin/webrtc.multiplatform.gradle.kts    |   37 -
 gradle/libs.versions.toml                     |   27 +-
 sample/composeApp/build.gradle.kts            |   25 +-
 .../iosApp/iosApp.xcodeproj/project.pbxproj   |   36 +-
 vfsoverlay/base.h                             |  151 +
 vfsoverlay/common.h                           | 4525 ++++++++++
 vfsoverlay/conversion.h                       | 2032 +++++
 vfsoverlay/extern.h                           |   49 +
 vfsoverlay/geometry.h                         | 1100 +++
 vfsoverlay/logic.h                            | 1315 +++
 vfsoverlay/math.h                             | 5996 +++++++++++++
 vfsoverlay/matrix.h                           | 1990 +++++
 vfsoverlay/matrix_types.h                     |  525 ++
 vfsoverlay/overlay.yaml                       |   28 +
 vfsoverlay/packed.h                           | 1031 +++
 vfsoverlay/quaternion.h                       | 1194 +++
 vfsoverlay/simd.h                             |   30 +
 vfsoverlay/types.h                            |  128 +
 vfsoverlay/vector.h                           |   52 +
 vfsoverlay/vector_make.h                      | 7874 +++++++++++++++++
 vfsoverlay/vector_types.h                     | 1281 +++
 webrtc-kmp/build.gradle.kts                   |   34 +-
 27 files changed, 29376 insertions(+), 130 deletions(-)
 delete mode 100644 buildSrc/build.gradle.kts
 delete mode 100644 buildSrc/settings.gradle.kts
 delete mode 100644 buildSrc/src/main/kotlin/KotlinAndroidTarget.kt
 delete mode 100644 buildSrc/src/main/kotlin/KotlinMultiplatformExtension.kt
 delete mode 100644 buildSrc/src/main/kotlin/webrtc.multiplatform.gradle.kts
 create mode 100644 vfsoverlay/base.h
 create mode 100644 vfsoverlay/common.h
 create mode 100644 vfsoverlay/conversion.h
 create mode 100644 vfsoverlay/extern.h
 create mode 100644 vfsoverlay/geometry.h
 create mode 100644 vfsoverlay/logic.h
 create mode 100644 vfsoverlay/math.h
 create mode 100644 vfsoverlay/matrix.h
 create mode 100644 vfsoverlay/matrix_types.h
 create mode 100644 vfsoverlay/overlay.yaml
 create mode 100644 vfsoverlay/packed.h
 create mode 100644 vfsoverlay/quaternion.h
 create mode 100644 vfsoverlay/simd.h
 create mode 100644 vfsoverlay/types.h
 create mode 100644 vfsoverlay/vector.h
 create mode 100644 vfsoverlay/vector_make.h
 create mode 100644 vfsoverlay/vector_types.h

diff --git a/build.gradle.kts b/build.gradle.kts
index 2a90d3cf..3cb8db84 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -3,6 +3,9 @@ import java.util.Properties
 plugins {
     alias(libs.plugins.ktlint)
     alias(libs.plugins.nexus)
+    alias(libs.plugins.kotlinMultiplatform) apply false
+    alias(libs.plugins.androidApplication) apply false
+    alias(libs.plugins.androidLibrary) apply false
     alias(libs.plugins.jetbrains.compose) apply false
 }
 
diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts
deleted file mode 100644
index 618f6814..00000000
--- a/buildSrc/build.gradle.kts
+++ /dev/null
@@ -1,8 +0,0 @@
-plugins {
-    `kotlin-dsl`
-}
-
-dependencies {
-    implementation(libs.kotlin.plugin)
-    implementation(libs.agp.plugin)
-}
diff --git a/buildSrc/settings.gradle.kts b/buildSrc/settings.gradle.kts
deleted file mode 100644
index 75df2751..00000000
--- a/buildSrc/settings.gradle.kts
+++ /dev/null
@@ -1,13 +0,0 @@
-dependencyResolutionManagement {
-    versionCatalogs {
-        create("libs") {
-            from(files("../gradle/libs.versions.toml"))
-        }
-    }
-
-    repositories {
-        gradlePluginPortal()
-        mavenCentral()
-        google()
-    }
-}
diff --git a/buildSrc/src/main/kotlin/KotlinAndroidTarget.kt b/buildSrc/src/main/kotlin/KotlinAndroidTarget.kt
deleted file mode 100644
index 4acc43b1..00000000
--- a/buildSrc/src/main/kotlin/KotlinAndroidTarget.kt
+++ /dev/null
@@ -1,7 +0,0 @@
-import org.jetbrains.kotlin.gradle.plugin.mpp.KotlinAndroidTarget
-
-fun KotlinAndroidTarget.configureJvmTarget(jvmVersion: String = "1.8") {
-    compilations.all {
-        kotlinOptions.jvmTarget = jvmVersion
-    }
-}
diff --git a/buildSrc/src/main/kotlin/KotlinMultiplatformExtension.kt b/buildSrc/src/main/kotlin/KotlinMultiplatformExtension.kt
deleted file mode 100644
index 00627579..00000000
--- a/buildSrc/src/main/kotlin/KotlinMultiplatformExtension.kt
+++ /dev/null
@@ -1,15 +0,0 @@
-import org.jetbrains.kotlin.gradle.dsl.KotlinMultiplatformExtension
-
-fun KotlinMultiplatformExtension.configureKotlinCompilerArgs(vararg args: String) {
-    targets.all {
-        compilations.all {
-            kotlinOptions {
-                freeCompilerArgs += setOf(
-                    "-opt-in=kotlin.RequiresOptIn",
-                    "-Xexpect-actual-classes",
-                    *args
-                )
-            }
-        }
-    }
-}
diff --git a/buildSrc/src/main/kotlin/webrtc.multiplatform.gradle.kts b/buildSrc/src/main/kotlin/webrtc.multiplatform.gradle.kts
deleted file mode 100644
index 747520de..00000000
--- a/buildSrc/src/main/kotlin/webrtc.multiplatform.gradle.kts
+++ /dev/null
@@ -1,37 +0,0 @@
-plugins {
-    id("com.android.library")
-    kotlin("multiplatform")
-}
-
-kotlin {
-    configureKotlinCompilerArgs()
-
-    androidTarget {
-        configureJvmTarget()
-    }
-}
-
-android {
-    compileSdk = androidCompileSdkVersion
-
-    sourceSets["main"].manifest.srcFile("src/androidMain/AndroidManifest.xml")
-    sourceSets["main"].res.srcDir("src/androidMain/res")
-
-    defaultConfig {
-        minSdk = androidMinSdkVersion
-    }
-
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-}
-
-private val Project.versionCatalog: VersionCatalog
-    get() = extensions.getByType<VersionCatalogsExtension>().named("libs")
-
-private val Project.androidCompileSdkVersion: Int
-    get() = "${versionCatalog.findVersion("compileSdk").get()}".toInt()
-
-private val Project.androidMinSdkVersion: Int
-    get() = "${versionCatalog.findVersion("minSdk").get()}".toInt()
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index b41632a8..24bd1a7f 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -1,15 +1,15 @@
 [versions]
-kotlin = "2.0.10"
-kotlin-coroutines = "1.8.0"
+kotlin = "2.0.21"
+kotlin-coroutines = "1.9.0"
 androidx-activity-compose = "1.9.0"
-androidx-appcompat = "1.6.1"
-androidx-core = "1.13.0"
-androidx-material = "1.11.0"
-androidx-lifecycle = "2.7.0"
-androidx-startup = "1.1.1"
-androidx-test-core = "1.5.0"
-androidx-test-runner = "1.5.2"
-androidx-test-rules = "1.5.0"
+androidx-appcompat = "1.7.0"
+androidx-core = "1.13.1"
+androidx-material = "1.12.0"
+androidx-lifecycle = "2.8.6"
+androidx-startup = "1.2.0"
+androidx-test-core = "1.6.1"
+androidx-test-runner = "1.6.2"
+androidx-test-rules = "1.6.1"
 accompanist-permision = "0.34.0"
 kermit = "2.0.3"
 kotlin-wrappers = "1.0.0-pre.732"
@@ -49,12 +49,11 @@ kotlin-wrappers-react = { module = "org.jetbrains.kotlin-wrappers:kotlin-react"
 kotlin-wrappers-reactDom = { module = "org.jetbrains.kotlin-wrappers:kotlin-react-dom" }
 kotlin-wrappers-mui = { module = "org.jetbrains.kotlin-wrappers:kotlin-mui-material" }
 
-# Plugin dependencies
-kotlin-plugin = { module = "org.jetbrains.kotlin:kotlin-gradle-plugin", version.ref = "kotlin" }
-agp-plugin = { module = "com.android.tools.build:gradle", version.ref = "agp" }
-
 [plugins]
 ktlint = { id = "org.jlleitschuh.gradle.ktlint", version.ref = "ktlint" }
 nexus = { id = "io.github.gradle-nexus.publish-plugin", version.ref = "nexus" }
 jetbrains-compose = { id = "org.jetbrains.compose", version.ref = "compose-plugin" }
 compose-compiler = { id = "org.jetbrains.kotlin.plugin.compose", version.ref = "kotlin" }
+kotlinMultiplatform = { id = "org.jetbrains.kotlin.multiplatform", version.ref = "kotlin" }
+androidApplication = { id = "com.android.application", version.ref = "agp" }
+androidLibrary = { id = "com.android.library", version.ref = "agp" }
diff --git a/sample/composeApp/build.gradle.kts b/sample/composeApp/build.gradle.kts
index 724adf97..837f7579 100644
--- a/sample/composeApp/build.gradle.kts
+++ b/sample/composeApp/build.gradle.kts
@@ -1,21 +1,18 @@
-import org.jetbrains.kotlin.gradle.targets.js.dsl.ExperimentalWasmDsl
+import org.jetbrains.kotlin.gradle.ExperimentalKotlinGradlePluginApi
+import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl
+import org.jetbrains.kotlin.gradle.dsl.JvmTarget
 import org.jetbrains.kotlin.gradle.targets.js.webpack.KotlinWebpackConfig
-import org.jetbrains.kotlin.gradle.plugin.mpp.KotlinNativeTarget
 import org.jetbrains.kotlin.gradle.plugin.mpp.NativeBuildType
 
-
 plugins {
-    kotlin("multiplatform")
-    id("com.android.application")
-    kotlin("native.cocoapods")
-
+    alias(libs.plugins.kotlinMultiplatform)
+    alias(libs.plugins.androidApplication)
     alias(libs.plugins.jetbrains.compose)
     alias(libs.plugins.compose.compiler)
+    kotlin("native.cocoapods")
 }
 
 kotlin {
-    configureKotlinCompilerArgs()
-
     cocoapods {
         version = "1.0"
         summary = "Compose app"
@@ -26,6 +23,7 @@ kotlin {
             version = libs.versions.webrtc.ios.sdk.get()
             moduleName = "WebRTC"
             packageName = "WebRTC"
+            linkOnly = true
         }
 
         podfile = project.file("../iosApp/Podfile")
@@ -40,8 +38,11 @@ kotlin {
         xcodeConfigurationToNativeBuildType["CUSTOM_RELEASE"] = NativeBuildType.RELEASE
     }
 
+    @OptIn(ExperimentalKotlinGradlePluginApi::class)
     androidTarget {
-        configureJvmTarget()
+        compilerOptions {
+            jvmTarget = JvmTarget.JVM_1_8
+        }
     }
 
     iosX64()
@@ -134,7 +135,3 @@ android {
         debugImplementation(compose.uiTooling)
     }
 }
-
-compose.experimental {
-    web.application {}
-}
diff --git a/sample/iosApp/iosApp.xcodeproj/project.pbxproj b/sample/iosApp/iosApp.xcodeproj/project.pbxproj
index ad7acba3..b8d2c6b6 100644
--- a/sample/iosApp/iosApp.xcodeproj/project.pbxproj
+++ b/sample/iosApp/iosApp.xcodeproj/project.pbxproj
@@ -125,8 +125,8 @@
 				7555FF77242A565900829871 /* Sources */,
 				B92378962B6B1156000C7307 /* Frameworks */,
 				7555FF79242A565900829871 /* Resources */,
-				931F3CF07987B7B6B6CCC6ED /* [CP] Copy Pods Resources */,
-				F3D8C08FB2F2F9B33C43EA14 /* [CP] Embed Pods Frameworks */,
+				9D30E8BF2257B72354B2C936 /* [CP] Embed Pods Frameworks */,
+				02E3B32A3BA45C3A2057CD5D /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
@@ -184,46 +184,46 @@
 /* End PBXResourcesBuildPhase section */
 
 /* Begin PBXShellScriptBuildPhase section */
-		5BCD90D29F45499A62B89A70 /* [CP] Check Pods Manifest.lock */ = {
+		02E3B32A3BA45C3A2057CD5D /* [CP] Copy Pods Resources */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 			);
 			inputFileListPaths = (
+				"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources-${CONFIGURATION}-input-files.xcfilelist",
 			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
+			name = "[CP] Copy Pods Resources";
 			outputFileListPaths = (
-			);
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-iosApp-checkManifestLockResult.txt",
+				"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources-${CONFIGURATION}-output-files.xcfilelist",
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources.sh\"\n";
 			showEnvVarsInLog = 0;
 		};
-		931F3CF07987B7B6B6CCC6ED /* [CP] Copy Pods Resources */ = {
+		5BCD90D29F45499A62B89A70 /* [CP] Check Pods Manifest.lock */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 			);
 			inputFileListPaths = (
-				"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources-${CONFIGURATION}-input-files.xcfilelist",
 			);
-			name = "[CP] Copy Pods Resources";
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
 			outputFileListPaths = (
-				"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources-${CONFIGURATION}-output-files.xcfilelist",
+			);
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-iosApp-checkManifestLockResult.txt",
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-iosApp/Pods-iosApp-resources.sh\"\n";
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
 			showEnvVarsInLog = 0;
 		};
-		F3D8C08FB2F2F9B33C43EA14 /* [CP] Embed Pods Frameworks */ = {
+		9D30E8BF2257B72354B2C936 /* [CP] Embed Pods Frameworks */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
diff --git a/vfsoverlay/base.h b/vfsoverlay/base.h
new file mode 100644
index 00000000..41b0bd27
--- /dev/null
+++ b/vfsoverlay/base.h
@@ -0,0 +1,151 @@
+/*! @header
+ *  This header defines macros used in the implementation of <simd/simd.h>
+ *  types and functions. Even though they are exposed in a public header,
+ *  the macros defined in this header are implementation details, and you
+ *  should not use or rely on them. They may be changed or removed entirely
+ *  in a future release.
+ *
+ *  @copyright 2016-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_BASE
+#define SIMD_BASE
+
+/*  Define __has_attribute and __has_include if they aren't available         */
+# ifndef __has_attribute
+#  define __has_attribute(__x) 0
+# endif
+# ifndef __has_include
+#  define __has_include(__x) 0
+# endif
+# ifndef __has_feature
+#  define __has_feature(__x) 0
+# endif
+
+# if __has_attribute(__ext_vector_type__) && __has_attribute(__overloadable__)
+#  define SIMD_COMPILER_HAS_REQUIRED_FEATURES 1
+# else
+/*  Your compiler is missing one or more features that are hard requirements
+ *  for any <simd/simd.h> support. None of the types or functions defined by
+ *  the simd headers will be available.                                       */
+#  define SIMD_COMPILER_HAS_REQUIRED_FEATURES 0
+# endif
+
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#  if __has_include(<TargetConditionals.h>) && __has_include(<Availability.h>)
+#   include <TargetConditionals.h>
+#   include <Availability.h>
+/*  A number of new features are added in newer releases; most of these are
+ *  inline in the header, which makes them available even when targeting older
+ *  OS versions. Those that make external calls, however, are only available
+ *  when targeting the release in which they became available. Because of the
+ *  way in which simd functions are overloaded, the usual weak-linking tricks
+ *  do not work; these functions are simply unavailable when targeting older
+ *  versions of the library.                                                  */
+#   if TARGET_OS_RTKIT
+#    define SIMD_LIBRARY_VERSION 5
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_13_0   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_16_0 || \
+        __WATCH_OS_VERSION_MIN_REQUIRED  >= __WATCHOS_9_0 || \
+        __TV_OS_VERSION_MIN_REQUIRED     >= __TVOS_16_0   || \
+        __BRIDGE_OS_VERSION_MIN_REQUIRED >= 70000   || \
+        __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_22_0
+#    define SIMD_LIBRARY_VERSION 5
+#   elif   __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_12_0   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_15_0 || \
+         __WATCH_OS_VERSION_MIN_REQUIRED >= __WATCHOS_8_0 || \
+            __TV_OS_VERSION_MIN_REQUIRED >= __TVOS_15_0   || \
+        __BRIDGE_OS_VERSION_MIN_REQUIRED >= 60000   || \
+        __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_21_0
+#    define SIMD_LIBRARY_VERSION 4
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_13   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_11_0 || \
+         __WATCH_OS_VERSION_MIN_REQUIRED >= __WATCHOS_4_0 || \
+            __TV_OS_VERSION_MIN_REQUIRED >= __TVOS_11_0   || \
+        __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_19_0
+#    define SIMD_LIBRARY_VERSION 3
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_12   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_10_0 || \
+         __WATCH_OS_VERSION_MIN_REQUIRED >= __WATCHOS_3_0 || \
+            __TV_OS_VERSION_MIN_REQUIRED >= __TVOS_10_0
+#    define SIMD_LIBRARY_VERSION 2
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_10   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0
+#    define SIMD_LIBRARY_VERSION 1
+#   else
+#    define SIMD_LIBRARY_VERSION 0
+#   endif
+#  else /* !__has_include(<TargetContidionals.h>) && __has_include(<Availability.h>) */
+#   define SIMD_LIBRARY_VERSION 5
+#   define __API_AVAILABLE(...) /* Nothing */
+#  endif
+
+/*  The simd types interoperate with the native simd intrinsic types for each
+ *  architecture; the headers that define those types and operations are
+ *  automatically included with simd.h                                        */
+#  if defined __ARM_NEON__
+#   include <arm_neon.h>
+#  elif defined __i386__ || defined __x86_64__
+#   include <immintrin.h>
+#  endif
+
+/*  Define a number of function attributes used by the simd functions.        */
+#  if __has_attribute(__always_inline__)
+#   define SIMD_INLINE  __attribute__((__always_inline__))
+#  else
+#   define SIMD_INLINE  inline
+#  endif
+
+#  if __has_attribute(__const__)
+#   define SIMD_CONST   __attribute__((__const__))
+#  else
+#   define SIMD_CONST   /* nothing */
+#  endif
+
+#  if __has_attribute(__nodebug__)
+#   define SIMD_NODEBUG __attribute__((__nodebug__))
+#  else
+#   define SIMD_NODEBUG /* nothing */
+#  endif
+
+#  if __has_attribute(__deprecated__)
+#   define SIMD_DEPRECATED(message) __attribute__((__deprecated__(message)))
+#  else
+#   define SIMD_DEPRECATED(message) /* nothing */
+#  endif
+
+#define SIMD_OVERLOAD __attribute__((__overloadable__))
+#define SIMD_CPPFUNC  SIMD_INLINE SIMD_CONST SIMD_NODEBUG
+#define SIMD_CFUNC    SIMD_CPPFUNC SIMD_OVERLOAD
+#define SIMD_NOINLINE SIMD_CONST SIMD_NODEBUG SIMD_OVERLOAD
+#define SIMD_NONCONST SIMD_INLINE SIMD_NODEBUG SIMD_OVERLOAD
+#define __SIMD_INLINE__     SIMD_CPPFUNC
+#define __SIMD_ATTRIBUTES__ SIMD_CFUNC
+#define __SIMD_OVERLOAD__   SIMD_OVERLOAD
+
+#  if __has_feature(cxx_constexpr)
+#   define SIMD_CONSTEXPR constexpr
+#  else
+#   define SIMD_CONSTEXPR /* nothing */
+#  endif
+
+#  if __has_feature(cxx_noexcept)
+#   define SIMD_NOEXCEPT noexcept
+#  else
+#   define SIMD_NOEXCEPT /* nothing */
+#  endif
+
+#if defined __cplusplus
+/*! @abstract A boolean scalar.                                               */
+typedef  bool simd_bool;
+#else
+/*! @abstract A boolean scalar.                                               */
+typedef _Bool simd_bool;
+#endif
+/*! @abstract A boolean scalar.
+ *  @discussion This type is deprecated; In C or Objective-C sources, use
+ *  `_Bool` instead. In C++ sources, use `bool`.                              */
+typedef simd_bool __SIMD_BOOLEAN_TYPE__;
+
+# endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* defined SIMD_BASE */
diff --git a/vfsoverlay/common.h b/vfsoverlay/common.h
new file mode 100644
index 00000000..cb2f6503
--- /dev/null
+++ b/vfsoverlay/common.h
@@ -0,0 +1,4525 @@
+/*! @header
+ *  The interfaces declared in this header provide "common" elementwise
+ *  operations that are neither math nor logic functions.  These are available
+ *  only for floating-point vectors and scalars, except for min, max, abs,
+ *  clamp, and the reduce operations, which also support integer vectors.
+ *
+ *      simd_abs(x)             Absolute value of x.  Also available as fabs
+ *                              for floating-point vectors.  If x is the
+ *                              smallest signed integer, x is returned.
+ *
+ *      simd_max(x,y)           Returns the maximum of x and y.  Also available
+ *                              as fmax for floating-point vectors.
+ *
+ *      simd_min(x,y)           Returns the minimum of x and y.  Also available
+ *                              as fmin for floating-point vectors.
+ *
+ *      simd_clamp(x,min,max)   x clamped to the range [min, max].
+ *
+ *      simd_sign(x)            -1 if x is less than zero, 0 if x is zero or
+ *                              NaN, and +1 if x is greater than zero.
+ *
+ *      simd_mix(x,y,t)         If t is not in the range [0,1], the result is
+ *      simd_lerp(x,y,t)        undefined.  Otherwise the result is x+(y-x)*t,
+ *                              which linearly interpolates between x and y.
+ *
+ *      simd_recip(x)           An approximation to 1/x.  If x is very near the
+ *                              limits of representable values, or is infinity
+ *                              or NaN, the result is undefined.  There are
+ *                              two variants of this function:
+ *
+ *                                  simd_precise_recip(x)
+ *
+ *                              and
+ *
+ *                                  simd_fast_recip(x).
+ *
+ *                              The "precise" variant is accurate to a few ULPs,
+ *                              whereas the "fast" variant may have as little
+ *                              as 11 bits of accuracy in float and about 22
+ *                              bits in double.
+ *
+ *                              The function simd_recip(x) resolves to
+ *                              simd_precise_recip(x) ordinarily, but to
+ *                              simd_fast_recip(x) when used in a translation
+ *                              unit compiled with -ffast-math (when
+ *                              -ffast-math is in effect, you may still use the
+ *                              precise version of this function by calling it
+ *                              explicitly by name).
+ *
+ *      simd_rsqrt(x)           An approximation to 1/sqrt(x).  If x is
+ *                              infinity or NaN, the result is undefined.
+ *                              There are two variants of this function:
+ *
+ *                                  simd_precise_rsqrt(x)
+ *
+ *                              and
+ *
+ *                                  simd_fast_rsqrt(x).
+ *
+ *                              The "precise" variant is accurate to a few ULPs,
+ *                              whereas the "fast" variant may have as little
+ *                              as 11 bits of accuracy in float and about 22
+ *                              bits in double.
+ *
+ *                              The function simd_rsqrt(x) resolves to
+ *                              simd_precise_rsqrt(x) ordinarily, but to
+ *                              simd_fast_rsqrt(x) when used in a translation
+ *                              unit compiled with -ffast-math (when
+ *                              -ffast-math is in effect, you may still use the
+ *                              precise version of this function by calling it
+ *                              explicitly by name).
+ *
+ *      simd_fract(x)           The "fractional part" of x, which lies strictly
+ *                              in the range [0, 0x1.fffffep-1].
+ *
+ *      simd_step(edge,x)       0 if x < edge, and 1 otherwise.
+ *
+ *      simd_smoothstep(edge0,edge1,x) 0 if x <= edge0, 1 if x >= edge1, and
+ *                              a Hermite interpolation between 0 and 1 if
+ *                              edge0 < x < edge1.
+ *
+ *      simd_reduce_add(x)      Sum of the elements of x.
+ *
+ *      simd_reduce_min(x)      Minimum of the elements of x.
+ *
+ *      simd_reduce_max(x)      Maximum of the elements of x.
+ *
+ *      simd_equal(x,y)         True if and only if every lane of x is equal
+ *                              to the corresponding lane of y.
+ *
+ *  The following common functions are available in the simd:: namespace:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      --------------------------------------------------------------------
+ *      simd::abs(x)                    simd_abs(x)
+ *      simd::max(x,y)                  simd_max(x,y)
+ *      simd::min(x,y)                  simd_min(x,y)
+ *      simd::clamp(x,min,max)          simd_clamp(x,min,max)
+ *      simd::sign(x)                   simd_sign(x)
+ *      simd::mix(x,y,t)                simd_mix(x,y,t)
+ *      simd::lerp(x,y,t)               simd_lerp(x,y,t)
+ *      simd::recip(x)                  simd_recip(x)
+ *      simd::rsqrt(x)                  simd_rsqrt(x)
+ *      simd::fract(x)                  simd_fract(x)
+ *      simd::step(edge,x)              simd_step(edge,x)
+ *      simd::smoothstep(e0,e1,x)       simd_smoothstep(e0,e1,x)
+ *      simd::reduce_add(x)             simd_reduce_add(x)
+ *      simd::reduce_max(x)             simd_reduce_max(x)
+ *      simd::reduce_min(x)             simd_reduce_min(x)
+ *      simd::equal(x,y)                simd_equal(x,y)
+ *
+ *      simd::precise::recip(x)         simd_precise_recip(x)
+ *      simd::precise::rsqrt(x)         simd_precise_rsqrt(x)
+ *
+ *      simd::fast::recip(x)            simd_fast_recip(x)
+ *      simd::fast::rsqrt(x)            simd_fast_rsqrt(x)
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_COMMON_HEADER
+#define SIMD_COMMON_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+#include <simd/math.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x);
+/*! @abstract The elementwise absolute value of x.
+ *  @discussion Deprecated. Use simd_abs(x) instead.                          */
+#define vector_abs simd_abs
+  
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC float simd_max(float x, float y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC double simd_max(double x, double y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise maximum of x and y.
+ *  @discussion Deprecated. Use simd_max(x,y) instead.                        */
+#define vector_max simd_max
+
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC float simd_min(float x, float y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC double simd_min(double x, double y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise minimum of x and y.
+ *  @discussion Deprecated. Use simd_min(x,y) instead.                        */
+#define vector_min simd_min
+
+  
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Deprecated. Use simd_clamp(x,min,max) instead.                */
+#define vector_clamp simd_clamp
+  
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC float simd_sign(float x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC double simd_sign(double x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.
+ *  @discussion Deprecated. Use simd_sign(x) instead.                         */
+#define vector_sign simd_sign
+
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1
+ *  @discussion Deprecated. Use simd_mix(x, y, t) instead.                    */
+#define vector_mix simd_mix
+#define simd_lerp simd_mix
+
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC float simd_precise_recip(float x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC double simd_precise_recip(double x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion Deprecated. Use simd_precise_recip(x) instead.                */
+#define vector_precise_recip simd_precise_recip
+
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC float simd_fast_recip(float x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC double simd_fast_recip(double x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion Deprecated. Use simd_fast_recip(x) instead.                   */
+#define vector_fast_recip simd_fast_recip
+
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC float simd_recip(float x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC double simd_recip(double x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion Deprecated. Use simd_recip(x) instead.                        */
+#define vector_recip simd_recip
+
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_precise_rsqrt(x) instead.                */
+#define vector_precise_rsqrt simd_precise_rsqrt
+
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_fast_rsqrt(x) instead.                   */
+#define vector_fast_rsqrt simd_fast_rsqrt
+
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC float simd_rsqrt(float x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC double simd_rsqrt(double x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_rsqrt(x) instead.                        */
+#define vector_rsqrt simd_rsqrt
+
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC float simd_fract(float x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC double simd_fract(double x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion Deprecated. Use simd_fract(x) instead.                        */
+#define vector_fract simd_fract
+
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC float simd_step(float edge, float x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC double simd_step(double edge, double x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Deprecated. Use simd_step(edge, x) instead.                   */
+#define vector_step simd_step
+
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion Deprecated. Use simd_smoothstep(edge0, edge1, x) instead.     */
+#define vector_smoothstep simd_smoothstep
+
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion Deprecated. Use simd_add(x) instead.                          */
+#define vector_reduce_add simd_reduce_add
+  
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x);
+/*! @abstract Minimum of elements in x.
+ *  @discussion Deprecated. Use simd_min(x) instead.                          */
+#define vector_reduce_min simd_reduce_min
+  
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x);
+/*! @abstract Maximum of elements in x.
+ *  @discussion Deprecated. Use simd_max(x) instead.                          */
+#define vector_reduce_max simd_reduce_max
+  
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char2 x, simd_char2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char3 x, simd_char3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char4 x, simd_char4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char8 x, simd_char8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char16 x, simd_char16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char32 x, simd_char32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char64 x, simd_char64 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar2 x, simd_uchar2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar3 x, simd_uchar3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar4 x, simd_uchar4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar8 x, simd_uchar8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar16 x, simd_uchar16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar32 x, simd_uchar32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar64 x, simd_uchar64 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short2 x, simd_short2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short3 x, simd_short3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short4 x, simd_short4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short8 x, simd_short8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short16 x, simd_short16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short32 x, simd_short32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort2 x, simd_ushort2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort3 x, simd_ushort3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort4 x, simd_ushort4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort8 x, simd_ushort8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort16 x, simd_ushort16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort32 x, simd_ushort32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int2 x, simd_int2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int3 x, simd_int3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int4 x, simd_int4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int8 x, simd_int8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int16 x, simd_int16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint2 x, simd_uint2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint3 x, simd_uint3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint4 x, simd_uint4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint8 x, simd_uint8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint16 x, simd_uint16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float2 x, simd_float2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float3 x, simd_float3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float4 x, simd_float4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float8 x, simd_float8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float16 x, simd_float16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long2 x, simd_long2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long3 x, simd_long3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long4 x, simd_long4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long8 x, simd_long8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong2 x, simd_ulong2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong3 x, simd_ulong3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong4 x, simd_ulong4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong8 x, simd_ulong8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double2 x, simd_double2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double3 x, simd_double3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double4 x, simd_double4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double8 x, simd_double8 y) {
+  return simd_all(x == y);
+}
+  
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  /*! @abstract The lanewise absolute value of x.                             */
+  template <typename typeN> static SIMD_CPPFUNC typeN abs(const typeN x) { return ::simd_abs(x); }
+  /*! @abstract The lanewise maximum of x and y.                              */
+  template <typename typeN> static SIMD_CPPFUNC typeN max(const typeN x, const typeN y) { return ::simd_max(x,y); }
+  /*! @abstract The lanewise minimum of x and y.                              */
+  template <typename typeN> static SIMD_CPPFUNC typeN min(const typeN x, const typeN y) { return ::simd_min(x,y); }
+  /*! @abstract x clamped to the interval [min, max].                         */
+  template <typename typeN> static SIMD_CPPFUNC typeN clamp(const typeN x, const typeN min, const typeN max) { return ::simd_clamp(x,min,max); }
+  /*! @abstract -1 if x < 0, +1 if x > 0, and 0 otherwise.                    */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN sign(const fptypeN x) { return ::simd_sign(x); }
+  /*! @abstract Linearly interpolates between x and y, taking the value x when t=0 and y when t=1 */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN mix(const fptypeN x, const fptypeN y, const fptypeN t) { return ::simd_mix(x,y,t); }
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN lerp(const fptypeN x, const fptypeN y, const fptypeN t) { return ::simd_mix(x,y,t); }
+  /*! @abstract An approximation to 1/x.                                      */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return simd_recip(x); }
+  /*! @abstract An approximation to 1/sqrt(x).                                */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return simd_rsqrt(x); }
+  /*! @abstract The "fracional part" of x, in the range [0,1).                */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN fract(const fptypeN x) { return ::simd_fract(x); }
+  /*! @abstract 0 if x < edge, 1 otherwise.                                   */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN step(const fptypeN edge, const fptypeN x) { return ::simd_step(edge,x); }
+  /*! @abstract smoothly interpolates from 0 at edge0 to 1 at edge1.          */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN smoothstep(const fptypeN edge0, const fptypeN edge1, const fptypeN x) { return ::simd_smoothstep(edge0,edge1,x); }
+  /*! @abstract True if and only if each lane of x is equal to the
+   *  corresponding lane of y.
+   *
+   *  @discussion This isn't operator== because that's already defined by
+   *  the compiler to return a lane mask.                                     */
+  template <typename fptypeN> static SIMD_CPPFUNC simd_bool equal(const fptypeN x, const fptypeN y) { return ::simd_equal(x, y); }
+#if __cpp_decltype_auto
+  /*  If you are targeting an earlier version of the C++ standard that lacks
+   decltype_auto support, you may use the C-style simd_reduce_* functions
+   instead.                                                                   */
+  /*! @abstract The sum of the elements in x. May overflow.                   */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_add(typeN x) { return ::simd_reduce_add(x); }
+  /*! @abstract The least element in x.                                       */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_min(typeN x) { return ::simd_reduce_min(x); }
+  /*! @abstract The greatest element in x.                                    */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_max(typeN x) { return ::simd_reduce_max(x); }
+#endif
+  namespace precise {
+    /*! @abstract An approximation to 1/x.                                      */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_precise_recip(x); }
+    /*! @abstract An approximation to 1/sqrt(x).                                */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_precise_rsqrt(x); }
+  }
+  namespace fast {
+    /*! @abstract An approximation to 1/x.                                      */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_fast_recip(x); }
+    /*! @abstract An approximation to 1/sqrt(x).                                */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_fast_rsqrt(x); }
+  }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementation
+
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x) {
+  return simd_make_char2(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x) {
+  return simd_make_char3(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x) {
+  return simd_make_char4(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s8(x);
+#else
+  return simd_make_char8(simd_abs(simd_make_char16_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s8(x);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_abs_epi8((__m128i)x);
+#else
+  simd_char16 mask = x >> 7; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi8(x);
+#else
+  return simd_make_char32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x) {
+#if defined __AVX512BW__
+  return _mm512_abs_epi8(x);
+#else
+  return simd_make_char64(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x) {
+  return simd_make_short2(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x) {
+  return simd_make_short3(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s16(x);
+#else
+  return simd_make_short4(simd_abs(simd_make_short8_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s16(x);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_abs_epi16((__m128i)x);
+#else
+  simd_short8 mask = x >> 15; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi16(x);
+#else
+  return simd_make_short16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x) {
+#if defined __AVX512BW__
+  return _mm512_abs_epi16(x);
+#else
+  return simd_make_short32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s32(x);
+#else
+  return simd_make_int2(simd_abs(simd_make_int4_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x) {
+  return simd_make_int3(simd_abs(simd_make_int4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s32(x);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_abs_epi32((__m128i)x);
+#else
+  simd_int4 mask = x >> 31; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi32(x);
+#else
+  return simd_make_int8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x) {
+#if defined __AVX512F__
+  return _mm512_abs_epi32(x);
+#else
+  return simd_make_int16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x) {
+#if defined __arm64__
+  return vabsq_s64(x);
+#elif defined __AVX512VL__
+  return (simd_long2) _mm_abs_epi64((__m128i)x);
+#else
+  simd_long2 mask = x >> 63; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x) {
+  return simd_make_long3(simd_abs(simd_make_long4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x) {
+#if defined __AVX512VL__
+  return _mm256_abs_epi64(x);
+#else
+  return simd_make_long4(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x) {
+#if defined __AVX512F__
+  return _mm512_abs_epi64(x);
+#else
+  return simd_make_long8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y) {
+  return simd_make_char2(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y) {
+  return simd_make_char3(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y) {
+  return simd_make_char4(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s8(x, y);
+#else
+  return simd_make_char8(simd_min(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s8(x, y);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_min_epi8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+  return _mm256_min_epi8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epi8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y) {
+  return simd_make_uchar2(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y) {
+  return simd_make_uchar3(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y) {
+  return simd_make_uchar4(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u8(x, y);
+#else
+  return simd_make_uchar8(simd_min(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u8(x, y);
+#elif defined __SSE4_1__
+  return (simd_uchar16) _mm_min_epu8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+  return _mm256_min_epu8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epu8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y) {
+  return simd_make_short2(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y) {
+  return simd_make_short3(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s16(x, y);
+#else
+  return simd_make_short4(simd_min(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s16(x, y);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_min_epi16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+  return _mm256_min_epi16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epi16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y) {
+  return simd_make_ushort2(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y) {
+  return simd_make_ushort3(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u16(x, y);
+#else
+  return simd_make_ushort4(simd_min(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u16(x, y);
+#elif defined __SSE4_1__
+  return (simd_ushort8) _mm_min_epu16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+  return _mm256_min_epu16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epu16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s32(x, y);
+#else
+  return simd_make_int2(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y) {
+  return simd_make_int3(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s32(x, y);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_min_epi32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+  return _mm256_min_epi32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+  return _mm512_min_epi32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u32(x, y);
+#else
+  return simd_make_uint2(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y) {
+  return simd_make_uint3(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u32(x, y);
+#elif defined __SSE4_1__
+  return (simd_uint4) _mm_min_epu32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+  return _mm256_min_epu32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+  return _mm512_min_epu32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_min(float x, float y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+  return _mm_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y) {
+  return simd_make_long3(simd_min(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+  return _mm256_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+  return _mm512_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+  return _mm_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y) {
+  return simd_make_ulong3(simd_min(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+  return _mm256_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+  return _mm512_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_min(double x, double y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y) {
+  return simd_make_char2(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y) {
+  return simd_make_char3(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y) {
+  return simd_make_char4(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s8(x, y);
+#else
+  return simd_make_char8(simd_max(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s8(x, y);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_max_epi8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+  return _mm256_max_epi8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epi8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y) {
+  return simd_make_uchar2(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y) {
+  return simd_make_uchar3(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y) {
+  return simd_make_uchar4(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u8(x, y);
+#else
+  return simd_make_uchar8(simd_max(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u8(x, y);
+#elif defined __SSE4_1__
+  return (simd_uchar16) _mm_max_epu8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+  return _mm256_max_epu8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epu8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y) {
+  return simd_make_short2(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y) {
+  return simd_make_short3(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s16(x, y);
+#else
+  return simd_make_short4(simd_max(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s16(x, y);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_max_epi16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+  return _mm256_max_epi16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epi16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y) {
+  return simd_make_ushort2(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y) {
+  return simd_make_ushort3(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u16(x, y);
+#else
+  return simd_make_ushort4(simd_max(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u16(x, y);
+#elif defined __SSE4_1__
+  return (simd_ushort8) _mm_max_epu16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+  return _mm256_max_epu16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epu16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s32(x, y);
+#else
+  return simd_make_int2(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y) {
+  return simd_make_int3(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s32(x, y);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_max_epi32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+  return _mm256_max_epi32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+  return _mm512_max_epi32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u32(x, y);
+#else
+  return simd_make_uint2(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y) {
+  return simd_make_uint3(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u32(x, y);
+#elif defined __SSE4_1__
+  return (simd_uint4) _mm_max_epu32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+  return _mm256_max_epu32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+  return _mm512_max_epu32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_max(float x, float y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+  return _mm_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y) {
+  return simd_make_long3(simd_max(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+  return _mm256_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+  return _mm512_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+  return _mm_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y) {
+  return simd_make_ulong3(simd_max(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+  return _mm256_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+  return _mm512_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_max(double x, double y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+  
+static inline SIMD_CFUNC float simd_sign(float x) {
+  return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC double simd_sign(double x) {
+  return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC float simd_recip(float x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_recip(double x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_fast_recip(float x) {
+#if defined __AVX512VL__
+  simd_float4 x4 = simd_make_float4(x);
+  return ((simd_float4)_mm_rcp14_ss(x4, x4)).x;
+#elif defined __SSE__
+  return ((simd_float4)_mm_rcp_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+  return simd_fast_recip(simd_make_float2_undef(x)).x;
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_fast_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = vrecpe_f32(x);
+  return r * vrecps_f32(x, r);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x) {
+  return simd_make_float3(simd_fast_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x) {
+#if defined __AVX512VL__
+  return _mm_rcp14_ps(x);
+#elif defined __SSE__
+  return _mm_rcp_ps(x);
+#elif defined __ARM_NEON__
+  simd_float4 r = vrecpeq_f32(x);
+  return r * vrecpsq_f32(x, r);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x) {
+#if defined __AVX512VL__
+  return _mm256_rcp14_ps(x);
+#elif defined __AVX__
+  return _mm256_rcp_ps(x);
+#else
+  return simd_make_float8(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x) {
+#if defined __AVX512F__
+  return _mm512_rcp14_ps(x);
+#else
+  return simd_make_float16(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_recip(double x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_recip(float x) {
+#if defined __SSE__
+  float r = simd_fast_recip(x);
+  return r*(2 - (x == 0 ? -INFINITY : x)*r);
+#elif defined __ARM_NEON__
+  return simd_precise_recip(simd_make_float2_undef(x)).x;
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_precise_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = simd_fast_recip(x);
+  return r*vrecps_f32(x, r);
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x) {
+  return simd_make_float3(simd_precise_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x) {
+#if defined __SSE__
+  simd_float4 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#elif defined __ARM_NEON__
+  simd_float4 r = simd_fast_recip(x);
+  return r*vrecpsq_f32(x, r);
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x) {
+#if defined __AVX__
+  simd_float8 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+  return simd_make_float8(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x) {
+#if defined __AVX512F__
+  simd_float16 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+  return simd_make_float16(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_precise_recip(double x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC float simd_rsqrt(float x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC double simd_rsqrt(double x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x) {
+#if defined __AVX512VL__
+  simd_float4 x4 = simd_make_float4(x);
+  return ((simd_float4)_mm_rsqrt14_ss(x4, x4)).x;
+#elif defined __SSE__
+  return ((simd_float4)_mm_rsqrt_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+  return simd_fast_rsqrt(simd_make_float2_undef(x)).x;
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_fast_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = vrsqrte_f32(x);
+  return r * vrsqrts_f32(x, r*r);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x) {
+  return simd_make_float3(simd_fast_rsqrt(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x) {
+#if defined __AVX512VL__
+  return _mm_rsqrt14_ps(x);
+#elif defined __SSE__
+  return _mm_rsqrt_ps(x);
+#elif defined __ARM_NEON__
+  simd_float4 r = vrsqrteq_f32(x);
+  return r * vrsqrtsq_f32(x, r*r);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x) {
+#if defined __AVX512VL__
+  return _mm256_rsqrt14_ps(x);
+#elif defined __AVX__
+  return _mm256_rsqrt_ps(x);
+#else
+  return simd_make_float8(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+  return _mm512_rsqrt14_ps(x);
+#else
+  return simd_make_float16(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x) {
+#if defined __SSE__
+  float r = simd_fast_rsqrt(x);
+  return r*(1.5f - 0.5f*(r == INFINITY ? -INFINITY : x)*r*r);
+#elif defined __ARM_NEON__
+  return simd_precise_rsqrt(simd_make_float2_undef(x)).x;
+#else
+  return 1/sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_precise_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = simd_fast_rsqrt(x);
+  return r*vrsqrts_f32(x, r*r);
+#else
+  return 1/__tg_sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x) {
+  return simd_make_float3(simd_precise_rsqrt(simd_make_float4_undef(x)));
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x) {
+#if defined __SSE__
+  simd_float4 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#elif defined __ARM_NEON__
+  simd_float4 r = simd_fast_rsqrt(x);
+  return r*vrsqrtsq_f32(x, r*r);
+#else
+  return 1/__tg_sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x) {
+#if defined __AVX__
+  simd_float8 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+  return simd_make_float8(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+  simd_float16 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+  return simd_make_float16(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+  
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x) {
+  return 1/sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC float simd_fract(float x) {
+  return fmin(x - floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC double simd_fract(double x) {
+  return fmin(x - floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC float simd_step(float edge, float x) {
+  return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x) {
+  return simd_bitselect((simd_float2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x) {
+  return simd_bitselect((simd_float3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x) {
+  return simd_bitselect((simd_float4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x) {
+  return simd_bitselect((simd_float8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x) {
+  return simd_bitselect((simd_float16)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC double simd_step(double edge, double x) {
+  return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x) {
+  return simd_bitselect((simd_double2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x) {
+  return simd_bitselect((simd_double3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x) {
+  return simd_bitselect((simd_double4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x) {
+  return simd_bitselect((simd_double8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x) {
+  float t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x) {
+  simd_float2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x) {
+  simd_float3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x) {
+  simd_float4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x) {
+  simd_float8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x) {
+  simd_float16 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x) {
+  double t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x) {
+  simd_double2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x) {
+  simd_double3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x) {
+  simd_double4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x) {
+  simd_double8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x) {
+  char t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x) {
+#if defined __arm64__
+  return vminvq_s8(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x) {
+  unsigned char t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x) {
+#if defined __arm64__
+  return vminvq_u8(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x) {
+  short t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x) {
+#if defined __arm64__
+  return vminvq_s16(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x) {
+  unsigned short t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x) {
+#if defined __arm64__
+  return vminvq_u16(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x) {
+  int t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x) {
+#if defined __arm64__
+  return vminvq_s32(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x) {
+  unsigned int t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x) {
+#if defined __arm64__
+  return vminvq_u32(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x) {
+  simd_long1 t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x) {
+  simd_ulong1 t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x) {
+  return fmin(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x) {
+  return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x) {
+#if defined __arm64__
+  return vminvq_f32(x);
+#else
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x) {
+#if defined __arm64__
+  return vminvq_f64(x);
+#else
+  return fmin(x.x, x.y);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x) {
+  return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x) {
+  char t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x) {
+#if defined __arm64__
+  return vmaxvq_s8(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x) {
+  unsigned char t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x) {
+#if defined __arm64__
+  return vmaxvq_u8(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x) {
+  short t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x) {
+#if defined __arm64__
+  return vmaxvq_s16(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x) {
+  unsigned short t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x) {
+#if defined __arm64__
+  return vmaxvq_u16(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x) {
+  int t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x) {
+#if defined __arm64__
+  return vmaxvq_s32(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x) {
+  unsigned int t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x) {
+#if defined __arm64__
+  return vmaxvq_u32(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x) {
+  simd_long1 t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x) {
+  simd_ulong1 t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x) {
+  return fmax(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x) {
+  return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x) {
+#if defined __arm64__
+  return vmaxvq_f32(x);
+#else
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x) {
+#if defined __arm64__
+  return vmaxvq_f64(x);
+#else
+  return fmax(x.x, x.y);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x) {
+  return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_COMMON_HEADER */
diff --git a/vfsoverlay/conversion.h b/vfsoverlay/conversion.h
new file mode 100644
index 00000000..235a56b8
--- /dev/null
+++ b/vfsoverlay/conversion.h
@@ -0,0 +1,2032 @@
+/*  Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ *  The interfaces declared in this header provide conversions between vector
+ *  types. The following functions are available:
+ *
+ *      simd_char(x)      simd_uchar(x)
+ *      simd_short(x)     simd_ushort(x)
+ *      simd_int(x)       simd_uint(x)
+ *      simd_long(x)      simd_ulong(x)
+ *      simd_float(x)
+ *      simd_double(x)
+ *
+ *  Each of these functions converts x to a vector whose elements have the
+ *  type named by the function, with the same number of elements as x. Unlike
+ *  a vector cast, these functions convert the elements to the new element
+ *  type. These conversions behave exactly as C scalar conversions, except
+ *  that conversions from integer vector types to signed integer vector types
+ *  are guaranteed to wrap modulo 2^N (where N is the number of bits in an
+ *  element of the result type).
+ *
+ *  For integer vector types, saturating conversions are also available:
+ *
+ *      simd_char_sat(x)      simd_uchar_sat(x)
+ *      simd_short_sat(x)     simd_ushort_sat(x)
+ *      simd_int_sat(x)       simd_uint_sat(x)
+ *      simd_long_sat(x)      simd_ulong_sat(x)
+ *
+ *  These conversions clamp x to the representable range of the result type
+ *  before converting.
+ *
+ *  In C++ the conversion functions are templated in the simd:: namespace.
+ *
+ *      C++ Function                            Equivalent C Function
+ *      -------------------------------------------------------------------
+ *      simd::convert<ScalarType>(x)            simd_ScalarType(x)
+ *      simd::convert_sat<ScalarType>(x)        simd_ScalarType_sat(x)
+ */
+
+#ifndef __SIMD_CONVERSION_HEADER__
+#define __SIMD_CONVERSION_HEADER__
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_types.h>
+#include <simd/common.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static simd_char2  SIMD_CFUNC simd_char(simd_char2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_char3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_char4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_char8    __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_char16   __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_char32   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_uchar2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_uchar3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_uchar4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_uchar8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16  __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_short2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_short3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_short4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_short8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_short16  __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_short32  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_ushort2  __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_ushort3  __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_ushort4  __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_ushort8  __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_int2     __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_int3     __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_int4     __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_int8     __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_int16    __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_uint2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_uint3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_uint4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_uint8    __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_float2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_float3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_float4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_float8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_float16  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_long2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_long3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_long4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_long8    __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_ulong2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_ulong3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_ulong4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_ulong8   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_double2  __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_double3  __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_double4  __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_double8  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_char2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_char3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_char4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_char8    __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16   __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32   __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_short2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_short3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_short4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_short8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16  __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_int2     __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_int3     __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_int4     __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_int8     __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16    __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_float2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_float3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_float4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_float8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_long2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_long3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_long4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_long8    __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_double2  __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_double3  __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_double4  __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_double8  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uchar2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uchar3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uchar4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uchar8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16  __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ushort2  __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ushort3  __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ushort4  __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ushort8  __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uint2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uint3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uint4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uint8    __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16   __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ulong2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ulong3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ulong4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ulong8   __x);
+#define vector_char simd_char
+#define vector_char_sat simd_char_sat
+
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_char2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_char3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_char4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_char8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16   __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uchar2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uchar3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uchar4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uchar8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_short2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_short3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_short4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_short8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ushort2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ushort3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ushort4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ushort8  __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_int2     __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_int3     __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_int4     __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_int8     __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uint2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uint3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uint4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uint8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_float2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_float3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_float4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_float8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_long2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_long3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_long4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_long8    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ulong2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ulong3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ulong4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ulong8   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_double2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_double3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_double4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_double8  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_char2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_char3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_char4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_char8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16   __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_short2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_short3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_short4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_short8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_int2     __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_int3     __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_int4     __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_int8     __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_float2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_float3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_float4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_float8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_long2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_long3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_long4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_long8    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_double2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_double3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_double4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_double8  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uchar2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uchar3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uchar4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uchar8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ushort2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ushort3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ushort4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ushort8  __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uint2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uint3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uint4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uint8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ulong2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ulong3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ulong4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ulong8   __x);
+#define vector_uchar simd_uchar
+#define vector_uchar_sat simd_uchar_sat
+
+static simd_short2  SIMD_CFUNC simd_short(simd_char2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_char3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_char4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_char8    __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_char16   __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_char32   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_uchar2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_uchar3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_uchar4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_uchar8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16  __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_short2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_short3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_short4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_short8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_short16  __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_short32  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_ushort2  __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_ushort3  __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_ushort4  __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_ushort8  __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_int2     __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_int3     __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_int4     __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_int8     __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_int16    __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_uint2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_uint3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_uint4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_uint8    __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_float2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_float3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_float4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_float8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_float16  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_long2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_long3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_long4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_long8    __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_ulong2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_ulong3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_ulong4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_ulong8   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_double2  __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_double3  __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_double4  __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_double8  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_char2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_char3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_char4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_char8    __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16   __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32   __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_short2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_short3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_short4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_short8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16  __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_int2     __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_int3     __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_int4     __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_int8     __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16    __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_float2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_float3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_float4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_float8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_long2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_long3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_long4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_long8    __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_double2  __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_double3  __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_double4  __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_double8  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uchar2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uchar3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uchar4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uchar8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16  __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ushort2  __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ushort3  __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ushort4  __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ushort8  __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uint2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uint3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uint4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uint8    __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16   __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ulong2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ulong3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ulong4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ulong8   __x);
+#define vector_short simd_short
+#define vector_short_sat simd_short_sat
+
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_char2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_char3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_char4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_char8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16   __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uchar2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uchar3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uchar4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uchar8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_short2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_short3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_short4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_short8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ushort2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ushort3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ushort4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ushort8  __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_int2     __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_int3     __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_int4     __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_int8     __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uint2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uint3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uint4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uint8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_float2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_float3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_float4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_float8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_long2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_long3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_long4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_long8    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ulong2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ulong3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ulong4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ulong8   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_double2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_double3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_double4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_double8  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_char2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_char3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_char4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_char8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16   __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_short2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_short3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_short4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_short8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_int2     __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_int3     __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_int4     __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_int8     __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_float2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_float3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_float4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_float8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_long2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_long3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_long4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_long8    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_double2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_double3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_double4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_double8  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uchar2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uchar3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uchar4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uchar8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ushort2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ushort3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ushort4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ushort8  __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uint2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uint3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uint4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uint8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ulong2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ulong3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ulong4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ulong8   __x);
+#define vector_ushort simd_ushort
+#define vector_ushort_sat simd_ushort_sat
+
+static simd_int2  SIMD_CFUNC simd_int(simd_char2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_char3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_char4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_char8    __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_char16   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_uchar2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_uchar3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_uchar4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_uchar8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_short2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_short3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_short4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_short8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_short16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_ushort2  __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_ushort3  __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_ushort4  __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_ushort8  __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_int2     __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_int3     __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_int4     __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_int8     __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_int16    __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_uint2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_uint3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_uint4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_uint8    __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_float16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_long2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_long3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_long4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_long8    __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_ulong2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_ulong3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_ulong4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_ulong8   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_double2  __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_double3  __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_double4  __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_double8  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_char2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_char3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_char4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_char8    __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16   __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_short2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_short3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_short4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_short8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_int2     __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_int3     __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_int4     __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_int8     __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16    __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_long2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_long3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_long4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_long8    __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_double2  __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_double3  __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_double4  __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_double8  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uchar2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uchar3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uchar4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uchar8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ushort2  __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ushort3  __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ushort4  __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ushort8  __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uint2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uint3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uint4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uint8    __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16   __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ulong2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ulong3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ulong4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ulong8   __x);
+static simd_int2  SIMD_CFUNC simd_int_rte(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int_rte(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int_rte(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int_rte(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16  __x);
+#define vector_int simd_int
+#define vector_int_sat simd_int_sat
+
+static simd_uint2  SIMD_CFUNC simd_uint(simd_char2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_char3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_char4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_char8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uchar2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uchar3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uchar4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uchar8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_short2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_short3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_short4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_short8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ushort2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ushort3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ushort4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ushort8  __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_int2     __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_int3     __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_int4     __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_int8     __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16    __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uint2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uint3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uint4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uint8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_float2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_float3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_float4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_float8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_long2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_long3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_long4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_long8    __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ulong2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ulong3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ulong4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ulong8   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_double2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_double3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_double4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_double8  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_char2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_char3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_char4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_char8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_short2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_short3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_short4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_short8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_int2     __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_int3     __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_int4     __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_int8     __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16    __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_float2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_float3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_float4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_float8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_long2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_long3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_long4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_long8    __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_double2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_double3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_double4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_double8  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uchar2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uchar3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uchar4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uchar8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ushort2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ushort3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ushort4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ushort8  __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uint2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uint3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uint4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uint8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ulong2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ulong3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ulong4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ulong8   __x);
+#define vector_uint simd_uint
+#define vector_uint_sat simd_uint_sat
+
+static simd_float2  SIMD_CFUNC simd_float(simd_char2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_char3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_char4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_char8    __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_char16   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_uchar2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_uchar3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_uchar4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_uchar8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_short2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_short3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_short4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_short8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_short16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_ushort2  __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_ushort3  __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_ushort4  __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_ushort8  __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_int2     __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_int3     __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_int4     __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_int8     __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_int16    __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_uint2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_uint3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_uint4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_uint8    __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_float2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_float3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_float4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_float8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_float16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_long2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_long3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_long4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_long8    __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_ulong2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_ulong3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_ulong4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_ulong8   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_double2  __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_double3  __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_double4  __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_double8  __x);
+#define vector_float simd_float
+
+static simd_long2  SIMD_CFUNC simd_long(simd_char2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_char3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_char4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_char8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_uchar2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_uchar3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_uchar4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_uchar8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_short2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_short3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_short4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_short8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_ushort2  __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_ushort3  __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_ushort4  __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_ushort8  __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_int2     __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_int3     __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_int4     __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_int8     __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_uint2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_uint3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_uint4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_uint8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_float2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_float3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_float4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_float8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_long2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_long3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_long4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_long8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_ulong2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_ulong3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_ulong4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_ulong8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_double8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_char2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_char3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_char4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_char8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_short2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_short3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_short4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_short8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_int2     __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_int3     __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_int4     __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_int8     __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_float2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_float3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_float4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_float8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_long2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_long3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_long4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_long8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_double8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uchar2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uchar3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uchar4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uchar8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ushort2  __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ushort3  __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ushort4  __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ushort8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uint2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uint3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uint4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uint8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ulong2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ulong3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ulong4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ulong8   __x);
+static simd_long2  SIMD_CFUNC simd_long_rte(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long_rte(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long_rte(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long_rte(simd_double8  __x);
+#define vector_long simd_long
+#define vector_long_sat simd_long_sat
+
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_char2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_char3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_char4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_char8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uchar2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uchar3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uchar4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uchar8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_short2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_short3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_short4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_short8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ushort2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ushort3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ushort4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ushort8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_int2     __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_int3     __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_int4     __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_int8     __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uint2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uint3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uint4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uint8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_float2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_float3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_float4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_float8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_long2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_long3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_long4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_long8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ulong2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ulong3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ulong4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ulong8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_double2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_double3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_double4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_double8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_char2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_char3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_char4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_char8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_short2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_short3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_short4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_short8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_int2     __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_int3     __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_int4     __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_int8     __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_float2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_float3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_float4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_float8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_long2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_long3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_long4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_long8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_double2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_double3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_double4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_double8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uchar2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uchar3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uchar4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uchar8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ushort2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ushort3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ushort4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ushort8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uint2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uint3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uint4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uint8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ulong2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ulong3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ulong4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ulong8   __x);
+#define vector_ulong simd_ulong
+#define vector_ulong_sat simd_ulong_sat
+
+static simd_double2  SIMD_CFUNC simd_double(simd_char2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_char3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_char4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_char8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_uchar2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_uchar3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_uchar4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_uchar8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_short2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_short3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_short4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_short8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_ushort2  __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_ushort3  __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_ushort4  __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_ushort8  __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_int2     __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_int3     __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_int4     __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_int8     __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_uint2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_uint3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_uint4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_uint8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_float2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_float3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_float4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_float8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_long2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_long3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_long4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_long8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_ulong2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_ulong3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_ulong4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_ulong8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_double2  __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_double3  __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_double4  __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_double8  __x);
+#define vector_double simd_double
+
+static simd_char2   SIMD_CFUNC vector2(char           __x, char           __y) { return (  simd_char2){__x, __y}; }
+static simd_uchar2  SIMD_CFUNC vector2(unsigned char  __x, unsigned char  __y) { return ( simd_uchar2){__x, __y}; }
+static simd_short2  SIMD_CFUNC vector2(short          __x, short          __y) { return ( simd_short2){__x, __y}; }
+static simd_ushort2 SIMD_CFUNC vector2(unsigned short __x, unsigned short __y) { return (simd_ushort2){__x, __y}; }
+static simd_int2    SIMD_CFUNC vector2(int            __x, int            __y) { return (   simd_int2){__x, __y}; }
+static simd_uint2   SIMD_CFUNC vector2(unsigned int   __x, unsigned int   __y) { return (  simd_uint2){__x, __y}; }
+static simd_float2  SIMD_CFUNC vector2(float          __x, float          __y) { return ( simd_float2){__x, __y}; }
+static simd_long2   SIMD_CFUNC vector2(simd_long1   __x, simd_long1   __y) { return (  simd_long2){__x, __y}; }
+static simd_ulong2  SIMD_CFUNC vector2(simd_ulong1  __x, simd_ulong1  __y) { return ( simd_ulong2){__x, __y}; }
+static simd_double2 SIMD_CFUNC vector2(double         __x, double         __y) { return (simd_double2){__x, __y}; }
+
+static simd_char3   SIMD_CFUNC vector3(char           __x, char           __y, char           __z) { return (  simd_char3){__x, __y, __z}; }
+static simd_uchar3  SIMD_CFUNC vector3(unsigned char  __x, unsigned char  __y, unsigned char  __z) { return ( simd_uchar3){__x, __y, __z}; }
+static simd_short3  SIMD_CFUNC vector3(short          __x, short          __y, short          __z) { return ( simd_short3){__x, __y, __z}; }
+static simd_ushort3 SIMD_CFUNC vector3(unsigned short __x, unsigned short __y, unsigned short __z) { return (simd_ushort3){__x, __y, __z}; }
+static simd_int3    SIMD_CFUNC vector3(int            __x, int            __y, int            __z) { return (   simd_int3){__x, __y, __z}; }
+static simd_uint3   SIMD_CFUNC vector3(unsigned int   __x, unsigned int   __y, unsigned int   __z) { return (  simd_uint3){__x, __y, __z}; }
+static simd_float3  SIMD_CFUNC vector3(float          __x, float          __y, float          __z) { return ( simd_float3){__x, __y, __z}; }
+static simd_long3   SIMD_CFUNC vector3(simd_long1   __x, simd_long1   __y, simd_long1   __z) { return (  simd_long3){__x, __y, __z}; }
+static simd_ulong3  SIMD_CFUNC vector3(simd_ulong1  __x, simd_ulong1  __y, simd_ulong1  __z) { return ( simd_ulong3){__x, __y, __z}; }
+static simd_double3 SIMD_CFUNC vector3(double         __x, double         __y, double         __z) { return (simd_double3){__x, __y, __z}; }
+
+static simd_char3   SIMD_CFUNC vector3(simd_char2   __xy, char           __z) { simd_char3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uchar3  SIMD_CFUNC vector3(simd_uchar2  __xy, unsigned char  __z) { simd_uchar3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_short3  SIMD_CFUNC vector3(simd_short2  __xy, short          __z) { simd_short3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ushort3 SIMD_CFUNC vector3(simd_ushort2 __xy, unsigned short __z) { simd_ushort3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_int3    SIMD_CFUNC vector3(simd_int2    __xy, int            __z) { simd_int3    __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uint3   SIMD_CFUNC vector3(simd_uint2   __xy, unsigned int   __z) { simd_uint3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_float3  SIMD_CFUNC vector3(simd_float2  __xy, float          __z) { simd_float3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_long3   SIMD_CFUNC vector3(simd_long2   __xy, simd_long1   __z) { simd_long3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ulong3  SIMD_CFUNC vector3(simd_ulong2  __xy, simd_ulong1  __z) { simd_ulong3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_double3 SIMD_CFUNC vector3(simd_double2 __xy, double         __z) { simd_double3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+
+static simd_char4   SIMD_CFUNC vector4(char           __x, char           __y, char           __z, char           __w) { return (  simd_char4){__x, __y, __z, __w}; }
+static simd_uchar4  SIMD_CFUNC vector4(unsigned char  __x, unsigned char  __y, unsigned char  __z, unsigned char  __w) { return ( simd_uchar4){__x, __y, __z, __w}; }
+static simd_short4  SIMD_CFUNC vector4(short          __x, short          __y, short          __z, short          __w) { return ( simd_short4){__x, __y, __z, __w}; }
+static simd_ushort4 SIMD_CFUNC vector4(unsigned short __x, unsigned short __y, unsigned short __z, unsigned short __w) { return (simd_ushort4){__x, __y, __z, __w}; }
+static simd_int4    SIMD_CFUNC vector4(int            __x, int            __y, int            __z, int            __w) { return (   simd_int4){__x, __y, __z, __w}; }
+static simd_uint4   SIMD_CFUNC vector4(unsigned int   __x, unsigned int   __y, unsigned int   __z, unsigned int   __w) { return (  simd_uint4){__x, __y, __z, __w}; }
+static simd_float4  SIMD_CFUNC vector4(float          __x, float          __y, float          __z, float          __w) { return ( simd_float4){__x, __y, __z, __w}; }
+static simd_long4   SIMD_CFUNC vector4(simd_long1   __x, simd_long1   __y, simd_long1   __z, simd_long1   __w) { return (  simd_long4){__x, __y, __z, __w}; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong1  __x, simd_ulong1  __y, simd_ulong1  __z, simd_ulong1  __w) { return ( simd_ulong4){__x, __y, __z, __w}; }
+static simd_double4 SIMD_CFUNC vector4(double         __x, double         __y, double         __z, double         __w) { return (simd_double4){__x, __y, __z, __w}; }
+
+static simd_char4   SIMD_CFUNC vector4(simd_char2   __xy, simd_char2   __zw) { simd_char4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uchar4  SIMD_CFUNC vector4(simd_uchar2  __xy, simd_uchar2  __zw) { simd_uchar4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_short4  SIMD_CFUNC vector4(simd_short2  __xy, simd_short2  __zw) { simd_short4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort2 __xy, simd_ushort2 __zw) { simd_ushort4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_int4    SIMD_CFUNC vector4(simd_int2    __xy, simd_int2    __zw) { simd_int4    __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uint4   SIMD_CFUNC vector4(simd_uint2   __xy, simd_uint2   __zw) { simd_uint4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_float4  SIMD_CFUNC vector4(simd_float2  __xy, simd_float2  __zw) { simd_float4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_long4   SIMD_CFUNC vector4(simd_long2   __xy, simd_long2   __zw) { simd_long4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong2  __xy, simd_ulong2  __zw) { simd_ulong4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double2 __xy, simd_double2 __zw) { simd_double4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+
+static simd_char4   SIMD_CFUNC vector4(simd_char3   __xyz, char           __w) { simd_char4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uchar4  SIMD_CFUNC vector4(simd_uchar3  __xyz, unsigned char  __w) { simd_uchar4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_short4  SIMD_CFUNC vector4(simd_short3  __xyz, short          __w) { simd_short4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort3 __xyz, unsigned short __w) { simd_ushort4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_int4    SIMD_CFUNC vector4(simd_int3    __xyz, int            __w) { simd_int4    __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uint4   SIMD_CFUNC vector4(simd_uint3   __xyz, unsigned int   __w) { simd_uint4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_float4  SIMD_CFUNC vector4(simd_float3  __xyz, float          __w) { simd_float4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_long4   SIMD_CFUNC vector4(simd_long3   __xyz, simd_long1   __w) { simd_long4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong3  __xyz, simd_ulong1  __w) { simd_ulong4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double3 __xyz, double         __w) { simd_double4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+
+static simd_char8   SIMD_CFUNC vector8(simd_char4   __lo, simd_char4   __hi) { simd_char8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar8  SIMD_CFUNC vector8(simd_uchar4  __lo, simd_uchar4  __hi) { simd_uchar8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short8  SIMD_CFUNC vector8(simd_short4  __lo, simd_short4  __hi) { simd_short8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort8 SIMD_CFUNC vector8(simd_ushort4 __lo, simd_ushort4 __hi) { simd_ushort8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int8    SIMD_CFUNC vector8(simd_int4    __lo, simd_int4    __hi) { simd_int8    __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint8   SIMD_CFUNC vector8(simd_uint4   __lo, simd_uint4   __hi) { simd_uint8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float8  SIMD_CFUNC vector8(simd_float4  __lo, simd_float4  __hi) { simd_float8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_long8   SIMD_CFUNC vector8(simd_long4   __lo, simd_long4   __hi) { simd_long8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ulong8  SIMD_CFUNC vector8(simd_ulong4  __lo, simd_ulong4  __hi) { simd_ulong8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_double8 SIMD_CFUNC vector8(simd_double4 __lo, simd_double4 __hi) { simd_double8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char16   SIMD_CFUNC vector16(simd_char8   __lo, simd_char8   __hi) { simd_char16   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar16  SIMD_CFUNC vector16(simd_uchar8  __lo, simd_uchar8  __hi) { simd_uchar16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short16  SIMD_CFUNC vector16(simd_short8  __lo, simd_short8  __hi) { simd_short16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort16 SIMD_CFUNC vector16(simd_ushort8 __lo, simd_ushort8 __hi) { simd_ushort16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int16    SIMD_CFUNC vector16(simd_int8    __lo, simd_int8    __hi) { simd_int16    __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint16   SIMD_CFUNC vector16(simd_uint8   __lo, simd_uint8   __hi) { simd_uint16   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float16  SIMD_CFUNC vector16(simd_float8  __lo, simd_float8  __hi) { simd_float16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char32   SIMD_CFUNC vector32(simd_char16   __lo, simd_char16   __hi) { simd_char32   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar32  SIMD_CFUNC vector32(simd_uchar16  __lo, simd_uchar16  __hi) { simd_uchar32  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short32  SIMD_CFUNC vector32(simd_short16  __lo, simd_short16  __hi) { simd_short32  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort32 SIMD_CFUNC vector32(simd_ushort16 __lo, simd_ushort16 __hi) { simd_ushort32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+#pragma mark - Implementation
+
+static simd_char2  SIMD_CFUNC simd_char(simd_char2    __x) { return __x; }
+static simd_char3  SIMD_CFUNC simd_char(simd_char3    __x) { return __x; }
+static simd_char4  SIMD_CFUNC simd_char(simd_char4    __x) { return __x; }
+static simd_char8  SIMD_CFUNC simd_char(simd_char8    __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_char16   __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_char32   __x) { return __x; }
+static simd_char2  SIMD_CFUNC simd_char(simd_uchar2   __x) { return (simd_char2)__x; }
+static simd_char3  SIMD_CFUNC simd_char(simd_uchar3   __x) { return (simd_char3)__x; }
+static simd_char4  SIMD_CFUNC simd_char(simd_uchar4   __x) { return (simd_char4)__x; }
+static simd_char8  SIMD_CFUNC simd_char(simd_uchar8   __x) { return (simd_char8)__x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16  __x) { return (simd_char16)__x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32  __x) { return (simd_char32)__x; }
+static simd_char2  SIMD_CFUNC simd_char(simd_short2   __x) { return __builtin_convertvector(__x & 0xff, simd_char2); }
+static simd_char3  SIMD_CFUNC simd_char(simd_short3   __x) { return __builtin_convertvector(__x & 0xff, simd_char3); }
+static simd_char4  SIMD_CFUNC simd_char(simd_short4   __x) { return __builtin_convertvector(__x & 0xff, simd_char4); }
+static simd_char8  SIMD_CFUNC simd_char(simd_short8   __x) { return __builtin_convertvector(__x & 0xff, simd_char8); }
+static simd_char16 SIMD_CFUNC simd_char(simd_short16  __x) { return __builtin_convertvector(__x & 0xff, simd_char16); }
+static simd_char32 SIMD_CFUNC simd_char(simd_short32  __x) { return __builtin_convertvector(__x & 0xff, simd_char32); }
+static simd_char2  SIMD_CFUNC simd_char(simd_ushort2  __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_ushort3  __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_ushort4  __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_ushort8  __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x) { return simd_char(simd_short(__x)); }
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_int2     __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_int3     __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_int4     __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_int8     __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_int16    __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_uint2    __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_uint3    __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_uint4    __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_uint8    __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16   __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_float2   __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_float3   __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_float4   __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_float8   __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_float16  __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_long2    __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_long3    __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_long4    __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_long8    __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_ulong2   __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_ulong3   __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_ulong4   __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_ulong8   __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_double2  __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_double3  __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_double4  __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_double8  __x) { return simd_char(simd_short(__x)); }
+    
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_char2    __x) { return __x; }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_char3    __x) { return __x; }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_char4    __x) { return __x; }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_char8    __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16   __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32   __x) { return __x; }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_short2   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_short3   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_short4   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_short8   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_int2     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_int3     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_int4     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_int8     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_float2   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_float3   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_float4   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_float8   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_long2    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_long3    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_long4    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_long8    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_double2  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_double3  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_double4  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_double8  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uchar2   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uchar3   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uchar4   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uchar8   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ushort2  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ushort3  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ushort4  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ushort8  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uint2    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uint3    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uint4    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uint8    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ulong2   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ulong3   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ulong4   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ulong8   __x) { return simd_char(simd_min(__x,0x7f)); }
+    
+
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_char2    __x) { return (simd_uchar2)__x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_char3    __x) { return (simd_uchar3)__x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_char4    __x) { return (simd_uchar4)__x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_char8    __x) { return (simd_uchar8)__x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16   __x) { return (simd_uchar16)__x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32   __x) { return (simd_uchar32)__x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uchar2   __x) { return __x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uchar3   __x) { return __x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uchar4   __x) { return __x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uchar8   __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16  __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32  __x) { return __x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_short2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_short3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_short4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_short8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ushort2  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ushort3  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ushort4  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ushort8  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_int2     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_int3     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_int4     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_int8     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uint2    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uint3    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uint4    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uint8    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_float2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_float3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_float4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_float8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_long2    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_long3    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_long4    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_long8    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ulong2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ulong3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ulong4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ulong8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_double2  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_double3  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_double4  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_double8  __x) { return simd_uchar(simd_char(__x)); }
+    
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_char2    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_char3    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_char4    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_char8    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16   __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32   __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_short2   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_short3   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_short4   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_short8   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_int2     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_int3     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_int4     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_int8     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_float2   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_float3   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_float4   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_float8   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_long2    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_long3    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_long4    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_long8    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_double2  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_double3  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_double4  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_double8  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uchar2   __x) { return __x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uchar3   __x) { return __x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uchar4   __x) { return __x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uchar8   __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16  __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32  __x) { return __x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ushort2  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ushort3  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ushort4  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ushort8  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uint2    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uint3    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uint4    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uint8    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ulong2   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ulong3   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ulong4   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ulong8   __x) { return simd_uchar(simd_min(__x,0xff)); }
+    
+
+static simd_short2  SIMD_CFUNC simd_short(simd_char2    __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_char3    __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_char4    __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_char8    __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_char16   __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_char32   __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2  SIMD_CFUNC simd_short(simd_uchar2   __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_uchar3   __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_uchar4   __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_uchar8   __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16  __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32  __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2  SIMD_CFUNC simd_short(simd_short2   __x) { return __x; }
+static simd_short3  SIMD_CFUNC simd_short(simd_short3   __x) { return __x; }
+static simd_short4  SIMD_CFUNC simd_short(simd_short4   __x) { return __x; }
+static simd_short8  SIMD_CFUNC simd_short(simd_short8   __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_short16  __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_short32  __x) { return __x; }
+static simd_short2  SIMD_CFUNC simd_short(simd_ushort2  __x) { return (simd_short2)__x; }
+static simd_short3  SIMD_CFUNC simd_short(simd_ushort3  __x) { return (simd_short3)__x; }
+static simd_short4  SIMD_CFUNC simd_short(simd_ushort4  __x) { return (simd_short4)__x; }
+static simd_short8  SIMD_CFUNC simd_short(simd_ushort8  __x) { return (simd_short8)__x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x) { return (simd_short16)__x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x) { return (simd_short32)__x; }
+static simd_short2  SIMD_CFUNC simd_short(simd_int2     __x) { return __builtin_convertvector(__x & 0xffff, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_int3     __x) { return __builtin_convertvector(__x & 0xffff, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_int4     __x) { return __builtin_convertvector(__x & 0xffff, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_int8     __x) { return __builtin_convertvector(__x & 0xffff, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_int16    __x) { return __builtin_convertvector(__x & 0xffff, simd_short16); }
+static simd_short2  SIMD_CFUNC simd_short(simd_uint2    __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_uint3    __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_uint4    __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_uint8    __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16   __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_float2   __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_float3   __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_float4   __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_float8   __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_float16  __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_long2    __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_long3    __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_long4    __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_long8    __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_ulong2   __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_ulong3   __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_ulong4   __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_ulong8   __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_double2  __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_double3  __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_double4  __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_double8  __x) { return simd_short(simd_int(__x)); }
+    
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_char2    __x) { return simd_short(__x); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_char3    __x) { return simd_short(__x); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_char4    __x) { return simd_short(__x); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_char8    __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16   __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32   __x) { return simd_short(__x); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_short2   __x) { return __x; }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_short3   __x) { return __x; }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_short4   __x) { return __x; }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_short8   __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16  __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32  __x) { return __x; }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_int2     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_int3     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_int4     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_int8     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_float2   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_float3   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_float4   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_float8   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_long2    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_long3    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_long4    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_long8    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_double2  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_double3  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_double4  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_double8  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uchar2   __x) { return simd_short(__x); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uchar3   __x) { return simd_short(__x); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uchar4   __x) { return simd_short(__x); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uchar8   __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16  __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32  __x) { return simd_short(__x); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ushort2  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ushort3  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ushort4  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ushort8  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uint2    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uint3    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uint4    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uint8    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ulong2   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ulong3   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ulong4   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ulong8   __x) { return simd_short(simd_min(__x,0x7fff)); }
+    
+
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_char2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_char3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_char4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_char8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uchar2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uchar3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uchar4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uchar8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_short2   __x) { return (simd_ushort2)__x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_short3   __x) { return (simd_ushort3)__x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_short4   __x) { return (simd_ushort4)__x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_short8   __x) { return (simd_ushort8)__x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16  __x) { return (simd_ushort16)__x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32  __x) { return (simd_ushort32)__x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ushort2  __x) { return __x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ushort3  __x) { return __x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ushort4  __x) { return __x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ushort8  __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x) { return __x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_int2     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_int3     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_int4     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_int8     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uint2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uint3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uint4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uint8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_float2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_float3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_float4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_float8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_long2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_long3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_long4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_long8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ulong2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ulong3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ulong4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ulong8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_double2  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_double3  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_double4  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_double8  __x) { return simd_ushort(simd_short(__x)); }
+    
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_char2    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_char3    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_char4    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_char8    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_short2   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_short3   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_short4   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_short8   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16  __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32  __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_int2     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_int3     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_int4     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_int8     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_float2   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_float3   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_float4   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_float8   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_long2    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_long3    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_long4    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_long8    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_double2  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_double3  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_double4  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_double8  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uchar2   __x) { return simd_ushort(__x); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uchar3   __x) { return simd_ushort(__x); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uchar4   __x) { return simd_ushort(__x); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uchar8   __x) { return simd_ushort(__x); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16  __x) { return simd_ushort(__x); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32  __x) { return simd_ushort(__x); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ushort2  __x) { return __x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ushort3  __x) { return __x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ushort4  __x) { return __x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ushort8  __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x) { return __x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uint2    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uint3    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uint4    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uint8    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ulong2   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ulong3   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ulong4   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ulong8   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+    
+
+static simd_int2  SIMD_CFUNC simd_int(simd_char2    __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_char3    __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_char4    __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_char8    __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_char16   __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_uchar2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_uchar3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_uchar4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_uchar8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_short2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_short3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_short4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_short8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_short16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_ushort2  __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_ushort3  __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_ushort4  __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_ushort8  __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_int2     __x) { return __x; }
+static simd_int3  SIMD_CFUNC simd_int(simd_int3     __x) { return __x; }
+static simd_int4  SIMD_CFUNC simd_int(simd_int4     __x) { return __x; }
+static simd_int8  SIMD_CFUNC simd_int(simd_int8     __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_int16    __x) { return __x; }
+static simd_int2  SIMD_CFUNC simd_int(simd_uint2    __x) { return (simd_int2)__x; }
+static simd_int3  SIMD_CFUNC simd_int(simd_uint3    __x) { return (simd_int3)__x; }
+static simd_int4  SIMD_CFUNC simd_int(simd_uint4    __x) { return (simd_int4)__x; }
+static simd_int8  SIMD_CFUNC simd_int(simd_uint8    __x) { return (simd_int8)__x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16   __x) { return (simd_int16)__x; }
+static simd_int2  SIMD_CFUNC simd_int(simd_float2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_float3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_float4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_float8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_float16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_long2    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_long3    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_long4    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_long8    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int8); }
+static simd_int2  SIMD_CFUNC simd_int(simd_ulong2   __x) { return simd_int(simd_long(__x)); }
+static simd_int3  SIMD_CFUNC simd_int(simd_ulong3   __x) { return simd_int(simd_long(__x)); }
+static simd_int4  SIMD_CFUNC simd_int(simd_ulong4   __x) { return simd_int(simd_long(__x)); }
+static simd_int8  SIMD_CFUNC simd_int(simd_ulong8   __x) { return simd_int(simd_long(__x)); }
+static simd_int2  SIMD_CFUNC simd_int(simd_double2  __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_double3  __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_double4  __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_double8  __x) { return __builtin_convertvector(__x, simd_int8); }
+    
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_char2    __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_char3    __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_char4    __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_char8    __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16   __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_short2   __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_short3   __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_short4   __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_short8   __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16  __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_int2     __x) { return __x; }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_int3     __x) { return __x; }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_int4     __x) { return __x; }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_int8     __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16    __x) { return __x; }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_float2   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_float3   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_float4   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_float8   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16  __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_long2    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_long3    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_long4    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_long8    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_double2  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_double3  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_double4  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_double8  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uchar2   __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uchar3   __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uchar4   __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uchar8   __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16  __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ushort2  __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ushort3  __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ushort4  __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ushort8  __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uint2    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uint3    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uint4    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uint8    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ulong2   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ulong3   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ulong4   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ulong8   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+    
+static simd_int2  SIMD_CFUNC simd_int_rte(simd_float2   __x) {
+#if defined __arm64__
+  return vcvtn_s32_f32(__x);
+#else
+  return simd_make_int2(simd_int_rte(simd_make_float4_undef(__x)));
+#endif
+}
+
+static simd_int3  SIMD_CFUNC simd_int_rte(simd_float3   __x) {
+  return simd_make_int3(simd_int_rte(simd_make_float4_undef(__x)));
+}
+
+static simd_int4  SIMD_CFUNC simd_int_rte(simd_float4   __x) {
+#if defined __SSE2__
+  return _mm_cvtps_epi32(__x);
+#elif defined __arm64__
+  return vcvtnq_s32_f32(__x);
+#else
+  simd_float4 magic = __tg_copysign(0x1.0p23, __x);
+  simd_int4 x_is_small = __tg_fabs(__x) < 0x1.0p23;
+  return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffff), simd_int4);
+#endif
+}
+
+static simd_int8  SIMD_CFUNC simd_int_rte(simd_float8   __x) {
+#if defined __AVX__
+  return _mm256_cvtps_epi32(__x);
+#else
+  return simd_make_int8(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16  __x) {
+#if defined __AVX512F__
+  return _mm512_cvt_roundps_epi32(__x, _MM_FROUND_RINT);
+#else
+  return simd_make_int16(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_uint2  SIMD_CFUNC simd_uint(simd_char2    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_char3    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_char4    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_char8    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uchar2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uchar3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uchar4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uchar8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_short2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_short3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_short4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_short8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ushort2  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ushort3  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ushort4  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ushort8  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_int2     __x) { return (simd_uint2)__x; }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_int3     __x) { return (simd_uint3)__x; }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_int4     __x) { return (simd_uint4)__x; }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_int8     __x) { return (simd_uint8)__x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16    __x) { return (simd_uint16)__x; }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uint2    __x) { return __x; }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uint3    __x) { return __x; }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uint4    __x) { return __x; }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uint8    __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16   __x) { return __x; }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_float2   __x) { simd_int2  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float2)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint2)0,0x80000000,__big); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_float3   __x) { simd_int3  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float3)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint3)0,0x80000000,__big); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_float4   __x) { simd_int4  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float4)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint4)0,0x80000000,__big); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_float8   __x) { simd_int8  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float8)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint8)0,0x80000000,__big); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16  __x) { simd_int16 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float16)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint16)0,0x80000000,__big); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_long2    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_long3    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_long4    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_long8    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ulong2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ulong3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ulong4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ulong8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_double2  __x) { simd_long2 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double2)0,0x1.0p31,__big))) + simd_bitselect((simd_uint2)0,0x80000000,simd_int(__big)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_double3  __x) { simd_long3 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double3)0,0x1.0p31,__big))) + simd_bitselect((simd_uint3)0,0x80000000,simd_int(__big)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_double4  __x) { simd_long4 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double4)0,0x1.0p31,__big))) + simd_bitselect((simd_uint4)0,0x80000000,simd_int(__big)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_double8  __x) { simd_long8 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double8)0,0x1.0p31,__big))) + simd_bitselect((simd_uint8)0,0x80000000,simd_int(__big)); }
+    
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_char2    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_char3    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_char4    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_char8    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_short2   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_short3   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_short4   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_short8   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16  __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_int2     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_int3     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_int4     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_int8     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_float2   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_float3   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_float4   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_float8   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16  __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_long2    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_long3    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_long4    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_long8    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_double2  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_double3  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_double4  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_double8  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uchar2   __x) { return simd_uint(__x); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uchar3   __x) { return simd_uint(__x); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uchar4   __x) { return simd_uint(__x); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uchar8   __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16  __x) { return simd_uint(__x); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ushort2  __x) { return simd_uint(__x); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ushort3  __x) { return simd_uint(__x); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ushort4  __x) { return simd_uint(__x); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ushort8  __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x) { return simd_uint(__x); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uint2    __x) { return __x; }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uint3    __x) { return __x; }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uint4    __x) { return __x; }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uint8    __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16   __x) { return __x; }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ulong2   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ulong3   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ulong4   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ulong8   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+    
+
+static simd_float2  SIMD_CFUNC simd_float(simd_char2    __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_char3    __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_char4    __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_char8    __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_char16   __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_uchar2   __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_uchar3   __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_uchar4   __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_uchar8   __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16  __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_short2   __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_short3   __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_short4   __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_short8   __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_short16  __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_ushort2  __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_ushort3  __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_ushort4  __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_ushort8  __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_int2     __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_int3     __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_int4     __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_int8     __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_int16    __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2  SIMD_CFUNC simd_float(simd_uint2    __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_uint3    __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_uint4    __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_uint8    __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16   __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2  SIMD_CFUNC simd_float(simd_float2   __x) { return __x; }
+static simd_float3  SIMD_CFUNC simd_float(simd_float3   __x) { return __x; }
+static simd_float4  SIMD_CFUNC simd_float(simd_float4   __x) { return __x; }
+static simd_float8  SIMD_CFUNC simd_float(simd_float8   __x) { return __x; }
+static simd_float16 SIMD_CFUNC simd_float(simd_float16  __x) { return __x; }
+static simd_float2  SIMD_CFUNC simd_float(simd_long2    __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_long3    __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_long4    __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_long8    __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2  SIMD_CFUNC simd_float(simd_ulong2   __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_ulong3   __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_ulong4   __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_ulong8   __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2  SIMD_CFUNC simd_float(simd_double2  __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_double3  __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_double4  __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_double8  __x) { return __builtin_convertvector(__x,simd_float8); }
+    
+
+static simd_long2  SIMD_CFUNC simd_long(simd_char2    __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_char3    __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_char4    __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_char8    __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_uchar2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_uchar3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_uchar4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_uchar8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_short2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_short3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_short4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_short8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_ushort2  __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_ushort3  __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_ushort4  __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_ushort8  __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_int2     __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_int3     __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_int4     __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_int8     __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_uint2    __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_uint3    __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_uint4    __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_uint8    __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_float2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_float3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_float4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_float8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_long2    __x) { return __x; }
+static simd_long3  SIMD_CFUNC simd_long(simd_long3    __x) { return __x; }
+static simd_long4  SIMD_CFUNC simd_long(simd_long4    __x) { return __x; }
+static simd_long8  SIMD_CFUNC simd_long(simd_long8    __x) { return __x; }
+static simd_long2  SIMD_CFUNC simd_long(simd_ulong2   __x) { return (simd_long2)__x; }
+static simd_long3  SIMD_CFUNC simd_long(simd_ulong3   __x) { return (simd_long3)__x; }
+static simd_long4  SIMD_CFUNC simd_long(simd_ulong4   __x) { return (simd_long4)__x; }
+static simd_long8  SIMD_CFUNC simd_long(simd_ulong8   __x) { return (simd_long8)__x; }
+static simd_long2  SIMD_CFUNC simd_long(simd_double2  __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_double3  __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_double4  __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_double8  __x) { return __builtin_convertvector(__x,simd_long8); }
+    
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_char2    __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_char3    __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_char4    __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_char8    __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_short2   __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_short3   __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_short4   __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_short8   __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_int2     __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_int3     __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_int4     __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_int8     __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_float2   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_float3   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_float4   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_float8   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_long2    __x) { return __x; }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_long3    __x) { return __x; }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_long4    __x) { return __x; }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_long8    __x) { return __x; }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_double2  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_double3  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_double4  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_double8  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uchar2   __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uchar3   __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uchar4   __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uchar8   __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ushort2  __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ushort3  __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ushort4  __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ushort8  __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uint2    __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uint3    __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uint4    __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uint8    __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ulong2   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ulong3   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ulong4   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ulong8   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+    
+static simd_long2  SIMD_CFUNC simd_long_rte(simd_double2  __x) {
+#if defined __AVX512F__
+  return _mm_cvtpd_epi64(__x);
+#elif defined __arm64__
+  return vcvtnq_s64_f64(__x);
+#else
+  simd_double2 magic = __tg_copysign(0x1.0p52, __x);
+  simd_long2 x_is_small = __tg_fabs(__x) < 0x1.0p52;
+  return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffffffffffff), simd_long2);
+#endif
+}
+
+static simd_long3  SIMD_CFUNC simd_long_rte(simd_double3  __x) {
+  return simd_make_long3(simd_long_rte(simd_make_double4_undef(__x)));
+}
+
+static simd_long4  SIMD_CFUNC simd_long_rte(simd_double4  __x) {
+#if defined __AVX512F__
+  return _mm256_cvtpd_epi64(__x);
+#else
+  return simd_make_long4(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+static simd_long8  SIMD_CFUNC simd_long_rte(simd_double8  __x) {
+#if defined __AVX512F__
+  return _mm512_cvt_roundpd_epi64(__x, _MM_FROUND_RINT);
+#else
+  return simd_make_long8(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_char2    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_char3    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_char4    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_char8    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uchar2   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uchar3   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uchar4   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uchar8   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_short2   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_short3   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_short4   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_short8   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ushort2  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ushort3  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ushort4  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ushort8  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_int2     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_int3     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_int4     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_int8     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uint2    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uint3    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uint4    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uint8    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_float2   __x) { simd_int2 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float2)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_float3   __x) { simd_int3 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float3)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_float4   __x) { simd_int4 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float4)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_float8   __x) { simd_int8 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float8)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_long2    __x) { return (simd_ulong2)__x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_long3    __x) { return (simd_ulong3)__x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_long4    __x) { return (simd_ulong4)__x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_long8    __x) { return (simd_ulong8)__x; }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ulong2   __x) { return __x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ulong3   __x) { return __x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ulong4   __x) { return __x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ulong8   __x) { return __x; }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_double2  __x) { simd_long2 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double2)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,__big); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_double3  __x) { simd_long3 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double3)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,__big); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_double4  __x) { simd_long4 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double4)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,__big); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_double8  __x) { simd_long8 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double8)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,__big); }
+    
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_char2    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_char3    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_char4    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_char8    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_short2   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_short3   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_short4   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_short8   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_int2     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_int3     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_int4     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_int8     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_float2   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_float3   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_float4   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_float8   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_long2    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_long3    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_long4    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_long8    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_double2  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_double3  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_double4  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_double8  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uchar2   __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uchar3   __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uchar4   __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uchar8   __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ushort2  __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ushort3  __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ushort4  __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ushort8  __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uint2    __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uint3    __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uint4    __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uint8    __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ulong2   __x) { return __x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ulong3   __x) { return __x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ulong4   __x) { return __x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ulong8   __x) { return __x; }
+    
+
+static simd_double2  SIMD_CFUNC simd_double(simd_char2    __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_char3    __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_char4    __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_char8    __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_uchar2   __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_uchar3   __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_uchar4   __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_uchar8   __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_short2   __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_short3   __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_short4   __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_short8   __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_ushort2  __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_ushort3  __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_ushort4  __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_ushort8  __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_int2     __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_int3     __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_int4     __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_int8     __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_uint2    __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_uint3    __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_uint4    __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_uint8    __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_float2   __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_float3   __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_float4   __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_float8   __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_long2    __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_long3    __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_long4    __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_long8    __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_ulong2   __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_ulong3   __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_ulong4   __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_ulong8   __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_double2  __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_double3  __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_double4  __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_double8  __x) { return __builtin_convertvector(__x, simd_double8); }
+    
+
+#ifdef __cplusplus
+} // extern "C"
+
+namespace simd {
+
+#if __has_feature(cxx_constexpr)
+/*! @abstract Convert a vector to another vector of the ScalarType and the same number of elements. */
+template<typename ScalarType, typename typeN>
+static constexpr Vector_t<ScalarType, traits<typeN>::count> convert(typeN vector)
+{
+    if constexpr (traits<typeN>::count == 1)
+        return static_cast<Vector_t<ScalarType, traits<typeN>::count>>(vector);
+    else if constexpr (std::is_same<ScalarType, char1>::value)
+        return simd_char(vector);
+    else if constexpr (std::is_same<ScalarType, uchar1>::value)
+        return simd_uchar(vector);
+    else if constexpr (std::is_same<ScalarType, short1>::value)
+        return simd_short(vector);
+    else if constexpr (std::is_same<ScalarType, ushort1>::value)
+        return simd_ushort(vector);
+    else if constexpr (std::is_same<ScalarType, int1>::value)
+        return simd_int(vector);
+    else if constexpr (std::is_same<ScalarType, uint1>::value)
+        return simd_uint(vector);
+    else if constexpr (std::is_same<ScalarType, long1>::value)
+        return simd_long(vector);
+    else if constexpr (std::is_same<ScalarType, ulong1>::value)
+        return simd_ulong(vector);
+    else if constexpr (std::is_same<ScalarType, float1>::value)
+        return simd_float(vector);
+    else if constexpr (std::is_same<ScalarType, double1>::value)
+        return simd_double(vector);
+}
+
+/*! @abstract Convert a vector to another vector of the ScalarType and the same number of elements with saturation.
+ *  @discussion When the input value is too large to be represented in the return type, the input value
+ *  will be saturated to the maximum value of the return type.  */
+template<typename ScalarType, typename typeN>
+static constexpr Vector_t<ScalarType, traits<typeN>::count> convert_sat(typeN vector)
+{
+    static_assert(traits<typeN>::count != 1);
+    if constexpr (std::is_same<ScalarType, char1>::value)
+        return simd_char_sat(vector);
+    else if constexpr (std::is_same<ScalarType, uchar1>::value)
+        return simd_uchar_sat(vector);
+    else if constexpr (std::is_same<ScalarType, short1>::value)
+        return simd_short_sat(vector);
+    else if constexpr (std::is_same<ScalarType, ushort1>::value)
+        return simd_ushort_sat(vector);
+    else if constexpr (std::is_same<ScalarType, int1>::value)
+        return simd_int_sat(vector);
+    else if constexpr (std::is_same<ScalarType, uint1>::value)
+        return simd_uint_sat(vector);
+    else if constexpr (std::is_same<ScalarType, long1>::value)
+        return simd_long_sat(vector);
+    else if constexpr (std::is_same<ScalarType, ulong1>::value)
+        return simd_ulong_sat(vector);
+    else
+        return convert<ScalarType, typeN>(vector);
+}
+#endif /* __has_feature(cxx_constexpr) */
+
+} /* namespace simd */
+#endif // __cplusplus
+#endif // SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#endif // __SIMD_CONVERSION_HEADER__
+
diff --git a/vfsoverlay/extern.h b/vfsoverlay/extern.h
new file mode 100644
index 00000000..b4b6b8f5
--- /dev/null
+++ b/vfsoverlay/extern.h
@@ -0,0 +1,49 @@
+/*  Copyright (c) 2014 Apple, Inc. All rights reserved.                       */
+ 
+#ifndef __SIMD_EXTERN_HEADER__
+#define __SIMD_EXTERN_HEADER__
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma mark - geometry
+#if SIMD_LIBRARY_VERSION >= 2
+extern float _simd_orient_vf2(simd_float2, simd_float2);
+extern float _simd_orient_pf2(simd_float2, simd_float2, simd_float2);
+extern float _simd_incircle_pf2(simd_float2, simd_float2, simd_float2, simd_float2);
+
+extern float _simd_orient_vf3(simd_float3, simd_float3, simd_float3);
+extern float _simd_orient_pf3(simd_float3, simd_float3, simd_float3, simd_float3);
+extern float _simd_insphere_pf3(simd_float3, simd_float3, simd_float3, simd_float3, simd_float3);
+  
+extern double _simd_orient_vd2(simd_double2, simd_double2);
+extern double _simd_orient_pd2(simd_double2, simd_double2, simd_double2);
+extern double _simd_incircle_pd2(simd_double2, simd_double2, simd_double2, simd_double2);
+
+/*  The double3 variants of these functions take their arguments in a buffer
+ *  to workaround the fact that double3 calling conventions are different
+ *  depending on whether or not the executable has been compiled with AVX
+ *  enabled.                                                                  */
+extern double _simd_orient_vd3(const double *);
+extern double _simd_orient_pd3(const double *);
+extern double _simd_insphere_pd3(const double *);
+#endif /* SIMD_LIBRARY_VERSION */
+
+#pragma mark - matrix
+extern simd_float2x2 __invert_f2(simd_float2x2);
+extern simd_double2x2 __invert_d2(simd_double2x2);
+extern simd_float3x3 __invert_f3(simd_float3x3);
+extern simd_double3x3 __invert_d3(simd_double3x3);
+extern simd_float4x4 __invert_f4(simd_float4x4);
+extern simd_double4x4 __invert_d4(simd_double4x4);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_EXTERN_HEADER__ */
diff --git a/vfsoverlay/geometry.h b/vfsoverlay/geometry.h
new file mode 100644
index 00000000..83b5a380
--- /dev/null
+++ b/vfsoverlay/geometry.h
@@ -0,0 +1,1100 @@
+/*  Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ *  The interfaces declared in this header provide operations for mathematical
+ *  vectors; these functions and macros operate on vectors of floating-point
+ *  data only.
+ *
+ *      Function                    Result
+ *      ------------------------------------------------------------------
+ *      simd_dot(x,y)               The dot product of x and y.
+ *
+ *      simd_project(x,y)           x projected onto y.  There are two variants
+ *                                  of this function, simd_precise_project
+ *                                  and simd_fast_project.  simd_project
+ *                                  is equivalent to simd_precise_project
+ *                                  unless you are compiling with -ffast-math
+ *                                  specified, in which case it is equivalent
+ *                                  to simd_fast_project.
+ *
+ *      simd_length(x)              The length (two-norm) of x.  Undefined if
+ *                                  x is poorly scaled such that an
+ *                                  intermediate computation overflows or
+ *                                  underflows.  There are two variants
+ *                                  of this function, simd_precise_length
+ *                                  and simd_fast_length.  simd_length
+ *                                  is equivalent to simd_precise_length
+ *                                  unless you are compiling with -ffast-math
+ *                                  specified, in which case it is equivalent
+ *                                  to simd_fast_length.
+ *
+ *      simd_length_squared(x)      The square of the length of x.  If you
+ *                                  simply need to compare relative magnitudes,
+ *                                  use this instead of simd_length; it is
+ *                                  faster than simd_fast_length and as
+ *                                  accurate as simd_precise_length.
+ *
+ *      simd_norm_one(x)            The one-norm (sum of absolute values) of x.
+ *
+ *      simd_norm_inf(x)            The inf-norm (max absolute value) of x.
+ *
+ *      simd_distance(x,y)          The distance between x and y. Undefined if
+ *                                  x and y are poorly scaled such that an
+ *                                  intermediate computation overflows
+ *                                  or underflows.  There are two variants
+ *                                  of this function, simd_precise_distance
+ *                                  and simd_fast_distance.  simd_distance
+ *                                  is equivalent to simd_precise_distance
+ *                                  unless you are compiling with -ffast-math
+ *                                  specified, in which case it is equivalent
+ *                                  to simd_fast_distance.
+ *
+ *      simd_distance_squared(x,y)  The square of the distance between x and y.
+ *
+ *      simd_normalize(x)           A vector pointing in the direction of x
+ *                                  with length 1.0.  Undefined if x is
+ *                                  the zero vector, or if x is poorly scaled
+ *                                  such that an intermediate computation
+ *                                  overflows or underflows.  There are two
+ *                                  variants of this function,
+ *                                  simd_precise_normalize and
+ *                                  simd_fast_normalize.  simd_normalize
+ *                                  is equivalent to simd_precise_normalize
+ *                                  unless you are compiling with -ffast-math
+ *                                  specified, in which case it is equivalent
+ *                                  to simd_fast_normalize.
+ *
+ *      simd_cross(x,y)             If x and y are vectors of dimension 3,
+ *                                  the cross-product of x and y.
+ *
+ *                                  If x and y are vectors of dimension 2,
+ *                                  the cross-product of x and y interpreted as
+ *                                  vectors in the z == 0 plane of a three-
+ *                                  dimensional space.
+ *
+ *                                  If x and y are vectors with a length that
+ *                                  is neither 2 nor 3, this operation is not
+ *                                  available.
+ *
+ *      simd_reflect(x,n)           Reflects x through the plane perpendicular
+ *                                  to the normal vector n.  Only available
+ *                                  for vectors of length 2, 3, or 4.
+ *
+ *      simd_refract(x,n,eta)       Calculates the refraction direction given
+ *                                  unit incident vector x, unit normal vector
+ *                                  n, and index of refraction eta.  If the
+ *                                  angle between the incident vector and the
+ *                                  surface normal is too great for the
+ *                                  specified index of refraction, zero is
+ *                                  returned.
+ *                                  Available for vectors of length 2, 3, or 4.
+ *
+ *     simd_orient(x,y,...)         Return a positive value if the origin and
+ *                                  their ordered arguments determine a positively
+ *                                  oriented parallelepiped, zero if it is degenerate,
+ *                                  and a negative value if it is negatively oriented.
+ *
+ *  In C++ the following geometric functions are available in the simd::
+ *  namespace:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      -----------------------------------------------------------
+ *      simd::dot(x,y)                  simd_dot(x,y)
+ *      simd::project(x,y)              simd_project(x,y)
+ *      simd::length_squared(x)         simd_length_squared(x)
+ *      simd::length(x)                 simd_length(x)
+ *      simd::distance_squared(x,y)     simd_distance_squared(x,y)
+ *      simd::norm_one(x)               simd_norm_one(x)
+ *      simd::norm_inf(x)               simd_norm_inf(x)
+ *      simd::distance(x,y)             simd_distance(x,y)
+ *      simd::normalize(x)              simd_normalize(x)
+ *      simd::cross(x,y)                simd_cross(x,y)
+ *      simd::reflect(x,n)              simd_reflect(x,n)
+ *      simd::refract(x,n,eta)          simd_refract(x,n,eta)
+ *      simd::orient(x,y,...)           simd_orient(x,y,...)
+ *
+ *      simd::precise::project(x,y)     simd_precise_project(x,y)
+ *      simd::precise::length(x)        simd_precise_length(x)
+ *      simd::precise::distance(x,y)    simd_precise_distance(x,y)
+ *      simd::precise::normalize(x)     simd_precise_normalize(x)
+ *
+ *      simd::fast::project(x,y)        simd_fast_project(x,y)
+ *      simd::fast::length(x)           simd_fast_length(x)
+ *      simd::fast::distance(x,y)       simd_fast_distance(x,y)
+ *      simd::fast::normalize(x)        simd_fast_normalize(x)
+ */
+
+#ifndef __SIMD_GEOMETRY_HEADER__
+#define __SIMD_GEOMETRY_HEADER__
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_types.h>
+#include <simd/common.h>
+#include <simd/extern.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+static float  SIMD_CFUNC simd_dot(simd_float2  __x, simd_float2  __y);
+static float  SIMD_CFUNC simd_dot(simd_float3  __x, simd_float3  __y);
+static float  SIMD_CFUNC simd_dot(simd_float4  __x, simd_float4  __y);
+static float  SIMD_CFUNC simd_dot(simd_float8  __x, simd_float8  __y);
+static float  SIMD_CFUNC simd_dot(simd_float16 __x, simd_float16 __y);
+static double SIMD_CFUNC simd_dot(simd_double2 __x, simd_double2 __y);
+static double SIMD_CFUNC simd_dot(simd_double3 __x, simd_double3 __y);
+static double SIMD_CFUNC simd_dot(simd_double4 __x, simd_double4 __y);
+static double SIMD_CFUNC simd_dot(simd_double8 __x, simd_double8 __y);
+#define vector_dot simd_dot
+
+static simd_float2  SIMD_CFUNC simd_precise_project(simd_float2  __x, simd_float2  __y);
+static simd_float3  SIMD_CFUNC simd_precise_project(simd_float3  __x, simd_float3  __y);
+static simd_float4  SIMD_CFUNC simd_precise_project(simd_float4  __x, simd_float4  __y);
+static simd_float8  SIMD_CFUNC simd_precise_project(simd_float8  __x, simd_float8  __y);
+static simd_float16 SIMD_CFUNC simd_precise_project(simd_float16 __x, simd_float16 __y);
+static simd_double2 SIMD_CFUNC simd_precise_project(simd_double2 __x, simd_double2 __y);
+static simd_double3 SIMD_CFUNC simd_precise_project(simd_double3 __x, simd_double3 __y);
+static simd_double4 SIMD_CFUNC simd_precise_project(simd_double4 __x, simd_double4 __y);
+static simd_double8 SIMD_CFUNC simd_precise_project(simd_double8 __x, simd_double8 __y);
+#define vector_precise_project simd_precise_project
+
+static simd_float2  SIMD_CFUNC simd_fast_project(simd_float2  __x, simd_float2  __y);
+static simd_float3  SIMD_CFUNC simd_fast_project(simd_float3  __x, simd_float3  __y);
+static simd_float4  SIMD_CFUNC simd_fast_project(simd_float4  __x, simd_float4  __y);
+static simd_float8  SIMD_CFUNC simd_fast_project(simd_float8  __x, simd_float8  __y);
+static simd_float16 SIMD_CFUNC simd_fast_project(simd_float16 __x, simd_float16 __y);
+static simd_double2 SIMD_CFUNC simd_fast_project(simd_double2 __x, simd_double2 __y);
+static simd_double3 SIMD_CFUNC simd_fast_project(simd_double3 __x, simd_double3 __y);
+static simd_double4 SIMD_CFUNC simd_fast_project(simd_double4 __x, simd_double4 __y);
+static simd_double8 SIMD_CFUNC simd_fast_project(simd_double8 __x, simd_double8 __y);
+#define vector_fast_project simd_fast_project
+
+static simd_float2  SIMD_CFUNC simd_project(simd_float2  __x, simd_float2  __y);
+static simd_float3  SIMD_CFUNC simd_project(simd_float3  __x, simd_float3  __y);
+static simd_float4  SIMD_CFUNC simd_project(simd_float4  __x, simd_float4  __y);
+static simd_float8  SIMD_CFUNC simd_project(simd_float8  __x, simd_float8  __y);
+static simd_float16 SIMD_CFUNC simd_project(simd_float16 __x, simd_float16 __y);
+static simd_double2 SIMD_CFUNC simd_project(simd_double2 __x, simd_double2 __y);
+static simd_double3 SIMD_CFUNC simd_project(simd_double3 __x, simd_double3 __y);
+static simd_double4 SIMD_CFUNC simd_project(simd_double4 __x, simd_double4 __y);
+static simd_double8 SIMD_CFUNC simd_project(simd_double8 __x, simd_double8 __y);
+#define vector_project simd_project
+
+static float  SIMD_CFUNC simd_precise_length(simd_float2  __x);
+static float  SIMD_CFUNC simd_precise_length(simd_float3  __x);
+static float  SIMD_CFUNC simd_precise_length(simd_float4  __x);
+static float  SIMD_CFUNC simd_precise_length(simd_float8  __x);
+static float  SIMD_CFUNC simd_precise_length(simd_float16 __x);
+static double SIMD_CFUNC simd_precise_length(simd_double2 __x);
+static double SIMD_CFUNC simd_precise_length(simd_double3 __x);
+static double SIMD_CFUNC simd_precise_length(simd_double4 __x);
+static double SIMD_CFUNC simd_precise_length(simd_double8 __x);
+#define vector_precise_length simd_precise_length
+
+static float  SIMD_CFUNC simd_fast_length(simd_float2  __x);
+static float  SIMD_CFUNC simd_fast_length(simd_float3  __x);
+static float  SIMD_CFUNC simd_fast_length(simd_float4  __x);
+static float  SIMD_CFUNC simd_fast_length(simd_float8  __x);
+static float  SIMD_CFUNC simd_fast_length(simd_float16 __x);
+static double SIMD_CFUNC simd_fast_length(simd_double2 __x);
+static double SIMD_CFUNC simd_fast_length(simd_double3 __x);
+static double SIMD_CFUNC simd_fast_length(simd_double4 __x);
+static double SIMD_CFUNC simd_fast_length(simd_double8 __x);
+#define vector_fast_length simd_fast_length
+
+static float  SIMD_CFUNC simd_length(simd_float2  __x);
+static float  SIMD_CFUNC simd_length(simd_float3  __x);
+static float  SIMD_CFUNC simd_length(simd_float4  __x);
+static float  SIMD_CFUNC simd_length(simd_float8  __x);
+static float  SIMD_CFUNC simd_length(simd_float16 __x);
+static double SIMD_CFUNC simd_length(simd_double2 __x);
+static double SIMD_CFUNC simd_length(simd_double3 __x);
+static double SIMD_CFUNC simd_length(simd_double4 __x);
+static double SIMD_CFUNC simd_length(simd_double8 __x);
+#define vector_length simd_length
+
+static float  SIMD_CFUNC simd_length_squared(simd_float2  __x);
+static float  SIMD_CFUNC simd_length_squared(simd_float3  __x);
+static float  SIMD_CFUNC simd_length_squared(simd_float4  __x);
+static float  SIMD_CFUNC simd_length_squared(simd_float8  __x);
+static float  SIMD_CFUNC simd_length_squared(simd_float16 __x);
+static double SIMD_CFUNC simd_length_squared(simd_double2 __x);
+static double SIMD_CFUNC simd_length_squared(simd_double3 __x);
+static double SIMD_CFUNC simd_length_squared(simd_double4 __x);
+static double SIMD_CFUNC simd_length_squared(simd_double8 __x);
+#define vector_length_squared simd_length_squared
+
+static float SIMD_CFUNC simd_norm_one(simd_float2 __x);
+static float SIMD_CFUNC simd_norm_one(simd_float3 __x);
+static float SIMD_CFUNC simd_norm_one(simd_float4 __x);
+static float SIMD_CFUNC simd_norm_one(simd_float8 __x);
+static float SIMD_CFUNC simd_norm_one(simd_float16 __x);
+static double SIMD_CFUNC simd_norm_one(simd_double2 __x);
+static double SIMD_CFUNC simd_norm_one(simd_double3 __x);
+static double SIMD_CFUNC simd_norm_one(simd_double4 __x);
+static double SIMD_CFUNC simd_norm_one(simd_double8 __x);
+#define vector_norm_one simd_norm_one
+
+static float SIMD_CFUNC simd_norm_inf(simd_float2 __x);
+static float SIMD_CFUNC simd_norm_inf(simd_float3 __x);
+static float SIMD_CFUNC simd_norm_inf(simd_float4 __x);
+static float SIMD_CFUNC simd_norm_inf(simd_float8 __x);
+static float SIMD_CFUNC simd_norm_inf(simd_float16 __x);
+static double SIMD_CFUNC simd_norm_inf(simd_double2 __x);
+static double SIMD_CFUNC simd_norm_inf(simd_double3 __x);
+static double SIMD_CFUNC simd_norm_inf(simd_double4 __x);
+static double SIMD_CFUNC simd_norm_inf(simd_double8 __x);
+#define vector_norm_inf simd_norm_inf
+
+static float  SIMD_CFUNC simd_precise_distance(simd_float2  __x, simd_float2  __y);
+static float  SIMD_CFUNC simd_precise_distance(simd_float3  __x, simd_float3  __y);
+static float  SIMD_CFUNC simd_precise_distance(simd_float4  __x, simd_float4  __y);
+static float  SIMD_CFUNC simd_precise_distance(simd_float8  __x, simd_float8  __y);
+static float  SIMD_CFUNC simd_precise_distance(simd_float16 __x, simd_float16 __y);
+static double SIMD_CFUNC simd_precise_distance(simd_double2 __x, simd_double2 __y);
+static double SIMD_CFUNC simd_precise_distance(simd_double3 __x, simd_double3 __y);
+static double SIMD_CFUNC simd_precise_distance(simd_double4 __x, simd_double4 __y);
+static double SIMD_CFUNC simd_precise_distance(simd_double8 __x, simd_double8 __y);
+#define vector_precise_distance simd_precise_distance
+
+static float  SIMD_CFUNC simd_fast_distance(simd_float2  __x, simd_float2  __y);
+static float  SIMD_CFUNC simd_fast_distance(simd_float3  __x, simd_float3  __y);
+static float  SIMD_CFUNC simd_fast_distance(simd_float4  __x, simd_float4  __y);
+static float  SIMD_CFUNC simd_fast_distance(simd_float8  __x, simd_float8  __y);
+static float  SIMD_CFUNC simd_fast_distance(simd_float16 __x, simd_float16 __y);
+static double SIMD_CFUNC simd_fast_distance(simd_double2 __x, simd_double2 __y);
+static double SIMD_CFUNC simd_fast_distance(simd_double3 __x, simd_double3 __y);
+static double SIMD_CFUNC simd_fast_distance(simd_double4 __x, simd_double4 __y);
+static double SIMD_CFUNC simd_fast_distance(simd_double8 __x, simd_double8 __y);
+#define vector_fast_distance simd_fast_distance
+
+static float  SIMD_CFUNC simd_distance(simd_float2  __x, simd_float2  __y);
+static float  SIMD_CFUNC simd_distance(simd_float3  __x, simd_float3  __y);
+static float  SIMD_CFUNC simd_distance(simd_float4  __x, simd_float4  __y);
+static float  SIMD_CFUNC simd_distance(simd_float8  __x, simd_float8  __y);
+static float  SIMD_CFUNC simd_distance(simd_float16 __x, simd_float16 __y);
+static double SIMD_CFUNC simd_distance(simd_double2 __x, simd_double2 __y);
+static double SIMD_CFUNC simd_distance(simd_double3 __x, simd_double3 __y);
+static double SIMD_CFUNC simd_distance(simd_double4 __x, simd_double4 __y);
+static double SIMD_CFUNC simd_distance(simd_double8 __x, simd_double8 __y);
+#define vector_distance simd_distance
+
+static float  SIMD_CFUNC simd_distance_squared(simd_float2  __x, simd_float2  __y);
+static float  SIMD_CFUNC simd_distance_squared(simd_float3  __x, simd_float3  __y);
+static float  SIMD_CFUNC simd_distance_squared(simd_float4  __x, simd_float4  __y);
+static float  SIMD_CFUNC simd_distance_squared(simd_float8  __x, simd_float8  __y);
+static float  SIMD_CFUNC simd_distance_squared(simd_float16 __x, simd_float16 __y);
+static double SIMD_CFUNC simd_distance_squared(simd_double2 __x, simd_double2 __y);
+static double SIMD_CFUNC simd_distance_squared(simd_double3 __x, simd_double3 __y);
+static double SIMD_CFUNC simd_distance_squared(simd_double4 __x, simd_double4 __y);
+static double SIMD_CFUNC simd_distance_squared(simd_double8 __x, simd_double8 __y);
+#define vector_distance_squared simd_distance_squared
+
+static simd_float2  SIMD_CFUNC simd_precise_normalize(simd_float2  __x);
+static simd_float3  SIMD_CFUNC simd_precise_normalize(simd_float3  __x);
+static simd_float4  SIMD_CFUNC simd_precise_normalize(simd_float4  __x);
+static simd_float8  SIMD_CFUNC simd_precise_normalize(simd_float8  __x);
+static simd_float16 SIMD_CFUNC simd_precise_normalize(simd_float16 __x);
+static simd_double2 SIMD_CFUNC simd_precise_normalize(simd_double2 __x);
+static simd_double3 SIMD_CFUNC simd_precise_normalize(simd_double3 __x);
+static simd_double4 SIMD_CFUNC simd_precise_normalize(simd_double4 __x);
+static simd_double8 SIMD_CFUNC simd_precise_normalize(simd_double8 __x);
+#define vector_precise_normalize simd_precise_normalize
+
+static simd_float2  SIMD_CFUNC simd_fast_normalize(simd_float2  __x);
+static simd_float3  SIMD_CFUNC simd_fast_normalize(simd_float3  __x);
+static simd_float4  SIMD_CFUNC simd_fast_normalize(simd_float4  __x);
+static simd_float8  SIMD_CFUNC simd_fast_normalize(simd_float8  __x);
+static simd_float16 SIMD_CFUNC simd_fast_normalize(simd_float16 __x);
+static simd_double2 SIMD_CFUNC simd_fast_normalize(simd_double2 __x);
+static simd_double3 SIMD_CFUNC simd_fast_normalize(simd_double3 __x);
+static simd_double4 SIMD_CFUNC simd_fast_normalize(simd_double4 __x);
+static simd_double8 SIMD_CFUNC simd_fast_normalize(simd_double8 __x);
+#define vector_fast_normalize simd_fast_normalize
+
+static simd_float2  SIMD_CFUNC simd_normalize(simd_float2  __x);
+static simd_float3  SIMD_CFUNC simd_normalize(simd_float3  __x);
+static simd_float4  SIMD_CFUNC simd_normalize(simd_float4  __x);
+static simd_float8  SIMD_CFUNC simd_normalize(simd_float8  __x);
+static simd_float16 SIMD_CFUNC simd_normalize(simd_float16 __x);
+static simd_double2 SIMD_CFUNC simd_normalize(simd_double2 __x);
+static simd_double3 SIMD_CFUNC simd_normalize(simd_double3 __x);
+static simd_double4 SIMD_CFUNC simd_normalize(simd_double4 __x);
+static simd_double8 SIMD_CFUNC simd_normalize(simd_double8 __x);
+#define vector_normalize simd_normalize
+
+static simd_float3  SIMD_CFUNC simd_cross(simd_float2  __x, simd_float2  __y);
+static simd_float3  SIMD_CFUNC simd_cross(simd_float3  __x, simd_float3  __y);
+static simd_double3 SIMD_CFUNC simd_cross(simd_double2 __x, simd_double2 __y);
+static simd_double3 SIMD_CFUNC simd_cross(simd_double3 __x, simd_double3 __y);
+#define vector_cross simd_cross
+
+static simd_float2  SIMD_CFUNC simd_reflect(simd_float2  __x, simd_float2  __n);
+static simd_float3  SIMD_CFUNC simd_reflect(simd_float3  __x, simd_float3  __n);
+static simd_float4  SIMD_CFUNC simd_reflect(simd_float4  __x, simd_float4  __n);
+static simd_double2 SIMD_CFUNC simd_reflect(simd_double2 __x, simd_double2 __n);
+static simd_double3 SIMD_CFUNC simd_reflect(simd_double3 __x, simd_double3 __n);
+static simd_double4 SIMD_CFUNC simd_reflect(simd_double4 __x, simd_double4 __n);
+#define vector_reflect simd_reflect
+
+static simd_float2  SIMD_CFUNC simd_refract(simd_float2  __x, simd_float2  __n, float __eta);
+static simd_float3  SIMD_CFUNC simd_refract(simd_float3  __x, simd_float3  __n, float __eta);
+static simd_float4  SIMD_CFUNC simd_refract(simd_float4  __x, simd_float4  __n, float __eta);
+static simd_double2 SIMD_CFUNC simd_refract(simd_double2 __x, simd_double2 __n, double __eta);
+static simd_double3 SIMD_CFUNC simd_refract(simd_double3 __x, simd_double3 __n, double __eta);
+static simd_double4 SIMD_CFUNC simd_refract(simd_double4 __x, simd_double4 __n, double __eta);
+#define vector_refract simd_refract
+
+#if SIMD_LIBRARY_VERSION >= 2
+/*  These functions require that you are building for OS X 10.12 or later,
+ *  iOS 10.0 or later, watchOS 3.0 or later, and tvOS 10.0 or later.  On
+ *  earlier OS versions, the library functions that implement these
+ *  operations are not available.                                             */
+
+/*! @functiongroup vector orientation
+ *
+ *  @discussion These functions return a positive value if the origin and
+ *  their ordered arguments determine a positively oriented parallelepiped,
+ *  zero if it is degenerate, and a negative value if it is negatively
+ *  oriented.  This is equivalent to saying that the matrix with rows equal
+ *  to the vectors has a positive, zero, or negative determinant,
+ *  respectively.
+ *
+ *  Naive evaluation of the determinant is prone to producing incorrect
+ *  results if the vectors are nearly degenerate (e.g. floating-point
+ *  rounding might cause the determinant to be zero or negative when
+ *  the points are very nearly coplanar but positively oriented).  If
+ *  the vectors are very large or small, computing the determininat is
+ *  also prone to premature overflow, which may cause the result to be
+ *  NaN even though the vectors contain normal floating-point numbers.
+ *
+ *  These routines take care to avoid those issues and always return a
+ *  result with correct sign, even when the problem is very ill-
+ *  conditioned.                                                              */
+
+/*! @abstract Test the orientation of two 2d vectors.
+ *
+ *  @param __x The first vector.
+ *  @param __y The second vector.
+ *
+ *  @result Positive if (x, y) are positively oriented, zero if they are
+ *  colinear, and negative if they are negatively oriented.
+ *
+ *  @discussion For two-dimensional vectors, "positively oriented" is
+ *  equivalent to the ordering (0, x, y) proceeding counter-clockwise
+ *  when viewed down the z axis, or to the cross product of x and y
+ *  extended to three-dimensions having positive z-component.                 */
+static float SIMD_CFUNC simd_orient(simd_float2 __x, simd_float2 __y);
+
+/*! @abstract Test the orientation of two 2d vectors.
+ *
+ *  @param __x The first vector.
+ *  @param __y The second vector.
+ *
+ *  @result Positive if (x, y) are positively oriented, zero if they are
+ *  colinear, and negative if they are negatively oriented.
+ *
+ *  @discussion For two-dimensional vectors, "positively oriented" is
+ *  equivalent to the ordering (0, x, y) proceeding counter- clockwise
+ *  when viewed down the z axis, or to the cross product of x and y
+ *  extended to three-dimensions having positive z-component.                 */
+static double SIMD_CFUNC simd_orient(simd_double2 __x, simd_double2 __y);
+
+/*! @abstract Test the orientation of three 3d vectors.
+ *
+ *  @param __x The first vector.
+ *  @param __y The second vector.
+ *  @param __z The third vector.
+ *
+ *  @result Positive if (x, y, z) are positively oriented, zero if they
+ *  are coplanar, and negative if they are negatively oriented.
+ *
+ *  @discussion For three-dimensional vectors, "positively oriented" is
+ *  equivalent to the ordering (x, y, z) following the "right hand rule",
+ *  or to the dot product of z with the cross product of x and y being
+ *  positive.                                                                 */
+static float SIMD_CFUNC simd_orient(simd_float3 __x, simd_float3 __y, simd_float3 __z);
+
+/*! @abstract Test the orientation of three 3d vectors.
+ *
+ *  @param __x The first vector.
+ *  @param __y The second vector.
+ *  @param __z The third vector.
+ *
+ *  @result Positive if (x, y, c) are positively oriented, zero if they
+ *  are coplanar, and negative if they are negatively oriented.
+ *
+ *  @discussion For three-dimensional vectors, "positively oriented" is
+ *  equivalent to the ordering (x, y, z) following the "right hand rule",
+ *  or to the dot product of z with the cross product of x and y being
+ *  positive.                                                                 */
+static double SIMD_CFUNC simd_orient(simd_double3 __x, simd_double3 __y, simd_double3 __z);
+
+/*! @functiongroup point (affine) orientation
+ *
+ *  @discussion These functions return a positive value if their ordered
+ *  arguments determine a positively oriented parallelepiped, zero if it
+ *  is degenerate, and a negative value if it is negatively oriented.
+ *
+ *  simd_orient(a, b, c) is formally equivalent to simd_orient(b-a, c-a),
+ *  but it is not effected by rounding error from subtraction of points,
+ *  as that implementation would be.  Care is taken so that the sign of
+ *  the result is always correct, even if the problem is ill-conditioned.     */
+
+/*! @abstract Test the orientation of a triangle in 2d.
+ *
+ *  @param __a The first point of the triangle.
+ *  @param __b The second point of the triangle.
+ *  @param __c The third point of the triangle.
+ *
+ *  @result Positive if the triangle is positively oriented, zero if it
+ *  is degenerate (three points in a line), and negative if it is negatively
+ *  oriented.
+ *
+ *  @discussion "Positively oriented" is equivalent to the ordering
+ *  (a, b, c) proceeding counter-clockwise when viewed down the z axis,
+ *  or to the cross product of a-c and b-c extended to three-dimensions
+ *  having positive z-component.                                              */
+static float SIMD_CFUNC simd_orient(simd_float2 __a, simd_float2 __b, simd_float2 __c);
+
+/*! @abstract Test the orientation of a triangle in 2d.
+ *
+ *  @param __a The first point of the triangle.
+ *  @param __b The second point of the triangle.
+ *  @param __c The third point of the triangle.
+ *
+ *  @result Positive if the triangle is positively oriented, zero if it
+ *  is degenerate (three points in a line), and negative if it is negatively
+ *  oriented.
+ *
+ *  @discussion "Positively oriented" is equivalent to the ordering
+ *  (a, b, c) proceeding counter-clockwise when viewed down the z axis,
+ *  or to the cross product of a-c and b-c extended to three-dimensions
+ *  having positive z-component.                                              */
+static double SIMD_CFUNC simd_orient(simd_double2 __a, simd_double2 __b, simd_double2 __c);
+
+/*! @abstract Test the orientation of a tetrahedron in 3d.
+ *
+ *  @param __a The first point of the tetrahedron.
+ *  @param __b The second point of the tetrahedron.
+ *  @param __c The third point of the tetrahedron.
+ *  @param __d The fourth point of the tetrahedron.
+ *
+ *  @result Positive if the tetrahedron is positively oriented, zero if it
+ *  is degenerate (four points in a plane), and negative if it is negatively
+ *  oriented.
+ *
+ *  @discussion "Positively oriented" is equivalent to the vectors
+ *  (a-d, b-d, c-d) following the "right hand rule", or to the dot product
+ *  of c-d with the the cross product of a-d and b-d being positive.          */
+static float SIMD_CFUNC simd_orient(simd_float3 __a, simd_float3 __b, simd_float3 __c, simd_float3 __d);
+
+/*! @abstract Test the orientation of a tetrahedron in 3d.
+ *
+ *  @param __a The first point of the tetrahedron.
+ *  @param __b The second point of the tetrahedron.
+ *  @param __c The third point of the tetrahedron.
+ *  @param __d The fourth point of the tetrahedron.
+ *
+ *  @result Positive if the tetrahedron is positively oriented, zero if it
+ *  is degenerate (four points in a plane), and negative if it is negatively
+ *  oriented.
+ *
+ *  @discussion "Positively oriented" is equivalent to the vectors
+ *  (a-d, b-d, c-d) following the "right hand rule", or to the dot product
+ *  of c-d with the the cross product of a-d and b-d being positive.          */
+static double SIMD_CFUNC simd_orient(simd_double3 __a, simd_double3 __b, simd_double3 __c, simd_double3 __d);
+
+/*! @functiongroup incircle (points) tests
+ *
+ *  @discussion These functions determine whether the point x is inside, on,
+ *  or outside the circle or sphere passing through a group of points.  If
+ *  x is inside the circle, the result is positive; if x is on the circle,
+ *  the result is zero; if x is outside the circle the result is negative.
+ *
+ *  These functions are always exact, even if the problem is ill-
+ *  conditioned (meaning that the points are nearly co-linear or
+ *  co-planar).
+ *
+ *  If the points are negatively-oriented, the the notions of "inside" and
+ *  "outside" are flipped.  If the points are degenerate, then the result
+ *  is undefined.                                                             */
+
+/*! @abstract Test if x lies inside, on, or outside the circle passing
+ *  through a, b, and c.
+ *
+ *  @param __x The point being tested.
+ *  @param __a The first point determining the circle.
+ *  @param __b The second point determining the circle.
+ *  @param __c The third point determining the circle.
+ *
+ *  @result Assuming that (a,b,c) are positively-oriented, positive if x is
+ *  inside the circle, zero if x is on the circle, and negative if x is
+ *  outside the circle.  The sign of the result is flipped if (a,b,c) are
+ *  negatively-oriented.                                                      */
+static float SIMD_CFUNC simd_incircle(simd_float2 __x, simd_float2 __a, simd_float2 __b, simd_float2 __c);
+
+/*! @abstract Test if x lies inside, on, or outside the circle passing
+ *  through a, b, and c.
+ *
+ *  @param __x The point being tested.
+ *  @param __a The first point determining the circle.
+ *  @param __b The second point determining the circle.
+ *  @param __c The third point determining the circle.
+ *
+ *  @result Assuming that (a,b,c) are positively-oriented, positive if x is
+ *  inside the circle, zero if x is on the circle, and negative if x is
+ *  outside the circle.  The sign of the result is flipped if (a,b,c) are
+ *  negatively-oriented.                                                      */
+static double SIMD_CFUNC simd_incircle(simd_double2 __x, simd_double2 __a, simd_double2 __b, simd_double2 __c);
+
+/*! @abstract Test if x lies inside, on, or outside the sphere passing
+ *  through a, b, c, and d.
+ *
+ *  @param __x The point being tested.
+ *  @param __a The first point determining the sphere.
+ *  @param __b The second point determining the sphere.
+ *  @param __c The third point determining the sphere.
+ *  @param __d The fourth point determining the sphere.
+ *
+ *  @result Assuming that the points are positively-oriented, positive if x
+ *  is inside the sphere, zero if x is on the sphere, and negative if x is
+ *  outside the sphere.  The sign of the result is flipped if the points are
+ *  negatively-oriented.                                                      */
+static float SIMD_CFUNC simd_insphere(simd_float3 __x, simd_float3 __a, simd_float3 __b, simd_float3 __c, simd_float3 __d);
+
+/*! @abstract Test if x lies inside, on, or outside the sphere passing
+ *  through a, b, c, and d.
+ *
+ *  @param __x The point being tested.
+ *  @param __a The first point determining the sphere.
+ *  @param __b The second point determining the sphere.
+ *  @param __c The third point determining the sphere.
+ *  @param __d The fourth point determining the sphere.
+ *
+ *  @result Assuming that the points are positively-oriented, positive if x
+ *  is inside the sphere, zero if x is on the sphere, and negative if x is
+ *  outside the sphere.  The sign of the result is flipped if the points are
+ *  negatively-oriented.                                                      */
+static double SIMD_CFUNC simd_insphere(simd_double3 __x, simd_double3 __a, simd_double3 __b, simd_double3 __c, simd_double3 __d);
+#endif /* SIMD_LIBRARY_VERSION */
+  
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  static SIMD_CPPFUNC float  dot(const float2  x, const float2  y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC float  dot(const float3  x, const float3  y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC float  dot(const float4  x, const float4  y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC float  dot(const float8  x, const float8  y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC float  dot(const float16 x, const float16 y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC double dot(const double2 x, const double2 y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC double dot(const double3 x, const double3 y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC double dot(const double4 x, const double4 y) { return ::simd_dot(x, y); }
+  static SIMD_CPPFUNC double dot(const double8 x, const double8 y) { return ::simd_dot(x, y); }
+  
+  static SIMD_CPPFUNC float2  project(const float2  x, const float2  y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC float3  project(const float3  x, const float3  y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC float4  project(const float4  x, const float4  y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC float8  project(const float8  x, const float8  y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC float16 project(const float16 x, const float16 y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC double2 project(const double2 x, const double2 y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC double3 project(const double3 x, const double3 y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC double4 project(const double4 x, const double4 y) { return ::simd_project(x, y); }
+  static SIMD_CPPFUNC double8 project(const double8 x, const double8 y) { return ::simd_project(x, y); }
+  
+  static SIMD_CPPFUNC float  length_squared(const float2  x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC float  length_squared(const float3  x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC float  length_squared(const float4  x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC float  length_squared(const float8  x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC float  length_squared(const float16 x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC double length_squared(const double2 x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC double length_squared(const double3 x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC double length_squared(const double4 x) { return ::simd_length_squared(x); }
+  static SIMD_CPPFUNC double length_squared(const double8 x) { return ::simd_length_squared(x); }
+  
+  static SIMD_CPPFUNC float  norm_one(const float2  x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC float  norm_one(const float3  x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC float  norm_one(const float4  x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC float  norm_one(const float8  x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC float  norm_one(const float16 x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC double norm_one(const double2 x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC double norm_one(const double3 x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC double norm_one(const double4 x) { return ::simd_norm_one(x); }
+  static SIMD_CPPFUNC double norm_one(const double8 x) { return ::simd_norm_one(x); }
+  
+  static SIMD_CPPFUNC float  norm_inf(const float2  x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC float  norm_inf(const float3  x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC float  norm_inf(const float4  x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC float  norm_inf(const float8  x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC float  norm_inf(const float16 x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC double norm_inf(const double2 x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC double norm_inf(const double3 x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC double norm_inf(const double4 x) { return ::simd_norm_inf(x); }
+  static SIMD_CPPFUNC double norm_inf(const double8 x) { return ::simd_norm_inf(x); }
+  
+  static SIMD_CPPFUNC float  length(const float2  x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC float  length(const float3  x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC float  length(const float4  x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC float  length(const float8  x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC float  length(const float16 x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC double length(const double2 x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC double length(const double3 x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC double length(const double4 x) { return ::simd_length(x); }
+  static SIMD_CPPFUNC double length(const double8 x) { return ::simd_length(x); }
+  
+  static SIMD_CPPFUNC float  distance_squared(const float2  x, const float2  y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC float  distance_squared(const float3  x, const float3  y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC float  distance_squared(const float4  x, const float4  y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC float  distance_squared(const float8  x, const float8  y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC float  distance_squared(const float16 x, const float16 y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC double distance_squared(const double2 x, const double2 y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC double distance_squared(const double3 x, const double3 y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC double distance_squared(const double4 x, const double4 y) { return ::simd_distance_squared(x, y); }
+  static SIMD_CPPFUNC double distance_squared(const double8 x, const double8 y) { return ::simd_distance_squared(x, y); }
+  
+  static SIMD_CPPFUNC float  distance(const float2  x, const float2  y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC float  distance(const float3  x, const float3  y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC float  distance(const float4  x, const float4  y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC float  distance(const float8  x, const float8  y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC float  distance(const float16 x, const float16 y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC double distance(const double2 x, const double2 y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC double distance(const double3 x, const double3 y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC double distance(const double4 x, const double4 y) { return ::simd_distance(x, y); }
+  static SIMD_CPPFUNC double distance(const double8 x, const double8 y) { return ::simd_distance(x, y); }
+  
+  static SIMD_CPPFUNC float2  normalize(const float2  x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC float3  normalize(const float3  x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC float4  normalize(const float4  x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC float8  normalize(const float8  x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC float16 normalize(const float16 x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC double2 normalize(const double2 x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC double3 normalize(const double3 x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC double4 normalize(const double4 x) { return ::simd_normalize(x); }
+  static SIMD_CPPFUNC double8 normalize(const double8 x) { return ::simd_normalize(x); }
+  
+  static SIMD_CPPFUNC float3  cross(const float2  x, const float2  y) { return ::simd_cross(x,y); }
+  static SIMD_CPPFUNC float3  cross(const float3  x, const float3  y) { return ::simd_cross(x,y); }
+  static SIMD_CPPFUNC double3 cross(const double2 x, const double2 y) { return ::simd_cross(x,y); }
+  static SIMD_CPPFUNC double3 cross(const double3 x, const double3 y) { return ::simd_cross(x,y); }
+  
+  static SIMD_CPPFUNC float2  reflect(const float2  x, const float2  n) { return ::simd_reflect(x,n); }
+  static SIMD_CPPFUNC float3  reflect(const float3  x, const float3  n) { return ::simd_reflect(x,n); }
+  static SIMD_CPPFUNC float4  reflect(const float4  x, const float4  n) { return ::simd_reflect(x,n); }
+  static SIMD_CPPFUNC double2 reflect(const double2 x, const double2 n) { return ::simd_reflect(x,n); }
+  static SIMD_CPPFUNC double3 reflect(const double3 x, const double3 n) { return ::simd_reflect(x,n); }
+  static SIMD_CPPFUNC double4 reflect(const double4 x, const double4 n) { return ::simd_reflect(x,n); }
+  
+  static SIMD_CPPFUNC float2  refract(const float2  x, const float2  n, const float eta) { return ::simd_refract(x,n,eta); }
+  static SIMD_CPPFUNC float3  refract(const float3  x, const float3  n, const float eta) { return ::simd_refract(x,n,eta); }
+  static SIMD_CPPFUNC float4  refract(const float4  x, const float4  n, const float eta) { return ::simd_refract(x,n,eta); }
+  static SIMD_CPPFUNC double2 refract(const double2 x, const double2 n, const float eta) { return ::simd_refract(x,n,eta); }
+  static SIMD_CPPFUNC double3 refract(const double3 x, const double3 n, const float eta) { return ::simd_refract(x,n,eta); }
+  static SIMD_CPPFUNC double4 refract(const double4 x, const double4 n, const float eta) { return ::simd_refract(x,n,eta); }
+  
+#if SIMD_LIBRARY_VERSION >= 2
+  static SIMD_CPPFUNC float  orient(const float2  x, const float2 y) { return ::simd_orient(x,y); }
+  static SIMD_CPPFUNC float  orient(const float2  a, const float2 b, const float2 c) { return ::simd_orient(a,b,c); }
+  static SIMD_CPPFUNC float  orient(const float3  x, const float3 y, const float3 z) { return ::simd_orient(x,y,z); }
+  static SIMD_CPPFUNC float  orient(const float3  a, const float3 b, const float3 c, const float3 d) { return ::simd_orient(a,b,c,d); }
+  static SIMD_CPPFUNC double orient(const double2 x, const double2 y) { return ::simd_orient(x,y); }
+  static SIMD_CPPFUNC double orient(const double2 a, const double2 b, const double2 c) { return ::simd_orient(a,b,c); }
+  static SIMD_CPPFUNC double orient(const double3 x, const double3 y, const double3 z) { return ::simd_orient(x,y,z); }
+  static SIMD_CPPFUNC double orient(const double3 a, const double3 b, const double3 c, const double3 d) { return ::simd_orient(a,b,c,d); }
+#endif
+
+  /* precise and fast sub-namespaces                                        */
+  namespace precise {
+    static SIMD_CPPFUNC float2  project(const float2  x, const float2  y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC float3  project(const float3  x, const float3  y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC float4  project(const float4  x, const float4  y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC float8  project(const float8  x, const float8  y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC float16 project(const float16 x, const float16 y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC double2 project(const double2 x, const double2 y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC double3 project(const double3 x, const double3 y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC double4 project(const double4 x, const double4 y) { return ::simd_precise_project(x, y); }
+    static SIMD_CPPFUNC double8 project(const double8 x, const double8 y) { return ::simd_precise_project(x, y); }
+    
+    static SIMD_CPPFUNC float  length(const float2  x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC float  length(const float3  x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC float  length(const float4  x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC float  length(const float8  x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC float  length(const float16 x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC double length(const double2 x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC double length(const double3 x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC double length(const double4 x) { return ::simd_precise_length(x); }
+    static SIMD_CPPFUNC double length(const double8 x) { return ::simd_precise_length(x); }
+    
+    static SIMD_CPPFUNC float  distance(const float2  x, const float2  y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float3  x, const float3  y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float4  x, const float4  y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float8  x, const float8  y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float16 x, const float16 y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double2 x, const double2 y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double3 x, const double3 y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double4 x, const double4 y) { return ::simd_precise_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double8 x, const double8 y) { return ::simd_precise_distance(x, y); }
+    
+    static SIMD_CPPFUNC float2  normalize(const float2  x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC float3  normalize(const float3  x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC float4  normalize(const float4  x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC float8  normalize(const float8  x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC float16 normalize(const float16 x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC double2 normalize(const double2 x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC double3 normalize(const double3 x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC double4 normalize(const double4 x) { return ::simd_precise_normalize(x); }
+    static SIMD_CPPFUNC double8 normalize(const double8 x) { return ::simd_precise_normalize(x); }
+  }
+  
+  namespace fast {
+    static SIMD_CPPFUNC float2  project(const float2  x, const float2  y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC float3  project(const float3  x, const float3  y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC float4  project(const float4  x, const float4  y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC float8  project(const float8  x, const float8  y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC float16 project(const float16 x, const float16 y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC double2 project(const double2 x, const double2 y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC double3 project(const double3 x, const double3 y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC double4 project(const double4 x, const double4 y) { return ::simd_fast_project(x, y); }
+    static SIMD_CPPFUNC double8 project(const double8 x, const double8 y) { return ::simd_fast_project(x, y); }
+    
+    static SIMD_CPPFUNC float  length(const float2  x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC float  length(const float3  x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC float  length(const float4  x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC float  length(const float8  x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC float  length(const float16 x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC double length(const double2 x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC double length(const double3 x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC double length(const double4 x) { return ::simd_fast_length(x); }
+    static SIMD_CPPFUNC double length(const double8 x) { return ::simd_fast_length(x); }
+    
+    static SIMD_CPPFUNC float  distance(const float2  x, const float2  y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float3  x, const float3  y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float4  x, const float4  y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float8  x, const float8  y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC float  distance(const float16 x, const float16 y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double2 x, const double2 y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double3 x, const double3 y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double4 x, const double4 y) { return ::simd_fast_distance(x, y); }
+    static SIMD_CPPFUNC double distance(const double8 x, const double8 y) { return ::simd_fast_distance(x, y); }
+    
+    static SIMD_CPPFUNC float2  normalize(const float2  x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC float3  normalize(const float3  x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC float4  normalize(const float4  x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC float8  normalize(const float8  x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC float16 normalize(const float16 x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC double2 normalize(const double2 x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC double3 normalize(const double3 x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC double4 normalize(const double4 x) { return ::simd_fast_normalize(x); }
+    static SIMD_CPPFUNC double8 normalize(const double8 x) { return ::simd_fast_normalize(x); }
+  }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+  
+#pragma mark - Implementation
+
+static float  SIMD_CFUNC simd_dot(simd_float2  __x, simd_float2  __y) { return simd_reduce_add(__x*__y); }
+static float  SIMD_CFUNC simd_dot(simd_float3  __x, simd_float3  __y) { return simd_reduce_add(__x*__y); }
+static float  SIMD_CFUNC simd_dot(simd_float4  __x, simd_float4  __y) { return simd_reduce_add(__x*__y); }
+static float  SIMD_CFUNC simd_dot(simd_float8  __x, simd_float8  __y) { return simd_reduce_add(__x*__y); }
+static float  SIMD_CFUNC simd_dot(simd_float16 __x, simd_float16 __y) { return simd_reduce_add(__x*__y); }
+static double SIMD_CFUNC simd_dot(simd_double2 __x, simd_double2 __y) { return simd_reduce_add(__x*__y); }
+static double SIMD_CFUNC simd_dot(simd_double3 __x, simd_double3 __y) { return simd_reduce_add(__x*__y); }
+static double SIMD_CFUNC simd_dot(simd_double4 __x, simd_double4 __y) { return simd_reduce_add(__x*__y); }
+static double SIMD_CFUNC simd_dot(simd_double8 __x, simd_double8 __y) { return simd_reduce_add(__x*__y); }
+
+static simd_float2  SIMD_CFUNC simd_precise_project(simd_float2  __x, simd_float2  __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_float3  SIMD_CFUNC simd_precise_project(simd_float3  __x, simd_float3  __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_float4  SIMD_CFUNC simd_precise_project(simd_float4  __x, simd_float4  __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_float8  SIMD_CFUNC simd_precise_project(simd_float8  __x, simd_float8  __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_float16 SIMD_CFUNC simd_precise_project(simd_float16 __x, simd_float16 __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_double2 SIMD_CFUNC simd_precise_project(simd_double2 __x, simd_double2 __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_double3 SIMD_CFUNC simd_precise_project(simd_double3 __x, simd_double3 __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_double4 SIMD_CFUNC simd_precise_project(simd_double4 __x, simd_double4 __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+static simd_double8 SIMD_CFUNC simd_precise_project(simd_double8 __x, simd_double8 __y) { return simd_dot(__x,__y)/simd_dot(__y,__y)*__y; }
+
+static simd_float2  SIMD_CFUNC simd_fast_project(simd_float2  __x, simd_float2  __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_float3  SIMD_CFUNC simd_fast_project(simd_float3  __x, simd_float3  __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_float4  SIMD_CFUNC simd_fast_project(simd_float4  __x, simd_float4  __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_float8  SIMD_CFUNC simd_fast_project(simd_float8  __x, simd_float8  __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_float16 SIMD_CFUNC simd_fast_project(simd_float16 __x, simd_float16 __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_double2 SIMD_CFUNC simd_fast_project(simd_double2 __x, simd_double2 __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_double3 SIMD_CFUNC simd_fast_project(simd_double3 __x, simd_double3 __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_double4 SIMD_CFUNC simd_fast_project(simd_double4 __x, simd_double4 __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+static simd_double8 SIMD_CFUNC simd_fast_project(simd_double8 __x, simd_double8 __y) { return __y*simd_dot(__x,__y)*simd_fast_recip(simd_dot(__y,__y)); }
+
+#if defined __FAST_MATH__
+static simd_float2  SIMD_CFUNC simd_project(simd_float2  __x, simd_float2  __y) { return simd_fast_project(__x,__y); }
+static simd_float3  SIMD_CFUNC simd_project(simd_float3  __x, simd_float3  __y) { return simd_fast_project(__x,__y); }
+static simd_float4  SIMD_CFUNC simd_project(simd_float4  __x, simd_float4  __y) { return simd_fast_project(__x,__y); }
+static simd_float8  SIMD_CFUNC simd_project(simd_float8  __x, simd_float8  __y) { return simd_fast_project(__x,__y); }
+static simd_float16 SIMD_CFUNC simd_project(simd_float16 __x, simd_float16 __y) { return simd_fast_project(__x,__y); }
+static simd_double2 SIMD_CFUNC simd_project(simd_double2 __x, simd_double2 __y) { return simd_fast_project(__x,__y); }
+static simd_double3 SIMD_CFUNC simd_project(simd_double3 __x, simd_double3 __y) { return simd_fast_project(__x,__y); }
+static simd_double4 SIMD_CFUNC simd_project(simd_double4 __x, simd_double4 __y) { return simd_fast_project(__x,__y); }
+static simd_double8 SIMD_CFUNC simd_project(simd_double8 __x, simd_double8 __y) { return simd_fast_project(__x,__y); }
+#else
+static simd_float2  SIMD_CFUNC simd_project(simd_float2  __x, simd_float2  __y) { return simd_precise_project(__x,__y); }
+static simd_float3  SIMD_CFUNC simd_project(simd_float3  __x, simd_float3  __y) { return simd_precise_project(__x,__y); }
+static simd_float4  SIMD_CFUNC simd_project(simd_float4  __x, simd_float4  __y) { return simd_precise_project(__x,__y); }
+static simd_float8  SIMD_CFUNC simd_project(simd_float8  __x, simd_float8  __y) { return simd_precise_project(__x,__y); }
+static simd_float16 SIMD_CFUNC simd_project(simd_float16 __x, simd_float16 __y) { return simd_precise_project(__x,__y); }
+static simd_double2 SIMD_CFUNC simd_project(simd_double2 __x, simd_double2 __y) { return simd_precise_project(__x,__y); }
+static simd_double3 SIMD_CFUNC simd_project(simd_double3 __x, simd_double3 __y) { return simd_precise_project(__x,__y); }
+static simd_double4 SIMD_CFUNC simd_project(simd_double4 __x, simd_double4 __y) { return simd_precise_project(__x,__y); }
+static simd_double8 SIMD_CFUNC simd_project(simd_double8 __x, simd_double8 __y) { return simd_precise_project(__x,__y); }
+#endif
+
+static float  SIMD_CFUNC simd_precise_length(simd_float2  __x) { return sqrtf(simd_length_squared(__x)); }
+static float  SIMD_CFUNC simd_precise_length(simd_float3  __x) { return sqrtf(simd_length_squared(__x)); }
+static float  SIMD_CFUNC simd_precise_length(simd_float4  __x) { return sqrtf(simd_length_squared(__x)); }
+static float  SIMD_CFUNC simd_precise_length(simd_float8  __x) { return sqrtf(simd_length_squared(__x)); }
+static float  SIMD_CFUNC simd_precise_length(simd_float16 __x) { return sqrtf(simd_length_squared(__x)); }
+static double SIMD_CFUNC simd_precise_length(simd_double2 __x) { return sqrt(simd_length_squared(__x)); }
+static double SIMD_CFUNC simd_precise_length(simd_double3 __x) { return sqrt(simd_length_squared(__x)); }
+static double SIMD_CFUNC simd_precise_length(simd_double4 __x) { return sqrt(simd_length_squared(__x)); }
+static double SIMD_CFUNC simd_precise_length(simd_double8 __x) { return sqrt(simd_length_squared(__x)); }
+
+static float  SIMD_CFUNC simd_fast_length(simd_float2  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_fast_length(simd_float3  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_fast_length(simd_float4  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_fast_length(simd_float8  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_fast_length(simd_float16 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_fast_length(simd_double2 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_fast_length(simd_double3 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_fast_length(simd_double4 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_fast_length(simd_double8 __x) { return simd_precise_length(__x); }
+
+#if defined __FAST_MATH__
+static float  SIMD_CFUNC simd_length(simd_float2  __x) { return simd_fast_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float3  __x) { return simd_fast_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float4  __x) { return simd_fast_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float8  __x) { return simd_fast_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float16 __x) { return simd_fast_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double2 __x) { return simd_fast_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double3 __x) { return simd_fast_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double4 __x) { return simd_fast_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double8 __x) { return simd_fast_length(__x); }
+#else
+static float  SIMD_CFUNC simd_length(simd_float2  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float3  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float4  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float8  __x) { return simd_precise_length(__x); }
+static float  SIMD_CFUNC simd_length(simd_float16 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double2 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double3 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double4 __x) { return simd_precise_length(__x); }
+static double SIMD_CFUNC simd_length(simd_double8 __x) { return simd_precise_length(__x); }
+#endif
+
+static float  SIMD_CFUNC simd_length_squared(simd_float2  __x) { return simd_dot(__x,__x); }
+static float  SIMD_CFUNC simd_length_squared(simd_float3  __x) { return simd_dot(__x,__x); }
+static float  SIMD_CFUNC simd_length_squared(simd_float4  __x) { return simd_dot(__x,__x); }
+static float  SIMD_CFUNC simd_length_squared(simd_float8  __x) { return simd_dot(__x,__x); }
+static float  SIMD_CFUNC simd_length_squared(simd_float16 __x) { return simd_dot(__x,__x); }
+static double SIMD_CFUNC simd_length_squared(simd_double2 __x) { return simd_dot(__x,__x); }
+static double SIMD_CFUNC simd_length_squared(simd_double3 __x) { return simd_dot(__x,__x); }
+static double SIMD_CFUNC simd_length_squared(simd_double4 __x) { return simd_dot(__x,__x); }
+static double SIMD_CFUNC simd_length_squared(simd_double8 __x) { return simd_dot(__x,__x); }
+
+static float SIMD_CFUNC simd_norm_one(simd_float2 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_one(simd_float3 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_one(simd_float4 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_one(simd_float8 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_one(simd_float16 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_one(simd_double2 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_one(simd_double3 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_one(simd_double4 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_one(simd_double8 __x) { return simd_reduce_add(__tg_fabs(__x)); }
+
+static float SIMD_CFUNC simd_norm_inf(simd_float2 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_inf(simd_float3 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_inf(simd_float4 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_inf(simd_float8 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static float SIMD_CFUNC simd_norm_inf(simd_float16 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_inf(simd_double2 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_inf(simd_double3 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_inf(simd_double4 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+static double SIMD_CFUNC simd_norm_inf(simd_double8 __x) { return simd_reduce_max(__tg_fabs(__x)); }
+
+static float  SIMD_CFUNC simd_precise_distance(simd_float2  __x, simd_float2  __y) { return simd_precise_length(__x - __y); }
+static float  SIMD_CFUNC simd_precise_distance(simd_float3  __x, simd_float3  __y) { return simd_precise_length(__x - __y); }
+static float  SIMD_CFUNC simd_precise_distance(simd_float4  __x, simd_float4  __y) { return simd_precise_length(__x - __y); }
+static float  SIMD_CFUNC simd_precise_distance(simd_float8  __x, simd_float8  __y) { return simd_precise_length(__x - __y); }
+static float  SIMD_CFUNC simd_precise_distance(simd_float16 __x, simd_float16 __y) { return simd_precise_length(__x - __y); }
+static double SIMD_CFUNC simd_precise_distance(simd_double2 __x, simd_double2 __y) { return simd_precise_length(__x - __y); }
+static double SIMD_CFUNC simd_precise_distance(simd_double3 __x, simd_double3 __y) { return simd_precise_length(__x - __y); }
+static double SIMD_CFUNC simd_precise_distance(simd_double4 __x, simd_double4 __y) { return simd_precise_length(__x - __y); }
+static double SIMD_CFUNC simd_precise_distance(simd_double8 __x, simd_double8 __y) { return simd_precise_length(__x - __y); }
+
+static float  SIMD_CFUNC simd_fast_distance(simd_float2  __x, simd_float2  __y) { return simd_fast_length(__x - __y); }
+static float  SIMD_CFUNC simd_fast_distance(simd_float3  __x, simd_float3  __y) { return simd_fast_length(__x - __y); }
+static float  SIMD_CFUNC simd_fast_distance(simd_float4  __x, simd_float4  __y) { return simd_fast_length(__x - __y); }
+static float  SIMD_CFUNC simd_fast_distance(simd_float8  __x, simd_float8  __y) { return simd_fast_length(__x - __y); }
+static float  SIMD_CFUNC simd_fast_distance(simd_float16 __x, simd_float16 __y) { return simd_fast_length(__x - __y); }
+static double SIMD_CFUNC simd_fast_distance(simd_double2 __x, simd_double2 __y) { return simd_fast_length(__x - __y); }
+static double SIMD_CFUNC simd_fast_distance(simd_double3 __x, simd_double3 __y) { return simd_fast_length(__x - __y); }
+static double SIMD_CFUNC simd_fast_distance(simd_double4 __x, simd_double4 __y) { return simd_fast_length(__x - __y); }
+static double SIMD_CFUNC simd_fast_distance(simd_double8 __x, simd_double8 __y) { return simd_fast_length(__x - __y); }
+
+#if defined __FAST_MATH__
+static float  SIMD_CFUNC simd_distance(simd_float2  __x, simd_float2  __y) { return simd_fast_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float3  __x, simd_float3  __y) { return simd_fast_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float4  __x, simd_float4  __y) { return simd_fast_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float8  __x, simd_float8  __y) { return simd_fast_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float16 __x, simd_float16 __y) { return simd_fast_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double2 __x, simd_double2 __y) { return simd_fast_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double3 __x, simd_double3 __y) { return simd_fast_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double4 __x, simd_double4 __y) { return simd_fast_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double8 __x, simd_double8 __y) { return simd_fast_distance(__x,__y); }
+#else
+static float  SIMD_CFUNC simd_distance(simd_float2  __x, simd_float2  __y) { return simd_precise_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float3  __x, simd_float3  __y) { return simd_precise_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float4  __x, simd_float4  __y) { return simd_precise_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float8  __x, simd_float8  __y) { return simd_precise_distance(__x,__y); }
+static float  SIMD_CFUNC simd_distance(simd_float16 __x, simd_float16 __y) { return simd_precise_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double2 __x, simd_double2 __y) { return simd_precise_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double3 __x, simd_double3 __y) { return simd_precise_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double4 __x, simd_double4 __y) { return simd_precise_distance(__x,__y); }
+static double SIMD_CFUNC simd_distance(simd_double8 __x, simd_double8 __y) { return simd_precise_distance(__x,__y); }
+#endif
+
+static float  SIMD_CFUNC simd_distance_squared(simd_float2  __x, simd_float2  __y) { return simd_length_squared(__x - __y); }
+static float  SIMD_CFUNC simd_distance_squared(simd_float3  __x, simd_float3  __y) { return simd_length_squared(__x - __y); }
+static float  SIMD_CFUNC simd_distance_squared(simd_float4  __x, simd_float4  __y) { return simd_length_squared(__x - __y); }
+static float  SIMD_CFUNC simd_distance_squared(simd_float8  __x, simd_float8  __y) { return simd_length_squared(__x - __y); }
+static float  SIMD_CFUNC simd_distance_squared(simd_float16 __x, simd_float16 __y) { return simd_length_squared(__x - __y); }
+static double SIMD_CFUNC simd_distance_squared(simd_double2 __x, simd_double2 __y) { return simd_length_squared(__x - __y); }
+static double SIMD_CFUNC simd_distance_squared(simd_double3 __x, simd_double3 __y) { return simd_length_squared(__x - __y); }
+static double SIMD_CFUNC simd_distance_squared(simd_double4 __x, simd_double4 __y) { return simd_length_squared(__x - __y); }
+static double SIMD_CFUNC simd_distance_squared(simd_double8 __x, simd_double8 __y) { return simd_length_squared(__x - __y); }
+
+static simd_float2  SIMD_CFUNC simd_precise_normalize(simd_float2  __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_float3  SIMD_CFUNC simd_precise_normalize(simd_float3  __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_float4  SIMD_CFUNC simd_precise_normalize(simd_float4  __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_float8  SIMD_CFUNC simd_precise_normalize(simd_float8  __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_float16 SIMD_CFUNC simd_precise_normalize(simd_float16 __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_double2 SIMD_CFUNC simd_precise_normalize(simd_double2 __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_double3 SIMD_CFUNC simd_precise_normalize(simd_double3 __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_double4 SIMD_CFUNC simd_precise_normalize(simd_double4 __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+static simd_double8 SIMD_CFUNC simd_precise_normalize(simd_double8 __x) { return __x * simd_precise_rsqrt(simd_length_squared(__x)); }
+
+static simd_float2  SIMD_CFUNC simd_fast_normalize(simd_float2  __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_float3  SIMD_CFUNC simd_fast_normalize(simd_float3  __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_float4  SIMD_CFUNC simd_fast_normalize(simd_float4  __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_float8  SIMD_CFUNC simd_fast_normalize(simd_float8  __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_float16 SIMD_CFUNC simd_fast_normalize(simd_float16 __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_double2 SIMD_CFUNC simd_fast_normalize(simd_double2 __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_double3 SIMD_CFUNC simd_fast_normalize(simd_double3 __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_double4 SIMD_CFUNC simd_fast_normalize(simd_double4 __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+static simd_double8 SIMD_CFUNC simd_fast_normalize(simd_double8 __x) { return __x * simd_fast_rsqrt(simd_length_squared(__x)); }
+
+#if defined __FAST_MATH__
+static simd_float2  SIMD_CFUNC simd_normalize(simd_float2  __x) { return simd_fast_normalize(__x); }
+static simd_float3  SIMD_CFUNC simd_normalize(simd_float3  __x) { return simd_fast_normalize(__x); }
+static simd_float4  SIMD_CFUNC simd_normalize(simd_float4  __x) { return simd_fast_normalize(__x); }
+static simd_float8  SIMD_CFUNC simd_normalize(simd_float8  __x) { return simd_fast_normalize(__x); }
+static simd_float16 SIMD_CFUNC simd_normalize(simd_float16 __x) { return simd_fast_normalize(__x); }
+static simd_double2 SIMD_CFUNC simd_normalize(simd_double2 __x) { return simd_fast_normalize(__x); }
+static simd_double3 SIMD_CFUNC simd_normalize(simd_double3 __x) { return simd_fast_normalize(__x); }
+static simd_double4 SIMD_CFUNC simd_normalize(simd_double4 __x) { return simd_fast_normalize(__x); }
+static simd_double8 SIMD_CFUNC simd_normalize(simd_double8 __x) { return simd_fast_normalize(__x); }
+#else
+static simd_float2  SIMD_CFUNC simd_normalize(simd_float2  __x) { return simd_precise_normalize(__x); }
+static simd_float3  SIMD_CFUNC simd_normalize(simd_float3  __x) { return simd_precise_normalize(__x); }
+static simd_float4  SIMD_CFUNC simd_normalize(simd_float4  __x) { return simd_precise_normalize(__x); }
+static simd_float8  SIMD_CFUNC simd_normalize(simd_float8  __x) { return simd_precise_normalize(__x); }
+static simd_float16 SIMD_CFUNC simd_normalize(simd_float16 __x) { return simd_precise_normalize(__x); }
+static simd_double2 SIMD_CFUNC simd_normalize(simd_double2 __x) { return simd_precise_normalize(__x); }
+static simd_double3 SIMD_CFUNC simd_normalize(simd_double3 __x) { return simd_precise_normalize(__x); }
+static simd_double4 SIMD_CFUNC simd_normalize(simd_double4 __x) { return simd_precise_normalize(__x); }
+static simd_double8 SIMD_CFUNC simd_normalize(simd_double8 __x) { return simd_precise_normalize(__x); }
+#endif
+
+static simd_float3  SIMD_CFUNC simd_cross(simd_float2  __x, simd_float2  __y) { return (simd_float3){ 0, 0, __x.x*__y.y - __x.y*__y.x }; }
+static simd_float3  SIMD_CFUNC simd_cross(simd_float3  __x, simd_float3  __y) { return (__x.zxy*__y - __x*__y.zxy).zxy; }
+static simd_double3 SIMD_CFUNC simd_cross(simd_double2 __x, simd_double2 __y) { return (simd_double3){ 0, 0, __x.x*__y.y - __x.y*__y.x }; }
+static simd_double3 SIMD_CFUNC simd_cross(simd_double3 __x, simd_double3 __y) { return (__x.zxy*__y - __x*__y.zxy).zxy; }
+
+static simd_float2  SIMD_CFUNC simd_reflect(simd_float2  __x, simd_float2  __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+static simd_float3  SIMD_CFUNC simd_reflect(simd_float3  __x, simd_float3  __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+static simd_float4  SIMD_CFUNC simd_reflect(simd_float4  __x, simd_float4  __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+static simd_double2 SIMD_CFUNC simd_reflect(simd_double2 __x, simd_double2 __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+static simd_double3 SIMD_CFUNC simd_reflect(simd_double3 __x, simd_double3 __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+static simd_double4 SIMD_CFUNC simd_reflect(simd_double4 __x, simd_double4 __n) { return __x - 2*simd_dot(__x,__n)*__n; }
+
+static simd_float2  SIMD_CFUNC simd_refract(simd_float2  __x, simd_float2  __n, float __eta) {
+  const float __k = 1.0f - __eta*__eta*(1.0f - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0f) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_float2)0.0f;
+}
+static simd_float3  SIMD_CFUNC simd_refract(simd_float3  __x, simd_float3  __n, float __eta) {
+  const float __k = 1.0f - __eta*__eta*(1.0f - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0f) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_float3)0.0f;
+}
+static simd_float4  SIMD_CFUNC simd_refract(simd_float4  __x, simd_float4  __n, float __eta) {
+  const float __k = 1.0f - __eta*__eta*(1.0f - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0f) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_float4)0.0f;
+}
+static simd_double2 SIMD_CFUNC simd_refract(simd_double2 __x, simd_double2 __n, double __eta) {
+  const double __k = 1.0 - __eta*__eta*(1.0 - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_double2)0.0;
+}
+static simd_double3 SIMD_CFUNC simd_refract(simd_double3 __x, simd_double3 __n, double __eta) {
+  const double __k = 1.0 - __eta*__eta*(1.0 - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_double3)0.0;
+}
+static simd_double4 SIMD_CFUNC simd_refract(simd_double4 __x, simd_double4 __n, double __eta) {
+  const double __k = 1.0 - __eta*__eta*(1.0 - simd_dot(__x,__n)*simd_dot(__x,__n));
+  return (__k >= 0.0) ? __eta*__x - (__eta*simd_dot(__x,__n) + sqrt(__k))*__n : (simd_double4)0.0;
+}
+
+#if SIMD_LIBRARY_VERSION >= 2
+static float SIMD_CFUNC simd_orient(simd_float2 __x, simd_float2 __y) {
+  return _simd_orient_vf2(__x, __y);
+}
+static double SIMD_CFUNC simd_orient(simd_double2 __x, simd_double2 __y) {
+  return _simd_orient_vd2(__x, __y);
+}
+static float SIMD_CFUNC simd_orient(simd_float3 __x, simd_float3 __y, simd_float3 __z) {
+  return _simd_orient_vf3(__x, __y, __z);
+}
+static double SIMD_CFUNC simd_orient(simd_double3 __x, simd_double3 __y, simd_double3 __z) {
+  simd_double3 __args[3] = { __x, __y, __z };
+  return _simd_orient_vd3((const double *)__args);
+}
+
+static float SIMD_CFUNC simd_orient(simd_float2 __a, simd_float2 __b, simd_float2 __c) {
+  return _simd_orient_pf2(__a, __b, __c);
+}
+static double SIMD_CFUNC simd_orient(simd_double2 __a, simd_double2 __b, simd_double2 __c) {
+  return _simd_orient_pd2(__a, __b, __c);
+}
+static float SIMD_CFUNC simd_orient(simd_float3 __a, simd_float3 __b, simd_float3 __c, simd_float3 __d) {
+  return _simd_orient_pf3(__a, __b, __c, __d);
+}
+static double SIMD_CFUNC simd_orient(simd_double3 __a, simd_double3 __b, simd_double3 __c, simd_double3 __d) {
+  simd_double3 __args[4] = { __a, __b, __c, __d };
+  return _simd_orient_pd3((const double *)__args);
+}
+
+static float SIMD_CFUNC simd_incircle(simd_float2 __x, simd_float2 __a, simd_float2 __b, simd_float2 __c) {
+  return _simd_incircle_pf2(__x, __a, __b, __c);
+}
+static double SIMD_CFUNC simd_incircle(simd_double2 __x, simd_double2 __a, simd_double2 __b, simd_double2 __c) {
+  return _simd_incircle_pd2(__x, __a, __b, __c);
+}
+static float SIMD_CFUNC simd_insphere(simd_float3 __x, simd_float3 __a, simd_float3 __b, simd_float3 __c, simd_float3 __d) {
+  return _simd_insphere_pf3(__x, __a, __b, __c, __d);
+}
+static double SIMD_CFUNC simd_insphere(simd_double3 __x, simd_double3 __a, simd_double3 __b, simd_double3 __c, simd_double3 __d) {
+  simd_double3 __args[5] = { __x, __a, __b, __c, __d };
+  return _simd_insphere_pd3((const double *)__args);
+}
+#endif /* SIMD_LIBRARY_VERSION */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_COMMON_HEADER__ */
diff --git a/vfsoverlay/logic.h b/vfsoverlay/logic.h
new file mode 100644
index 00000000..0d447a91
--- /dev/null
+++ b/vfsoverlay/logic.h
@@ -0,0 +1,1315 @@
+/*! @header
+ *  The interfaces declared in this header provide logical and bitwise
+ *  operations on vectors.  Some of these function operate elementwise,
+ *  and some produce a scalar result that depends on all lanes of the input.
+ *
+ *  For functions returning a boolean value, the return type in C and
+ *  Objective-C is _Bool; for C++ it is bool.
+ *
+ *      Function                    Result
+ *      ------------------------------------------------------------------
+ *      simd_all(comparison)        True if and only if the comparison is true
+ *                                  in every vector lane.  e.g.:
+ *
+ *                                      if (simd_all(x == 0.0f)) {
+ *                                          // executed if every lane of x
+ *                                          // contains zero.
+ *                                      }
+ *
+ *                                  The precise function of simd_all is to
+ *                                  return the high-order bit of the result
+ *                                  of a horizontal bitwise AND of all vector
+ *                                  lanes.
+ *
+ *      simd_any(comparison)        True if and only if the comparison is true
+ *                                  in at least one vector lane.  e.g.:
+ *
+ *                                      if (simd_any(x < 0.0f)) {
+ *                                          // executed if any lane of x
+ *                                          // contains a negative value.
+ *                                      }
+ *
+ *                                  The precise function of simd_all is to
+ *                                  return the high-order bit of the result
+ *                                  of a horizontal bitwise OR of all vector
+ *                                  lanes.
+ *
+ *      simd_select(x,y,mask)       For each lane in the result, selects the
+ *                                  corresponding element of x if the high-
+ *                                  order bit of the corresponding element of
+ *                                  mask is 0, and the corresponding element
+ *                                  of y otherwise.
+ *
+ *      simd_bitselect(x,y,mask)    For each bit in the result, selects the
+ *                                  corresponding bit of x if the corresponding
+ *                                  bit of mask is clear, and the corresponding
+ *                                  of y otherwise.
+ *
+ *  In C++, these functions are available under the simd:: namespace:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      --------------------------------------------------------------------
+ *      simd::all(comparison)           simd_all(comparison)
+ *      simd::any(comparison)           simd_any(comparison)
+ *      simd::select(x,y,mask)          simd_select(x,y,mask)
+ *      simd::bitselect(x,y,mask)       simd_bitselect(x,y,mask)
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_LOGIC_HEADER
+#define SIMD_LOGIC_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.
+ *  @discussion Deprecated. Use simd_any instead.                             */
+#define vector_any simd_any
+
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.
+ *  @discussion Deprecated. Use simd_all instead.                             */
+#define vector_all simd_all
+
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.
+ *  @discussion Deprecated. Use simd_select instead.                          */
+#define vector_select simd_select
+  
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.
+ *  @discussion Deprecated. Use simd_bitselect instead.                       */
+#define vector_bitselect simd_bitselect
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  /*! @abstract True if and only if the high-order bit of every lane is set.  */
+  template <typename inttypeN> static SIMD_CPPFUNC simd_bool all(const inttypeN predicate) { return ::simd_all(predicate); }
+  /*! @abstract True if and only if the high-order bit of any lane is set.    */
+  template <typename inttypeN> static SIMD_CPPFUNC simd_bool any(const inttypeN predicate) { return ::simd_any(predicate); }
+  /*! @abstract Each lane of the result is selected from the corresponding lane
+   *  of x or y according to whether the high-order bit of the corresponding
+   *  lane of mask is 0 or 1, respectively.                                   */
+  template <typename inttypeN, typename fptypeN> static SIMD_CPPFUNC fptypeN select(const fptypeN x, const fptypeN y, const inttypeN predicate) { return ::simd_select(x,y,predicate); }
+  /*! @abstract For each bit in the result, selects the corresponding bit of x
+   *  or y according to whether the corresponding bit of mask is 0 or 1,
+   *  respectively.                                                           */
+  template <typename inttypeN, typename typeN> static SIMD_CPPFUNC typeN bitselect(const typeN x, const typeN y, const inttypeN mask) { return ::simd_bitselect(x,y,mask); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementations
+
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3);
+#elif defined __arm64__
+  return simd_any(x.xyxy);
+#else
+  union { uint16_t i; simd_char2 v; } u = { .v = x };
+  return (u.i & 0x8080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  union { uint32_t i; simd_char3 v; } u = { .v = x };
+  return (u.i & 0x808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf);
+#elif defined __arm64__
+  return simd_any(x.xyzwxyzw);
+#else
+  union { uint32_t i; simd_char4 v; } u = { .v = x };
+  return (u.i & 0x80808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff);
+#elif defined __arm64__
+  return vmaxv_u8(x) & 0x80;
+#else
+  union { uint64_t i; simd_char8 v; } u = { .v = x };
+  return (u.i & 0x8080808080808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x) {
+#if defined __SSE2__
+  return _mm_movemask_epi8((__m128i)x);
+#elif defined __arm64__
+  return vmaxvq_u8(x) & 0x80;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_movemask_epi8(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x) {
+  return simd_any((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x) {
+  return simd_any((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x) {
+  return simd_any((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x) {
+  return simd_any((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x) {
+  return simd_any((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x) {
+  return simd_any((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x) {
+  return simd_any((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa);
+#elif defined __arm64__
+  return simd_any(x.xyxy);
+#else
+  union { uint32_t i; simd_short2 v; } u = { .v = x };
+  return (u.i & 0x80008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  union { uint64_t i; simd_short3 v; } u = { .v = x };
+  return (u.i & 0x800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa);
+#elif defined __arm64__
+  return vmaxv_u16(x) & 0x8000;
+#else
+  union { uint64_t i; simd_short4 v; } u = { .v = x };
+  return (u.i & 0x8000800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)x) & 0xaaaa);
+#elif defined __arm64__
+  return vmaxvq_u16(x) & 0x8000;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x) {
+#if defined __AVX2__
+  return (_mm256_movemask_epi8(x) & 0xaaaaaaaa);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x) {
+  return simd_any((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x) {
+  return simd_any((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x) {
+  return simd_any((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x) {
+  return simd_any((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x) {
+  return simd_any((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x) {
+  return simd_any((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3);
+#elif defined __arm64__
+  return vmaxv_u32(x) & 0x80000000;
+#else
+  union { uint64_t i; simd_int2 v; } u = { .v = x };
+  return (u.i & 0x8000000080000000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  return (x.x | x.y | x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x) {
+#if defined __SSE2__
+  return _mm_movemask_ps((__m128)x);
+#elif defined __arm64__
+  return vmaxvq_u32(x) & 0x80000000;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x) {
+#if defined __AVX__
+  return _mm256_movemask_ps(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x) {
+  return simd_any((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x) {
+  return simd_any((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x) {
+  return simd_any((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x) {
+  return simd_any((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x) {
+  return simd_any((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x) {
+#if defined __SSE2__
+  return _mm_movemask_pd((__m128d)x);
+#elif defined __arm64__
+  return (x.x | x.y) & 0x8000000000000000U;
+#else
+  return (x.x | x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x) {
+#if defined __AVX__
+  return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7);
+#else
+  return (x.x | x.y | x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x) {
+#if defined __AVX__
+  return _mm256_movemask_pd(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x) {
+  return simd_any((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x) {
+  return simd_any((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x) {
+  return simd_any((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x) {
+  return simd_any((simd_long8)x);
+}
+  
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+  return simd_all(x.xyxy);
+#else
+  union { uint16_t i; simd_char2 v; } u = { .v = x };
+  return (u.i & 0x8080) == 0x8080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  union { uint32_t i; simd_char3 v; } u = { .v = x };
+  return (u.i & 0x808080) == 0x808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf) == 0xf;
+#elif defined __arm64__
+  return simd_all(x.xyzwxyzw);
+#else
+  union { uint32_t i; simd_char4 v; } u = { .v = x };
+  return (u.i & 0x80808080) == 0x80808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff) == 0xff;
+#elif defined __arm64__
+  return vminv_u8(x) & 0x80;
+#else
+  union { uint64_t i; simd_char8 v; } u = { .v = x };
+  return (u.i & 0x8080808080808080) == 0x8080808080808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x) {
+#if defined __SSE2__
+  return _mm_movemask_epi8((__m128i)x) == 0xffff;
+#elif defined __arm64__
+  return vminvq_u8(x) & 0x80;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_movemask_epi8(x) == 0xffffffff;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x) {
+  return simd_all((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x) {
+  return simd_all((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x) {
+  return simd_all((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x) {
+  return simd_all((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x) {
+  return simd_all((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x) {
+  return simd_all((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x) {
+  return simd_all((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa) == 0xa;
+#elif defined __arm64__
+  return simd_all(x.xyxy);
+#else
+  union { uint32_t i; simd_short2 v; } u = { .v = x };
+  return (u.i & 0x80008000) == 0x80008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a) == 0x2a;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  union { uint64_t i; simd_short3 v; } u = { .v = x };
+  return (u.i & 0x800080008000) == 0x800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa) == 0xaa;
+#elif defined __arm64__
+  return vminv_u16(x) & 0x8000;
+#else
+  union { uint64_t i; simd_short4 v; } u = { .v = x };
+  return (u.i & 0x8000800080008000) == 0x8000800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)x) & 0xaaaa) == 0xaaaa;
+#elif defined __arm64__
+  return vminvq_u16(x) & 0x8000;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x) {
+#if defined __AVX2__
+  return (_mm256_movemask_epi8(x) & 0xaaaaaaaa) == 0xaaaaaaaa;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x) {
+  return simd_all((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x) {
+  return simd_all((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x) {
+  return simd_all((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x) {
+  return simd_all((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x) {
+  return simd_all((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x) {
+  return simd_all((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+  return vminv_u32(x) & 0x80000000;
+#else
+  union { uint64_t i; simd_int2 v; } u = { .v = x };
+  return (u.i & 0x8000000080000000) == 0x8000000080000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  return (x.x & x.y & x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x) {
+#if defined __SSE2__
+  return _mm_movemask_ps((__m128)x) == 0xf;
+#elif defined __arm64__
+  return vminvq_u32(x) & 0x80000000;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x) {
+#if defined __AVX__
+  return _mm256_movemask_ps(x) == 0xff;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x) {
+  return simd_all((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x) {
+  return simd_all((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x) {
+  return simd_all((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x) {
+  return simd_all((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x) {
+  return simd_all((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x) {
+#if defined __SSE2__
+  return _mm_movemask_pd((__m128d)x) == 0x3;
+#elif defined __arm64__
+  return (x.x & x.y) & 0x8000000000000000U;
+#else
+  return (x.x & x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x) {
+#if defined __AVX__
+  return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7) == 0x7;
+#else
+  return (x.x & x.y & x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x) {
+#if defined __AVX__
+  return _mm256_movemask_pd(x) == 0xf;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x) {
+  return simd_all((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x) {
+  return simd_all((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x) {
+  return simd_all((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x) {
+  return simd_all((simd_long8)x);
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask) {
+  return simd_make_float2(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask) {
+  return simd_make_float3(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask) {
+#if defined __SSE4_1__
+  return _mm_blendv_ps(x, y, (__m128)mask);
+#else
+  return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask) {
+#if defined __AVX__
+  return _mm256_blendv_ps(x, y, mask);
+#else
+  return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask) {
+  return simd_bitselect(x, y, mask >> 31);
+}
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask) {
+#if defined __SSE4_1__
+  return _mm_blendv_pd(x, y, (__m128d)mask);
+#else
+  return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask) {
+  return simd_make_double3(simd_select(simd_make_double4_undef(x), simd_make_double4_undef(y), simd_make_long4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask) {
+#if defined __AVX__
+  return _mm256_blendv_pd(x, y, mask);
+#else
+  return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask) {
+  return simd_bitselect(x, y, mask >> 63);
+}
+  
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask) {
+  return (simd_uchar2)simd_bitselect((simd_char2)x, (simd_char2)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask) {
+  return (simd_uchar3)simd_bitselect((simd_char3)x, (simd_char3)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask) {
+  return (simd_uchar4)simd_bitselect((simd_char4)x, (simd_char4)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask) {
+  return (simd_uchar8)simd_bitselect((simd_char8)x, (simd_char8)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask) {
+  return (simd_uchar16)simd_bitselect((simd_char16)x, (simd_char16)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask) {
+  return (simd_uchar32)simd_bitselect((simd_char32)x, (simd_char32)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask) {
+  return (simd_uchar64)simd_bitselect((simd_char64)x, (simd_char64)y, mask);
+}
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask) {
+  return (simd_ushort2)simd_bitselect((simd_short2)x, (simd_short2)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask) {
+  return (simd_ushort3)simd_bitselect((simd_short3)x, (simd_short3)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask) {
+  return (simd_ushort4)simd_bitselect((simd_short4)x, (simd_short4)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask) {
+  return (simd_ushort8)simd_bitselect((simd_short8)x, (simd_short8)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask) {
+  return (simd_ushort16)simd_bitselect((simd_short16)x, (simd_short16)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask) {
+  return (simd_ushort32)simd_bitselect((simd_short32)x, (simd_short32)y, mask);
+}
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask) {
+  return (simd_uint2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask) {
+  return (simd_uint3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask) {
+  return (simd_uint4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask) {
+  return (simd_uint8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask) {
+  return (simd_uint16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask) {
+  return (simd_float2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask) {
+  return (simd_float3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask) {
+  return (simd_float4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask) {
+  return (simd_float8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask) {
+  return (simd_float16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask) {
+  return (simd_ulong2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask) {
+  return (simd_ulong3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask) {
+  return (simd_ulong4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask) {
+  return (simd_ulong8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask) {
+  return (simd_double2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask) {
+  return (simd_double3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask) {
+  return (simd_double4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask) {
+  return (simd_double8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_LOGIC_HEADER__ */
diff --git a/vfsoverlay/math.h b/vfsoverlay/math.h
new file mode 100644
index 00000000..85d51984
--- /dev/null
+++ b/vfsoverlay/math.h
@@ -0,0 +1,5996 @@
+/*! @header
+ *  The interfaces declared in this header provide elementwise math operations
+ *  on vectors; each lane of the result vector depends only on the data in the
+ *  corresponding lane of the argument(s) to the function.
+ *
+ *  You should not use the C functions declared in this header directly (these
+ *  are functions with names like `__tg_cos(x)`). These are merely
+ *  implementation details of <tgmath.h> overloading; instead of calling
+ *  `__tg_cos(x)`, call `cos(x)`. If you are writing C++, use `simd::cos(x)`.
+ *
+ *  Note that while these vector functions are relatively recent additions,
+ *  scalar fallback is provided for all of them, so they are available even
+ *  when targeting older OS versions.
+ *
+ *  The following functions are available:
+ *
+ *    C name        C++ name          Notes
+ *    ----------------------------------------------------------------------
+ *    acos(x)       simd::acos(x)     
+ *    asin(x)       simd::asin(x)
+ *    atan(x)       simd::atan(x)
+ *    atan2(y,x)    simd::atan2(y,x)  The argument order matches the scalar
+ *                                    atan2 function, which gives the angle
+ *                                    of a line with slope y/x.
+ *    cos(x)        simd::cos(x)
+ *    sin(x)        simd::sin(x)
+ *    tan(x)        simd::tan(x)
+ *    sincos(x)     simd::sincos(x)   Computes sin(x) and cos(x) more efficiently
+ *
+ *    cospi(x)      simd::cospi(x)    Returns cos(pi*x), sin(pi*x), tan(pi*x)
+ *    sinpi(x)      simd::sinpi(x)    more efficiently and accurately than
+ *    tanpi(x)      simd::tanpi(x)    would otherwise be possible
+ *    sincospi(x)   simd::sincospi(x) Computes sin(pi*x) and cos(pi*x) more efficiently
+ *
+ *    acosh(x)      simd::acosh(x)
+ *    asinh(x)      simd::asinh(x)
+ *    atanh(x)      simd::atanh(x)
+ *
+ *    cosh(x)       simd::cosh(x)
+ *    sinh(x)       simd::sinh(x)
+ *    tanh(x)       simd::tanh(x)
+ *
+ *    exp(x)        simd::exp(x)
+ *    exp2(x)       simd::exp2(x)
+ *    exp10(x)      simd::exp10(x)    More efficient that pow(10,x).
+ *    expm1(x)      simd::expm1(x)    exp(x)-1, accurate even for tiny x.
+ *
+ *    log(x)        simd::log(x)
+ *    log2(x)       simd::log2(x)
+ *    log10(x)      simd::log10(x)
+ *    log1p(x)      simd::log1p(x)    log(1+x), accurate even for tiny x.
+ *
+ *    fabs(x)       simd::fabs(x)
+ *    cbrt(x)       simd::cbrt(x)
+ *    sqrt(x)       simd::sqrt(x)
+ *    pow(x,y)      simd::pow(x,y)
+ *    copysign(x,y) simd::copysign(x,y)
+ *    hypot(x,y)    simd::hypot(x,y)  sqrt(x*x + y*y), computed without
+ *                                    overflow.1
+ *    erf(x)        simd::erf(x)
+ *    erfc(x)       simd::erfc(x)
+ *    tgamma(x)     simd::tgamma(x)
+ *    lgamma(x)     simd::lgamma(x)
+ *
+ *    fmod(x,y)      simd::fmod(x,y)
+ *    remainder(x,y) simd::remainder(x,y)
+ *
+ *    ceil(x)       simd::ceil(x)
+ *    floor(x)      simd::floor(x)
+ *    rint(x)       simd::rint(x)
+ *    round(x)      simd::round(x)
+ *    trunc(x)      simd::trunc(x)
+ *
+ *    fdim(x,y)     simd::fdim(x,y)
+ *    fmax(x,y)     simd::fmax(x,y)   When one argument to fmin or fmax is
+ *    fmin(x,y)     simd::fmin(x,y)   constant, use it as the *second* (y)
+ *                                    argument to get better codegen on some
+ *                                    architectures. E.g., write fmin(x,2)
+ *                                    instead of fmin(2,x).
+ *    fma(x,y,z)    simd::fma(x,y,z)  Fast on arm64 and when targeting AVX2
+ *                                    and later; may be quite expensive on
+ *                                    older hardware.
+ *    simd_muladd(x,y,z) simd::muladd(x,y,z)
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_MATH_HEADER
+#define SIMD_MATH_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float2 __tg_lgamma(simd_float2 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float3 __tg_lgamma(simd_float3 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float4 __tg_lgamma(simd_float4 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float8 __tg_lgamma(simd_float8 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float16 __tg_lgamma(simd_float16 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double2 __tg_lgamma(simd_double2 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double3 __tg_lgamma(simd_double3 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double4 __tg_lgamma(simd_double4 x);
+/*! @abstract Do not call this function; instead use `lgamma` in C and
+ *  Objective-C, and `simd::lgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double8 __tg_lgamma(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 5
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_float2 x, simd_float2 *sinp, simd_float2 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_float3 x, simd_float3 *sinp, simd_float3 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_float8 x, simd_float8 *sinp, simd_float8 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_float16 x, simd_float16 *sinp, simd_float16 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_double3 x, simd_double3 *sinp, simd_double3 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_double4 x, simd_double4 *sinp, simd_double4 *cosp);
+/*! @abstract Do not call this function; instead use `sincos` in C and
+ *  Objective-C, and `simd::sincos` in C++.                                   */
+static inline SIMD_NONCONST void __tg_sincos(simd_double8 x, simd_double8 *sinp, simd_double8 *cosp);
+
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_float2 x, simd_float2 *sinp, simd_float2 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_float3 x, simd_float3 *sinp, simd_float3 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_float8 x, simd_float8 *sinp, simd_float8 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_float16 x, simd_float16 *sinp, simd_float16 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_double3 x, simd_double3 *sinp, simd_double3 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_double4 x, simd_double4 *sinp, simd_double4 *cosp);
+/*! @abstract Do not call this function; instead use `sincospi` in C and
+ *  Objective-C, and `simd::sincospi` in C++.                                 */
+static inline SIMD_NONCONST void __tg_sincospi(simd_double8 x, simd_double8 *sinp, simd_double8 *cosp);
+
+#endif
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_int2 __tg_isfinite(simd_float2 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_int3 __tg_isfinite(simd_float3 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_int4 __tg_isfinite(simd_float4 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_int8 __tg_isfinite(simd_float8 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_int16 __tg_isfinite(simd_float16 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_long2 __tg_isfinite(simd_double2 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_long3 __tg_isfinite(simd_double3 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_long4 __tg_isfinite(simd_double4 x);
+/*! @abstract Do not call this function; instead use `isfinite` in C and
+ *  Objective-C, and `simd::isfinite` in C++.                                 */
+static inline SIMD_CFUNC simd_long8 __tg_isfinite(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_int2 __tg_isinf(simd_float2 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_int3 __tg_isinf(simd_float3 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_int4 __tg_isinf(simd_float4 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_int8 __tg_isinf(simd_float8 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_int16 __tg_isinf(simd_float16 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_long2 __tg_isinf(simd_double2 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_long3 __tg_isinf(simd_double3 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_long4 __tg_isinf(simd_double4 x);
+/*! @abstract Do not call this function; instead use `isinf` in C and
+ *  Objective-C, and `simd::isinf` in C++.                                    */
+static inline SIMD_CFUNC simd_long8 __tg_isinf(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_int2 __tg_isnan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_int3 __tg_isnan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_int4 __tg_isnan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_int8 __tg_isnan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_int16 __tg_isnan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_long2 __tg_isnan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_long3 __tg_isnan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_long4 __tg_isnan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `isnan` in C and
+ *  Objective-C, and `simd::isnan` in C++.                                    */
+static inline SIMD_CFUNC simd_long8 __tg_isnan(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_int2 __tg_isnormal(simd_float2 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_int3 __tg_isnormal(simd_float3 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_int4 __tg_isnormal(simd_float4 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_int8 __tg_isnormal(simd_float8 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_int16 __tg_isnormal(simd_float16 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_long2 __tg_isnormal(simd_double2 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_long3 __tg_isnormal(simd_double3 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_long4 __tg_isnormal(simd_double4 x);
+/*! @abstract Do not call this function; instead use `isnormal` in C and
+ *  Objective-C, and `simd::isnormal` in C++.                                 */
+static inline SIMD_CFUNC simd_long8 __tg_isnormal(simd_double8 x);
+
+
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y);
+
+
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z);
+    
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z);
+    
+#ifdef __cplusplus
+} /* extern "C" */
+
+#include <cmath>
+/*! @abstract Do not call this function directly; use simd::acos instead.     */
+static SIMD_CPPFUNC float __tg_acos(float x) { return ::acosf(x); }
+/*! @abstract Do not call this function directly; use simd::acos instead.     */
+static SIMD_CPPFUNC double __tg_acos(double x) { return ::acos(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead.     */
+static SIMD_CPPFUNC float __tg_asin(float x) { return ::asinf(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead.     */
+static SIMD_CPPFUNC double __tg_asin(double x) { return ::asin(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead.     */
+static SIMD_CPPFUNC float __tg_atan(float x) { return ::atanf(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead.     */
+static SIMD_CPPFUNC double __tg_atan(double x) { return ::atan(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead.      */
+static SIMD_CPPFUNC float __tg_cos(float x) { return ::cosf(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead.      */
+static SIMD_CPPFUNC double __tg_cos(double x) { return ::cos(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead.      */
+static SIMD_CPPFUNC float __tg_sin(float x) { return ::sinf(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead.      */
+static SIMD_CPPFUNC double __tg_sin(double x) { return ::sin(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead.      */
+static SIMD_CPPFUNC float __tg_tan(float x) { return ::tanf(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead.      */
+static SIMD_CPPFUNC double __tg_tan(double x) { return ::tan(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead.    */
+static SIMD_CPPFUNC float __tg_cospi(float x) { return ::__cospif(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead.    */
+static SIMD_CPPFUNC double __tg_cospi(double x) { return ::__cospi(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead.    */
+static SIMD_CPPFUNC float __tg_sinpi(float x) { return ::__sinpif(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead.    */
+static SIMD_CPPFUNC double __tg_sinpi(double x) { return ::__sinpi(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead.    */
+static SIMD_CPPFUNC float __tg_tanpi(float x) { return ::__tanpif(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead.    */
+static SIMD_CPPFUNC double __tg_tanpi(double x) { return ::__tanpi(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead.    */
+static SIMD_CPPFUNC float __tg_acosh(float x) { return ::acoshf(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead.    */
+static SIMD_CPPFUNC double __tg_acosh(double x) { return ::acosh(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead.    */
+static SIMD_CPPFUNC float __tg_asinh(float x) { return ::asinhf(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead.    */
+static SIMD_CPPFUNC double __tg_asinh(double x) { return ::asinh(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead.    */
+static SIMD_CPPFUNC float __tg_atanh(float x) { return ::atanhf(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead.    */
+static SIMD_CPPFUNC double __tg_atanh(double x) { return ::atanh(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead.     */
+static SIMD_CPPFUNC float __tg_cosh(float x) { return ::coshf(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead.     */
+static SIMD_CPPFUNC double __tg_cosh(double x) { return ::cosh(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead.     */
+static SIMD_CPPFUNC float __tg_sinh(float x) { return ::sinhf(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead.     */
+static SIMD_CPPFUNC double __tg_sinh(double x) { return ::sinh(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead.     */
+static SIMD_CPPFUNC float __tg_tanh(float x) { return ::tanhf(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead.     */
+static SIMD_CPPFUNC double __tg_tanh(double x) { return ::tanh(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead.      */
+static SIMD_CPPFUNC float __tg_exp(float x) { return ::expf(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead.      */
+static SIMD_CPPFUNC double __tg_exp(double x) { return ::exp(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead.     */
+static SIMD_CPPFUNC float __tg_exp2(float x) { return ::exp2f(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead.     */
+static SIMD_CPPFUNC double __tg_exp2(double x) { return ::exp2(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead.    */
+static SIMD_CPPFUNC float __tg_exp10(float x) { return ::__exp10f(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead.    */
+static SIMD_CPPFUNC double __tg_exp10(double x) { return ::__exp10(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead.    */
+static SIMD_CPPFUNC float __tg_expm1(float x) { return ::expm1f(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead.    */
+static SIMD_CPPFUNC double __tg_expm1(double x) { return ::expm1(x); }
+/*! @abstract Do not call this function directly; use simd::log instead.      */
+static SIMD_CPPFUNC float __tg_log(float x) { return ::logf(x); }
+/*! @abstract Do not call this function directly; use simd::log instead.      */
+static SIMD_CPPFUNC double __tg_log(double x) { return ::log(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead.     */
+static SIMD_CPPFUNC float __tg_log2(float x) { return ::log2f(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead.     */
+static SIMD_CPPFUNC double __tg_log2(double x) { return ::log2(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead.    */
+static SIMD_CPPFUNC float __tg_log10(float x) { return ::log10f(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead.    */
+static SIMD_CPPFUNC double __tg_log10(double x) { return ::log10(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead.    */
+static SIMD_CPPFUNC float __tg_log1p(float x) { return ::log1pf(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead.    */
+static SIMD_CPPFUNC double __tg_log1p(double x) { return ::log1p(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead.     */
+static SIMD_CPPFUNC float __tg_fabs(float x) { return ::fabsf(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead.     */
+static SIMD_CPPFUNC double __tg_fabs(double x) { return ::fabs(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead.     */
+static SIMD_CPPFUNC float __tg_cbrt(float x) { return ::cbrtf(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead.     */
+static SIMD_CPPFUNC double __tg_cbrt(double x) { return ::cbrt(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead.     */
+static SIMD_CPPFUNC float __tg_sqrt(float x) { return ::sqrtf(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead.     */
+static SIMD_CPPFUNC double __tg_sqrt(double x) { return ::sqrt(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead.      */
+static SIMD_CPPFUNC float __tg_erf(float x) { return ::erff(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead.      */
+static SIMD_CPPFUNC double __tg_erf(double x) { return ::erf(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead.     */
+static SIMD_CPPFUNC float __tg_erfc(float x) { return ::erfcf(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead.     */
+static SIMD_CPPFUNC double __tg_erfc(double x) { return ::erfc(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead.   */
+static SIMD_CPPFUNC float __tg_tgamma(float x) { return ::tgammaf(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead.   */
+static SIMD_CPPFUNC double __tg_tgamma(double x) { return ::tgamma(x); }
+/*! @abstract Do not call this function directly; use simd::lgamma instead.   */
+static SIMD_CPPFUNC float __tg_lgamma(float x) { return ::lgammaf(x); }
+/*! @abstract Do not call this function directly; use simd::lgamma instead.   */
+static SIMD_CPPFUNC double __tg_lgamma(double x) { return ::lgamma(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead.     */
+static SIMD_CPPFUNC float __tg_ceil(float x) { return ::ceilf(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead.     */
+static SIMD_CPPFUNC double __tg_ceil(double x) { return ::ceil(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead.    */
+static SIMD_CPPFUNC float __tg_floor(float x) { return ::floorf(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead.    */
+static SIMD_CPPFUNC double __tg_floor(double x) { return ::floor(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead.     */
+static SIMD_CPPFUNC float __tg_rint(float x) { return ::rintf(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead.     */
+static SIMD_CPPFUNC double __tg_rint(double x) { return ::rint(x); }
+/*! @abstract Do not call this function directly; use simd::round instead.    */
+static SIMD_CPPFUNC float __tg_round(float x) { return ::roundf(x); }
+/*! @abstract Do not call this function directly; use simd::round instead.    */
+static SIMD_CPPFUNC double __tg_round(double x) { return ::round(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead.    */
+static SIMD_CPPFUNC float __tg_trunc(float x) { return ::truncf(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead.    */
+static SIMD_CPPFUNC double __tg_trunc(double x) { return ::trunc(x); }
+#if SIMD_LIBRARY_VERSION >= 5
+/*! @abstract Do not call this function directly; use simd::sincos instead.   */
+static SIMD_INLINE SIMD_NODEBUG void __tg_sincos(float x, float *sinp, float *cosp) { ::__sincosf(x, sinp, cosp); }
+/*! @abstract Do not call this function directly; use simd::sincos instead.   */
+static SIMD_INLINE SIMD_NODEBUG void __tg_sincos(double x, double *sinp, double *cosp) { ::__sincos(x, sinp, cosp); }
+/*! @abstract Do not call this function directly; use simd::sincospi
+ *  instead.                                                                  */
+static SIMD_INLINE SIMD_NODEBUG void __tg_sincospi(float x, float *sinp, float *cosp) { ::__sincospif(x, sinp, cosp); }
+/*! @abstract Do not call this function directly; use simd::sincospi
+ *  instead.                                                                  */
+static SIMD_INLINE SIMD_NODEBUG void __tg_sincospi(double x, double *sinp, double *cosp) { ::__sincospi(x, sinp, cosp); }
+#endif
+/*! @abstract Do not call this function directly; use simd::isfinite
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_isfinite(float x) { return ::isfinite(x); }
+/*! @abstract Do not call this function directly; use simd::isfinite
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_isfinite(double x) { return ::isfinite(x); }
+/*! @abstract Do not call this function directly; use simd::isinf instead.    */
+static SIMD_CPPFUNC float __tg_isinf(float x) { return ::isinf(x); }
+/*! @abstract Do not call this function directly; use simd::isinf instead.    */
+static SIMD_CPPFUNC double __tg_isinf(double x) { return ::isinf(x); }
+/*! @abstract Do not call this function directly; use simd::isnan instead.    */
+static SIMD_CPPFUNC float __tg_isnan(float x) { return ::isnan(x); }
+/*! @abstract Do not call this function directly; use simd::isnan instead.    */
+static SIMD_CPPFUNC double __tg_isnan(double x) { return ::isnan(x); }
+/*! @abstract Do not call this function directly; use simd::isnormal
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_isnormal(float x) { return ::isnormal(x); }
+/*! @abstract Do not call this function directly; use simd::isnormal
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_isnormal(double x) { return ::isnormal(x); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead.    */
+static SIMD_CPPFUNC float __tg_atan2(float x, float y) { return ::atan2f(x, y); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead.    */
+static SIMD_CPPFUNC double __tg_atan2(double x, double y) { return ::atan2(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead.    */
+static SIMD_CPPFUNC float __tg_hypot(float x, float y) { return ::hypotf(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead.    */
+static SIMD_CPPFUNC double __tg_hypot(double x, double y) { return ::hypot(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead.      */
+static SIMD_CPPFUNC float __tg_pow(float x, float y) { return ::powf(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead.      */
+static SIMD_CPPFUNC double __tg_pow(double x, double y) { return ::pow(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead.     */
+static SIMD_CPPFUNC float __tg_fmod(float x, float y) { return ::fmodf(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead.     */
+static SIMD_CPPFUNC double __tg_fmod(double x, double y) { return ::fmod(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_remainder(float x, float y) { return ::remainderf(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_remainder(double x, double y) { return ::remainder(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_copysign(float x, float y) { return ::copysignf(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_copysign(double x, double y) { return ::copysign(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_nextafter(float x, float y) { return ::nextafterf(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_nextafter(double x, double y) { return ::nextafter(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead.     */
+static SIMD_CPPFUNC float __tg_fdim(float x, float y) { return ::fdimf(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead.     */
+static SIMD_CPPFUNC double __tg_fdim(double x, double y) { return ::fdim(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead.     */
+static SIMD_CPPFUNC float __tg_fmax(float x, float y) { return ::fmaxf(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead.     */
+static SIMD_CPPFUNC double __tg_fmax(double x, double y) { return ::fmax(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead.     */
+static SIMD_CPPFUNC float __tg_fmin(float x, float y) { return ::fminf(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead.     */
+static SIMD_CPPFUNC double __tg_fmin(double x, double y) { return ::fmin(x, y); }
+/*! @abstract Do not call this function directly; use simd::fma instead.      */
+static SIMD_CPPFUNC float __tg_fma(float x, float y, float z) { return ::fmaf(x, y, z); }
+/*! @abstract Do not call this function directly; use simd::fma instead.      */
+static SIMD_CPPFUNC double __tg_fma(double x, double y, double z) { return ::fma(x, y, z); }
+  
+namespace simd {
+/*! @abstract Generalizes the <cmath> function acos to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN acos(fptypeN x) { return ::__tg_acos(x); }
+  
+/*! @abstract Generalizes the <cmath> function asin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN asin(fptypeN x) { return ::__tg_asin(x); }
+  
+/*! @abstract Generalizes the <cmath> function atan to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atan(fptypeN x) { return ::__tg_atan(x); }
+  
+/*! @abstract Generalizes the <cmath> function cos to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cos(fptypeN x) { return ::__tg_cos(x); }
+  
+/*! @abstract Generalizes the <cmath> function sin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sin(fptypeN x) { return ::__tg_sin(x); }
+  
+/*! @abstract Generalizes the <cmath> function tan to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tan(fptypeN x) { return ::__tg_tan(x); }
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function cospi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cospi(fptypeN x) { return ::__tg_cospi(x); }
+#endif
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function sinpi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sinpi(fptypeN x) { return ::__tg_sinpi(x); }
+#endif
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function tanpi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tanpi(fptypeN x) { return ::__tg_tanpi(x); }
+#endif
+  
+/*! @abstract Generalizes the <cmath> function acosh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN acosh(fptypeN x) { return ::__tg_acosh(x); }
+  
+/*! @abstract Generalizes the <cmath> function asinh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN asinh(fptypeN x) { return ::__tg_asinh(x); }
+  
+/*! @abstract Generalizes the <cmath> function atanh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atanh(fptypeN x) { return ::__tg_atanh(x); }
+  
+/*! @abstract Generalizes the <cmath> function cosh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cosh(fptypeN x) { return ::__tg_cosh(x); }
+  
+/*! @abstract Generalizes the <cmath> function sinh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sinh(fptypeN x) { return ::__tg_sinh(x); }
+  
+/*! @abstract Generalizes the <cmath> function tanh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tanh(fptypeN x) { return ::__tg_tanh(x); }
+  
+/*! @abstract Generalizes the <cmath> function exp to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp(fptypeN x) { return ::__tg_exp(x); }
+  
+/*! @abstract Generalizes the <cmath> function exp2 to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp2(fptypeN x) { return ::__tg_exp2(x); }
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function exp10 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp10(fptypeN x) { return ::__tg_exp10(x); }
+#endif
+  
+/*! @abstract Generalizes the <cmath> function expm1 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN expm1(fptypeN x) { return ::__tg_expm1(x); }
+  
+/*! @abstract Generalizes the <cmath> function log to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log(fptypeN x) { return ::__tg_log(x); }
+  
+/*! @abstract Generalizes the <cmath> function log2 to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log2(fptypeN x) { return ::__tg_log2(x); }
+  
+/*! @abstract Generalizes the <cmath> function log10 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log10(fptypeN x) { return ::__tg_log10(x); }
+  
+/*! @abstract Generalizes the <cmath> function log1p to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log1p(fptypeN x) { return ::__tg_log1p(x); }
+  
+/*! @abstract Generalizes the <cmath> function fabs to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fabs(fptypeN x) { return ::__tg_fabs(x); }
+  
+/*! @abstract Generalizes the <cmath> function cbrt to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cbrt(fptypeN x) { return ::__tg_cbrt(x); }
+  
+/*! @abstract Generalizes the <cmath> function sqrt to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sqrt(fptypeN x) { return ::__tg_sqrt(x); }
+  
+/*! @abstract Generalizes the <cmath> function erf to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN erf(fptypeN x) { return ::__tg_erf(x); }
+  
+/*! @abstract Generalizes the <cmath> function erfc to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN erfc(fptypeN x) { return ::__tg_erfc(x); }
+  
+/*! @abstract Generalizes the <cmath> function tgamma to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tgamma(fptypeN x) { return ::__tg_tgamma(x); }
+  
+/*! @abstract Generalizes the <cmath> function lgamma to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN lgamma(fptypeN x) { return ::__tg_lgamma(x); }
+  
+/*! @abstract Generalizes the <cmath> function ceil to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN ceil(fptypeN x) { return ::__tg_ceil(x); }
+  
+/*! @abstract Generalizes the <cmath> function floor to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN floor(fptypeN x) { return ::__tg_floor(x); }
+  
+/*! @abstract Generalizes the <cmath> function rint to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN rint(fptypeN x) { return ::__tg_rint(x); }
+  
+/*! @abstract Generalizes the <cmath> function round to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN round(fptypeN x) { return ::__tg_round(x); }
+  
+/*! @abstract Generalizes the <cmath> function trunc to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN trunc(fptypeN x) { return ::__tg_trunc(x); }
+  
+#if SIMD_LIBRARY_VERSION >= 5
+/*! @abstract Computes sincos more efficiently than separate computations.    */
+  template <typename fptypeN>
+  static SIMD_INLINE SIMD_NODEBUG void sincos(fptypeN x, fptypeN *sinp, fptypeN *cosp) { ::__tg_sincos(x, sinp, cosp); }
+
+/*! @abstract Computes sincospi more efficiently than separate computations.  */
+  template <typename fptypeN>
+  static SIMD_INLINE SIMD_NODEBUG void sincospi(fptypeN x, fptypeN *sinp, fptypeN *cosp) { ::__tg_sincospi(x, sinp, cosp); }
+
+#endif
+/*! @abstract Generalizes the <cmath> function isfinite to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC
+  typename std::enable_if<std::is_floating_point<typename traits<fptypeN>::scalar_t>::value, typename traits<fptypeN>::mask_t>::type
+  isfinite(fptypeN x) { return ::__tg_isfinite(x); }
+
+/*! @abstract Generalizes the <cmath> function isinf to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC
+  typename std::enable_if<std::is_floating_point<typename traits<fptypeN>::scalar_t>::value, typename traits<fptypeN>::mask_t>::type
+  isinf(fptypeN x) { return ::__tg_isinf(x); }
+
+/*! @abstract Generalizes the <cmath> function isnan to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC
+  typename std::enable_if<std::is_floating_point<typename traits<fptypeN>::scalar_t>::value, typename traits<fptypeN>::mask_t>::type
+  isnan(fptypeN x) { return ::__tg_isnan(x); }
+
+/*! @abstract Generalizes the <cmath> function isnormal to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC
+  typename std::enable_if<std::is_floating_point<typename traits<fptypeN>::scalar_t>::value, typename traits<fptypeN>::mask_t>::type
+  isnormal(fptypeN x) { return ::__tg_isnormal(x); }
+
+/*! @abstract Generalizes the <cmath> function atan2 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atan2(fptypeN y, fptypeN x) { return ::__tg_atan2(y, x); }
+    
+/*! @abstract Generalizes the <cmath> function hypot to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN hypot(fptypeN x, fptypeN y) { return ::__tg_hypot(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function pow to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN pow(fptypeN x, fptypeN y) { return ::__tg_pow(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmod to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmod(fptypeN x, fptypeN y) { return ::__tg_fmod(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function remainder to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN remainder(fptypeN x, fptypeN y) { return ::__tg_remainder(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function copysign to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN copysign(fptypeN x, fptypeN y) { return ::__tg_copysign(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function nextafter to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN nextafter(fptypeN x, fptypeN y) { return ::__tg_nextafter(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fdim to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fdim(fptypeN x, fptypeN y) { return ::__tg_fdim(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmax to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmax(fptypeN x, fptypeN y) { return ::__tg_fmax(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmin(fptypeN x, fptypeN y) { return ::__tg_fmin(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fma to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fma(fptypeN x, fptypeN y, fptypeN z) { return ::__tg_fma(x, y, z); }
+        
+/*! @abstract Computes x*y + z by the most efficient means available; either
+ *  a fused multiply add or separate multiply and add.                        */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN muladd(fptypeN x, fptypeN y, fptypeN z) { return ::simd_muladd(x, y, z); }
+};
+
+extern "C" {
+#else
+#include <tgmath.h>
+/* C and Objective-C, we need some infrastructure to piggyback on tgmath.h    */
+static SIMD_OVERLOAD simd_float2 __tg_promote(simd_float2);
+static SIMD_OVERLOAD simd_float3 __tg_promote(simd_float3);
+static SIMD_OVERLOAD simd_float4 __tg_promote(simd_float4);
+static SIMD_OVERLOAD simd_float8 __tg_promote(simd_float8);
+static SIMD_OVERLOAD simd_float16 __tg_promote(simd_float16);
+static SIMD_OVERLOAD simd_double2 __tg_promote(simd_double2);
+static SIMD_OVERLOAD simd_double3 __tg_promote(simd_double3);
+static SIMD_OVERLOAD simd_double4 __tg_promote(simd_double4);
+static SIMD_OVERLOAD simd_double8 __tg_promote(simd_double8);
+
+/*  Apple extensions to <math.h>, added in macOS 10.9 and iOS 7.0             */
+#if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_9   || \
+    __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_7_0 || \
+    __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_19_0
+static inline SIMD_CFUNC float __tg_cospi(float x) { return __cospif(x); }
+static inline SIMD_CFUNC double __tg_cospi(double x) { return __cospi(x); }
+#undef cospi
+/*! @abstract `cospi(x)` computes `cos(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `cos`. Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define cospi(__x) __tg_cospi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_sinpi(float x) { return __sinpif(x); }
+static inline SIMD_CFUNC double __tg_sinpi(double x) { return __sinpi(x); }
+#undef sinpi
+/*! @abstract `sinpi(x)` computes `sin(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `sin`. Defined for `float` and `double` as well as vectors
+ *  of floats and doubles as provided by `<simd/simd.h>`.                     */
+#define sinpi(__x) __tg_sinpi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_tanpi(float x) { return __tanpif(x); }
+static inline SIMD_CFUNC double __tg_tanpi(double x) { return __tanpi(x); }
+#undef tanpi
+/*! @abstract `tanpi(x)` computes `tan(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `tan`. Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define tanpi(__x) __tg_tanpi(__tg_promote1((__x))(__x))
+
+#if SIMD_LIBRARY_VERSION >= 5
+static inline SIMD_NONCONST void __tg_sincos(float x, float *sinp, float *cosp) { __sincosf(x, sinp, cosp); }
+static inline SIMD_NONCONST void __tg_sincos(double x, double *sinp, double *cosp) { __sincos(x, sinp, cosp); }
+#undef sincos
+/*! @abstract `sincos(x)` computes `sin(x)` and `cos(x)` more efficiently.
+ *
+ *  @discussion Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define sincos(__x, __sinp, __cosp) __tg_sincos(__tg_promote1((__x))(__x), __sinp, __cosp)
+
+static inline SIMD_NONCONST void __tg_sincospi(float x, float *sinp, float *cosp) { __sincospif(x, sinp, cosp); }
+static inline SIMD_NONCONST void __tg_sincospi(double x, double *sinp, double *cosp) { __sincospi(x, sinp, cosp); }
+#undef sincospi
+/*! @abstract `sincospi(x)` computes `sin(pi * x)` and `cos(pi * x)` more efficiently.
+ *
+ *  @discussion Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define sincospi(__x, __sinp, __cosp) __tg_sincospi(__tg_promote1((__x))(__x), __sinp, __cosp)
+#endif // SIMD_LIBRARY_VERSION >= 5
+
+static inline SIMD_CFUNC float __tg_exp10(float x) { return __exp10f(x); }
+static inline SIMD_CFUNC double __tg_exp10(double x) { return __exp10(x); }
+#undef exp10
+/*! @abstract `exp10(x)` computes `10**x` more efficiently and accurately
+ *  than `pow(10, x)`.
+ *
+ *  @discussion Defined for `float` and `double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define exp10(__x) __tg_exp10(__tg_promote1((__x))(__x))
+#endif
+
+#if (defined(__GNUC__) && 0 == __FINITE_MATH_ONLY__)
+static inline SIMD_CFUNC int __tg_isfinite(float x) { return __inline_isfinitef(x); }
+static inline SIMD_CFUNC int __tg_isfinite(double x) { return __inline_isfinited(x); }
+static inline SIMD_CFUNC int __tg_isfinite(long double x) { return __inline_isfinitel(x); }
+#undef isfinite
+/*! @abstract `__tg_isfinite(x)` determines if x is a finite value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isfinite(__x) __tg_isfinite(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isinf(float x) { return __inline_isinff(x); }
+static inline SIMD_CFUNC int __tg_isinf(double x) { return __inline_isinfd(x); }
+static inline SIMD_CFUNC int __tg_isinf(long double x) { return __inline_isinfl(x); }
+#undef isinf
+/*! @abstract `__tg_isinf(x)` determines if x is positive or negative infinity.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isinf(__x) __tg_isinf(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isnan(float x) { return __inline_isnanf(x); }
+static inline SIMD_CFUNC int __tg_isnan(double x) { return __inline_isnand(x); }
+static inline SIMD_CFUNC int __tg_isnan(long double x) { return __inline_isnanl(x); }
+#undef isnan
+/*! @abstract `__tg_isnan(x)` determines if x is a not-a-number (NaN) value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isnan(__x) __tg_isnan(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isnormal(float x) { return __inline_isnormalf(x); }
+static inline SIMD_CFUNC int __tg_isnormal(double x) { return __inline_isnormald(x); }
+static inline SIMD_CFUNC int __tg_isnormal(long double x) { return __inline_isnormall(x); }
+#undef isnormal
+/*! @abstract `__tg_isnormal(x)` determines if x is a normal value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isnormal(__x) __tg_isnormal(__tg_promote1((__x))(__x))
+
+#else /* defined(__GNUC__) && 0 == __FINITE_MATH_ONLY__ */
+
+static inline SIMD_CFUNC int __tg_isfinite(float x) { return __isfinitef(x); }
+static inline SIMD_CFUNC int __tg_isfinite(double x) { return __isfinited(x); }
+static inline SIMD_CFUNC int __tg_isfinite(long double x) { return __isfinitel(x); }
+#undef isfinite
+/*! @abstract `__tg_isfinite(x)` determines if x is a finite value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isfinite(__x) __tg_isfinite(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isinf(float x) { return __isinff(x); }
+static inline SIMD_CFUNC int __tg_isinf(double x) { return __isinfd(x); }
+static inline SIMD_CFUNC int __tg_isinf(long double x) { return __isinfl(x); }
+#undef isinf
+/*! @abstract `__tg_isinf(x)` determines if x is positive or negative infinity.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isinf(__x) __tg_isinf(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isnan(float x) { return __isnanf(x); }
+static inline SIMD_CFUNC int __tg_isnan(double x) { return __isnand(x); }
+static inline SIMD_CFUNC int __tg_isnan(long double x) { return __isnanl(x); }
+#undef isnan
+/*! @abstract `__tg_isnan(x)` determines if x is a not-a-number (NaN) value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isnan(__x) __tg_isnan(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC int __tg_isnormal(float x) { return __isnormalf(x); }
+static inline SIMD_CFUNC int __tg_isnormal(double x) { return __isnormald(x); }
+static inline SIMD_CFUNC int __tg_isnormal(long double x) { return __isnormall(x); }
+#undef isnormal
+/*! @abstract `__tg_isnormal(x)` determines if x is a normal value.
+ *
+ *  @discussion Defined for `float`, `double` and `long double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define isnormal(__x) __tg_isnormal(__tg_promote1((__x))(__x))
+#endif /* defined(__GNUC__) && 0 == __FINITE_MATH_ONLY__ */
+#endif /* !__cplusplus */
+  
+#pragma mark - fabs implementation
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffff); }
+  
+#pragma mark - isfinite implementation
+static inline SIMD_CFUNC simd_int2 __tg_isfinite(simd_float2 x) { return x == x && __tg_fabs(x) != (simd_float2)INFINITY; }
+static inline SIMD_CFUNC simd_int3 __tg_isfinite(simd_float3 x) { return x == x && __tg_fabs(x) != (simd_float3)INFINITY; }
+static inline SIMD_CFUNC simd_int4 __tg_isfinite(simd_float4 x) { return x == x && __tg_fabs(x) != (simd_float4)INFINITY; }
+static inline SIMD_CFUNC simd_int8 __tg_isfinite(simd_float8 x) { return x == x && __tg_fabs(x) != (simd_float8)INFINITY; }
+static inline SIMD_CFUNC simd_int16 __tg_isfinite(simd_float16 x) { return x == x && __tg_fabs(x) != (simd_float16)INFINITY; }
+static inline SIMD_CFUNC simd_long2 __tg_isfinite(simd_double2 x) { return x == x && __tg_fabs(x) != (simd_double2)INFINITY; }
+static inline SIMD_CFUNC simd_long3 __tg_isfinite(simd_double3 x) { return x == x && __tg_fabs(x) != (simd_double3)INFINITY; }
+static inline SIMD_CFUNC simd_long4 __tg_isfinite(simd_double4 x) { return x == x && __tg_fabs(x) != (simd_double4)INFINITY; }
+static inline SIMD_CFUNC simd_long8 __tg_isfinite(simd_double8 x) { return x == x && __tg_fabs(x) != (simd_double8)INFINITY; }
+
+#pragma mark - isinf implementation
+static inline SIMD_CFUNC simd_int2 __tg_isinf(simd_float2 x) { return __tg_fabs(x) == (simd_float2)INFINITY; }
+static inline SIMD_CFUNC simd_int3 __tg_isinf(simd_float3 x) { return __tg_fabs(x) == (simd_float3)INFINITY; }
+static inline SIMD_CFUNC simd_int4 __tg_isinf(simd_float4 x) { return __tg_fabs(x) == (simd_float4)INFINITY; }
+static inline SIMD_CFUNC simd_int8 __tg_isinf(simd_float8 x) { return __tg_fabs(x) == (simd_float8)INFINITY; }
+static inline SIMD_CFUNC simd_int16 __tg_isinf(simd_float16 x) { return __tg_fabs(x) == (simd_float16)INFINITY; }
+static inline SIMD_CFUNC simd_long2 __tg_isinf(simd_double2 x) { return __tg_fabs(x) == (simd_double2)INFINITY; }
+static inline SIMD_CFUNC simd_long3 __tg_isinf(simd_double3 x) { return __tg_fabs(x) == (simd_double3)INFINITY; }
+static inline SIMD_CFUNC simd_long4 __tg_isinf(simd_double4 x) { return __tg_fabs(x) == (simd_double4)INFINITY; }
+static inline SIMD_CFUNC simd_long8 __tg_isinf(simd_double8 x) { return __tg_fabs(x) == (simd_double8)INFINITY; }
+
+#pragma mark - isnan implementation
+static inline SIMD_CFUNC simd_int2 __tg_isnan(simd_float2 x) { return x != x; }
+static inline SIMD_CFUNC simd_int3 __tg_isnan(simd_float3 x) { return x != x; }
+static inline SIMD_CFUNC simd_int4 __tg_isnan(simd_float4 x) { return x != x; }
+static inline SIMD_CFUNC simd_int8 __tg_isnan(simd_float8 x) { return x != x; }
+static inline SIMD_CFUNC simd_int16 __tg_isnan(simd_float16 x) { return x != x; }
+static inline SIMD_CFUNC simd_long2 __tg_isnan(simd_double2 x) { return x != x; }
+static inline SIMD_CFUNC simd_long3 __tg_isnan(simd_double3 x) { return x != x; }
+static inline SIMD_CFUNC simd_long4 __tg_isnan(simd_double4 x) { return x != x; }
+static inline SIMD_CFUNC simd_long8 __tg_isnan(simd_double8 x) { return x != x; }
+
+#pragma mark - isnormal implementation
+static inline SIMD_CFUNC simd_int2 __tg_isnormal(simd_float2 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_float2)__FLT_MIN__; }
+static inline SIMD_CFUNC simd_int3 __tg_isnormal(simd_float3 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_float3)__FLT_MIN__; }
+static inline SIMD_CFUNC simd_int4 __tg_isnormal(simd_float4 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_float4)__FLT_MIN__; }
+static inline SIMD_CFUNC simd_int8 __tg_isnormal(simd_float8 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_float8)__FLT_MIN__; }
+static inline SIMD_CFUNC simd_int16 __tg_isnormal(simd_float16 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_float16)__FLT_MIN__; }
+static inline SIMD_CFUNC simd_long2 __tg_isnormal(simd_double2 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_double2)__DBL_MIN__; }
+static inline SIMD_CFUNC simd_long3 __tg_isnormal(simd_double3 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_double3)__DBL_MIN__; }
+static inline SIMD_CFUNC simd_long4 __tg_isnormal(simd_double4 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_double4)__DBL_MIN__; }
+static inline SIMD_CFUNC simd_long8 __tg_isnormal(simd_double8 x) { return __tg_isfinite(x) && __tg_fabs(x) >= (simd_double8)__DBL_MIN__; }
+
+#pragma mark - fmin, fmax implementation
+static SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+  return vminnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmin_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm_range_ps(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_min_ps(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_min_ps(x, y), x, y != y);
+#elif defined __arm64__
+  return vminnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vminq_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm256_range_ps(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_min_ps(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_min_ps(x, y), x, y != y);
+#else
+  return simd_make_float8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+  return _mm512_range_ps(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_min_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_min_ps(x, y), x, y != y);
+#else
+  return simd_make_float16(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm_range_pd(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_min_pd(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_min_pd(x, y), x, y != y);
+#elif defined __arm64__
+  return vminnmq_f64(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmin(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm256_range_pd(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_min_pd(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_min_pd(x, y), x, y != y);
+#else
+  return simd_make_double4(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+  return _mm512_range_pd(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_min_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_min_pd(x, y), x, y != y);
+#else
+  return simd_make_double8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+  return vmaxnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmax_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm_range_ps(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_max_ps(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_max_ps(x, y), x, y != y);
+#elif defined __arm64__
+  return vmaxnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmaxq_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm256_range_ps(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_max_ps(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_max_ps(x, y), x, y != y);
+#else
+  return simd_make_float8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+  return _mm512_range_ps(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_max_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_max_ps(x, y), x, y != y);
+#else
+  return simd_make_float16(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm_range_pd(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_max_pd(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_max_pd(x, y), x, y != y);
+#elif defined __arm64__
+  return vmaxnmq_f64(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmax(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm256_range_pd(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_max_pd(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_max_pd(x, y), x, y != y);
+#else
+  return simd_make_double4(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+  return _mm512_range_pd(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_max_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_max_pd(x, y), x, y != y);
+#else
+  return simd_make_double8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+#pragma mark - copysign implementation
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y) { return simd_bitselect(y, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y) { return simd_bitselect(y, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y) { return simd_bitselect(y, x, 0x7fffffffffffffff); }
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y) { return simd_bitselect(y, x, 0x7fffffffffffffff); }
+  
+#pragma mark - sqrt implementation
+static SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_sqrt(simd_make_float4_undef(x)));
+#elif defined __arm64__
+  return vsqrt_f32(x);
+#else
+  return simd_make_float2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x) {
+  return simd_make_float3(__tg_sqrt(simd_make_float4_undef(x)));
+}
+
+static SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x) {
+#if defined __SSE2__
+  return _mm_sqrt_ps(x);
+#elif defined __arm64__
+  return vsqrtq_f32(x);
+#else
+  return simd_make_float4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_sqrt_ps(x);
+#else
+  return simd_make_float8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_sqrt_ps(x);
+#else
+  return simd_make_float16(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x) {
+#if defined __SSE2__
+  return _mm_sqrt_pd(x);
+#elif defined __arm64__
+  return vsqrtq_f64(x);
+#else
+  return simd_make_double2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x) {
+  return simd_make_double3(__tg_sqrt(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_sqrt_pd(x);
+#else
+  return simd_make_double4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_sqrt_pd(x);
+#else
+  return simd_make_double8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+#pragma mark - ceil, floor, rint, trunc implementation
+static SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x) {
+#if defined __arm64__
+  return vrndp_f32(x);
+#else
+  return simd_make_float2(__tg_ceil(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x) {
+  return simd_make_float3(__tg_ceil(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_ceil_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndpq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_ceil_f4(x);
+#else
+  simd_float4 truncated = __tg_trunc(x);
+  simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated < x);
+  return __tg_copysign(truncated + adjust, x);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_ceil_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndpq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_ceil_d2(x);
+#else
+  simd_double2 truncated = __tg_trunc(x);
+  simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated < x);
+  return __tg_copysign(truncated + adjust, x);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x) {
+  return simd_make_double3(__tg_ceil(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x) {
+#if defined __arm64__
+  return vrndm_f32(x);
+#else
+  return simd_make_float2(__tg_floor(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x) {
+  return simd_make_float3(__tg_floor(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_floor_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndmq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_floor_f4(x);
+#else
+  simd_float4 truncated = __tg_trunc(x);
+  simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated > x);
+  return truncated - adjust;
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_floor_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndmq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_floor_d2(x);
+#else
+  simd_double2 truncated = __tg_trunc(x);
+  simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated > x);
+  return truncated - adjust;
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x) {
+  return simd_make_double3(__tg_floor(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x) {
+#if defined __arm64__
+  return vrndx_f32(x);
+#else
+  return simd_make_float2(__tg_rint(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x) {
+  return simd_make_float3(__tg_rint(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_rint_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+  return vrndxq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_rint_f4(x);
+#else
+  simd_float4 magic = __tg_copysign(0x1.0p23, x);
+  simd_int4 x_is_small = __tg_fabs(x) < 0x1.0p23;
+  return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffff);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_RINT);
+#else
+  return simd_make_float8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_RINT);
+#else
+  return simd_make_float16(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_rint_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+  return vrndxq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_rint_d2(x);
+#else
+  simd_double2 magic = __tg_copysign(0x1.0p52, x);
+  simd_long2 x_is_small = __tg_fabs(x) < 0x1.0p52;
+  return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffffffffffff);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x) {
+  return simd_make_double3(__tg_rint(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_RINT);
+#else
+  return simd_make_double4(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_RINT);
+#else
+  return simd_make_double8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x) {
+#if defined __arm64__
+  return vrnd_f32(x);
+#else
+  return simd_make_float2(__tg_trunc(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x) {
+  return simd_make_float3(__tg_trunc(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_trunc_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_trunc_f4(x);
+#else
+  simd_float4 binade = simd_bitselect(0, x, 0x7f800000);
+  simd_int4 mask = (simd_int4)__tg_fmin(-2*binade + 1, -0);
+  simd_float4 result = simd_bitselect(0, x, mask);
+  return simd_bitselect(x, result, binade < 0x1.0p23);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_trunc_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_trunc_d2(x);
+#else
+  simd_double2 binade = simd_bitselect(0, x, 0x7ff0000000000000);
+  simd_long2 mask = (simd_long2)__tg_fmin(-2*binade + 1, -0);
+  simd_double2 result = simd_bitselect(0, x, mask);
+  return simd_bitselect(x, result, binade < 0x1.0p52);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x) {
+  return simd_make_double3(__tg_trunc(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+#pragma mark - sine, cosine implementation
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x) {
+  return simd_make_float2(__tg_sin(simd_make_float4(x)));
+}
+  
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x) {
+  return simd_make_float3(__tg_sin(simd_make_float4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return _simd_sin_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return __sin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return simd_make_float4(sin(x.x), sin(x.y), sin(x.z), sin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+  return _simd_sin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+  return simd_make_float8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+  return _simd_sin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+  return simd_make_float16(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return _simd_sin_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return __sin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return simd_make_double2(sin(x.x), sin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x) {
+  return simd_make_double3(__tg_sin(simd_make_double4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+  return _simd_sin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+  return simd_make_double4(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+  return _simd_sin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+  return simd_make_double8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x) {
+  return simd_make_float2(__tg_cos(simd_make_float4(x)));
+}
+  
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x) {
+  return simd_make_float3(__tg_cos(simd_make_float4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return _simd_cos_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return __cos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return simd_make_float4(cos(x.x), cos(x.y), cos(x.z), cos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+  return _simd_cos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+  return simd_make_float8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+  return _simd_cos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+  return simd_make_float16(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return _simd_cos_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return __cos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return simd_make_double2(cos(x.x), cos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x) {
+  return simd_make_double3(__tg_cos(simd_make_double4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+  return _simd_cos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+  return simd_make_double4(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+  return _simd_cos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+  return simd_make_double8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+  
+#pragma mark - acos implementation
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x) {
+  return simd_make_float2(__tg_acos(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x) {
+  return simd_make_float3(__tg_acos(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+  return _simd_acos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+  return simd_make_float4(acos(x.x), acos(x.y), acos(x.z), acos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+  return _simd_acos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+  return simd_make_float8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+  return _simd_acos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+  return simd_make_float16(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+  return _simd_acos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+  return simd_make_double2(acos(x.x), acos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x) {
+  return simd_make_double3(__tg_acos(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+  return _simd_acos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+  return simd_make_double4(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+  return _simd_acos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+  return simd_make_double8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#pragma mark - asin implementation
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x) {
+  return simd_make_float2(__tg_asin(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x) {
+  return simd_make_float3(__tg_asin(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+  return _simd_asin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+  return simd_make_float4(asin(x.x), asin(x.y), asin(x.z), asin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+  return _simd_asin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+  return simd_make_float8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+  return _simd_asin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+  return simd_make_float16(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+  return _simd_asin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+  return simd_make_double2(asin(x.x), asin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x) {
+  return simd_make_double3(__tg_asin(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+  return _simd_asin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+  return simd_make_double4(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+  return _simd_asin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+  return simd_make_double8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#pragma mark - atan implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x) {
+  return simd_make_float2(__tg_atan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x) {
+  return simd_make_float3(__tg_atan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+  return _simd_atan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+  return simd_make_float4(atan(x.x), atan(x.y), atan(x.z), atan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+  return _simd_atan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+  return simd_make_float8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+  return _simd_atan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+  return simd_make_float16(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+  return _simd_atan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+  return simd_make_double2(atan(x.x), atan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x) {
+  return simd_make_double3(__tg_atan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+  return _simd_atan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+  return simd_make_double4(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+  return _simd_atan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+  return simd_make_double8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#pragma mark - tan implementation
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x) {
+  return simd_make_float2(__tg_tan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x) {
+  return simd_make_float3(__tg_tan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+  return _simd_tan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+  return simd_make_float4(tan(x.x), tan(x.y), tan(x.z), tan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+  return _simd_tan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+  return simd_make_float8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+  return _simd_tan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+  return simd_make_float16(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+  return _simd_tan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+  return simd_make_double2(tan(x.x), tan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x) {
+  return simd_make_double3(__tg_tan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+  return _simd_tan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+  return simd_make_double4(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+  return _simd_tan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+  return simd_make_double8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#pragma mark - cospi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x) {
+  return simd_make_float2(__tg_cospi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x) {
+  return simd_make_float3(__tg_cospi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cospi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+  return _simd_cospi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+  return simd_make_float4(__cospi(x.x), __cospi(x.y), __cospi(x.z), __cospi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cospi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+  return _simd_cospi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+  return simd_make_float8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cospi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+  return _simd_cospi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+  return simd_make_float16(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cospi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+  return _simd_cospi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+  return simd_make_double2(__cospi(x.x), __cospi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x) {
+  return simd_make_double3(__tg_cospi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cospi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+  return _simd_cospi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+  return simd_make_double4(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cospi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+  return _simd_cospi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+  return simd_make_double8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - sinpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x) {
+  return simd_make_float2(__tg_sinpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x) {
+  return simd_make_float3(__tg_sinpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+  return _simd_sinpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+  return simd_make_float4(__sinpi(x.x), __sinpi(x.y), __sinpi(x.z), __sinpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+  return _simd_sinpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+  return simd_make_float8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+  return _simd_sinpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+  return simd_make_float16(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+  return _simd_sinpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+  return simd_make_double2(__sinpi(x.x), __sinpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x) {
+  return simd_make_double3(__tg_sinpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+  return _simd_sinpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+  return simd_make_double4(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+  return _simd_sinpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+  return simd_make_double8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - tanpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x) {
+  return simd_make_float2(__tg_tanpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x) {
+  return simd_make_float3(__tg_tanpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+  return _simd_tanpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+  return simd_make_float4(__tanpi(x.x), __tanpi(x.y), __tanpi(x.z), __tanpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+  return _simd_tanpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+  return simd_make_float8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+  return _simd_tanpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+  return simd_make_float16(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+  return _simd_tanpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+  return simd_make_double2(__tanpi(x.x), __tanpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x) {
+  return simd_make_double3(__tg_tanpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+  return _simd_tanpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+  return simd_make_double4(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+  return _simd_tanpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+  return simd_make_double8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - acosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x) {
+  return simd_make_float2(__tg_acosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x) {
+  return simd_make_float3(__tg_acosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+  return _simd_acosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+  return simd_make_float4(acosh(x.x), acosh(x.y), acosh(x.z), acosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+  return _simd_acosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+  return simd_make_float8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+  return _simd_acosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+  return simd_make_float16(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+  return _simd_acosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+  return simd_make_double2(acosh(x.x), acosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x) {
+  return simd_make_double3(__tg_acosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+  return _simd_acosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+  return simd_make_double4(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+  return _simd_acosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+  return simd_make_double8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#pragma mark - asinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x) {
+  return simd_make_float2(__tg_asinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x) {
+  return simd_make_float3(__tg_asinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+  return _simd_asinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+  return simd_make_float4(asinh(x.x), asinh(x.y), asinh(x.z), asinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+  return _simd_asinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+  return simd_make_float8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+  return _simd_asinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+  return simd_make_float16(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+  return _simd_asinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+  return simd_make_double2(asinh(x.x), asinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x) {
+  return simd_make_double3(__tg_asinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+  return _simd_asinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+  return simd_make_double4(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+  return _simd_asinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+  return simd_make_double8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#pragma mark - atanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x) {
+  return simd_make_float2(__tg_atanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x) {
+  return simd_make_float3(__tg_atanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+  return _simd_atanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+  return simd_make_float4(atanh(x.x), atanh(x.y), atanh(x.z), atanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+  return _simd_atanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+  return simd_make_float8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+  return _simd_atanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+  return simd_make_float16(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+  return _simd_atanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+  return simd_make_double2(atanh(x.x), atanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x) {
+  return simd_make_double3(__tg_atanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+  return _simd_atanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+  return simd_make_double4(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+  return _simd_atanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+  return simd_make_double8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#pragma mark - cosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x) {
+  return simd_make_float2(__tg_cosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x) {
+  return simd_make_float3(__tg_cosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+  return _simd_cosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+  return simd_make_float4(cosh(x.x), cosh(x.y), cosh(x.z), cosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+  return _simd_cosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+  return simd_make_float8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+  return _simd_cosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+  return simd_make_float16(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+  return _simd_cosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+  return simd_make_double2(cosh(x.x), cosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x) {
+  return simd_make_double3(__tg_cosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+  return _simd_cosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+  return simd_make_double4(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+  return _simd_cosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+  return simd_make_double8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#pragma mark - sinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x) {
+  return simd_make_float2(__tg_sinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x) {
+  return simd_make_float3(__tg_sinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+  return _simd_sinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+  return simd_make_float4(sinh(x.x), sinh(x.y), sinh(x.z), sinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+  return _simd_sinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+  return simd_make_float8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+  return _simd_sinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+  return simd_make_float16(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+  return _simd_sinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+  return simd_make_double2(sinh(x.x), sinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x) {
+  return simd_make_double3(__tg_sinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+  return _simd_sinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+  return simd_make_double4(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+  return _simd_sinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+  return simd_make_double8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#pragma mark - tanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x) {
+  return simd_make_float2(__tg_tanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x) {
+  return simd_make_float3(__tg_tanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+  return _simd_tanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+  return simd_make_float4(tanh(x.x), tanh(x.y), tanh(x.z), tanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+  return _simd_tanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+  return simd_make_float8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+  return _simd_tanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+  return simd_make_float16(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+  return _simd_tanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+  return simd_make_double2(tanh(x.x), tanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x) {
+  return simd_make_double3(__tg_tanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+  return _simd_tanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+  return simd_make_double4(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+  return _simd_tanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+  return simd_make_double8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#pragma mark - exp implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x) {
+  return simd_make_float2(__tg_exp(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x) {
+  return simd_make_float3(__tg_exp(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+  return _simd_exp_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+  return simd_make_float4(exp(x.x), exp(x.y), exp(x.z), exp(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+  return _simd_exp_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+  return simd_make_float8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+  return _simd_exp_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+  return simd_make_float16(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+  return _simd_exp_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+  return simd_make_double2(exp(x.x), exp(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x) {
+  return simd_make_double3(__tg_exp(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+  return _simd_exp_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+  return simd_make_double4(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+  return _simd_exp_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+  return simd_make_double8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#pragma mark - exp2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x) {
+  return simd_make_float2(__tg_exp2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x) {
+  return simd_make_float3(__tg_exp2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+  return _simd_exp2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+  return simd_make_float4(exp2(x.x), exp2(x.y), exp2(x.z), exp2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+  return _simd_exp2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+  return simd_make_float8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+  return _simd_exp2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+  return simd_make_float16(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+  return _simd_exp2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+  return simd_make_double2(exp2(x.x), exp2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x) {
+  return simd_make_double3(__tg_exp2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+  return _simd_exp2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+  return simd_make_double4(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+  return _simd_exp2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+  return simd_make_double8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#pragma mark - exp10 implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x) {
+  return simd_make_float2(__tg_exp10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x) {
+  return simd_make_float3(__tg_exp10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+  return _simd_exp10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+  return simd_make_float4(__exp10(x.x), __exp10(x.y), __exp10(x.z), __exp10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+  return _simd_exp10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+  return simd_make_float8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+  return _simd_exp10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+  return simd_make_float16(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+  return _simd_exp10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+  return simd_make_double2(__exp10(x.x), __exp10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x) {
+  return simd_make_double3(__tg_exp10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+  return _simd_exp10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+  return simd_make_double4(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+  return _simd_exp10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+  return simd_make_double8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - expm1 implementation
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x) {
+  return simd_make_float2(__tg_expm1(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x) {
+  return simd_make_float3(__tg_expm1(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_expm1_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+  return _simd_expm1_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+  return simd_make_float4(expm1(x.x), expm1(x.y), expm1(x.z), expm1(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_expm1_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+  return _simd_expm1_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+  return simd_make_float8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_expm1_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+  return _simd_expm1_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+  return simd_make_float16(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_expm1_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+  return _simd_expm1_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+  return simd_make_double2(expm1(x.x), expm1(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x) {
+  return simd_make_double3(__tg_expm1(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_expm1_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+  return _simd_expm1_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+  return simd_make_double4(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_expm1_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+  return _simd_expm1_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+  return simd_make_double8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#pragma mark - log implementation
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x) {
+  return simd_make_float2(__tg_log(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x) {
+  return simd_make_float3(__tg_log(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+  return _simd_log_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+  return simd_make_float4(log(x.x), log(x.y), log(x.z), log(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+  return _simd_log_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+  return simd_make_float8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+  return _simd_log_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+  return simd_make_float16(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+  return _simd_log_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+  return simd_make_double2(log(x.x), log(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x) {
+  return simd_make_double3(__tg_log(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+  return _simd_log_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+  return simd_make_double4(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+  return _simd_log_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+  return simd_make_double8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#pragma mark - log2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x) {
+  return simd_make_float2(__tg_log2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x) {
+  return simd_make_float3(__tg_log2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+  return _simd_log2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+  return simd_make_float4(log2(x.x), log2(x.y), log2(x.z), log2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+  return _simd_log2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+  return simd_make_float8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+  return _simd_log2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+  return simd_make_float16(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+  return _simd_log2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+  return simd_make_double2(log2(x.x), log2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x) {
+  return simd_make_double3(__tg_log2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+  return _simd_log2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+  return simd_make_double4(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+  return _simd_log2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+  return simd_make_double8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#pragma mark - log10 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x) {
+  return simd_make_float2(__tg_log10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x) {
+  return simd_make_float3(__tg_log10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+  return _simd_log10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+  return simd_make_float4(log10(x.x), log10(x.y), log10(x.z), log10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+  return _simd_log10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+  return simd_make_float8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+  return _simd_log10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+  return simd_make_float16(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+  return _simd_log10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+  return simd_make_double2(log10(x.x), log10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x) {
+  return simd_make_double3(__tg_log10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+  return _simd_log10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+  return simd_make_double4(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+  return _simd_log10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+  return simd_make_double8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#pragma mark - log1p implementation
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x) {
+  return simd_make_float2(__tg_log1p(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x) {
+  return simd_make_float3(__tg_log1p(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log1p_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+  return _simd_log1p_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+  return simd_make_float4(log1p(x.x), log1p(x.y), log1p(x.z), log1p(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log1p_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+  return _simd_log1p_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+  return simd_make_float8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log1p_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+  return _simd_log1p_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+  return simd_make_float16(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log1p_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+  return _simd_log1p_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+  return simd_make_double2(log1p(x.x), log1p(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x) {
+  return simd_make_double3(__tg_log1p(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log1p_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+  return _simd_log1p_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+  return simd_make_double4(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log1p_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+  return _simd_log1p_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+  return simd_make_double8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#pragma mark - cbrt implementation
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x) {
+  return simd_make_float2(__tg_cbrt(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x) {
+  return simd_make_float3(__tg_cbrt(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cbrt_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+  return _simd_cbrt_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+  return simd_make_float4(cbrt(x.x), cbrt(x.y), cbrt(x.z), cbrt(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cbrt_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+  return _simd_cbrt_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+  return simd_make_float8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cbrt_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+  return _simd_cbrt_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+  return simd_make_float16(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cbrt_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+  return _simd_cbrt_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+  return simd_make_double2(cbrt(x.x), cbrt(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x) {
+  return simd_make_double3(__tg_cbrt(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cbrt_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+  return _simd_cbrt_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+  return simd_make_double4(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cbrt_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+  return _simd_cbrt_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+  return simd_make_double8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#pragma mark - erf implementation
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x) {
+  return simd_make_float2(__tg_erf(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x) {
+  return simd_make_float3(__tg_erf(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erf_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+  return _simd_erf_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+  return simd_make_float4(erf(x.x), erf(x.y), erf(x.z), erf(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erf_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+  return _simd_erf_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+  return simd_make_float8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erf_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+  return _simd_erf_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+  return simd_make_float16(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erf_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+  return _simd_erf_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+  return simd_make_double2(erf(x.x), erf(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x) {
+  return simd_make_double3(__tg_erf(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erf_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+  return _simd_erf_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+  return simd_make_double4(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erf_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+  return _simd_erf_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+  return simd_make_double8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#pragma mark - erfc implementation
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x) {
+  return simd_make_float2(__tg_erfc(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x) {
+  return simd_make_float3(__tg_erfc(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erfc_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+  return _simd_erfc_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+  return simd_make_float4(erfc(x.x), erfc(x.y), erfc(x.z), erfc(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erfc_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+  return _simd_erfc_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+  return simd_make_float8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erfc_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+  return _simd_erfc_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+  return simd_make_float16(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erfc_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+  return _simd_erfc_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+  return simd_make_double2(erfc(x.x), erfc(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x) {
+  return simd_make_double3(__tg_erfc(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erfc_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+  return _simd_erfc_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+  return simd_make_double4(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erfc_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+  return _simd_erfc_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+  return simd_make_double8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#pragma mark - tgamma implementation
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x) {
+  return simd_make_float2(__tg_tgamma(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x) {
+  return simd_make_float3(__tg_tgamma(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tgamma_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+  return _simd_tgamma_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+  return simd_make_float4(tgamma(x.x), tgamma(x.y), tgamma(x.z), tgamma(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tgamma_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+  return _simd_tgamma_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+  return simd_make_float8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tgamma_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+  return _simd_tgamma_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+  return simd_make_float16(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tgamma_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+  return _simd_tgamma_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+  return simd_make_double2(tgamma(x.x), tgamma(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x) {
+  return simd_make_double3(__tg_tgamma(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tgamma_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+  return _simd_tgamma_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+  return simd_make_double4(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tgamma_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+  return _simd_tgamma_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+  return simd_make_double8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#pragma mark - round implementation
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x) {
+  return simd_make_float2(__tg_round(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x) {
+  return simd_make_float3(__tg_round(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_round_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+#if defined __arm64__
+  return vrndaq_f32(x);
+#else
+  return _simd_round_f4(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+  return simd_make_float4(round(x.x), round(x.y), round(x.z), round(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_round_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+  return _simd_round_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+  return simd_make_float8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_round_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+  return _simd_round_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+  return simd_make_float16(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_round_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+#if defined __arm64__
+  return vrndaq_f64(x);
+#else
+  return _simd_round_d2(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+  return simd_make_double2(round(x.x), round(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x) {
+  return simd_make_double3(__tg_round(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_round_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+  return _simd_round_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+  return simd_make_double4(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_round_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+  return _simd_round_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+  return simd_make_double8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#pragma mark - atan2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x) {
+  return simd_make_float2(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x) {
+  return simd_make_float3(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan2_f4(simd_float4 y, simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+  return _simd_atan2_f4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+  return simd_make_float4(atan2(y.x, x.x), atan2(y.y, x.y), atan2(y.z, x.z), atan2(y.w, x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan2_f8(simd_float8 y, simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+  return _simd_atan2_f8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+  return simd_make_float8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan2_f16(simd_float16 y, simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+  return _simd_atan2_f16(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+  return simd_make_float16(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan2_d2(simd_double2 y, simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+  return _simd_atan2_d2(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+  return simd_make_double2(atan2(y.x, x.x), atan2(y.y, x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x) {
+  return simd_make_double3(__tg_atan2(simd_make_double4(y), simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan2_d4(simd_double4 y, simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+  return _simd_atan2_d4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+  return simd_make_double4(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan2_d8(simd_double8 y, simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+  return _simd_atan2_d8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+  return simd_make_double8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#pragma mark - hypot implementation
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_hypot_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+  return _simd_hypot_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(hypot(x.x, y.x), hypot(x.y, y.y), hypot(x.z, y.z), hypot(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_hypot_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+  return _simd_hypot_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_hypot_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+  return _simd_hypot_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_hypot_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+  return _simd_hypot_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(hypot(x.x, y.x), hypot(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_hypot(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_hypot_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+  return _simd_hypot_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_hypot_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+  return _simd_hypot_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - pow implementation
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_pow_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+  return _simd_pow_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z), pow(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_pow_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+  return _simd_pow_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_pow_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+  return _simd_pow_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_pow_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+  return _simd_pow_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(pow(x.x, y.x), pow(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_pow(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_pow_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+  return _simd_pow_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_pow_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+  return _simd_pow_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - fmod implementation
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fmod_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+  return _simd_fmod_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(fmod(x.x, y.x), fmod(x.y, y.y), fmod(x.z, y.z), fmod(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_fmod_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+  return _simd_fmod_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_fmod_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+  return _simd_fmod_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fmod_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+  return _simd_fmod_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(fmod(x.x, y.x), fmod(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmod(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_fmod_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+  return _simd_fmod_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_fmod_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+  return _simd_fmod_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - remainder implementation
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_remainder_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+  return _simd_remainder_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(remainder(x.x, y.x), remainder(x.y, y.y), remainder(x.z, y.z), remainder(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_remainder_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+  return _simd_remainder_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_remainder_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+  return _simd_remainder_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_remainder_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+  return _simd_remainder_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(remainder(x.x, y.x), remainder(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_remainder(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_remainder_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+  return _simd_remainder_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_remainder_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+  return _simd_remainder_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - nextafter implementation
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_nextafter_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+  return _simd_nextafter_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(nextafter(x.x, y.x), nextafter(x.y, y.y), nextafter(x.z, y.z), nextafter(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_nextafter_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+  return _simd_nextafter_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_nextafter_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+  return _simd_nextafter_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_nextafter_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+  return _simd_nextafter_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(nextafter(x.x, y.x), nextafter(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_nextafter(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_nextafter_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+  return _simd_nextafter_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_nextafter_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+  return _simd_nextafter_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 5
+#pragma mark - sincos implementation
+static inline SIMD_NONCONST void __tg_sincos(simd_float2 x, simd_float2 *sinp, simd_float2 *cosp) {
+    simd_float4 sin_val;
+    simd_float4 cos_val;
+    __tg_sincos(simd_make_float4(x), &sin_val, &cos_val);
+    *sinp = simd_make_float2(sin_val);
+    *cosp = simd_make_float2(cos_val);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_float3 x, simd_float3 *sinp, simd_float3 *cosp) {
+    simd_float4 sin_val;
+    simd_float4 cos_val;
+    __tg_sincos(simd_make_float4(x), &sin_val, &cos_val);
+    *sinp = simd_make_float3(sin_val);
+    *cosp = simd_make_float3(cos_val);
+}
+
+extern void _simd_sincos_f4(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp);
+static inline SIMD_NONCONST void __tg_sincos(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp) {
+  return _simd_sincos_f4(x, sinp, cosp);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_float8 x, simd_float8 *sinp, simd_float8 *cosp) {
+  __tg_sincos(x.lo, (simd_float4 *)sinp+0, (simd_float4 *)cosp+0);
+  __tg_sincos(x.hi, (simd_float4 *)sinp+1, (simd_float4 *)cosp+1);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_float16 x, simd_float16 *sinp, simd_float16 *cosp) {
+  __tg_sincos(x.lo, (simd_float8 *)sinp+0, (simd_float8 *)cosp+0);
+  __tg_sincos(x.hi, (simd_float8 *)sinp+1, (simd_float8 *)cosp+1);
+}
+
+extern void _simd_sincos_d2(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp);
+static inline SIMD_NONCONST void __tg_sincos(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp) {
+  return _simd_sincos_d2(x, sinp, cosp);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_double3 x, simd_double3 *sinp, simd_double3 *cosp) {
+    simd_double4 sin_val;
+    simd_double4 cos_val;
+    __tg_sincos(simd_make_double4(x), &sin_val, &cos_val);
+    *sinp = simd_make_double3(sin_val);
+    *cosp = simd_make_double3(cos_val);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_double4 x, simd_double4 *sinp, simd_double4 *cosp) {
+  __tg_sincos(x.lo, (simd_double2 *)sinp+0, (simd_double2 *)cosp+0);
+  __tg_sincos(x.hi, (simd_double2 *)sinp+1, (simd_double2 *)cosp+1);
+}
+
+static inline SIMD_NONCONST void __tg_sincos(simd_double8 x, simd_double8 *sinp, simd_double8 *cosp) {
+  __tg_sincos(x.lo, (simd_double4 *)sinp+0, (simd_double4 *)cosp+0);
+  __tg_sincos(x.hi, (simd_double4 *)sinp+1, (simd_double4 *)cosp+1);
+}
+
+#pragma mark - sincospi implementation
+static inline SIMD_NONCONST void __tg_sincospi(simd_float2 x, simd_float2 *sinp, simd_float2 *cosp) {
+    simd_float4 sin_val;
+    simd_float4 cos_val;
+    __tg_sincospi(simd_make_float4(x), &sin_val, &cos_val);
+    *sinp = simd_make_float2(sin_val);
+    *cosp = simd_make_float2(cos_val);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_float3 x, simd_float3 *sinp, simd_float3 *cosp) {
+    simd_float4 sin_val;
+    simd_float4 cos_val;
+    __tg_sincospi(simd_make_float4(x), &sin_val, &cos_val);
+    *sinp = simd_make_float3(sin_val);
+    *cosp = simd_make_float3(cos_val);
+}
+
+extern void _simd_sincospi_f4(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp);
+static inline SIMD_NONCONST void __tg_sincospi(simd_float4 x, simd_float4 *sinp, simd_float4 *cosp) {
+  return _simd_sincospi_f4(x, sinp, cosp);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_float8 x, simd_float8 *sinp, simd_float8 *cosp) {
+  __tg_sincospi(x.lo, (simd_float4 *)sinp+0, (simd_float4 *)cosp+0);
+  __tg_sincospi(x.hi, (simd_float4 *)sinp+1, (simd_float4 *)cosp+1);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_float16 x, simd_float16 *sinp, simd_float16 *cosp) {
+  __tg_sincospi(x.lo, (simd_float8 *)sinp+0, (simd_float8 *)cosp+0);
+  __tg_sincospi(x.hi, (simd_float8 *)sinp+1, (simd_float8 *)cosp+1);
+}
+
+extern void _simd_sincospi_d2(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp);
+static inline SIMD_NONCONST void __tg_sincospi(simd_double2 x, simd_double2 *sinp, simd_double2 *cosp) {
+  return _simd_sincospi_d2(x, sinp, cosp);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_double3 x, simd_double3 *sinp, simd_double3 *cosp) {
+    simd_double4 sin_val;
+    simd_double4 cos_val;
+    __tg_sincospi(simd_make_double4(x), &sin_val, &cos_val);
+    *sinp = simd_make_double3(sin_val);
+    *cosp = simd_make_double3(cos_val);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_double4 x, simd_double4 *sinp, simd_double4 *cosp) {
+  __tg_sincospi(x.lo, (simd_double2 *)sinp+0, (simd_double2 *)cosp+0);
+  __tg_sincospi(x.hi, (simd_double2 *)sinp+1, (simd_double2 *)cosp+1);
+}
+
+static inline SIMD_NONCONST void __tg_sincospi(simd_double8 x, simd_double8 *sinp, simd_double8 *cosp) {
+  __tg_sincospi(x.lo, (simd_double4 *)sinp+0, (simd_double4 *)cosp+0);
+  __tg_sincospi(x.hi, (simd_double4 *)sinp+1, (simd_double4 *)cosp+1);
+}
+
+#endif // SIMD_LIBRARY_VERSION >= 5
+#pragma mark - lgamma implementation
+static inline SIMD_CFUNC simd_float2 __tg_lgamma(simd_float2 x) {
+  return simd_make_float2(__tg_lgamma(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_lgamma(simd_float3 x) {
+  return simd_make_float3(__tg_lgamma(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 4
+extern simd_float4 _simd_lgamma_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_lgamma(simd_float4 x) {
+  return _simd_lgamma_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_lgamma(simd_float4 x) {
+  return simd_make_float4(lgamma(x.x), lgamma(x.y), lgamma(x.z), lgamma(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 4 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_lgamma_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_lgamma(simd_float8 x) {
+  return _simd_lgamma_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_lgamma(simd_float8 x) {
+  return simd_make_float8(__tg_lgamma(x.lo), __tg_lgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 4 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_lgamma_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_lgamma(simd_float16 x) {
+  return _simd_lgamma_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_lgamma(simd_float16 x) {
+  return simd_make_float16(__tg_lgamma(x.lo), __tg_lgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 4
+extern simd_double2 _simd_lgamma_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_lgamma(simd_double2 x) {
+  return _simd_lgamma_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_lgamma(simd_double2 x) {
+  return simd_make_double2(lgamma(x.x), lgamma(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_lgamma(simd_double3 x) {
+  return simd_make_double3(__tg_lgamma(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 4 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_lgamma_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_lgamma(simd_double4 x) {
+  return _simd_lgamma_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_lgamma(simd_double4 x) {
+  return simd_make_double4(__tg_lgamma(x.lo), __tg_lgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 4 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_lgamma_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_lgamma(simd_double8 x) {
+  return _simd_lgamma_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_lgamma(simd_double8 x) {
+  return simd_make_double8(__tg_lgamma(x.lo), __tg_lgamma(x.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y) { return simd_bitselect(x-y, 0, x<y); }
+ 
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+  return vfma_f32(z, x, y);
+#else
+  return simd_make_float2(__tg_fma(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_float4_undef(z)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z) {
+  return simd_make_float3(__tg_fma(simd_make_float4(x), simd_make_float4(y), simd_make_float4(z)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fma_f4(simd_float4 x, simd_float4 y, simd_float4 z);
+#endif
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+  return vfmaq_f32(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm_fmadd_ps(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+  return _simd_fma_f4(x, y, z);
+#else
+  return simd_make_float4(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y), fma(x.z, y.z, z.z), fma(x.w, y.w, z.w));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm256_fmadd_ps(x, y, z);
+#else
+  return simd_make_float8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_fmadd_ps(x, y, z);
+#else
+  return simd_make_float16(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fma_d2(simd_double2 x, simd_double2 y, simd_double2 z);
+#endif
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z) {
+#if defined __arm64__
+  return vfmaq_f64(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm_fmadd_pd(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+  return _simd_fma_d2(x, y, z);
+#else
+  return simd_make_double2(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z) {
+  return simd_make_double3(__tg_fma(simd_make_double4(x), simd_make_double4(y), simd_make_double4(z)));
+}
+
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm256_fmadd_pd(x, y, z);
+#else
+  return simd_make_double4(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_fmadd_pd(x, y, z);
+#else
+  return simd_make_double8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+#ifdef __cplusplus
+}      /* extern "C" */
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_MATH_HEADER */
diff --git a/vfsoverlay/matrix.h b/vfsoverlay/matrix.h
new file mode 100644
index 00000000..bfc07b96
--- /dev/null
+++ b/vfsoverlay/matrix.h
@@ -0,0 +1,1990 @@
+/* Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ *      Function                        Result
+ *      ------------------------------------------------------------------
+ *
+ *      simd_diagonal_matrix(x)         A square matrix with the vector x
+ *                                      as its diagonal.
+ *
+ *      simd_matrix(c0, c1, ... )       A matrix with the specified vectors
+ *                                      as columns.
+ *
+ *      simd_matrix_from_rows(r0, r1, ... )  A matrix with the specified vectors
+ *                                      as rows.
+ *
+ *      simd_mul(a,x)                   Scalar product a*x.
+ *
+ *      simd_linear_combination(a,x,b,y)  a*x + b*y.
+ *
+ *      simd_add(x,y)                   Macro wrapping linear_combination
+ *                                      to compute x + y.
+ *
+ *      simd_sub(x,y)                   Macro wrapping linear_combination
+ *                                      to compute x - y.
+ *
+ *      simd_transpose(x)               Transpose of the matrix x.
+ *
+ *      simd_trace(x)                   Trace of the matrix x.
+ *
+ *      simd_determinant(x)             Determinant of the matrix x.
+ *
+ *      simd_inverse(x)                 Inverse of x if x is non-singular.  If
+ *                                      x is singular, the result is undefined.
+ *
+ *      simd_mul(x,y)                   If x is a matrix, returns the matrix
+ *                                      product x*y, where y is either a matrix
+ *                                      or a column vector.  If x is a vector,
+ *                                      returns the product x*y where x is
+ *                                      interpreted as a row vector.
+ *
+ *      simd_equal(x,y)                 Returns true if and only if every
+ *                                      element of x is exactly equal to the
+ *                                      corresponding element of y.
+ *
+ *      simd_almost_equal_elements(x,y,tol)
+ *                                      Returns true if and only if for each
+ *                                      entry xij in x, the corresponding
+ *                                      element yij in y satisfies
+ *                                      |xij - yij| <= tol.
+ *
+ *      simd_almost_equal_elements_relative(x,y,tol)
+ *                                      Returns true if and only if for each
+ *                                      entry xij in x, the corresponding
+ *                                      element yij in y satisfies
+ *                                      |xij - yij| <= tol*|xij|.
+ *
+ *  The header also defines a few useful global matrix objects:
+ *  matrix_identity_floatNxM and matrix_identity_doubleNxM, may be used to get
+ *  an identity matrix of the specified size.
+ *
+ *  In C++, we are able to use namespacing to make the functions more concise;
+ *  we also overload some common arithmetic operators to work with the matrix
+ *  types:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      --------------------------------------------------------------------
+ *      simd::inverse                   simd_inverse
+ *      simd::transpose                 simd_transpose
+ *      operator+                       simd_add
+ *      operator-                       simd_sub
+ *      operator+=                      N/A
+ *      operator-=                      N/A
+ *      operator*                       simd_mul or simd_mul
+ *      operator*=                      simd_mul or simd_mul
+ *      operator==                      simd_equal
+ *      operator!=                      !simd_equal
+ *      simd::almost_equal_elements     simd_almost_equal_elements
+ *      simd::almost_equal_elements_relative  simd_almost_equal_elements_relative
+ *
+ *  <simd/matrix_types.h> provides constructors for C++ matrix types.
+ */
+
+#ifndef SIMD_MATRIX_HEADER
+#define SIMD_MATRIX_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/matrix_types.h>
+#include <simd/geometry.h>
+#include <simd/extern.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+extern const simd_float2x2 matrix_identity_float2x2  __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+extern const simd_float3x3 matrix_identity_float3x3  __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+extern const simd_float4x4 matrix_identity_float4x4  __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+extern const simd_double2x2 matrix_identity_double2x2 __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+extern const simd_double3x3 matrix_identity_double3x3 __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+extern const simd_double4x4 matrix_identity_double4x4 __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+
+static simd_float2x2 SIMD_CFUNC simd_diagonal_matrix(simd_float2 __x);
+static simd_float3x3 SIMD_CFUNC simd_diagonal_matrix(simd_float3 __x);
+static simd_float4x4 SIMD_CFUNC simd_diagonal_matrix(simd_float4 __x);
+static simd_double2x2 SIMD_CFUNC simd_diagonal_matrix(simd_double2 __x);
+static simd_double3x3 SIMD_CFUNC simd_diagonal_matrix(simd_double3 __x);
+static simd_double4x4 SIMD_CFUNC simd_diagonal_matrix(simd_double4 __x);
+#define matrix_from_diagonal simd_diagonal_matrix
+
+static simd_float2x2 SIMD_CFUNC simd_matrix(simd_float2 col0, simd_float2 col1);
+static simd_float3x2 SIMD_CFUNC simd_matrix(simd_float2 col0, simd_float2 col1, simd_float2 col2);
+static simd_float4x2 SIMD_CFUNC simd_matrix(simd_float2 col0, simd_float2 col1, simd_float2 col2, simd_float2 col3);
+static simd_float2x3 SIMD_CFUNC simd_matrix(simd_float3 col0, simd_float3 col1);
+static simd_float3x3 SIMD_CFUNC simd_matrix(simd_float3 col0, simd_float3 col1, simd_float3 col2);
+static simd_float4x3 SIMD_CFUNC simd_matrix(simd_float3 col0, simd_float3 col1, simd_float3 col2, simd_float3 col3);
+static simd_float2x4 SIMD_CFUNC simd_matrix(simd_float4 col0, simd_float4 col1);
+static simd_float3x4 SIMD_CFUNC simd_matrix(simd_float4 col0, simd_float4 col1, simd_float4 col2);
+static simd_float4x4 SIMD_CFUNC simd_matrix(simd_float4 col0, simd_float4 col1, simd_float4 col2, simd_float4 col3);
+static simd_double2x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1);
+static simd_double3x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1, simd_double2 col2);
+static simd_double4x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1, simd_double2 col2, simd_double2 col3);
+static simd_double2x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1);
+static simd_double3x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1, simd_double3 col2);
+static simd_double4x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1, simd_double3 col2, simd_double3 col3);
+static simd_double2x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1);
+static simd_double3x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1, simd_double4 col2);
+static simd_double4x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1, simd_double4 col2, simd_double4 col3);
+#define matrix_from_columns simd_matrix
+
+static simd_float2x2 SIMD_CFUNC simd_matrix_from_rows(simd_float2 row0, simd_float2 row1);
+static simd_float2x3 SIMD_CFUNC simd_matrix_from_rows(simd_float2 row0, simd_float2 row1, simd_float2 row2);
+static simd_float2x4 SIMD_CFUNC simd_matrix_from_rows(simd_float2 row0, simd_float2 row1, simd_float2 row2, simd_float2 row3);
+static simd_float3x2 SIMD_CFUNC simd_matrix_from_rows(simd_float3 row0, simd_float3 row1);
+static simd_float3x3 SIMD_CFUNC simd_matrix_from_rows(simd_float3 row0, simd_float3 row1, simd_float3 row2);
+static simd_float3x4 SIMD_CFUNC simd_matrix_from_rows(simd_float3 row0, simd_float3 row1, simd_float3 row2, simd_float3 row3);
+static simd_float4x2 SIMD_CFUNC simd_matrix_from_rows(simd_float4 row0, simd_float4 row1);
+static simd_float4x3 SIMD_CFUNC simd_matrix_from_rows(simd_float4 row0, simd_float4 row1, simd_float4 row2);
+static simd_float4x4 SIMD_CFUNC simd_matrix_from_rows(simd_float4 row0, simd_float4 row1, simd_float4 row2, simd_float4 row3);
+static simd_double2x2 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1);
+static simd_double2x3 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1, simd_double2 row2);
+static simd_double2x4 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1, simd_double2 row2, simd_double2 row3);
+static simd_double3x2 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1);
+static simd_double3x3 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1, simd_double3 row2);
+static simd_double3x4 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1, simd_double3 row2, simd_double3 row3);
+static simd_double4x2 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1);
+static simd_double4x3 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1, simd_double4 row2);
+static simd_double4x4 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1, simd_double4 row2, simd_double4 row3);
+#define matrix_from_rows simd_matrix_from_rows
+        
+static  simd_float3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatf q);
+static  simd_float4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatf q);
+static simd_double3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatd q);
+static simd_double4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatd q);
+
+static simd_float2x2 SIMD_CFUNC simd_mul(float __a, simd_float2x2 __x);
+static simd_float3x2 SIMD_CFUNC simd_mul(float __a, simd_float3x2 __x);
+static simd_float4x2 SIMD_CFUNC simd_mul(float __a, simd_float4x2 __x);
+static simd_float2x3 SIMD_CFUNC simd_mul(float __a, simd_float2x3 __x);
+static simd_float3x3 SIMD_CFUNC simd_mul(float __a, simd_float3x3 __x);
+static simd_float4x3 SIMD_CFUNC simd_mul(float __a, simd_float4x3 __x);
+static simd_float2x4 SIMD_CFUNC simd_mul(float __a, simd_float2x4 __x);
+static simd_float3x4 SIMD_CFUNC simd_mul(float __a, simd_float3x4 __x);
+static simd_float4x4 SIMD_CFUNC simd_mul(float __a, simd_float4x4 __x);
+static simd_double2x2 SIMD_CFUNC simd_mul(double __a, simd_double2x2 __x);
+static simd_double3x2 SIMD_CFUNC simd_mul(double __a, simd_double3x2 __x);
+static simd_double4x2 SIMD_CFUNC simd_mul(double __a, simd_double4x2 __x);
+static simd_double2x3 SIMD_CFUNC simd_mul(double __a, simd_double2x3 __x);
+static simd_double3x3 SIMD_CFUNC simd_mul(double __a, simd_double3x3 __x);
+static simd_double4x3 SIMD_CFUNC simd_mul(double __a, simd_double4x3 __x);
+static simd_double2x4 SIMD_CFUNC simd_mul(double __a, simd_double2x4 __x);
+static simd_double3x4 SIMD_CFUNC simd_mul(double __a, simd_double3x4 __x);
+static simd_double4x4 SIMD_CFUNC simd_mul(double __a, simd_double4x4 __x);
+
+static simd_float2x2 SIMD_CFUNC simd_linear_combination(float __a, simd_float2x2 __x, float __b, simd_float2x2 __y);
+static simd_float3x2 SIMD_CFUNC simd_linear_combination(float __a, simd_float3x2 __x, float __b, simd_float3x2 __y);
+static simd_float4x2 SIMD_CFUNC simd_linear_combination(float __a, simd_float4x2 __x, float __b, simd_float4x2 __y);
+static simd_float2x3 SIMD_CFUNC simd_linear_combination(float __a, simd_float2x3 __x, float __b, simd_float2x3 __y);
+static simd_float3x3 SIMD_CFUNC simd_linear_combination(float __a, simd_float3x3 __x, float __b, simd_float3x3 __y);
+static simd_float4x3 SIMD_CFUNC simd_linear_combination(float __a, simd_float4x3 __x, float __b, simd_float4x3 __y);
+static simd_float2x4 SIMD_CFUNC simd_linear_combination(float __a, simd_float2x4 __x, float __b, simd_float2x4 __y);
+static simd_float3x4 SIMD_CFUNC simd_linear_combination(float __a, simd_float3x4 __x, float __b, simd_float3x4 __y);
+static simd_float4x4 SIMD_CFUNC simd_linear_combination(float __a, simd_float4x4 __x, float __b, simd_float4x4 __y);
+static simd_double2x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x2 __x, double __b, simd_double2x2 __y);
+static simd_double3x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x2 __x, double __b, simd_double3x2 __y);
+static simd_double4x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x2 __x, double __b, simd_double4x2 __y);
+static simd_double2x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x3 __x, double __b, simd_double2x3 __y);
+static simd_double3x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x3 __x, double __b, simd_double3x3 __y);
+static simd_double4x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x3 __x, double __b, simd_double4x3 __y);
+static simd_double2x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x4 __x, double __b, simd_double2x4 __y);
+static simd_double3x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x4 __x, double __b, simd_double3x4 __y);
+static simd_double4x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x4 __x, double __b, simd_double4x4 __y);
+#define matrix_linear_combination simd_linear_combination
+      
+static simd_float2x2 SIMD_CFUNC simd_add(simd_float2x2 __x, simd_float2x2 __y);
+static simd_float3x2 SIMD_CFUNC simd_add(simd_float3x2 __x, simd_float3x2 __y);
+static simd_float4x2 SIMD_CFUNC simd_add(simd_float4x2 __x, simd_float4x2 __y);
+static simd_float2x3 SIMD_CFUNC simd_add(simd_float2x3 __x, simd_float2x3 __y);
+static simd_float3x3 SIMD_CFUNC simd_add(simd_float3x3 __x, simd_float3x3 __y);
+static simd_float4x3 SIMD_CFUNC simd_add(simd_float4x3 __x, simd_float4x3 __y);
+static simd_float2x4 SIMD_CFUNC simd_add(simd_float2x4 __x, simd_float2x4 __y);
+static simd_float3x4 SIMD_CFUNC simd_add(simd_float3x4 __x, simd_float3x4 __y);
+static simd_float4x4 SIMD_CFUNC simd_add(simd_float4x4 __x, simd_float4x4 __y);
+static simd_double2x2 SIMD_CFUNC simd_add(simd_double2x2 __x, simd_double2x2 __y);
+static simd_double3x2 SIMD_CFUNC simd_add(simd_double3x2 __x, simd_double3x2 __y);
+static simd_double4x2 SIMD_CFUNC simd_add(simd_double4x2 __x, simd_double4x2 __y);
+static simd_double2x3 SIMD_CFUNC simd_add(simd_double2x3 __x, simd_double2x3 __y);
+static simd_double3x3 SIMD_CFUNC simd_add(simd_double3x3 __x, simd_double3x3 __y);
+static simd_double4x3 SIMD_CFUNC simd_add(simd_double4x3 __x, simd_double4x3 __y);
+static simd_double2x4 SIMD_CFUNC simd_add(simd_double2x4 __x, simd_double2x4 __y);
+static simd_double3x4 SIMD_CFUNC simd_add(simd_double3x4 __x, simd_double3x4 __y);
+static simd_double4x4 SIMD_CFUNC simd_add(simd_double4x4 __x, simd_double4x4 __y);
+#define matrix_add simd_add
+      
+static simd_float2x2 SIMD_CFUNC simd_sub(simd_float2x2 __x, simd_float2x2 __y);
+static simd_float3x2 SIMD_CFUNC simd_sub(simd_float3x2 __x, simd_float3x2 __y);
+static simd_float4x2 SIMD_CFUNC simd_sub(simd_float4x2 __x, simd_float4x2 __y);
+static simd_float2x3 SIMD_CFUNC simd_sub(simd_float2x3 __x, simd_float2x3 __y);
+static simd_float3x3 SIMD_CFUNC simd_sub(simd_float3x3 __x, simd_float3x3 __y);
+static simd_float4x3 SIMD_CFUNC simd_sub(simd_float4x3 __x, simd_float4x3 __y);
+static simd_float2x4 SIMD_CFUNC simd_sub(simd_float2x4 __x, simd_float2x4 __y);
+static simd_float3x4 SIMD_CFUNC simd_sub(simd_float3x4 __x, simd_float3x4 __y);
+static simd_float4x4 SIMD_CFUNC simd_sub(simd_float4x4 __x, simd_float4x4 __y);
+static simd_double2x2 SIMD_CFUNC simd_sub(simd_double2x2 __x, simd_double2x2 __y);
+static simd_double3x2 SIMD_CFUNC simd_sub(simd_double3x2 __x, simd_double3x2 __y);
+static simd_double4x2 SIMD_CFUNC simd_sub(simd_double4x2 __x, simd_double4x2 __y);
+static simd_double2x3 SIMD_CFUNC simd_sub(simd_double2x3 __x, simd_double2x3 __y);
+static simd_double3x3 SIMD_CFUNC simd_sub(simd_double3x3 __x, simd_double3x3 __y);
+static simd_double4x3 SIMD_CFUNC simd_sub(simd_double4x3 __x, simd_double4x3 __y);
+static simd_double2x4 SIMD_CFUNC simd_sub(simd_double2x4 __x, simd_double2x4 __y);
+static simd_double3x4 SIMD_CFUNC simd_sub(simd_double3x4 __x, simd_double3x4 __y);
+static simd_double4x4 SIMD_CFUNC simd_sub(simd_double4x4 __x, simd_double4x4 __y);
+#define matrix_sub simd_sub
+
+static simd_float2x2 SIMD_CFUNC simd_transpose(simd_float2x2 __x);
+static simd_float2x3 SIMD_CFUNC simd_transpose(simd_float3x2 __x);
+static simd_float2x4 SIMD_CFUNC simd_transpose(simd_float4x2 __x);
+static simd_float3x2 SIMD_CFUNC simd_transpose(simd_float2x3 __x);
+static simd_float3x3 SIMD_CFUNC simd_transpose(simd_float3x3 __x);
+static simd_float3x4 SIMD_CFUNC simd_transpose(simd_float4x3 __x);
+static simd_float4x2 SIMD_CFUNC simd_transpose(simd_float2x4 __x);
+static simd_float4x3 SIMD_CFUNC simd_transpose(simd_float3x4 __x);
+static simd_float4x4 SIMD_CFUNC simd_transpose(simd_float4x4 __x);
+static simd_double2x2 SIMD_CFUNC simd_transpose(simd_double2x2 __x);
+static simd_double2x3 SIMD_CFUNC simd_transpose(simd_double3x2 __x);
+static simd_double2x4 SIMD_CFUNC simd_transpose(simd_double4x2 __x);
+static simd_double3x2 SIMD_CFUNC simd_transpose(simd_double2x3 __x);
+static simd_double3x3 SIMD_CFUNC simd_transpose(simd_double3x3 __x);
+static simd_double3x4 SIMD_CFUNC simd_transpose(simd_double4x3 __x);
+static simd_double4x2 SIMD_CFUNC simd_transpose(simd_double2x4 __x);
+static simd_double4x3 SIMD_CFUNC simd_transpose(simd_double3x4 __x);
+static simd_double4x4 SIMD_CFUNC simd_transpose(simd_double4x4 __x);
+#define matrix_transpose simd_transpose
+
+static float SIMD_CFUNC simd_trace(simd_float2x2 __x);
+static float SIMD_CFUNC simd_trace(simd_float3x3 __x);
+static float SIMD_CFUNC simd_trace(simd_float4x4 __x);
+static double SIMD_CFUNC simd_trace(simd_double2x2 __x);
+static double SIMD_CFUNC simd_trace(simd_double3x3 __x);
+static double SIMD_CFUNC simd_trace(simd_double4x4 __x);
+#define matrix_trace simd_trace
+
+static float SIMD_CFUNC simd_determinant(simd_float2x2 __x);
+static float SIMD_CFUNC simd_determinant(simd_float3x3 __x);
+static float SIMD_CFUNC simd_determinant(simd_float4x4 __x);
+static double SIMD_CFUNC simd_determinant(simd_double2x2 __x);
+static double SIMD_CFUNC simd_determinant(simd_double3x3 __x);
+static double SIMD_CFUNC simd_determinant(simd_double4x4 __x);
+#define matrix_determinant simd_determinant
+
+static simd_float2x2 SIMD_CFUNC simd_inverse(simd_float2x2 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+static simd_float3x3 SIMD_CFUNC simd_inverse(simd_float3x3 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+static simd_float4x4 SIMD_CFUNC simd_inverse(simd_float4x4 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+static simd_double2x2 SIMD_CFUNC simd_inverse(simd_double2x2 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+static simd_double3x3 SIMD_CFUNC simd_inverse(simd_double3x3 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+static simd_double4x4 SIMD_CFUNC simd_inverse(simd_double4x4 __x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0));
+#define matrix_invert simd_inverse
+
+static simd_float2 SIMD_CFUNC simd_mul(simd_float2x2 __x, simd_float2 __y);
+static simd_float2 SIMD_CFUNC simd_mul(simd_float3x2 __x, simd_float3 __y);
+static simd_float2 SIMD_CFUNC simd_mul(simd_float4x2 __x, simd_float4 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float2x3 __x, simd_float2 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float3x3 __x, simd_float3 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float4x3 __x, simd_float4 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float2x4 __x, simd_float2 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float3x4 __x, simd_float3 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float4x4 __x, simd_float4 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double2 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double3 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double4 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double2 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double3 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double4 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double2 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double3 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double4 __y);
+static simd_float2 SIMD_CFUNC simd_mul(simd_float2 __x, simd_float2x2 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float2 __x, simd_float3x2 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float2 __x, simd_float4x2 __y);
+static simd_float2 SIMD_CFUNC simd_mul(simd_float3 __x, simd_float2x3 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float3 __x, simd_float3x3 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float3 __x, simd_float4x3 __y);
+static simd_float2 SIMD_CFUNC simd_mul(simd_float4 __x, simd_float2x4 __y);
+static simd_float3 SIMD_CFUNC simd_mul(simd_float4 __x, simd_float3x4 __y);
+static simd_float4 SIMD_CFUNC simd_mul(simd_float4 __x, simd_float4x4 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double2x2 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double3x2 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double4x2 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double2x3 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double3x3 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double4x3 __y);
+static simd_double2 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double2x4 __y);
+static simd_double3 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double3x4 __y);
+static simd_double4 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double4x4 __y);
+static simd_float2x2 SIMD_CFUNC simd_mul(simd_float2x2 __x, simd_float2x2 __y);
+static simd_float3x2 SIMD_CFUNC simd_mul(simd_float2x2 __x, simd_float3x2 __y);
+static simd_float4x2 SIMD_CFUNC simd_mul(simd_float2x2 __x, simd_float4x2 __y);
+static simd_float2x3 SIMD_CFUNC simd_mul(simd_float2x3 __x, simd_float2x2 __y);
+static simd_float3x3 SIMD_CFUNC simd_mul(simd_float2x3 __x, simd_float3x2 __y);
+static simd_float4x3 SIMD_CFUNC simd_mul(simd_float2x3 __x, simd_float4x2 __y);
+static simd_float2x4 SIMD_CFUNC simd_mul(simd_float2x4 __x, simd_float2x2 __y);
+static simd_float3x4 SIMD_CFUNC simd_mul(simd_float2x4 __x, simd_float3x2 __y);
+static simd_float4x4 SIMD_CFUNC simd_mul(simd_float2x4 __x, simd_float4x2 __y);
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double2x2 __y);
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double3x2 __y);
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double4x2 __y);
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double2x2 __y);
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double3x2 __y);
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double4x2 __y);
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double2x2 __y);
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double3x2 __y);
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double4x2 __y);
+static simd_float2x2 SIMD_CFUNC simd_mul(simd_float3x2 __x, simd_float2x3 __y);
+static simd_float3x2 SIMD_CFUNC simd_mul(simd_float3x2 __x, simd_float3x3 __y);
+static simd_float4x2 SIMD_CFUNC simd_mul(simd_float3x2 __x, simd_float4x3 __y);
+static simd_float2x3 SIMD_CFUNC simd_mul(simd_float3x3 __x, simd_float2x3 __y);
+static simd_float3x3 SIMD_CFUNC simd_mul(simd_float3x3 __x, simd_float3x3 __y);
+static simd_float4x3 SIMD_CFUNC simd_mul(simd_float3x3 __x, simd_float4x3 __y);
+static simd_float2x4 SIMD_CFUNC simd_mul(simd_float3x4 __x, simd_float2x3 __y);
+static simd_float3x4 SIMD_CFUNC simd_mul(simd_float3x4 __x, simd_float3x3 __y);
+static simd_float4x4 SIMD_CFUNC simd_mul(simd_float3x4 __x, simd_float4x3 __y);
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double2x3 __y);
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double3x3 __y);
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double4x3 __y);
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double2x3 __y);
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double3x3 __y);
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double4x3 __y);
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double2x3 __y);
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double3x3 __y);
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double4x3 __y);
+static simd_float2x2 SIMD_CFUNC simd_mul(simd_float4x2 __x, simd_float2x4 __y);
+static simd_float3x2 SIMD_CFUNC simd_mul(simd_float4x2 __x, simd_float3x4 __y);
+static simd_float4x2 SIMD_CFUNC simd_mul(simd_float4x2 __x, simd_float4x4 __y);
+static simd_float2x3 SIMD_CFUNC simd_mul(simd_float4x3 __x, simd_float2x4 __y);
+static simd_float3x3 SIMD_CFUNC simd_mul(simd_float4x3 __x, simd_float3x4 __y);
+static simd_float4x3 SIMD_CFUNC simd_mul(simd_float4x3 __x, simd_float4x4 __y);
+static simd_float2x4 SIMD_CFUNC simd_mul(simd_float4x4 __x, simd_float2x4 __y);
+static simd_float3x4 SIMD_CFUNC simd_mul(simd_float4x4 __x, simd_float3x4 __y);
+static simd_float4x4 SIMD_CFUNC simd_mul(simd_float4x4 __x, simd_float4x4 __y);
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double2x4 __y);
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double3x4 __y);
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double4x4 __y);
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double2x4 __y);
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double3x4 __y);
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double4x4 __y);
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double2x4 __y);
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double3x4 __y);
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double4x4 __y);
+    
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x2 __x, simd_float2x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x3 __x, simd_float2x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x4 __x, simd_float2x4 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x2 __x, simd_float3x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x3 __x, simd_float3x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x4 __x, simd_float3x4 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x2 __x, simd_float4x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x3 __x, simd_float4x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x4 __x, simd_float4x4 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x2 __x, simd_double2x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x3 __x, simd_double2x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x4 __x, simd_double2x4 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x2 __x, simd_double3x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x3 __x, simd_double3x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x4 __x, simd_double3x4 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x2 __x, simd_double4x2 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x3 __x, simd_double4x3 __y);
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x4 __x, simd_double4x4 __y);
+#define matrix_equal simd_equal
+      
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x2 __x, simd_float2x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x3 __x, simd_float2x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x4 __x, simd_float2x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x2 __x, simd_float3x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x3 __x, simd_float3x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x4 __x, simd_float3x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x2 __x, simd_float4x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x3 __x, simd_float4x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x4 __x, simd_float4x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x2 __x, simd_double2x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x3 __x, simd_double2x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x4 __x, simd_double2x4 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x2 __x, simd_double3x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x3 __x, simd_double3x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x4 __x, simd_double3x4 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x2 __x, simd_double4x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x3 __x, simd_double4x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x4 __x, simd_double4x4 __y, double __tol);
+#define matrix_almost_equal_elements simd_almost_equal_elements
+      
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x2 __x, simd_float2x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x3 __x, simd_float2x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x4 __x, simd_float2x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x2 __x, simd_float3x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x3 __x, simd_float3x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x4 __x, simd_float3x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x2 __x, simd_float4x2 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x3 __x, simd_float4x3 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x4 __x, simd_float4x4 __y, float __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x2 __x, simd_double2x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x3 __x, simd_double2x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x4 __x, simd_double2x4 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x2 __x, simd_double3x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x3 __x, simd_double3x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x4 __x, simd_double3x4 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x2 __x, simd_double4x2 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x3 __x, simd_double4x3 __y, double __tol);
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x4 __x, simd_double4x4 __y, double __tol);
+#define matrix_almost_equal_elements_relative simd_almost_equal_elements_relative
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  static SIMD_CPPFUNC float2x2 operator+(const float2x2 x, const float2x2 y) { return float2x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float2x3 operator+(const float2x3 x, const float2x3 y) { return float2x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float2x4 operator+(const float2x4 x, const float2x4 y) { return float2x4(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float3x2 operator+(const float3x2 x, const float3x2 y) { return float3x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float3x3 operator+(const float3x3 x, const float3x3 y) { return float3x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float3x4 operator+(const float3x4 x, const float3x4 y) { return float3x4(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float4x2 operator+(const float4x2 x, const float4x2 y) { return float4x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float4x3 operator+(const float4x3 x, const float4x3 y) { return float4x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC float4x4 operator+(const float4x4 x, const float4x4 y) { return float4x4(::simd_linear_combination(1, x, 1, y)); }
+  
+  static SIMD_CPPFUNC float2x2 operator-(const float2x2 x, const float2x2 y) { return float2x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float2x3 operator-(const float2x3 x, const float2x3 y) { return float2x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float2x4 operator-(const float2x4 x, const float2x4 y) { return float2x4(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float3x2 operator-(const float3x2 x, const float3x2 y) { return float3x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float3x3 operator-(const float3x3 x, const float3x3 y) { return float3x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float3x4 operator-(const float3x4 x, const float3x4 y) { return float3x4(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float4x2 operator-(const float4x2 x, const float4x2 y) { return float4x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float4x3 operator-(const float4x3 x, const float4x3 y) { return float4x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC float4x4 operator-(const float4x4 x, const float4x4 y) { return float4x4(::simd_linear_combination(1, x, -1, y)); }
+  
+  static SIMD_INLINE SIMD_NODEBUG float2x2& operator+=(float2x2& x, const float2x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x3& operator+=(float2x3& x, const float2x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x4& operator+=(float2x4& x, const float2x4 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x2& operator+=(float3x2& x, const float3x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x3& operator+=(float3x3& x, const float3x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x4& operator+=(float3x4& x, const float3x4 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x2& operator+=(float4x2& x, const float4x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x3& operator+=(float4x3& x, const float4x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x4& operator+=(float4x4& x, const float4x4 y) { x = x + y; return x; }
+  
+  static SIMD_INLINE SIMD_NODEBUG float2x2& operator-=(float2x2& x, const float2x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x3& operator-=(float2x3& x, const float2x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x4& operator-=(float2x4& x, const float2x4 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x2& operator-=(float3x2& x, const float3x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x3& operator-=(float3x3& x, const float3x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x4& operator-=(float3x4& x, const float3x4 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x2& operator-=(float4x2& x, const float4x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x3& operator-=(float4x3& x, const float4x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x4& operator-=(float4x4& x, const float4x4 y) { x = x - y; return x; }
+  
+  static SIMD_CPPFUNC float2x2 transpose(const float2x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float2x3 transpose(const float3x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float2x4 transpose(const float4x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float3x2 transpose(const float2x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float3x3 transpose(const float3x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float3x4 transpose(const float4x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float4x2 transpose(const float2x4 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float4x3 transpose(const float3x4 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC float4x4 transpose(const float4x4 x) { return ::simd_transpose(x); }
+
+  static SIMD_CPPFUNC float trace(const float2x2 x) { return ::simd_trace(x); }
+  static SIMD_CPPFUNC float trace(const float3x3 x) { return ::simd_trace(x); }
+  static SIMD_CPPFUNC float trace(const float4x4 x) { return ::simd_trace(x); }
+
+  static SIMD_CPPFUNC float determinant(const float2x2 x) { return ::simd_determinant(x); }
+  static SIMD_CPPFUNC float determinant(const float3x3 x) { return ::simd_determinant(x); }
+  static SIMD_CPPFUNC float determinant(const float4x4 x) { return ::simd_determinant(x); }
+  
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+  static SIMD_CPPFUNC float2x2 inverse(const float2x2 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+  static SIMD_CPPFUNC float3x3 inverse(const float3x3 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+  static SIMD_CPPFUNC float4x4 inverse(const float4x4 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+#pragma clang diagnostic pop
+  
+  static SIMD_CPPFUNC float2x2 operator*(const float a, const float2x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float2x3 operator*(const float a, const float2x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float2x4 operator*(const float a, const float2x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x2 operator*(const float a, const float3x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x3 operator*(const float a, const float3x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x4 operator*(const float a, const float3x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x2 operator*(const float a, const float4x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x3 operator*(const float a, const float4x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x4 operator*(const float a, const float4x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float2x2 operator*(const float2x2 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float2x3 operator*(const float2x3 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float2x4 operator*(const float2x4 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x2 operator*(const float3x2 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x3 operator*(const float3x3 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float3x4 operator*(const float3x4 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x2 operator*(const float4x2 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x3 operator*(const float4x3 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC float4x4 operator*(const float4x4 x, const float a) { return ::simd_mul(a, x); }
+  static SIMD_INLINE SIMD_NODEBUG float2x2& operator*=(float2x2& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x3& operator*=(float2x3& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x4& operator*=(float2x4& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x2& operator*=(float3x2& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x3& operator*=(float3x3& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x4& operator*=(float3x4& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x2& operator*=(float4x2& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x3& operator*=(float4x3& x, const float a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x4& operator*=(float4x4& x, const float a) { x = ::simd_mul(a, x); return x; }
+  
+  static SIMD_CPPFUNC float2 operator*(const float2 x, const float2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float2 x, const float3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float2 x, const float4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2 operator*(const float3 x, const float2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float3 x, const float3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float3 x, const float4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2 operator*(const float4 x, const float2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float4 x, const float3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float4 x, const float4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2 operator*(const float2x2 x, const float2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2 operator*(const float3x2 x, const float3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2 operator*(const float4x2 x, const float4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float2x3 x, const float2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float3x3 x, const float3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3 operator*(const float4x3 x, const float4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float2x4 x, const float2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float3x4 x, const float3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4 operator*(const float4x4 x, const float4 y) { return ::simd_mul(x, y); }
+  static SIMD_INLINE SIMD_NODEBUG float2& operator*=(float2& x, const float2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3& operator*=(float3& x, const float3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4& operator*=(float4& x, const float4x4 y) { x = ::simd_mul(x, y); return x; }
+  
+  static SIMD_CPPFUNC float2x2 operator*(const float2x2 x, const float2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x2 operator*(const float2x2 x, const float3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x2 operator*(const float2x2 x, const float4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x3 operator*(const float2x3 x, const float2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x3 operator*(const float2x3 x, const float3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x3 operator*(const float2x3 x, const float4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x4 operator*(const float2x4 x, const float2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x4 operator*(const float2x4 x, const float3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x4 operator*(const float2x4 x, const float4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x2 operator*(const float3x2 x, const float2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x2 operator*(const float3x2 x, const float3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x2 operator*(const float3x2 x, const float4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x3 operator*(const float3x3 x, const float2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x3 operator*(const float3x3 x, const float3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x3 operator*(const float3x3 x, const float4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x4 operator*(const float3x4 x, const float2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x4 operator*(const float3x4 x, const float3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x4 operator*(const float3x4 x, const float4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x2 operator*(const float4x2 x, const float2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x2 operator*(const float4x2 x, const float3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x2 operator*(const float4x2 x, const float4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x3 operator*(const float4x3 x, const float2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x3 operator*(const float4x3 x, const float3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x3 operator*(const float4x3 x, const float4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float2x4 operator*(const float4x4 x, const float2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float3x4 operator*(const float4x4 x, const float3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC float4x4 operator*(const float4x4 x, const float4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_INLINE SIMD_NODEBUG float2x2& operator*=(float2x2& x, const float2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x3& operator*=(float2x3& x, const float2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float2x4& operator*=(float2x4& x, const float2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x2& operator*=(float3x2& x, const float3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x3& operator*=(float3x3& x, const float3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float3x4& operator*=(float3x4& x, const float3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x2& operator*=(float4x2& x, const float4x4 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x3& operator*=(float4x3& x, const float4x4 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG float4x4& operator*=(float4x4& x, const float4x4 y) { x = ::simd_mul(x, y); return x; }
+  
+  static SIMD_CPPFUNC bool operator==(const float2x2& x, const float2x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float2x3& x, const float2x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float2x4& x, const float2x4& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float3x2& x, const float3x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float3x3& x, const float3x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float3x4& x, const float3x4& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float4x2& x, const float4x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float4x3& x, const float4x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const float4x4& x, const float4x4& y) { return ::simd_equal(x, y); }
+  
+  static SIMD_CPPFUNC bool operator!=(const float2x2& x, const float2x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float2x3& x, const float2x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float2x4& x, const float2x4& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float3x2& x, const float3x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float3x3& x, const float3x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float3x4& x, const float3x4& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float4x2& x, const float4x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float4x3& x, const float4x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const float4x4& x, const float4x4& y) { return !(x == y); }
+  
+  static SIMD_CPPFUNC bool almost_equal_elements(const float2x2 x, const float2x2 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float2x3 x, const float2x3 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float2x4 x, const float2x4 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float3x2 x, const float3x2 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float3x3 x, const float3x3 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float3x4 x, const float3x4 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float4x2 x, const float4x2 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float4x3 x, const float4x3 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const float4x4 x, const float4x4 y, const float tol) { return ::simd_almost_equal_elements(x, y, tol); }
+    
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float2x2 x, const float2x2 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float2x3 x, const float2x3 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float2x4 x, const float2x4 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float3x2 x, const float3x2 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float3x3 x, const float3x3 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float3x4 x, const float3x4 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float4x2 x, const float4x2 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float4x3 x, const float4x3 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const float4x4 x, const float4x4 y, const float tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  
+  static SIMD_CPPFUNC double2x2 operator+(const double2x2 x, const double2x2 y) { return double2x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double2x3 operator+(const double2x3 x, const double2x3 y) { return double2x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double2x4 operator+(const double2x4 x, const double2x4 y) { return double2x4(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double3x2 operator+(const double3x2 x, const double3x2 y) { return double3x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double3x3 operator+(const double3x3 x, const double3x3 y) { return double3x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double3x4 operator+(const double3x4 x, const double3x4 y) { return double3x4(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double4x2 operator+(const double4x2 x, const double4x2 y) { return double4x2(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double4x3 operator+(const double4x3 x, const double4x3 y) { return double4x3(::simd_linear_combination(1, x, 1, y)); }
+  static SIMD_CPPFUNC double4x4 operator+(const double4x4 x, const double4x4 y) { return double4x4(::simd_linear_combination(1, x, 1, y)); }
+  
+  static SIMD_CPPFUNC double2x2 operator-(const double2x2 x, const double2x2 y) { return double2x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double2x3 operator-(const double2x3 x, const double2x3 y) { return double2x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double2x4 operator-(const double2x4 x, const double2x4 y) { return double2x4(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double3x2 operator-(const double3x2 x, const double3x2 y) { return double3x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double3x3 operator-(const double3x3 x, const double3x3 y) { return double3x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double3x4 operator-(const double3x4 x, const double3x4 y) { return double3x4(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double4x2 operator-(const double4x2 x, const double4x2 y) { return double4x2(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double4x3 operator-(const double4x3 x, const double4x3 y) { return double4x3(::simd_linear_combination(1, x, -1, y)); }
+  static SIMD_CPPFUNC double4x4 operator-(const double4x4 x, const double4x4 y) { return double4x4(::simd_linear_combination(1, x, -1, y)); }
+  
+  static SIMD_INLINE SIMD_NODEBUG double2x2& operator+=(double2x2& x, const double2x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x3& operator+=(double2x3& x, const double2x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x4& operator+=(double2x4& x, const double2x4 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x2& operator+=(double3x2& x, const double3x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x3& operator+=(double3x3& x, const double3x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x4& operator+=(double3x4& x, const double3x4 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x2& operator+=(double4x2& x, const double4x2 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x3& operator+=(double4x3& x, const double4x3 y) { x = x + y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x4& operator+=(double4x4& x, const double4x4 y) { x = x + y; return x; }
+  
+  static SIMD_INLINE SIMD_NODEBUG double2x2& operator-=(double2x2& x, const double2x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x3& operator-=(double2x3& x, const double2x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x4& operator-=(double2x4& x, const double2x4 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x2& operator-=(double3x2& x, const double3x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x3& operator-=(double3x3& x, const double3x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x4& operator-=(double3x4& x, const double3x4 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x2& operator-=(double4x2& x, const double4x2 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x3& operator-=(double4x3& x, const double4x3 y) { x = x - y; return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x4& operator-=(double4x4& x, const double4x4 y) { x = x - y; return x; }
+  
+  static SIMD_CPPFUNC double2x2 transpose(const double2x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double2x3 transpose(const double3x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double2x4 transpose(const double4x2 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double3x2 transpose(const double2x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double3x3 transpose(const double3x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double3x4 transpose(const double4x3 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double4x2 transpose(const double2x4 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double4x3 transpose(const double3x4 x) { return ::simd_transpose(x); }
+  static SIMD_CPPFUNC double4x4 transpose(const double4x4 x) { return ::simd_transpose(x); }
+
+  static SIMD_CPPFUNC double trace(const double2x2 x) { return ::simd_trace(x); }
+  static SIMD_CPPFUNC double trace(const double3x3 x) { return ::simd_trace(x); }
+  static SIMD_CPPFUNC double trace(const double4x4 x) { return ::simd_trace(x); }
+
+  static SIMD_CPPFUNC double determinant(const double2x2 x) { return ::simd_determinant(x); }
+  static SIMD_CPPFUNC double determinant(const double3x3 x) { return ::simd_determinant(x); }
+  static SIMD_CPPFUNC double determinant(const double4x4 x) { return ::simd_determinant(x); }
+  
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+  static SIMD_CPPFUNC double2x2 inverse(const double2x2 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+  static SIMD_CPPFUNC double3x3 inverse(const double3x3 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+  static SIMD_CPPFUNC double4x4 inverse(const double4x4 x) __API_AVAILABLE(macos(10.10), ios(8.0), watchos(2.0), tvos(9.0)) { return ::simd_inverse(x); }
+#pragma clang diagnostic pop
+  
+  static SIMD_CPPFUNC double2x2 operator*(const double a, const double2x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double2x3 operator*(const double a, const double2x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double2x4 operator*(const double a, const double2x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x2 operator*(const double a, const double3x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x3 operator*(const double a, const double3x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x4 operator*(const double a, const double3x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x2 operator*(const double a, const double4x2 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x3 operator*(const double a, const double4x3 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x4 operator*(const double a, const double4x4 x) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double2x2 operator*(const double2x2 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double2x3 operator*(const double2x3 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double2x4 operator*(const double2x4 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x2 operator*(const double3x2 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x3 operator*(const double3x3 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double3x4 operator*(const double3x4 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x2 operator*(const double4x2 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x3 operator*(const double4x3 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_CPPFUNC double4x4 operator*(const double4x4 x, const double a) { return ::simd_mul(a, x); }
+  static SIMD_INLINE SIMD_NODEBUG double2x2& operator*=(double2x2& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x3& operator*=(double2x3& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x4& operator*=(double2x4& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x2& operator*=(double3x2& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x3& operator*=(double3x3& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x4& operator*=(double3x4& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x2& operator*=(double4x2& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x3& operator*=(double4x3& x, const double a) { x = ::simd_mul(a, x); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x4& operator*=(double4x4& x, const double a) { x = ::simd_mul(a, x); return x; }
+  
+  static SIMD_CPPFUNC double2 operator*(const double2 x, const double2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double2 x, const double3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double2 x, const double4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2 operator*(const double3 x, const double2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double3 x, const double3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double3 x, const double4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2 operator*(const double4 x, const double2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double4 x, const double3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double4 x, const double4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2 operator*(const double2x2 x, const double2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2 operator*(const double3x2 x, const double3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2 operator*(const double4x2 x, const double4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double2x3 x, const double2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double3x3 x, const double3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3 operator*(const double4x3 x, const double4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double2x4 x, const double2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double3x4 x, const double3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4 operator*(const double4x4 x, const double4 y) { return ::simd_mul(x, y); }
+  static SIMD_INLINE SIMD_NODEBUG double2& operator*=(double2& x, const double2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3& operator*=(double3& x, const double3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4& operator*=(double4& x, const double4x4 y) { x = ::simd_mul(x, y); return x; }
+  
+  static SIMD_CPPFUNC double2x2 operator*(const double2x2 x, const double2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x2 operator*(const double2x2 x, const double3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x2 operator*(const double2x2 x, const double4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x3 operator*(const double2x3 x, const double2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x3 operator*(const double2x3 x, const double3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x3 operator*(const double2x3 x, const double4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x4 operator*(const double2x4 x, const double2x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x4 operator*(const double2x4 x, const double3x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x4 operator*(const double2x4 x, const double4x2 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x2 operator*(const double3x2 x, const double2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x2 operator*(const double3x2 x, const double3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x2 operator*(const double3x2 x, const double4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x3 operator*(const double3x3 x, const double2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x3 operator*(const double3x3 x, const double3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x3 operator*(const double3x3 x, const double4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x4 operator*(const double3x4 x, const double2x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x4 operator*(const double3x4 x, const double3x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x4 operator*(const double3x4 x, const double4x3 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x2 operator*(const double4x2 x, const double2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x2 operator*(const double4x2 x, const double3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x2 operator*(const double4x2 x, const double4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x3 operator*(const double4x3 x, const double2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x3 operator*(const double4x3 x, const double3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x3 operator*(const double4x3 x, const double4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double2x4 operator*(const double4x4 x, const double2x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double3x4 operator*(const double4x4 x, const double3x4 y) { return ::simd_mul(x, y); }
+  static SIMD_CPPFUNC double4x4 operator*(const double4x4 x, const double4x4 y) { return ::simd_mul(x, y); }
+  static SIMD_INLINE SIMD_NODEBUG double2x2& operator*=(double2x2& x, const double2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x3& operator*=(double2x3& x, const double2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double2x4& operator*=(double2x4& x, const double2x2 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x2& operator*=(double3x2& x, const double3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x3& operator*=(double3x3& x, const double3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double3x4& operator*=(double3x4& x, const double3x3 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x2& operator*=(double4x2& x, const double4x4 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x3& operator*=(double4x3& x, const double4x4 y) { x = ::simd_mul(x, y); return x; }
+  static SIMD_INLINE SIMD_NODEBUG double4x4& operator*=(double4x4& x, const double4x4 y) { x = ::simd_mul(x, y); return x; }
+  
+  static SIMD_CPPFUNC bool operator==(const double2x2& x, const double2x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double2x3& x, const double2x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double2x4& x, const double2x4& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double3x2& x, const double3x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double3x3& x, const double3x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double3x4& x, const double3x4& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double4x2& x, const double4x2& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double4x3& x, const double4x3& y) { return ::simd_equal(x, y); }
+  static SIMD_CPPFUNC bool operator==(const double4x4& x, const double4x4& y) { return ::simd_equal(x, y); }
+  
+  static SIMD_CPPFUNC bool operator!=(const double2x2& x, const double2x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double2x3& x, const double2x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double2x4& x, const double2x4& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double3x2& x, const double3x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double3x3& x, const double3x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double3x4& x, const double3x4& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double4x2& x, const double4x2& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double4x3& x, const double4x3& y) { return !(x == y); }
+  static SIMD_CPPFUNC bool operator!=(const double4x4& x, const double4x4& y) { return !(x == y); }
+  
+  static SIMD_CPPFUNC bool almost_equal_elements(const double2x2 x, const double2x2 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double2x3 x, const double2x3 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double2x4 x, const double2x4 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double3x2 x, const double3x2 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double3x3 x, const double3x3 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double3x4 x, const double3x4 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double4x2 x, const double4x2 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double4x3 x, const double4x3 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements(const double4x4 x, const double4x4 y, const double tol) { return ::simd_almost_equal_elements(x, y, tol); }
+  
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double2x2 x, const double2x2 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double2x3 x, const double2x3 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double2x4 x, const double2x4 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double3x2 x, const double3x2 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double3x3 x, const double3x3 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double3x4 x, const double3x4 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double4x2 x, const double4x2 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double4x3 x, const double4x3 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+  static SIMD_CPPFUNC bool almost_equal_elements_relative(const double4x4 x, const double4x4 y, const double tol) { return ::simd_almost_equal_elements_relative(x, y, tol); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementation
+
+static  simd_float2x2 SIMD_CFUNC simd_diagonal_matrix(simd_float2  __x) {  simd_float2x2 __r = { .columns[0] = {__x.x,0}, .columns[1] = {0,__x.y} }; return __r; }
+static simd_double2x2 SIMD_CFUNC simd_diagonal_matrix(simd_double2 __x) { simd_double2x2 __r = { .columns[0] = {__x.x,0}, .columns[1] = {0,__x.y} }; return __r; }
+static  simd_float3x3 SIMD_CFUNC simd_diagonal_matrix(simd_float3  __x) {  simd_float3x3 __r = { .columns[0] = {__x.x,0,0}, .columns[1] = {0,__x.y,0}, .columns[2] = {0,0,__x.z} }; return __r; }
+static simd_double3x3 SIMD_CFUNC simd_diagonal_matrix(simd_double3 __x) { simd_double3x3 __r = { .columns[0] = {__x.x,0,0}, .columns[1] = {0,__x.y,0}, .columns[2] = {0,0,__x.z} }; return __r; }
+static  simd_float4x4 SIMD_CFUNC simd_diagonal_matrix(simd_float4  __x) {  simd_float4x4 __r = { .columns[0] = {__x.x,0,0,0}, .columns[1] = {0,__x.y,0,0}, .columns[2] = {0,0,__x.z,0}, .columns[3] = {0,0,0,__x.w} }; return __r; }
+static simd_double4x4 SIMD_CFUNC simd_diagonal_matrix(simd_double4 __x) { simd_double4x4 __r = { .columns[0] = {__x.x,0,0,0}, .columns[1] = {0,__x.y,0,0}, .columns[2] = {0,0,__x.z,0}, .columns[3] = {0,0,0,__x.w} }; return __r; }
+
+static  simd_float2x2 SIMD_CFUNC simd_matrix(simd_float2  col0, simd_float2  col1) {  simd_float2x2 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static  simd_float2x3 SIMD_CFUNC simd_matrix(simd_float3  col0, simd_float3  col1) {  simd_float2x3 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static  simd_float2x4 SIMD_CFUNC simd_matrix(simd_float4  col0, simd_float4  col1) {  simd_float2x4 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static simd_double2x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1) { simd_double2x2 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static simd_double2x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1) { simd_double2x3 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static simd_double2x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1) { simd_double2x4 __r = { .columns[0] = col0, .columns[1] = col1 }; return __r; }
+static  simd_float3x2 SIMD_CFUNC simd_matrix(simd_float2  col0, simd_float2  col1, simd_float2  col2) {  simd_float3x2 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static  simd_float3x3 SIMD_CFUNC simd_matrix(simd_float3  col0, simd_float3  col1, simd_float3  col2) {  simd_float3x3 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static  simd_float3x4 SIMD_CFUNC simd_matrix(simd_float4  col0, simd_float4  col1, simd_float4  col2) {  simd_float3x4 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static simd_double3x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1, simd_double2 col2) { simd_double3x2 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static simd_double3x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1, simd_double3 col2) { simd_double3x3 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static simd_double3x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1, simd_double4 col2) { simd_double3x4 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2 }; return __r; }
+static  simd_float4x2 SIMD_CFUNC simd_matrix(simd_float2  col0, simd_float2  col1, simd_float2  col2, simd_float2  col3) {  simd_float4x2 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+static  simd_float4x3 SIMD_CFUNC simd_matrix(simd_float3  col0, simd_float3  col1, simd_float3  col2, simd_float3  col3) {  simd_float4x3 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+static  simd_float4x4 SIMD_CFUNC simd_matrix(simd_float4  col0, simd_float4  col1, simd_float4  col2, simd_float4  col3) {  simd_float4x4 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+static simd_double4x2 SIMD_CFUNC simd_matrix(simd_double2 col0, simd_double2 col1, simd_double2 col2, simd_double2 col3) { simd_double4x2 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+static simd_double4x3 SIMD_CFUNC simd_matrix(simd_double3 col0, simd_double3 col1, simd_double3 col2, simd_double3 col3) { simd_double4x3 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+static simd_double4x4 SIMD_CFUNC simd_matrix(simd_double4 col0, simd_double4 col1, simd_double4 col2, simd_double4 col3) { simd_double4x4 __r = { .columns[0] = col0, .columns[1] = col1, .columns[2] = col2, .columns[3] = col3 }; return __r; }
+
+static  simd_float2x2 SIMD_CFUNC simd_matrix_from_rows(simd_float2  row0, simd_float2  row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static  simd_float3x2 SIMD_CFUNC simd_matrix_from_rows(simd_float3  row0, simd_float3  row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static  simd_float4x2 SIMD_CFUNC simd_matrix_from_rows(simd_float4  row0, simd_float4  row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static simd_double2x2 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static simd_double3x2 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static simd_double4x2 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1) { return simd_transpose(simd_matrix(row0, row1)); }
+static  simd_float2x3 SIMD_CFUNC simd_matrix_from_rows(simd_float2  row0, simd_float2  row1, simd_float2  row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static  simd_float3x3 SIMD_CFUNC simd_matrix_from_rows(simd_float3  row0, simd_float3  row1, simd_float3  row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static  simd_float4x3 SIMD_CFUNC simd_matrix_from_rows(simd_float4  row0, simd_float4  row1, simd_float4  row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static simd_double2x3 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1, simd_double2 row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static simd_double3x3 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1, simd_double3 row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static simd_double4x3 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1, simd_double4 row2) { return simd_transpose(simd_matrix(row0, row1, row2)); }
+static  simd_float2x4 SIMD_CFUNC simd_matrix_from_rows(simd_float2  row0, simd_float2  row1, simd_float2  row2, simd_float2  row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+static  simd_float3x4 SIMD_CFUNC simd_matrix_from_rows(simd_float3  row0, simd_float3  row1, simd_float3  row2, simd_float3  row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+static  simd_float4x4 SIMD_CFUNC simd_matrix_from_rows(simd_float4  row0, simd_float4  row1, simd_float4  row2, simd_float4  row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+static simd_double2x4 SIMD_CFUNC simd_matrix_from_rows(simd_double2 row0, simd_double2 row1, simd_double2 row2, simd_double2 row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+static simd_double3x4 SIMD_CFUNC simd_matrix_from_rows(simd_double3 row0, simd_double3 row1, simd_double3 row2, simd_double3 row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+static simd_double4x4 SIMD_CFUNC simd_matrix_from_rows(simd_double4 row0, simd_double4 row1, simd_double4 row2, simd_double4 row3) { return simd_transpose(simd_matrix(row0, row1, row2, row3)); }
+  
+static  simd_float3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatf q) {
+  simd_float4x4 r = simd_matrix4x4(q);
+  return (simd_float3x3){ r.columns[0].xyz, r.columns[1].xyz, r.columns[2].xyz };
+}
+
+static  simd_float4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatf q) {
+  simd_float4 v = q.vector;
+  simd_float4x4 r = {
+    .columns[0] = { v.x*v.x - v.y*v.y - v.z*v.z + v.w*v.w,
+                        2*(v.x*v.y + v.z*v.w),
+                        2*(v.x*v.z - v.y*v.w), 0 },
+    .columns[1] = {     2*(v.x*v.y - v.z*v.w),
+                    v.y*v.y - v.z*v.z + v.w*v.w - v.x*v.x,
+                        2*(v.y*v.z + v.x*v.w), 0 },
+    .columns[2] = {     2*(v.z*v.x + v.y*v.w),
+                        2*(v.y*v.z - v.x*v.w),
+                    v.z*v.z + v.w*v.w - v.x*v.x - v.y*v.y, 0 },
+    .columns[3] = { 0, 0, 0, 1 }
+  };
+  return r;
+}
+  
+static simd_double3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatd q) {
+  simd_double4x4 r = simd_matrix4x4(q);
+  return (simd_double3x3){ r.columns[0].xyz, r.columns[1].xyz, r.columns[2].xyz };
+}
+
+static simd_double4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatd q) {
+  simd_double4 v = q.vector;
+  simd_double4x4 r = {
+    .columns[0] = { v.x*v.x - v.y*v.y - v.z*v.z + v.w*v.w,
+                        2*(v.x*v.y + v.z*v.w),
+                        2*(v.x*v.z - v.y*v.w), 0 },
+    .columns[1] = {     2*(v.x*v.y - v.z*v.w),
+                    v.y*v.y - v.z*v.z + v.w*v.w - v.x*v.x,
+                        2*(v.y*v.z + v.x*v.w), 0 },
+    .columns[2] = {     2*(v.z*v.x + v.y*v.w),
+                        2*(v.y*v.z - v.x*v.w),
+                    v.z*v.z + v.w*v.w - v.x*v.x - v.y*v.y, 0 },
+    .columns[3] = { 0, 0, 0, 1 }
+  };
+  return r;
+}
+
+static  simd_float2x2 SIMD_CFUNC matrix_scale(float  __a,  simd_float2x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x2 SIMD_CFUNC matrix_scale(float  __a,  simd_float3x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x2 SIMD_CFUNC matrix_scale(float  __a,  simd_float4x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static  simd_float2x3 SIMD_CFUNC matrix_scale(float  __a,  simd_float2x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x3 SIMD_CFUNC matrix_scale(float  __a,  simd_float3x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x3 SIMD_CFUNC matrix_scale(float  __a,  simd_float4x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static  simd_float2x4 SIMD_CFUNC matrix_scale(float  __a,  simd_float2x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x4 SIMD_CFUNC matrix_scale(float  __a,  simd_float3x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x4 SIMD_CFUNC matrix_scale(float  __a,  simd_float4x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x2 SIMD_CFUNC matrix_scale(double __a, simd_double2x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x2 SIMD_CFUNC matrix_scale(double __a, simd_double3x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x2 SIMD_CFUNC matrix_scale(double __a, simd_double4x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x3 SIMD_CFUNC matrix_scale(double __a, simd_double2x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x3 SIMD_CFUNC matrix_scale(double __a, simd_double3x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x3 SIMD_CFUNC matrix_scale(double __a, simd_double4x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x4 SIMD_CFUNC matrix_scale(double __a, simd_double2x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x4 SIMD_CFUNC matrix_scale(double __a, simd_double3x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x4 SIMD_CFUNC matrix_scale(double __a, simd_double4x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+  
+static  simd_float2x2 SIMD_CFUNC simd_mul(float  __a,  simd_float2x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x2 SIMD_CFUNC simd_mul(float  __a,  simd_float3x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x2 SIMD_CFUNC simd_mul(float  __a,  simd_float4x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static  simd_float2x3 SIMD_CFUNC simd_mul(float  __a,  simd_float2x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x3 SIMD_CFUNC simd_mul(float  __a,  simd_float3x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x3 SIMD_CFUNC simd_mul(float  __a,  simd_float4x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static  simd_float2x4 SIMD_CFUNC simd_mul(float  __a,  simd_float2x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static  simd_float3x4 SIMD_CFUNC simd_mul(float  __a,  simd_float3x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static  simd_float4x4 SIMD_CFUNC simd_mul(float  __a,  simd_float4x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x2 SIMD_CFUNC simd_mul(double __a, simd_double2x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x2 SIMD_CFUNC simd_mul(double __a, simd_double3x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x2 SIMD_CFUNC simd_mul(double __a, simd_double4x2 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x3 SIMD_CFUNC simd_mul(double __a, simd_double2x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x3 SIMD_CFUNC simd_mul(double __a, simd_double3x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x3 SIMD_CFUNC simd_mul(double __a, simd_double4x3 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+static simd_double2x4 SIMD_CFUNC simd_mul(double __a, simd_double2x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; return __x; }
+static simd_double3x4 SIMD_CFUNC simd_mul(double __a, simd_double3x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; return __x; }
+static simd_double4x4 SIMD_CFUNC simd_mul(double __a, simd_double4x4 __x) { __x.columns[0] *= __a; __x.columns[1] *= __a; __x.columns[2] *= __a; __x.columns[3] *= __a; return __x; }
+
+static  simd_float2x2 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float2x2 __x, float  __b,  simd_float2x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static  simd_float3x2 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float3x2 __x, float  __b,  simd_float3x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static  simd_float4x2 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float4x2 __x, float  __b,  simd_float4x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+static  simd_float2x3 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float2x3 __x, float  __b,  simd_float2x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static  simd_float3x3 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float3x3 __x, float  __b,  simd_float3x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static  simd_float4x3 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float4x3 __x, float  __b,  simd_float4x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+static  simd_float2x4 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float2x4 __x, float  __b,  simd_float2x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static  simd_float3x4 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float3x4 __x, float  __b,  simd_float3x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static  simd_float4x4 SIMD_CFUNC simd_linear_combination(float  __a,  simd_float4x4 __x, float  __b,  simd_float4x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+static simd_double2x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x2 __x, double __b, simd_double2x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static simd_double3x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x2 __x, double __b, simd_double3x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static simd_double4x2 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x2 __x, double __b, simd_double4x2 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+static simd_double2x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x3 __x, double __b, simd_double2x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static simd_double3x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x3 __x, double __b, simd_double3x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static simd_double4x3 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x3 __x, double __b, simd_double4x3 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+static simd_double2x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double2x4 __x, double __b, simd_double2x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    return __x;
+}
+static simd_double3x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double3x4 __x, double __b, simd_double3x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    return __x;
+}
+static simd_double4x4 SIMD_CFUNC simd_linear_combination(double __a, simd_double4x4 __x, double __b, simd_double4x4 __y) {
+    __x.columns[0] = __a*__x.columns[0] + __b*__y.columns[0];
+    __x.columns[1] = __a*__x.columns[1] + __b*__y.columns[1];
+    __x.columns[2] = __a*__x.columns[2] + __b*__y.columns[2];
+    __x.columns[3] = __a*__x.columns[3] + __b*__y.columns[3];
+    return __x;
+}
+  
+static simd_float2x2 SIMD_CFUNC simd_add(simd_float2x2 __x, simd_float2x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float3x2 SIMD_CFUNC simd_add(simd_float3x2 __x, simd_float3x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float4x2 SIMD_CFUNC simd_add(simd_float4x2 __x, simd_float4x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float2x3 SIMD_CFUNC simd_add(simd_float2x3 __x, simd_float2x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float3x3 SIMD_CFUNC simd_add(simd_float3x3 __x, simd_float3x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float4x3 SIMD_CFUNC simd_add(simd_float4x3 __x, simd_float4x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float2x4 SIMD_CFUNC simd_add(simd_float2x4 __x, simd_float2x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float3x4 SIMD_CFUNC simd_add(simd_float3x4 __x, simd_float3x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_float4x4 SIMD_CFUNC simd_add(simd_float4x4 __x, simd_float4x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double2x2 SIMD_CFUNC simd_add(simd_double2x2 __x, simd_double2x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double3x2 SIMD_CFUNC simd_add(simd_double3x2 __x, simd_double3x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double4x2 SIMD_CFUNC simd_add(simd_double4x2 __x, simd_double4x2 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double2x3 SIMD_CFUNC simd_add(simd_double2x3 __x, simd_double2x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double3x3 SIMD_CFUNC simd_add(simd_double3x3 __x, simd_double3x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double4x3 SIMD_CFUNC simd_add(simd_double4x3 __x, simd_double4x3 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double2x4 SIMD_CFUNC simd_add(simd_double2x4 __x, simd_double2x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double3x4 SIMD_CFUNC simd_add(simd_double3x4 __x, simd_double3x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+static simd_double4x4 SIMD_CFUNC simd_add(simd_double4x4 __x, simd_double4x4 __y) { return simd_linear_combination(1, __x, 1, __y); }
+      
+static simd_float2x2 SIMD_CFUNC simd_sub(simd_float2x2 __x, simd_float2x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float3x2 SIMD_CFUNC simd_sub(simd_float3x2 __x, simd_float3x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float4x2 SIMD_CFUNC simd_sub(simd_float4x2 __x, simd_float4x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float2x3 SIMD_CFUNC simd_sub(simd_float2x3 __x, simd_float2x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float3x3 SIMD_CFUNC simd_sub(simd_float3x3 __x, simd_float3x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float4x3 SIMD_CFUNC simd_sub(simd_float4x3 __x, simd_float4x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float2x4 SIMD_CFUNC simd_sub(simd_float2x4 __x, simd_float2x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float3x4 SIMD_CFUNC simd_sub(simd_float3x4 __x, simd_float3x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_float4x4 SIMD_CFUNC simd_sub(simd_float4x4 __x, simd_float4x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double2x2 SIMD_CFUNC simd_sub(simd_double2x2 __x, simd_double2x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double3x2 SIMD_CFUNC simd_sub(simd_double3x2 __x, simd_double3x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double4x2 SIMD_CFUNC simd_sub(simd_double4x2 __x, simd_double4x2 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double2x3 SIMD_CFUNC simd_sub(simd_double2x3 __x, simd_double2x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double3x3 SIMD_CFUNC simd_sub(simd_double3x3 __x, simd_double3x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double4x3 SIMD_CFUNC simd_sub(simd_double4x3 __x, simd_double4x3 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double2x4 SIMD_CFUNC simd_sub(simd_double2x4 __x, simd_double2x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double3x4 SIMD_CFUNC simd_sub(simd_double3x4 __x, simd_double3x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+static simd_double4x4 SIMD_CFUNC simd_sub(simd_double4x4 __x, simd_double4x4 __y) { return simd_linear_combination(1, __x, -1, __y); }
+
+static simd_float2x2 SIMD_CFUNC simd_transpose(simd_float2x2 __x) {
+    simd_float4 __x0, __x1;
+    __x0.xy = __x.columns[0];
+    __x1.xy = __x.columns[1];
+#if defined __SSE__
+    simd_float4 __r01 = _mm_unpacklo_ps(__x0, __x1);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __r01 = vzip1q_f32(__x0, __x1);
+#else
+    simd_float4 __r01 = { __x0[0], __x1[0], __x0[1], __x1[1] };
+#endif
+    return simd_matrix(__r01.lo, __r01.hi);
+}
+    
+static simd_float3x2 SIMD_CFUNC simd_transpose(simd_float2x3 __x) {
+    simd_float4 __x0, __x1;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+#if defined __SSE__
+    simd_float4 __r01 = _mm_unpacklo_ps(__x0, __x1);
+    simd_float4 __r2x = _mm_unpackhi_ps(__x0, __x1);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __r01 = vzip1q_f32(__x0, __x1);
+    simd_float4 __r2x = vzip2q_f32(__x0, __x1);
+#else
+    simd_float4 __r01 = { __x0[0], __x1[0], __x0[1], __x1[1] };
+    simd_float4 __r2x = { __x0[2], __x1[2] };
+#endif
+    return simd_matrix(__r01.lo, __r01.hi, __r2x.lo);
+}
+    
+static simd_float4x2 SIMD_CFUNC simd_transpose(simd_float2x4 __x) {
+#if defined __SSE__
+    simd_float4 __r01 = _mm_unpacklo_ps(__x.columns[0], __x.columns[1]);
+    simd_float4 __r23 = _mm_unpackhi_ps(__x.columns[0], __x.columns[1]);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __r01 = vzip1q_f32(__x.columns[0], __x.columns[1]);
+    simd_float4 __r23 = vzip2q_f32(__x.columns[0], __x.columns[1]);
+#else
+    simd_float4 __r01 = { __x.columns[0][0], __x.columns[1][0], __x.columns[0][1], __x.columns[1][1] };
+    simd_float4 __r23 = { __x.columns[0][2], __x.columns[1][2], __x.columns[0][3], __x.columns[1][3] };
+#endif
+    return simd_matrix(__r01.lo, __r01.hi, __r23.lo, __r23.hi);
+}
+    
+static simd_float2x3 SIMD_CFUNC simd_transpose(simd_float3x2 __x) {
+    simd_float4 __x0, __x1, __x2;
+    __x0.xy = __x.columns[0];
+    __x1.xy = __x.columns[1];
+    __x2.xy = __x.columns[2];
+#if defined __SSE__
+    simd_float4 __t = _mm_unpacklo_ps(__x0, __x1);
+    simd_float4 __r0 = _mm_shuffle_ps(__t,__x2,0xc4);
+    simd_float4 __r1 = _mm_shuffle_ps(__t,__x2,0xde);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 padding = { 0 };
+    simd_float4 __t0 = vzip1q_f32(__x0,__x2);
+    simd_float4 __t1 = vzip1q_f32(__x1,padding);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t1);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t1);
+#else
+    simd_float4 __r0 = { __x0[0], __x1[0], __x2[0] };
+    simd_float4 __r1 = { __x0[1], __x1[1], __x2[1] };
+#endif
+    return simd_matrix(__r0.xyz, __r1.xyz);
+}
+    
+static simd_float3x3 SIMD_CFUNC simd_transpose(simd_float3x3 __x) {
+    simd_float4 __x0, __x1, __x2;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+    __x2.xyz = __x.columns[2];
+#if defined __SSE__
+    simd_float4 __t0 = _mm_unpacklo_ps(__x0, __x1);
+    simd_float4 __t1 = _mm_unpackhi_ps(__x0, __x1);
+    simd_float4 __r0 = __t0; __r0.hi = __x2.lo;
+    simd_float4 __r1 = _mm_shuffle_ps(__t0, __x2, 0xde);
+    simd_float4 __r2 = __x2; __r2.lo = __t1.lo;
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 padding = { 0 };
+    simd_float4 __t0 = vzip1q_f32(__x0,__x2);
+    simd_float4 __t1 = vzip2q_f32(__x0,__x2);
+    simd_float4 __t2 = vzip1q_f32(__x1,padding);
+    simd_float4 __t3 = vzip2q_f32(__x1,padding);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t2);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t2);
+    simd_float4 __r2 = vzip1q_f32(__t1,__t3);
+#else
+    simd_float4 __r0 = {__x0[0], __x1[0], __x2[0]};
+    simd_float4 __r1 = {__x0[1], __x1[1], __x2[1]};
+    simd_float4 __r2 = {__x0[2], __x1[2], __x2[2]};
+#endif
+    return simd_matrix(__r0.xyz, __r1.xyz, __r2.xyz);
+}
+    
+static simd_float4x3 SIMD_CFUNC simd_transpose(simd_float3x4 __x) {
+#if defined __SSE__
+    simd_float4 __t0 = _mm_unpacklo_ps(__x.columns[0],__x.columns[1]); /* 00 10 01 11 */
+    simd_float4 __t1 = _mm_unpackhi_ps(__x.columns[0],__x.columns[1]); /* 02 12 03 13 */
+    simd_float4 __r0 = __t0; __r0.hi = __x.columns[2].lo;
+    simd_float4 __r1 = _mm_shuffle_ps(__t0, __x.columns[2], 0xde);
+    simd_float4 __r2 = __x.columns[2]; __r2.lo = __t1.lo;
+    simd_float4 __r3 = _mm_shuffle_ps(__t1, __x.columns[2], 0xfe);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 padding = { 0 };
+    simd_float4 __t0 = vzip1q_f32(__x.columns[0],__x.columns[2]);
+    simd_float4 __t1 = vzip2q_f32(__x.columns[0],__x.columns[2]);
+    simd_float4 __t2 = vzip1q_f32(__x.columns[1],padding);
+    simd_float4 __t3 = vzip2q_f32(__x.columns[1],padding);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t2);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t2);
+    simd_float4 __r2 = vzip1q_f32(__t1,__t3);
+    simd_float4 __r3 = vzip2q_f32(__t1,__t3);
+#else
+    simd_float4 __r0 = {__x.columns[0][0], __x.columns[1][0], __x.columns[2][0]};
+    simd_float4 __r1 = {__x.columns[0][1], __x.columns[1][1], __x.columns[2][1]};
+    simd_float4 __r2 = {__x.columns[0][2], __x.columns[1][2], __x.columns[2][2]};
+    simd_float4 __r3 = {__x.columns[0][3], __x.columns[1][3], __x.columns[2][3]};
+#endif
+    return simd_matrix(__r0.xyz, __r1.xyz, __r2.xyz, __r3.xyz);
+}
+
+static simd_float2x4 SIMD_CFUNC simd_transpose(simd_float4x2 __x) {
+    simd_float4 __x0, __x1, __x2, __x3;
+    __x0.xy = __x.columns[0];
+    __x1.xy = __x.columns[1];
+    __x2.xy = __x.columns[2];
+    __x3.xy = __x.columns[3];
+#if defined __SSE__
+    simd_float4 __t0 = _mm_unpacklo_ps(__x0,__x2);
+    simd_float4 __t1 = _mm_unpacklo_ps(__x1,__x3);
+    simd_float4 __r0 = _mm_unpacklo_ps(__t0,__t1);
+    simd_float4 __r1 = _mm_unpackhi_ps(__t0,__t1);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __t0 = vzip1q_f32(__x0,__x2);
+    simd_float4 __t1 = vzip1q_f32(__x1,__x3);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t1);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t1);
+#else
+    simd_float4 __r0 = {__x.columns[0][0], __x.columns[1][0], __x.columns[2][0], __x.columns[3][0]};
+    simd_float4 __r1 = {__x.columns[0][1], __x.columns[1][1], __x.columns[2][1], __x.columns[3][1]};
+#endif
+    return simd_matrix(__r0,__r1);
+}
+
+static simd_float3x4 SIMD_CFUNC simd_transpose(simd_float4x3 __x) {
+    simd_float4 __x0, __x1, __x2, __x3;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+    __x2.xyz = __x.columns[2];
+    __x3.xyz = __x.columns[3];
+#if defined __SSE__
+    simd_float4 __t0 = _mm_unpacklo_ps(__x0,__x2);
+    simd_float4 __t1 = _mm_unpackhi_ps(__x0,__x2);
+    simd_float4 __t2 = _mm_unpacklo_ps(__x1,__x3);
+    simd_float4 __t3 = _mm_unpackhi_ps(__x1,__x3);
+    simd_float4 __r0 = _mm_unpacklo_ps(__t0,__t2);
+    simd_float4 __r1 = _mm_unpackhi_ps(__t0,__t2);
+    simd_float4 __r2 = _mm_unpacklo_ps(__t1,__t3);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __t0 = vzip1q_f32(__x0,__x2);
+    simd_float4 __t1 = vzip2q_f32(__x0,__x2);
+    simd_float4 __t2 = vzip1q_f32(__x1,__x3);
+    simd_float4 __t3 = vzip2q_f32(__x1,__x3);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t2);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t2);
+    simd_float4 __r2 = vzip1q_f32(__t1,__t3);
+#else
+    simd_float4 __r0 = {__x.columns[0][0], __x.columns[1][0], __x.columns[2][0], __x.columns[3][0]};
+    simd_float4 __r1 = {__x.columns[0][1], __x.columns[1][1], __x.columns[2][1], __x.columns[3][1]};
+    simd_float4 __r2 = {__x.columns[0][2], __x.columns[1][2], __x.columns[2][2], __x.columns[3][2]};
+#endif
+    return simd_matrix(__r0,__r1,__r2);
+}
+
+static simd_float4x4 SIMD_CFUNC simd_transpose(simd_float4x4 __x) {
+#if defined __SSE__
+    simd_float4 __t0 = _mm_unpacklo_ps(__x.columns[0],__x.columns[2]);
+    simd_float4 __t1 = _mm_unpackhi_ps(__x.columns[0],__x.columns[2]);
+    simd_float4 __t2 = _mm_unpacklo_ps(__x.columns[1],__x.columns[3]);
+    simd_float4 __t3 = _mm_unpackhi_ps(__x.columns[1],__x.columns[3]);
+    simd_float4 __r0 = _mm_unpacklo_ps(__t0,__t2);
+    simd_float4 __r1 = _mm_unpackhi_ps(__t0,__t2);
+    simd_float4 __r2 = _mm_unpacklo_ps(__t1,__t3);
+    simd_float4 __r3 = _mm_unpackhi_ps(__t1,__t3);
+#elif defined __ARM_NEON__ && defined __arm64__
+    simd_float4 __t0 = vzip1q_f32(__x.columns[0],__x.columns[2]);
+    simd_float4 __t1 = vzip2q_f32(__x.columns[0],__x.columns[2]);
+    simd_float4 __t2 = vzip1q_f32(__x.columns[1],__x.columns[3]);
+    simd_float4 __t3 = vzip2q_f32(__x.columns[1],__x.columns[3]);
+    simd_float4 __r0 = vzip1q_f32(__t0,__t2);
+    simd_float4 __r1 = vzip2q_f32(__t0,__t2);
+    simd_float4 __r2 = vzip1q_f32(__t1,__t3);
+    simd_float4 __r3 = vzip2q_f32(__t1,__t3);
+#else
+    simd_float4 __r0 = {__x.columns[0][0], __x.columns[1][0], __x.columns[2][0], __x.columns[3][0]};
+    simd_float4 __r1 = {__x.columns[0][1], __x.columns[1][1], __x.columns[2][1], __x.columns[3][1]};
+    simd_float4 __r2 = {__x.columns[0][2], __x.columns[1][2], __x.columns[2][2], __x.columns[3][2]};
+    simd_float4 __r3 = {__x.columns[0][3], __x.columns[1][3], __x.columns[2][3], __x.columns[3][3]};
+#endif
+    return simd_matrix(__r0,__r1,__r2,__r3);
+}
+
+static simd_double2x2 SIMD_CFUNC simd_transpose(simd_double2x2 __x) {
+    simd_double2 __x0, __x1;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 __r0 = vzip1q_f64(__x0, __x1);
+    simd_double2 __r1 = vzip2q_f64(__x0, __x1);
+#else
+    simd_double2 __r0 = { __x0[0], __x1[0] };
+    simd_double2 __r1 = { __x0[1], __x1[1] };
+#endif
+    return simd_matrix(__r0, __r1);
+}
+
+static simd_double3x2 SIMD_CFUNC simd_transpose(simd_double2x3 __x) {
+    simd_double4 __x0, __x1;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 __r0 = vzip1q_f64(__x0.lo,__x1.lo);
+    simd_double2 __r1 = vzip2q_f64(__x0.lo,__x1.lo);
+    simd_double2 __r2 = vzip1q_f64(__x0.hi,__x1.hi);
+#else
+    simd_double2 __r0 = {__x0[0], __x1[0]};
+    simd_double2 __r1 = {__x0[1], __x1[1]};
+    simd_double2 __r2 = {__x0[2], __x1[2]};
+#endif
+    return simd_matrix(__r0,__r1,__r2);
+}
+
+static simd_double4x2 SIMD_CFUNC simd_transpose(simd_double2x4 __x) {
+    simd_double4 __x0, __x1;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 __r0 = vzip1q_f64(__x0.lo,__x1.lo);
+    simd_double2 __r1 = vzip2q_f64(__x0.lo,__x1.lo);
+    simd_double2 __r2 = vzip1q_f64(__x0.hi,__x1.hi);
+    simd_double2 __r3 = vzip2q_f64(__x0.hi,__x1.hi);
+#else
+    simd_double2 __r0 = {__x0[0], __x1[0]};
+    simd_double2 __r1 = {__x0[1], __x1[1]};
+    simd_double2 __r2 = {__x0[2], __x1[2]};
+    simd_double2 __r3 = {__x0[3], __x1[3]};
+#endif
+    return simd_matrix(__r0,__r1,__r2,__r3);
+}
+
+static simd_double2x3 SIMD_CFUNC simd_transpose(simd_double3x2 __x) {
+    simd_double2 __x0, __x1, __x2;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+    __x2 = __x.columns[2];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 padding = { 0 };
+    simd_double4 __r0,__r1;
+    __r0.lo = vzip1q_f64(__x0,__x1);
+    __r1.lo = vzip2q_f64(__x0,__x1);
+    __r0.hi = vzip1q_f64(__x2,padding);
+    __r1.hi = vzip2q_f64(__x2,padding);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1]};
+#endif
+    return simd_matrix(__r0.xyz,__r1.xyz);
+}
+
+static simd_double3x3 SIMD_CFUNC simd_transpose(simd_double3x3 __x) {
+    simd_double4 __x0, __x1, __x2;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+    __x2.xyz = __x.columns[2];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 padding = { 0 };
+    simd_double4 __r0,__r1,__r2;
+    __r0.lo = vzip1q_f64(__x0.lo,__x1.lo);
+    __r1.lo = vzip2q_f64(__x0.lo,__x1.lo);
+    __r2.lo = vzip1q_f64(__x0.hi,__x1.hi);
+    __r0.hi = vzip1q_f64(__x2.lo,padding);
+    __r1.hi = vzip2q_f64(__x2.lo,padding);
+    __r2.hi = vzip1q_f64(__x2.hi,padding);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1]};
+    simd_double4 __r2 = {__x0[2], __x1[2], __x2[2]};
+#endif
+    return simd_matrix(__r0.xyz,__r1.xyz,__r2.xyz);
+}
+
+static simd_double4x3 SIMD_CFUNC simd_transpose(simd_double3x4 __x) {
+    simd_double4 __x0, __x1, __x2;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+    __x2 = __x.columns[2];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double2 padding = { 0 };
+    simd_double4 __r0,__r1,__r2,__r3;
+    __r0.lo = vzip1q_f64(__x0.lo,__x1.lo);
+    __r1.lo = vzip2q_f64(__x0.lo,__x1.lo);
+    __r2.lo = vzip1q_f64(__x0.hi,__x1.hi);
+    __r3.lo = vzip2q_f64(__x0.hi,__x1.hi);
+    __r0.hi = vzip1q_f64(__x2.lo,padding);
+    __r1.hi = vzip2q_f64(__x2.lo,padding);
+    __r2.hi = vzip1q_f64(__x2.hi,padding);
+    __r3.hi = vzip2q_f64(__x2.hi,padding);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1]};
+    simd_double4 __r2 = {__x0[2], __x1[2], __x2[2]};
+    simd_double4 __r3 = {__x0[3], __x1[3], __x2[3]};
+#endif
+    return simd_matrix(__r0.xyz,__r1.xyz,__r2.xyz,__r3.xyz);
+}
+
+static simd_double2x4 SIMD_CFUNC simd_transpose(simd_double4x2 __x) {
+    simd_double2 __x0, __x1, __x2, __x3;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+    __x2 = __x.columns[2];
+    __x3 = __x.columns[3];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double4 __r0,__r1;
+    __r0.lo = vzip1q_f64(__x0,__x1);
+    __r1.lo = vzip2q_f64(__x0,__x1);
+    __r0.hi = vzip1q_f64(__x2,__x3);
+    __r1.hi = vzip2q_f64(__x2,__x3);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0], __x3[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1], __x3[1]};
+#endif
+    return simd_matrix(__r0,__r1);
+}
+
+static simd_double3x4 SIMD_CFUNC simd_transpose(simd_double4x3 __x) {
+    simd_double4 __x0, __x1, __x2, __x3;
+    __x0.xyz = __x.columns[0];
+    __x1.xyz = __x.columns[1];
+    __x2.xyz = __x.columns[2];
+    __x3.xyz = __x.columns[3];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double4 __r0,__r1,__r2;
+    __r0.lo = vzip1q_f64(__x0.lo,__x1.lo);
+    __r1.lo = vzip2q_f64(__x0.lo,__x1.lo);
+    __r2.lo = vzip1q_f64(__x0.hi,__x1.hi);
+    __r0.hi = vzip1q_f64(__x2.lo,__x3.lo);
+    __r1.hi = vzip2q_f64(__x2.lo,__x3.lo);
+    __r2.hi = vzip1q_f64(__x2.hi,__x3.hi);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0], __x3[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1], __x3[1]};
+    simd_double4 __r2 = {__x0[2], __x1[2], __x2[2], __x3[2]};
+#endif
+    return simd_matrix(__r0,__r1,__r2);
+}
+
+static simd_double4x4 SIMD_CFUNC simd_transpose(simd_double4x4 __x) {
+    simd_double4 __x0, __x1, __x2, __x3;
+    __x0 = __x.columns[0];
+    __x1 = __x.columns[1];
+    __x2 = __x.columns[2];
+    __x3 = __x.columns[3];
+#if defined __ARM_NEON__ && defined __arm64__
+    simd_double4 __r0,__r1,__r2,__r3;
+    __r0.lo = vzip1q_f64(__x0.lo,__x1.lo);
+    __r1.lo = vzip2q_f64(__x0.lo,__x1.lo);
+    __r2.lo = vzip1q_f64(__x0.hi,__x1.hi);
+    __r3.lo = vzip2q_f64(__x0.hi,__x1.hi);
+    __r0.hi = vzip1q_f64(__x2.lo,__x3.lo);
+    __r1.hi = vzip2q_f64(__x2.lo,__x3.lo);
+    __r2.hi = vzip1q_f64(__x2.hi,__x3.hi);
+    __r3.hi = vzip2q_f64(__x2.hi,__x3.hi);
+#else
+    simd_double4 __r0 = {__x0[0], __x1[0], __x2[0], __x3[0]};
+    simd_double4 __r1 = {__x0[1], __x1[1], __x2[1], __x3[1]};
+    simd_double4 __r2 = {__x0[2], __x1[2], __x2[2], __x3[2]};
+    simd_double4 __r3 = {__x0[3], __x1[3], __x2[3], __x3[3]};
+#endif
+    return simd_matrix(__r0,__r1,__r2,__r3);
+}
+
+static  simd_float3 SIMD_CFUNC __rotate1( simd_float3 __x) { return __builtin_shufflevector(__x,__x,1,2,0); }
+static  simd_float3 SIMD_CFUNC __rotate2( simd_float3 __x) { return __builtin_shufflevector(__x,__x,2,0,1); }
+static  simd_float4 SIMD_CFUNC __rotate1( simd_float4 __x) { return __builtin_shufflevector(__x,__x,1,2,3,0); }
+static  simd_float4 SIMD_CFUNC __rotate2( simd_float4 __x) { return __builtin_shufflevector(__x,__x,2,3,0,1); }
+static  simd_float4 SIMD_CFUNC __rotate3( simd_float4 __x) { return __builtin_shufflevector(__x,__x,3,0,1,2); }
+static simd_double3 SIMD_CFUNC __rotate1(simd_double3 __x) { return __builtin_shufflevector(__x,__x,1,2,0); }
+static simd_double3 SIMD_CFUNC __rotate2(simd_double3 __x) { return __builtin_shufflevector(__x,__x,2,0,1); }
+static simd_double4 SIMD_CFUNC __rotate1(simd_double4 __x) { return __builtin_shufflevector(__x,__x,1,2,3,0); }
+static simd_double4 SIMD_CFUNC __rotate2(simd_double4 __x) { return __builtin_shufflevector(__x,__x,2,3,0,1); }
+static simd_double4 SIMD_CFUNC __rotate3(simd_double4 __x) { return __builtin_shufflevector(__x,__x,3,0,1,2); }
+
+static  float SIMD_CFUNC simd_trace( simd_float2x2 __x) { return __x.columns[0][0] + __x.columns[1][1]; }
+static double SIMD_CFUNC simd_trace(simd_double2x2 __x) { return __x.columns[0][0] + __x.columns[1][1]; }
+static  float SIMD_CFUNC simd_trace( simd_float3x3 __x) { return __x.columns[0][0] + __x.columns[1][1] + __x.columns[2][2]; }
+static double SIMD_CFUNC simd_trace(simd_double3x3 __x) { return __x.columns[0][0] + __x.columns[1][1] + __x.columns[2][2]; }
+static  float SIMD_CFUNC simd_trace( simd_float4x4 __x) { return __x.columns[0][0] + __x.columns[1][1] + __x.columns[2][2] + __x.columns[3][3]; }
+static double SIMD_CFUNC simd_trace(simd_double4x4 __x) { return __x.columns[0][0] + __x.columns[1][1] + __x.columns[2][2] + __x.columns[3][3]; }
+
+static  float SIMD_CFUNC simd_determinant( simd_float2x2 __x) { return __x.columns[0][0]*__x.columns[1][1] - __x.columns[0][1]*__x.columns[1][0]; }
+static double SIMD_CFUNC simd_determinant(simd_double2x2 __x) { return __x.columns[0][0]*__x.columns[1][1] - __x.columns[0][1]*__x.columns[1][0]; }
+static  float SIMD_CFUNC simd_determinant( simd_float3x3 __x) { return simd_reduce_add(__x.columns[0]*(__rotate1(__x.columns[1])*__rotate2(__x.columns[2]) - __rotate2(__x.columns[1])*__rotate1(__x.columns[2]))); }
+static double SIMD_CFUNC simd_determinant(simd_double3x3 __x) { return simd_reduce_add(__x.columns[0]*(__rotate1(__x.columns[1])*__rotate2(__x.columns[2]) - __rotate2(__x.columns[1])*__rotate1(__x.columns[2]))); }
+static  float SIMD_CFUNC simd_determinant( simd_float4x4 __x) {
+    simd_float4 codet = __x.columns[0]*(__rotate1(__x.columns[1])*(__rotate2(__x.columns[2])*__rotate3(__x.columns[3])-__rotate3(__x.columns[2])*__rotate2(__x.columns[3])) +
+                                          __rotate2(__x.columns[1])*(__rotate3(__x.columns[2])*__rotate1(__x.columns[3])-__rotate1(__x.columns[2])*__rotate3(__x.columns[3])) +
+                                          __rotate3(__x.columns[1])*(__rotate1(__x.columns[2])*__rotate2(__x.columns[3])-__rotate2(__x.columns[2])*__rotate1(__x.columns[3])));
+    return simd_reduce_add(codet.even - codet.odd);
+}
+static double SIMD_CFUNC simd_determinant(simd_double4x4 __x) {
+    simd_double4 codet = __x.columns[0]*(__rotate1(__x.columns[1])*(__rotate2(__x.columns[2])*__rotate3(__x.columns[3])-__rotate3(__x.columns[2])*__rotate2(__x.columns[3])) +
+                                           __rotate2(__x.columns[1])*(__rotate3(__x.columns[2])*__rotate1(__x.columns[3])-__rotate1(__x.columns[2])*__rotate3(__x.columns[3])) +
+                                           __rotate3(__x.columns[1])*(__rotate1(__x.columns[2])*__rotate2(__x.columns[3])-__rotate2(__x.columns[2])*__rotate1(__x.columns[3])));
+    return simd_reduce_add(codet.even - codet.odd);
+}
+
+static  simd_float2x2 SIMD_CFUNC simd_inverse( simd_float2x2 __x) { return __invert_f2(__x); }
+static  simd_float3x3 SIMD_CFUNC simd_inverse( simd_float3x3 __x) { return __invert_f3(__x); }
+static  simd_float4x4 SIMD_CFUNC simd_inverse( simd_float4x4 __x) { return __invert_f4(__x); }
+static simd_double2x2 SIMD_CFUNC simd_inverse(simd_double2x2 __x) { return __invert_d2(__x); }
+static simd_double3x3 SIMD_CFUNC simd_inverse(simd_double3x3 __x) { return __invert_d3(__x); }
+static simd_double4x4 SIMD_CFUNC simd_inverse(simd_double4x4 __x) { return __invert_d4(__x); }
+
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float2x2 __x,  simd_float2 __y) {  simd_float2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float2x3 __x,  simd_float2 __y) {  simd_float3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float2x4 __x,  simd_float2 __y) {  simd_float4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float3x2 __x,  simd_float3 __y) {  simd_float2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float3x3 __x,  simd_float3 __y) {  simd_float3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float3x4 __x,  simd_float3 __y) {  simd_float4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float4x2 __x,  simd_float4 __y) {  simd_float2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float4x3 __x,  simd_float4 __y) {  simd_float3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float4x4 __x,  simd_float4 __y) {  simd_float4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double2 __y) { simd_double2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double2 __y) { simd_double3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double2 __y) { simd_double4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); return __r; }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double3 __y) { simd_double2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double3 __y) { simd_double3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double3 __y) { simd_double4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); return __r; }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double4 __y) { simd_double2 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double4 __y) { simd_double3 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double4 __y) { simd_double4 __r = __x.columns[0]*__y[0]; __r = simd_muladd( __x.columns[1], __y[1],__r); __r = simd_muladd( __x.columns[2], __y[2],__r); __r = simd_muladd( __x.columns[3], __y[3],__r); return __r; }
+
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float2 __x,  simd_float2x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float2 __x,  simd_float3x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float2 __x,  simd_float4x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float3 __x,  simd_float2x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float3 __x,  simd_float3x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float3 __x,  simd_float4x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float2 SIMD_CFUNC simd_mul( simd_float4 __x,  simd_float2x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float3 SIMD_CFUNC simd_mul( simd_float4 __x,  simd_float3x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+static  simd_float4 SIMD_CFUNC simd_mul( simd_float4 __x,  simd_float4x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double2x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double3x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double2 __x, simd_double4x2 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double2x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double3x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double3 __x, simd_double4x3 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double2 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double2x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double3 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double3x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+static simd_double4 SIMD_CFUNC simd_mul(simd_double4 __x, simd_double4x4 __y) { return simd_mul(simd_transpose(__y), __x); }
+
+static  simd_float2x2 SIMD_CFUNC simd_mul( simd_float2x2 __x,  simd_float2x2 __y) {  simd_float2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double2x2 __y) { simd_double2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x3 SIMD_CFUNC simd_mul( simd_float2x3 __x,  simd_float2x2 __y) {  simd_float2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double2x2 __y) { simd_double2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x4 SIMD_CFUNC simd_mul( simd_float2x4 __x,  simd_float2x2 __y) {  simd_float2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double2x2 __y) { simd_double2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x2 SIMD_CFUNC simd_mul( simd_float3x2 __x,  simd_float2x3 __y) {  simd_float2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double2x3 __y) { simd_double2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x3 SIMD_CFUNC simd_mul( simd_float3x3 __x,  simd_float2x3 __y) {  simd_float2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double2x3 __y) { simd_double2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x4 SIMD_CFUNC simd_mul( simd_float3x4 __x,  simd_float2x3 __y) {  simd_float2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double2x3 __y) { simd_double2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x2 SIMD_CFUNC simd_mul( simd_float4x2 __x,  simd_float2x4 __y) {  simd_float2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double2x4 __y) { simd_double2x2 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x3 SIMD_CFUNC simd_mul( simd_float4x3 __x,  simd_float2x4 __y) {  simd_float2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double2x4 __y) { simd_double2x3 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float2x4 SIMD_CFUNC simd_mul( simd_float4x4 __x,  simd_float2x4 __y) {  simd_float2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double2x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double2x4 __y) { simd_double2x4 __r; for (int i=0; i<2; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+
+static  simd_float3x2 SIMD_CFUNC simd_mul( simd_float2x2 __x,  simd_float3x2 __y) {  simd_float3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double3x2 __y) { simd_double3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x3 SIMD_CFUNC simd_mul( simd_float2x3 __x,  simd_float3x2 __y) {  simd_float3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double3x2 __y) { simd_double3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x4 SIMD_CFUNC simd_mul( simd_float2x4 __x,  simd_float3x2 __y) {  simd_float3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double3x2 __y) { simd_double3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x2 SIMD_CFUNC simd_mul( simd_float3x2 __x,  simd_float3x3 __y) {  simd_float3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double3x3 __y) { simd_double3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x3 SIMD_CFUNC simd_mul( simd_float3x3 __x,  simd_float3x3 __y) {  simd_float3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double3x3 __y) { simd_double3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x4 SIMD_CFUNC simd_mul( simd_float3x4 __x,  simd_float3x3 __y) {  simd_float3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double3x3 __y) { simd_double3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x2 SIMD_CFUNC simd_mul( simd_float4x2 __x,  simd_float3x4 __y) {  simd_float3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double3x4 __y) { simd_double3x2 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x3 SIMD_CFUNC simd_mul( simd_float4x3 __x,  simd_float3x4 __y) {  simd_float3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double3x4 __y) { simd_double3x3 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float3x4 SIMD_CFUNC simd_mul( simd_float4x4 __x,  simd_float3x4 __y) {  simd_float3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double3x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double3x4 __y) { simd_double3x4 __r; for (int i=0; i<3; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+
+static  simd_float4x2 SIMD_CFUNC simd_mul( simd_float2x2 __x,  simd_float4x2 __y) {  simd_float4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double2x2 __x, simd_double4x2 __y) { simd_double4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x3 SIMD_CFUNC simd_mul( simd_float2x3 __x,  simd_float4x2 __y) {  simd_float4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double2x3 __x, simd_double4x2 __y) { simd_double4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x4 SIMD_CFUNC simd_mul( simd_float2x4 __x,  simd_float4x2 __y) {  simd_float4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double2x4 __x, simd_double4x2 __y) { simd_double4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x2 SIMD_CFUNC simd_mul( simd_float3x2 __x,  simd_float4x3 __y) {  simd_float4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double3x2 __x, simd_double4x3 __y) { simd_double4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x3 SIMD_CFUNC simd_mul( simd_float3x3 __x,  simd_float4x3 __y) {  simd_float4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double3x3 __x, simd_double4x3 __y) { simd_double4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x4 SIMD_CFUNC simd_mul( simd_float3x4 __x,  simd_float4x3 __y) {  simd_float4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double3x4 __x, simd_double4x3 __y) { simd_double4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x2 SIMD_CFUNC simd_mul( simd_float4x2 __x,  simd_float4x4 __y) {  simd_float4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x2 SIMD_CFUNC simd_mul(simd_double4x2 __x, simd_double4x4 __y) { simd_double4x2 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x3 SIMD_CFUNC simd_mul( simd_float4x3 __x,  simd_float4x4 __y) {  simd_float4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x3 SIMD_CFUNC simd_mul(simd_double4x3 __x, simd_double4x4 __y) { simd_double4x3 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static  simd_float4x4 SIMD_CFUNC simd_mul( simd_float4x4 __x,  simd_float4x4 __y) {  simd_float4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+static simd_double4x4 SIMD_CFUNC simd_mul(simd_double4x4 __x, simd_double4x4 __y) { simd_double4x4 __r; for (int i=0; i<4; ++i) __r.columns[i] = simd_mul(__x, __y.columns[i]); return __r; }
+  
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float2x2 __x,  simd_float2 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float2x3 __x,  simd_float2 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float2x4 __x,  simd_float2 __y) { return simd_mul(__x, __y); }
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float3x2 __x,  simd_float3 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float3x3 __x,  simd_float3 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float3x4 __x,  simd_float3 __y) { return simd_mul(__x, __y); }
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float4x2 __x,  simd_float4 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float4x3 __x,  simd_float4 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float4x4 __x,  simd_float4 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double2x2 __x, simd_double2 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double2x3 __x, simd_double2 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double2x4 __x, simd_double2 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double3x2 __x, simd_double3 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double3x3 __x, simd_double3 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double3x4 __x, simd_double3 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double4x2 __x, simd_double4 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double4x3 __x, simd_double4 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double4x4 __x, simd_double4 __y) { return simd_mul(__x, __y); }
+  
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float2 __x,  simd_float2x2 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float2 __x,  simd_float3x2 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float2 __x,  simd_float4x2 __y) { return simd_mul(__x, __y); }
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float3 __x,  simd_float2x3 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float3 __x,  simd_float3x3 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float3 __x,  simd_float4x3 __y) { return simd_mul(__x, __y); }
+static  simd_float2 SIMD_CFUNC matrix_multiply( simd_float4 __x,  simd_float2x4 __y) { return simd_mul(__x, __y); }
+static  simd_float3 SIMD_CFUNC matrix_multiply( simd_float4 __x,  simd_float3x4 __y) { return simd_mul(__x, __y); }
+static  simd_float4 SIMD_CFUNC matrix_multiply( simd_float4 __x,  simd_float4x4 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double2 __x, simd_double2x2 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double2 __x, simd_double3x2 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double2 __x, simd_double4x2 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double3 __x, simd_double2x3 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double3 __x, simd_double3x3 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double3 __x, simd_double4x3 __y) { return simd_mul(__x, __y); }
+static simd_double2 SIMD_CFUNC matrix_multiply(simd_double4 __x, simd_double2x4 __y) { return simd_mul(__x, __y); }
+static simd_double3 SIMD_CFUNC matrix_multiply(simd_double4 __x, simd_double3x4 __y) { return simd_mul(__x, __y); }
+static simd_double4 SIMD_CFUNC matrix_multiply(simd_double4 __x, simd_double4x4 __y) { return simd_mul(__x, __y); }
+  
+static  simd_float2x2 SIMD_CFUNC matrix_multiply( simd_float2x2 __x,  simd_float2x2 __y) { return simd_mul(__x, __y); }
+static simd_double2x2 SIMD_CFUNC matrix_multiply(simd_double2x2 __x, simd_double2x2 __y) { return simd_mul(__x, __y); }
+static  simd_float2x3 SIMD_CFUNC matrix_multiply( simd_float2x3 __x,  simd_float2x2 __y) { return simd_mul(__x, __y); }
+static simd_double2x3 SIMD_CFUNC matrix_multiply(simd_double2x3 __x, simd_double2x2 __y) { return simd_mul(__x, __y); }
+static  simd_float2x4 SIMD_CFUNC matrix_multiply( simd_float2x4 __x,  simd_float2x2 __y) { return simd_mul(__x, __y); }
+static simd_double2x4 SIMD_CFUNC matrix_multiply(simd_double2x4 __x, simd_double2x2 __y) { return simd_mul(__x, __y); }
+static  simd_float2x2 SIMD_CFUNC matrix_multiply( simd_float3x2 __x,  simd_float2x3 __y) { return simd_mul(__x, __y); }
+static simd_double2x2 SIMD_CFUNC matrix_multiply(simd_double3x2 __x, simd_double2x3 __y) { return simd_mul(__x, __y); }
+static  simd_float2x3 SIMD_CFUNC matrix_multiply( simd_float3x3 __x,  simd_float2x3 __y) { return simd_mul(__x, __y); }
+static simd_double2x3 SIMD_CFUNC matrix_multiply(simd_double3x3 __x, simd_double2x3 __y) { return simd_mul(__x, __y); }
+static  simd_float2x4 SIMD_CFUNC matrix_multiply( simd_float3x4 __x,  simd_float2x3 __y) { return simd_mul(__x, __y); }
+static simd_double2x4 SIMD_CFUNC matrix_multiply(simd_double3x4 __x, simd_double2x3 __y) { return simd_mul(__x, __y); }
+static  simd_float2x2 SIMD_CFUNC matrix_multiply( simd_float4x2 __x,  simd_float2x4 __y) { return simd_mul(__x, __y); }
+static simd_double2x2 SIMD_CFUNC matrix_multiply(simd_double4x2 __x, simd_double2x4 __y) { return simd_mul(__x, __y); }
+static  simd_float2x3 SIMD_CFUNC matrix_multiply( simd_float4x3 __x,  simd_float2x4 __y) { return simd_mul(__x, __y); }
+static simd_double2x3 SIMD_CFUNC matrix_multiply(simd_double4x3 __x, simd_double2x4 __y) { return simd_mul(__x, __y); }
+static  simd_float2x4 SIMD_CFUNC matrix_multiply( simd_float4x4 __x,  simd_float2x4 __y) { return simd_mul(__x, __y); }
+static simd_double2x4 SIMD_CFUNC matrix_multiply(simd_double4x4 __x, simd_double2x4 __y) { return simd_mul(__x, __y); }
+  
+static  simd_float3x2 SIMD_CFUNC matrix_multiply( simd_float2x2 __x,  simd_float3x2 __y) { return simd_mul(__x, __y); }
+static simd_double3x2 SIMD_CFUNC matrix_multiply(simd_double2x2 __x, simd_double3x2 __y) { return simd_mul(__x, __y); }
+static  simd_float3x3 SIMD_CFUNC matrix_multiply( simd_float2x3 __x,  simd_float3x2 __y) { return simd_mul(__x, __y); }
+static simd_double3x3 SIMD_CFUNC matrix_multiply(simd_double2x3 __x, simd_double3x2 __y) { return simd_mul(__x, __y); }
+static  simd_float3x4 SIMD_CFUNC matrix_multiply( simd_float2x4 __x,  simd_float3x2 __y) { return simd_mul(__x, __y); }
+static simd_double3x4 SIMD_CFUNC matrix_multiply(simd_double2x4 __x, simd_double3x2 __y) { return simd_mul(__x, __y); }
+static  simd_float3x2 SIMD_CFUNC matrix_multiply( simd_float3x2 __x,  simd_float3x3 __y) { return simd_mul(__x, __y); }
+static simd_double3x2 SIMD_CFUNC matrix_multiply(simd_double3x2 __x, simd_double3x3 __y) { return simd_mul(__x, __y); }
+static  simd_float3x3 SIMD_CFUNC matrix_multiply( simd_float3x3 __x,  simd_float3x3 __y) { return simd_mul(__x, __y); }
+static simd_double3x3 SIMD_CFUNC matrix_multiply(simd_double3x3 __x, simd_double3x3 __y) { return simd_mul(__x, __y); }
+static  simd_float3x4 SIMD_CFUNC matrix_multiply( simd_float3x4 __x,  simd_float3x3 __y) { return simd_mul(__x, __y); }
+static simd_double3x4 SIMD_CFUNC matrix_multiply(simd_double3x4 __x, simd_double3x3 __y) { return simd_mul(__x, __y); }
+static  simd_float3x2 SIMD_CFUNC matrix_multiply( simd_float4x2 __x,  simd_float3x4 __y) { return simd_mul(__x, __y); }
+static simd_double3x2 SIMD_CFUNC matrix_multiply(simd_double4x2 __x, simd_double3x4 __y) { return simd_mul(__x, __y); }
+static  simd_float3x3 SIMD_CFUNC matrix_multiply( simd_float4x3 __x,  simd_float3x4 __y) { return simd_mul(__x, __y); }
+static simd_double3x3 SIMD_CFUNC matrix_multiply(simd_double4x3 __x, simd_double3x4 __y) { return simd_mul(__x, __y); }
+static  simd_float3x4 SIMD_CFUNC matrix_multiply( simd_float4x4 __x,  simd_float3x4 __y) { return simd_mul(__x, __y); }
+static simd_double3x4 SIMD_CFUNC matrix_multiply(simd_double4x4 __x, simd_double3x4 __y) { return simd_mul(__x, __y); }
+  
+static  simd_float4x2 SIMD_CFUNC matrix_multiply( simd_float2x2 __x,  simd_float4x2 __y) { return simd_mul(__x, __y); }
+static simd_double4x2 SIMD_CFUNC matrix_multiply(simd_double2x2 __x, simd_double4x2 __y) { return simd_mul(__x, __y); }
+static  simd_float4x3 SIMD_CFUNC matrix_multiply( simd_float2x3 __x,  simd_float4x2 __y) { return simd_mul(__x, __y); }
+static simd_double4x3 SIMD_CFUNC matrix_multiply(simd_double2x3 __x, simd_double4x2 __y) { return simd_mul(__x, __y); }
+static  simd_float4x4 SIMD_CFUNC matrix_multiply( simd_float2x4 __x,  simd_float4x2 __y) { return simd_mul(__x, __y); }
+static simd_double4x4 SIMD_CFUNC matrix_multiply(simd_double2x4 __x, simd_double4x2 __y) { return simd_mul(__x, __y); }
+static  simd_float4x2 SIMD_CFUNC matrix_multiply( simd_float3x2 __x,  simd_float4x3 __y) { return simd_mul(__x, __y); }
+static simd_double4x2 SIMD_CFUNC matrix_multiply(simd_double3x2 __x, simd_double4x3 __y) { return simd_mul(__x, __y); }
+static  simd_float4x3 SIMD_CFUNC matrix_multiply( simd_float3x3 __x,  simd_float4x3 __y) { return simd_mul(__x, __y); }
+static simd_double4x3 SIMD_CFUNC matrix_multiply(simd_double3x3 __x, simd_double4x3 __y) { return simd_mul(__x, __y); }
+static  simd_float4x4 SIMD_CFUNC matrix_multiply( simd_float3x4 __x,  simd_float4x3 __y) { return simd_mul(__x, __y); }
+static simd_double4x4 SIMD_CFUNC matrix_multiply(simd_double3x4 __x, simd_double4x3 __y) { return simd_mul(__x, __y); }
+static  simd_float4x2 SIMD_CFUNC matrix_multiply( simd_float4x2 __x,  simd_float4x4 __y) { return simd_mul(__x, __y); }
+static simd_double4x2 SIMD_CFUNC matrix_multiply(simd_double4x2 __x, simd_double4x4 __y) { return simd_mul(__x, __y); }
+static  simd_float4x3 SIMD_CFUNC matrix_multiply( simd_float4x3 __x,  simd_float4x4 __y) { return simd_mul(__x, __y); }
+static simd_double4x3 SIMD_CFUNC matrix_multiply(simd_double4x3 __x, simd_double4x4 __y) { return simd_mul(__x, __y); }
+static  simd_float4x4 SIMD_CFUNC matrix_multiply( simd_float4x4 __x,  simd_float4x4 __y) { return simd_mul(__x, __y); }
+static simd_double4x4 SIMD_CFUNC matrix_multiply(simd_double4x4 __x, simd_double4x4 __y) { return simd_mul(__x, __y); }
+
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x2 __x, simd_float2x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x3 __x, simd_float2x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float2x4 __x, simd_float2x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x2 __x, simd_float3x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x3 __x, simd_float3x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float3x4 __x, simd_float3x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x2 __x, simd_float4x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x3 __x, simd_float4x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_float4x4 __x, simd_float4x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x2 __x, simd_double2x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x3 __x, simd_double2x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double2x4 __x, simd_double2x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x2 __x, simd_double3x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x3 __x, simd_double3x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double3x4 __x, simd_double3x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x2 __x, simd_double4x2 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x3 __x, simd_double4x3 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+static simd_bool SIMD_CFUNC simd_equal(simd_double4x4 __x, simd_double4x4 __y) {
+    return simd_all((__x.columns[0] == __y.columns[0]) &
+                      (__x.columns[1] == __y.columns[1]) &
+                      (__x.columns[2] == __y.columns[2]) &
+                      (__x.columns[3] == __y.columns[3]));
+}
+
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x2 __x, simd_float2x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x3 __x, simd_float2x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float2x4 __x, simd_float2x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x2 __x, simd_float3x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x3 __x, simd_float3x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float3x4 __x, simd_float3x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x2 __x, simd_float4x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x3 __x, simd_float4x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_float4x4 __x, simd_float4x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x2 __x, simd_double2x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x3 __x, simd_double2x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double2x4 __x, simd_double2x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x2 __x, simd_double3x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x3 __x, simd_double3x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double3x4 __x, simd_double3x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x2 __x, simd_double4x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x3 __x, simd_double4x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements(simd_double4x4 __x, simd_double4x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol));
+}
+
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x2 __x, simd_float2x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x3 __x, simd_float2x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float2x4 __x, simd_float2x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x2 __x, simd_float3x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x3 __x, simd_float3x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float3x4 __x, simd_float3x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x2 __x, simd_float4x2 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x3 __x, simd_float4x3 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_float4x4 __x, simd_float4x4 __y, float __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x2 __x, simd_double2x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x3 __x, simd_double2x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double2x4 __x, simd_double2x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x2 __x, simd_double3x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x3 __x, simd_double3x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double3x4 __x, simd_double3x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x2 __x, simd_double4x2 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x3 __x, simd_double4x3 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+static simd_bool SIMD_CFUNC simd_almost_equal_elements_relative(simd_double4x4 __x, simd_double4x4 __y, double __tol) {
+    return simd_all((__tg_fabs(__x.columns[0] - __y.columns[0]) <= __tol*__tg_fabs(__x.columns[0])) &
+                      (__tg_fabs(__x.columns[1] - __y.columns[1]) <= __tol*__tg_fabs(__x.columns[1])) &
+                      (__tg_fabs(__x.columns[2] - __y.columns[2]) <= __tol*__tg_fabs(__x.columns[2])) &
+                      (__tg_fabs(__x.columns[3] - __y.columns[3]) <= __tol*__tg_fabs(__x.columns[3])));
+}
+    
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_HEADER__ */
diff --git a/vfsoverlay/matrix_types.h b/vfsoverlay/matrix_types.h
new file mode 100644
index 00000000..69f821ee
--- /dev/null
+++ b/vfsoverlay/matrix_types.h
@@ -0,0 +1,525 @@
+/*  Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ *  This header defines nine matrix types for each of float and double, which
+ *  are intended for use together with the vector types defined in
+ *  <simd/vector_types.h>.
+ *
+ *  For compatibility with common graphics libraries, these matrices are stored
+ *  in column-major order, and implemented as arrays of column vectors.
+ *  Column-major storage order may seem a little strange if you aren't used to
+ *  it, but for most usage the memory layout of the matrices shouldn't matter
+ *  at all; instead you should think of matrices as abstract mathematical
+ *  objects that you use to perform arithmetic without worrying about the
+ *  details of the underlying representation.
+ *
+ *  WARNING: vectors of length three are internally represented as length four
+ *  vectors with one element of padding (for alignment purposes).  This means
+ *  that when a floatNx3 or doubleNx3 is viewed as a vector, it appears to
+ *  have 4*N elements instead of the expected 3*N (with one padding element
+ *  at the end of each column).  The matrix elements are laid out in memory
+ *  as follows:
+ *
+ *      { 0, 1, 2, x, 3, 4, 5, x, ... }
+ *
+ *  (where the scalar indices used above indicate the conceptual column-
+ *  major storage order).  If you aren't monkeying around with the internal
+ *  storage details of matrices, you don't need to worry about this at all.
+ *  Consider this yet another good reason to avoid doing so.                  */
+
+#ifndef SIMD_MATRIX_TYPES_HEADER
+#define SIMD_MATRIX_TYPES_HEADER
+
+#include <simd/types.h>
+#include <simd/vector_make.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+
+/*  Matrix types available in C, Objective-C, and C++                         */
+typedef simd_float2x2 matrix_float2x2;
+typedef simd_float3x2 matrix_float3x2;
+typedef simd_float4x2 matrix_float4x2;
+
+typedef simd_float2x3 matrix_float2x3;
+typedef simd_float3x3 matrix_float3x3;
+typedef simd_float4x3 matrix_float4x3;
+
+typedef simd_float2x4 matrix_float2x4;
+typedef simd_float3x4 matrix_float3x4;
+typedef simd_float4x4 matrix_float4x4;
+
+typedef simd_double2x2 matrix_double2x2;
+typedef simd_double3x2 matrix_double3x2;
+typedef simd_double4x2 matrix_double4x2;
+
+typedef simd_double2x3 matrix_double2x3;
+typedef simd_double3x3 matrix_double3x3;
+typedef simd_double4x3 matrix_double4x3;
+
+typedef simd_double2x4 matrix_double2x4;
+typedef simd_double3x4 matrix_double3x4;
+typedef simd_double4x4 matrix_double4x4;
+
+#ifdef __cplusplus
+#if defined SIMD_MATRIX_HEADER
+static  simd_float3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatf q);
+static  simd_float4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatf q);
+static simd_double3x3 SIMD_NOINLINE simd_matrix3x3(simd_quatd q);
+static simd_double4x4 SIMD_NOINLINE simd_matrix4x4(simd_quatd q);
+#endif
+
+namespace simd {
+  
+  struct float2x2 : ::simd_float2x2 {
+    SIMD_CONSTEXPR float2x2() SIMD_NOEXCEPT : ::simd_float2x2((simd_float2x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float2x2(float diagonal) SIMD_NOEXCEPT : float2x2((float2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float2x2(float2 v) SIMD_NOEXCEPT :
+    ::simd_float2x2((simd_float2x2){(float2){v.x,0}, (float2){0,v.y}}) { }
+    SIMD_CONSTEXPR float2x2(float2 c0, float2 c1) SIMD_NOEXCEPT : simd_float2x2((simd_float2x2){c0, c1}) { }
+    SIMD_CONSTEXPR float2x2(::simd_float2x2 m) SIMD_NOEXCEPT : ::simd_float2x2(m) { }
+  };
+  
+  struct float3x2 : ::simd_float3x2 {
+    SIMD_CONSTEXPR float3x2() SIMD_NOEXCEPT : ::simd_float3x2((simd_float3x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float3x2(float diagonal) SIMD_NOEXCEPT : float3x2((float2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float3x2(float2 v) SIMD_NOEXCEPT :
+    ::simd_float3x2((simd_float3x2){(float2){v.x,0}, (float2){0,v.y}, (float2){0}}) { }
+    SIMD_CONSTEXPR float3x2(float2 c0, float2 c1, float2 c2) SIMD_NOEXCEPT :
+    ::simd_float3x2((simd_float3x2){c0, c1, c2}) { }
+    SIMD_CONSTEXPR float3x2(::simd_float3x2 m) SIMD_NOEXCEPT : ::simd_float3x2(m) { }
+  };
+  
+  struct float4x2 : ::simd_float4x2 {
+    SIMD_CONSTEXPR float4x2() SIMD_NOEXCEPT : ::simd_float4x2((simd_float4x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float4x2(float diagonal) SIMD_NOEXCEPT : float4x2((float2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float4x2(float2 v) SIMD_NOEXCEPT :
+    ::simd_float4x2((simd_float4x2){(float2){v.x,0}, (float2){0,v.y}, (float2){0}, (float2){0}}) { }
+    SIMD_CONSTEXPR float4x2(float2 c0, float2 c1, float2 c2, float2 c3) SIMD_NOEXCEPT :
+    ::simd_float4x2((simd_float4x2){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR float4x2(::simd_float4x2 m) SIMD_NOEXCEPT : ::simd_float4x2(m) { }
+  };
+  
+  struct float2x3 : ::simd_float2x3 {
+    SIMD_CONSTEXPR float2x3() SIMD_NOEXCEPT : ::simd_float2x3((simd_float2x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float2x3(float diagonal) SIMD_NOEXCEPT : float2x3((float2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float2x3(float2 v) SIMD_NOEXCEPT :
+    ::simd_float2x3((simd_float2x3){(float3){v.x,0,0}, (float3){0,v.y,0}}) { }
+    SIMD_CONSTEXPR float2x3(float3 c0, float3 c1) SIMD_NOEXCEPT : ::simd_float2x3((simd_float2x3){c0, c1}) { }
+    SIMD_CONSTEXPR float2x3(::simd_float2x3 m) SIMD_NOEXCEPT : ::simd_float2x3(m) { }
+  };
+  
+  struct float3x3 : ::simd_float3x3 {
+    SIMD_CONSTEXPR float3x3() SIMD_NOEXCEPT : ::simd_float3x3((simd_float3x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float3x3(float diagonal) SIMD_NOEXCEPT : float3x3((float3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float3x3(float3 v) SIMD_NOEXCEPT :
+    ::simd_float3x3((simd_float3x3){(float3){v.x,0,0}, (float3){0,v.y,0}, (float3){0,0,v.z}}) { }
+    SIMD_CONSTEXPR float3x3(float3 c0, float3 c1, float3 c2) SIMD_NOEXCEPT :
+    ::simd_float3x3((simd_float3x3){c0, c1, c2}) { }
+    SIMD_CONSTEXPR float3x3(::simd_float3x3 m) SIMD_NOEXCEPT : ::simd_float3x3(m) { }
+#if defined SIMD_MATRIX_HEADER
+    SIMD_CONSTEXPR float3x3(::simd_quatf q) SIMD_NOEXCEPT : ::simd_float3x3(::simd_matrix3x3(q)) { }
+#endif
+  };
+  
+  struct float4x3 : ::simd_float4x3 {
+    SIMD_CONSTEXPR float4x3() SIMD_NOEXCEPT : ::simd_float4x3((simd_float4x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float4x3(float diagonal) SIMD_NOEXCEPT : float4x3((float3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float4x3(float3 v) SIMD_NOEXCEPT :
+    ::simd_float4x3((simd_float4x3){(float3){v.x,0,0}, (float3){0,v.y,0}, (float3){0,0,v.z}, (float3){0}}) { }
+    SIMD_CONSTEXPR float4x3(float3 c0, float3 c1, float3 c2, float3 c3) SIMD_NOEXCEPT :
+    ::simd_float4x3((simd_float4x3){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR float4x3(::simd_float4x3 m) SIMD_NOEXCEPT : ::simd_float4x3(m) { }
+  };
+  
+  struct float2x4 : ::simd_float2x4 {
+    SIMD_CONSTEXPR float2x4() SIMD_NOEXCEPT : ::simd_float2x4((simd_float2x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float2x4(float diagonal) SIMD_NOEXCEPT : float2x4((float2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float2x4(float2 v) SIMD_NOEXCEPT :
+    ::simd_float2x4((simd_float2x4){(float4){v.x,0,0,0}, (float4){0,v.y,0,0}}) { }
+    SIMD_CONSTEXPR float2x4(float4 c0, float4 c1) SIMD_NOEXCEPT : ::simd_float2x4((simd_float2x4){c0, c1}) { }
+    SIMD_CONSTEXPR float2x4(::simd_float2x4 m) SIMD_NOEXCEPT : ::simd_float2x4(m) { }
+  };
+  
+  struct float3x4 : ::simd_float3x4 {
+    SIMD_CONSTEXPR float3x4() SIMD_NOEXCEPT : ::simd_float3x4((simd_float3x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float3x4(float diagonal) SIMD_NOEXCEPT : float3x4((float3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float3x4(float3 v) SIMD_NOEXCEPT :
+    ::simd_float3x4((simd_float3x4){(float4){v.x,0,0,0}, (float4){0,v.y,0,0}, (float4){0,0,v.z,0}}) { }
+    SIMD_CONSTEXPR float3x4(float4 c0, float4 c1, float4 c2) SIMD_NOEXCEPT :
+    ::simd_float3x4((simd_float3x4){c0, c1, c2}) { }
+    SIMD_CONSTEXPR float3x4(::simd_float3x4 m) SIMD_NOEXCEPT : ::simd_float3x4(m) { }
+  };
+  
+  struct float4x4 : ::simd_float4x4 {
+    SIMD_CONSTEXPR float4x4() SIMD_NOEXCEPT : ::simd_float4x4((simd_float4x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR float4x4(float diagonal) SIMD_NOEXCEPT : float4x4((float4)diagonal) { }
+#endif
+    SIMD_CONSTEXPR float4x4(float4 v) SIMD_NOEXCEPT :
+    ::simd_float4x4((simd_float4x4){(float4){v.x,0,0,0}, (float4){0,v.y,0,0}, (float4){0,0,v.z,0}, (float4){0,0,0,v.w}}) { }
+    SIMD_CONSTEXPR float4x4(float4 c0, float4 c1, float4 c2, float4 c3) SIMD_NOEXCEPT :
+    ::simd_float4x4((simd_float4x4){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR float4x4(::simd_float4x4 m) SIMD_NOEXCEPT : ::simd_float4x4(m) { }
+#if defined SIMD_MATRIX_HEADER
+    SIMD_CONSTEXPR float4x4(::simd_quatf q) SIMD_NOEXCEPT : ::simd_float4x4(::simd_matrix4x4(q)) { }
+#endif
+  };
+  
+  struct double2x2 : ::simd_double2x2 {
+    SIMD_CONSTEXPR double2x2() SIMD_NOEXCEPT : ::simd_double2x2((simd_double2x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double2x2(double diagonal) SIMD_NOEXCEPT : double2x2((double2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double2x2(double2 v) SIMD_NOEXCEPT :
+    ::simd_double2x2((simd_double2x2){(double2){v.x,0}, (double2){0,v.y}}) { }
+    SIMD_CONSTEXPR double2x2(double2 c0, double2 c1) SIMD_NOEXCEPT :
+    ::simd_double2x2((simd_double2x2){c0, c1}) { }
+    SIMD_CONSTEXPR double2x2(::simd_double2x2 m) SIMD_NOEXCEPT : ::simd_double2x2(m) { }
+  };
+  
+  struct double3x2 : ::simd_double3x2 {
+    SIMD_CONSTEXPR double3x2() SIMD_NOEXCEPT : ::simd_double3x2((simd_double3x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double3x2(double diagonal) SIMD_NOEXCEPT : double3x2((double2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double3x2(double2 v) SIMD_NOEXCEPT :
+    ::simd_double3x2((simd_double3x2){(double2){v.x,0}, (double2){0,v.y}, (double2){0}}) { }
+    SIMD_CONSTEXPR double3x2(double2 c0, double2 c1, double2 c2) SIMD_NOEXCEPT :
+    ::simd_double3x2((simd_double3x2){c0, c1, c2}) { }
+    SIMD_CONSTEXPR double3x2(::simd_double3x2 m) SIMD_NOEXCEPT : ::simd_double3x2(m) { }
+  };
+  
+  struct double4x2 : ::simd_double4x2 {
+    SIMD_CONSTEXPR double4x2() SIMD_NOEXCEPT : ::simd_double4x2((simd_double4x2){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double4x2(double diagonal) SIMD_NOEXCEPT : double4x2((double2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double4x2(double2 v) SIMD_NOEXCEPT :
+    ::simd_double4x2((simd_double4x2){(double2){v.x,0}, (double2){0,v.y}, (double2){0}, (double2){0}}) { }
+    SIMD_CONSTEXPR double4x2(double2 c0, double2 c1, double2 c2, double2 c3) SIMD_NOEXCEPT :
+    ::simd_double4x2((simd_double4x2){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR double4x2(::simd_double4x2 m) SIMD_NOEXCEPT : ::simd_double4x2(m) { }
+  };
+  
+  struct double2x3 : ::simd_double2x3 {
+    SIMD_CONSTEXPR double2x3() SIMD_NOEXCEPT : ::simd_double2x3((simd_double2x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double2x3(double diagonal) SIMD_NOEXCEPT : double2x3((double2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double2x3(double2 v) SIMD_NOEXCEPT :
+    ::simd_double2x3((simd_double2x3){(double3){v.x,0,0}, (double3){0,v.y,0}}) { }
+    SIMD_CONSTEXPR double2x3(double3 c0, double3 c1) SIMD_NOEXCEPT :
+    ::simd_double2x3((simd_double2x3){c0, c1}) { }
+    SIMD_CONSTEXPR double2x3(::simd_double2x3 m) SIMD_NOEXCEPT : ::simd_double2x3(m) { }
+  };
+  
+  struct double3x3 : ::simd_double3x3 {
+    SIMD_CONSTEXPR double3x3() SIMD_NOEXCEPT : ::simd_double3x3((simd_double3x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double3x3(double diagonal) SIMD_NOEXCEPT : double3x3((double3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double3x3(double3 v) SIMD_NOEXCEPT :
+    ::simd_double3x3((simd_double3x3){(double3){v.x,0,0}, (double3){0,v.y,0}, (double3){0,0,v.z}}) { }
+    SIMD_CONSTEXPR double3x3(double3 c0, double3 c1, double3 c2) SIMD_NOEXCEPT :
+    ::simd_double3x3((simd_double3x3){c0, c1, c2}) { }
+    SIMD_CONSTEXPR double3x3(::simd_double3x3 m) SIMD_NOEXCEPT : ::simd_double3x3(m) { }
+#if defined SIMD_MATRIX_HEADER
+    SIMD_CONSTEXPR double3x3(::simd_quatd q) SIMD_NOEXCEPT : ::simd_double3x3(::simd_matrix3x3(q)) { }
+#endif
+  };
+  
+  struct double4x3 : ::simd_double4x3 {
+    SIMD_CONSTEXPR double4x3() SIMD_NOEXCEPT : ::simd_double4x3((simd_double4x3){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double4x3(double diagonal) SIMD_NOEXCEPT : double4x3((double3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double4x3(double3 v) SIMD_NOEXCEPT :
+    ::simd_double4x3((simd_double4x3){(double3){v.x,0,0}, (double3){0,v.y,0}, (double3){0,0,v.z}, (double3){0}}) { }
+    SIMD_CONSTEXPR double4x3(double3 c0, double3 c1, double3 c2, double3 c3) SIMD_NOEXCEPT :
+    ::simd_double4x3((simd_double4x3){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR double4x3(::simd_double4x3 m) SIMD_NOEXCEPT : ::simd_double4x3(m) { }
+  };
+  
+  struct double2x4 : ::simd_double2x4 {
+    SIMD_CONSTEXPR double2x4() SIMD_NOEXCEPT : ::simd_double2x4((simd_double2x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double2x4(double diagonal) SIMD_NOEXCEPT : double2x4((double2)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double2x4(double2 v) SIMD_NOEXCEPT :
+    ::simd_double2x4((simd_double2x4){(double4){v.x,0,0,0}, (double4){0,v.y,0,0}}) { }
+    SIMD_CONSTEXPR double2x4(double4 c0, double4 c1) SIMD_NOEXCEPT : ::simd_double2x4((simd_double2x4){c0, c1}) { }
+    SIMD_CONSTEXPR double2x4(::simd_double2x4 m) SIMD_NOEXCEPT : ::simd_double2x4(m) { }
+  };
+  
+  struct double3x4 : ::simd_double3x4 {
+    SIMD_CONSTEXPR double3x4() SIMD_NOEXCEPT : ::simd_double3x4((simd_double3x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double3x4(double diagonal) SIMD_NOEXCEPT : double3x4((double3)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double3x4(double3 v) SIMD_NOEXCEPT :
+    ::simd_double3x4((simd_double3x4){(double4){v.x,0,0,0}, (double4){0,v.y,0,0}, (double4){0,0,v.z,0}}) { }
+    SIMD_CONSTEXPR double3x4(double4 c0, double4 c1, double4 c2) SIMD_NOEXCEPT :
+    ::simd_double3x4((simd_double3x4){c0, c1, c2}) { }
+    SIMD_CONSTEXPR double3x4(::simd_double3x4 m) SIMD_NOEXCEPT : ::simd_double3x4(m) { }
+  };
+  
+  struct double4x4 : ::simd_double4x4 {
+    SIMD_CONSTEXPR double4x4() SIMD_NOEXCEPT : ::simd_double4x4((simd_double4x4){0}) { }
+#if __has_feature(cxx_delegating_constructors)
+    SIMD_CONSTEXPR double4x4(double diagonal) SIMD_NOEXCEPT : double4x4((double4)diagonal) { }
+#endif
+    SIMD_CONSTEXPR double4x4(double4 v) SIMD_NOEXCEPT :
+    ::simd_double4x4((simd_double4x4){(double4){v.x,0,0,0}, (double4){0,v.y,0,0}, (double4){0,0,v.z,0}, (double4){0,0,0,v.w}}) { }
+    SIMD_CONSTEXPR double4x4(double4 c0, double4 c1, double4 c2, double4 c3) SIMD_NOEXCEPT :
+    ::simd_double4x4((simd_double4x4){c0, c1, c2, c3}) { }
+    SIMD_CONSTEXPR double4x4(::simd_double4x4 m) SIMD_NOEXCEPT : ::simd_double4x4(m) { }
+#if defined SIMD_MATRIX_HEADER
+    SIMD_CONSTEXPR double4x4(::simd_quatd q) SIMD_NOEXCEPT : ::simd_double4x4(::simd_matrix4x4(q)) { }
+#endif
+  };
+
+/*! @abstract Templated Matrix struct based on scalar type and number of columns and rows.  */
+template <typename ScalarType, size_t col, size_t row> struct Matrix {
+    //  static const size_t col
+    //  static const size_t row
+    //  typedef scalar_t
+    //  typedef type
+};
+/*! @abstract Helper type to access the simd type easily.                     */
+template <typename ScalarType, size_t col, size_t row>
+using Matrix_t = typename Matrix<ScalarType, col, row>::type;
+
+template<> struct Matrix<float, 2, 2> {
+    static const size_t col = 2;
+    static const size_t row = 2;
+    typedef float scalar_t;
+    typedef float2x2 type;
+};
+
+template<> struct Matrix<float, 3, 2> {
+    static const size_t col = 3;
+    static const size_t row = 2;
+    typedef float scalar_t;
+    typedef float3x2 type;
+};
+
+template<> struct Matrix<float, 4, 2> {
+    static const size_t col = 4;
+    static const size_t row = 2;
+    typedef float scalar_t;
+    typedef float4x2 type;
+};
+
+template<> struct Matrix<float, 2, 3> {
+    static const size_t col = 2;
+    static const size_t row = 3;
+    typedef float scalar_t;
+    typedef float2x3 type;
+};
+
+template<> struct Matrix<float, 3, 3> {
+    static const size_t col = 3;
+    static const size_t row = 3;
+    typedef float scalar_t;
+    typedef float3x3 type;
+};
+
+template<> struct Matrix<float, 4, 3> {
+    static const size_t col = 4;
+    static const size_t row = 3;
+    typedef float scalar_t;
+    typedef float4x3 type;
+};
+
+template<> struct Matrix<float, 2, 4> {
+    static const size_t col = 2;
+    static const size_t row = 4;
+    typedef float scalar_t;
+    typedef float2x4 type;
+};
+
+template<> struct Matrix<float, 3, 4> {
+    static const size_t col = 3;
+    static const size_t row = 4;
+    typedef float scalar_t;
+    typedef float3x4 type;
+};
+
+template<> struct Matrix<float, 4, 4> {
+    static const size_t col = 4;
+    static const size_t row = 4;
+    typedef float scalar_t;
+    typedef float4x4 type;
+};
+
+template<> struct Matrix<double, 2, 2> {
+    static const size_t col = 2;
+    static const size_t row = 2;
+    typedef double scalar_t;
+    typedef double2x2 type;
+};
+
+template<> struct Matrix<double, 3, 2> {
+    static const size_t col = 3;
+    static const size_t row = 2;
+    typedef double scalar_t;
+    typedef double3x2 type;
+};
+
+template<> struct Matrix<double, 4, 2> {
+    static const size_t col = 4;
+    static const size_t row = 2;
+    typedef double scalar_t;
+    typedef double4x2 type;
+};
+
+template<> struct Matrix<double, 2, 3> {
+    static const size_t col = 2;
+    static const size_t row = 3;
+    typedef double scalar_t;
+    typedef double2x3 type;
+};
+
+template<> struct Matrix<double, 3, 3> {
+    static const size_t col = 3;
+    static const size_t row = 3;
+    typedef double scalar_t;
+    typedef double3x3 type;
+};
+
+template<> struct Matrix<double, 4, 3> {
+    static const size_t col = 4;
+    static const size_t row = 3;
+    typedef double scalar_t;
+    typedef double4x3 type;
+};
+
+template<> struct Matrix<double, 2, 4> {
+    static const size_t col = 2;
+    static const size_t row = 4;
+    typedef double scalar_t;
+    typedef double2x4 type;
+};
+
+template<> struct Matrix<double, 3, 4> {
+    static const size_t col = 3;
+    static const size_t row = 4;
+    typedef double scalar_t;
+    typedef double3x4 type;
+};
+
+template<> struct Matrix<double, 4, 4> {
+    static const size_t col = 4;
+    static const size_t row = 4;
+    typedef double scalar_t;
+    typedef double4x4 type;
+};
+
+template <> struct get_traits<float2x2>
+{
+    using type = Matrix<float, 2, 2>;
+};
+
+template <> struct get_traits<float3x2>
+{
+    using type = Matrix<float, 3, 2>;
+};
+
+template <> struct get_traits<float4x2>
+{
+    using type = Matrix<float, 4, 2>;
+};
+
+template <> struct get_traits<float2x3>
+{
+    using type = Matrix<float, 2, 3>;
+};
+
+template <> struct get_traits<float3x3>
+{
+    using type = Matrix<float, 3, 3>;
+};
+
+template <> struct get_traits<float4x3>
+{
+    using type = Matrix<float, 4, 3>;
+};
+
+template <> struct get_traits<float2x4>
+{
+    using type = Matrix<float, 2, 4>;
+};
+
+template <> struct get_traits<float3x4>
+{
+    using type = Matrix<float, 3, 4>;
+};
+
+template <> struct get_traits<float4x4>
+{
+    using type = Matrix<float, 4, 4>;
+};
+
+template <> struct get_traits<double2x2>
+{
+    using type = Matrix<double, 2, 2>;
+};
+
+template <> struct get_traits<double3x2>
+{
+    using type = Matrix<double, 3, 2>;
+};
+
+template <> struct get_traits<double4x2>
+{
+    using type = Matrix<double, 4, 2>;
+};
+
+template <> struct get_traits<double2x3>
+{
+    using type = Matrix<double, 2, 3>;
+};
+
+template <> struct get_traits<double3x3>
+{
+    using type = Matrix<double, 3, 3>;
+};
+
+template <> struct get_traits<double4x3>
+{
+    using type = Matrix<double, 4, 3>;
+};
+
+template <> struct get_traits<double2x4>
+{
+    using type = Matrix<double, 2, 4>;
+};
+
+template <> struct get_traits<double3x4>
+{
+    using type = Matrix<double, 3, 4>;
+};
+
+template <> struct get_traits<double4x4>
+{
+    using type = Matrix<double, 4, 4>;
+};
+
+}
+#endif /* __cplusplus */
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_MATRIX_TYPES_HEADER */
diff --git a/vfsoverlay/overlay.yaml b/vfsoverlay/overlay.yaml
new file mode 100644
index 00000000..a4560827
--- /dev/null
+++ b/vfsoverlay/overlay.yaml
@@ -0,0 +1,28 @@
+{
+  'case-sensitive': 'false',
+  'roots': [
+    {
+      "contents": [
+        { 'external-contents': "../vfsoverlay/packed.h", 'name': "simd/packed.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/types.h", 'name': "simd/types.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/quaternion.h", 'name': "simd/quaternion.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/matrix_types.h", 'name': "simd/matrix_types.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/matrix.h", 'name': "simd/matrix.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/conversion.h", 'name': "simd/conversion.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/vector_make.h", 'name': "simd/vector_make.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/common.h", 'name': "simd/common.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/logic.h", 'name': "simd/logic.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/simd.h", 'name': "simd/simd.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/vector_types.h", 'name': "simd/vector_types.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/math.h", 'name': "simd/math.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/extern.h", 'name': "simd/extern.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/vector.h", 'name': "simd/vector.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/geometry.h", 'name': "simd/geometry.h", 'type': 'file' },
+        { 'external-contents': "../vfsoverlay/base.h", 'name': "simd/base.h", 'type': 'file' }
+      ],
+      'name': "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator18.0.sdk/usr/include",
+      'type': 'directory'
+    },
+  ],
+  'version': 0,
+}
diff --git a/vfsoverlay/packed.h b/vfsoverlay/packed.h
new file mode 100644
index 00000000..284947c3
--- /dev/null
+++ b/vfsoverlay/packed.h
@@ -0,0 +1,1031 @@
+/*! @header
+ *  This header defines fixed size vector types with relaxed alignment. For 
+ *  each vector type defined by <simd/vector_types.h> that is not a 1- or 3-
+ *  element vector, there is a corresponding type defined by this header that
+ *  requires only the alignment matching that of the underlying scalar type.
+ *
+ *  These types should be used to access buffers that may not be sufficiently
+ *  aligned to allow them to be accessed using the "normal" simd vector types.
+ *  As an example of this usage, suppose that you want to load a vector of
+ *  four floats from an array of floats. The type simd_float4 has sixteen byte
+ *  alignment, whereas an array of floats has only four byte alignment.
+ *  Thus, naively casting a pointer into the array to (simd_float4 *) would 
+ *  invoke undefined behavior, and likely produce an alignment fault at
+ *  runtime. Instead, use the corresponding packed type to load from the array:
+ *
+ *  <pre>
+ *  @textblock
+ *  simd_float4 vector = *(simd_packed_float4 *)&array[i];
+ *  // do something with vector ...
+ *  @/textblock
+ *  </pre>
+ *
+ *  It's important to note that the packed_ types are only needed to work with
+ *  memory; once the data is loaded, we simply operate on it as usual using
+ *  the simd_float4 type, as illustrated above.
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_PACKED_TYPES
+#define SIMD_PACKED_TYPES
+
+# include <simd/vector_types.h>
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::char2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) char simd_packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::char4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) char simd_packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) char simd_packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) char simd_packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char32.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) char simd_packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char64.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) char simd_packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uchar2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) unsigned char simd_packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uchar4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) unsigned char simd_packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uchar8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) unsigned char simd_packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) unsigned char simd_packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) unsigned char simd_packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar64. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) unsigned char simd_packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::short2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) short simd_packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::short4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) short simd_packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::short8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) short simd_packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::short16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) short simd_packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers with relaxed alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::short32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) short simd_packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ushort2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) unsigned short simd_packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ushort4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) unsigned short simd_packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort8. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) unsigned short simd_packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) unsigned short simd_packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) unsigned short simd_packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::int2. The alignment of this type is that of the underlying
+ *  scalar element type, so you can use it to load or store from an array of
+ *  that type.                                                                */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) int simd_packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::int4. The alignment of this type is that of the underlying
+ *  scalar element type, so you can use it to load or store from an array of
+ *  that type.                                                                */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) int simd_packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::int8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) int simd_packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::int16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) int simd_packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uint2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) unsigned int simd_packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uint4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) unsigned int simd_packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uint8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) unsigned int simd_packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uint16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) unsigned int simd_packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::float2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) float simd_packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::float4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) float simd_packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::float8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) float simd_packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::float16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) float simd_packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::long2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_long1 simd_packed_long2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_long1 simd_packed_long2;
+#endif
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::long4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_long1 simd_packed_long4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_long1 simd_packed_long4;
+#endif
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::long8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_long1 simd_packed_long8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_long1 simd_packed_long8;
+#endif
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ulong2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_ulong1 simd_packed_ulong2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_ulong1 simd_packed_ulong2;
+#endif
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ulong4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_ulong1 simd_packed_ulong4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_ulong1 simd_packed_ulong4;
+#endif
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::ulong8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_ulong1 simd_packed_ulong8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_ulong1 simd_packed_ulong8;
+#endif
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::double2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) double simd_packed_double2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) double simd_packed_double2;
+#endif
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::double4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) double simd_packed_double4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) double simd_packed_double4;
+#endif
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::double8. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) double simd_packed_double8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) double simd_packed_double8;
+#endif
+
+/*  MARK: C++ vector types                                                    */
+#if defined __cplusplus
+namespace simd {
+  namespace packed {
+    /*! @abstract A vector of two 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_char2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_char2 char2;
+  
+    /*! @abstract A vector of four 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_char4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_char4 char4;
+  
+    /*! @abstract A vector of eight 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char8 char8;
+  
+    /*! @abstract A vector of sixteen 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char16 char16;
+  
+    /*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char32 char32;
+  
+    /*! @abstract A vector of sixty-four 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char64. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char64 char64;
+  
+    /*! @abstract A vector of two 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uchar2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uchar2 uchar2;
+  
+    /*! @abstract A vector of four 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uchar4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uchar4 uchar4;
+  
+    /*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar8 uchar8;
+  
+    /*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar16 uchar16;
+  
+    /*! @abstract A vector of thirty-two 8-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar32 uchar32;
+  
+    /*! @abstract A vector of sixty-four 8-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar64. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar64 uchar64;
+  
+    /*! @abstract A vector of two 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_short2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_short2 short2;
+  
+    /*! @abstract A vector of four 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_short4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_short4 short4;
+  
+    /*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short8 short8;
+  
+    /*! @abstract A vector of sixteen 16-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short16 short16;
+  
+    /*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short32 short32;
+  
+    /*! @abstract A vector of two 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ushort2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ushort2 ushort2;
+  
+    /*! @abstract A vector of four 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ushort4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ushort4 ushort4;
+  
+    /*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort8 ushort8;
+  
+    /*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort16 ushort16;
+  
+    /*! @abstract A vector of thirty-two 16-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort32 ushort32;
+  
+    /*! @abstract A vector of two 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_int2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_int2 int2;
+  
+    /*! @abstract A vector of four 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_int4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_int4 int4;
+  
+    /*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_int8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_int8 int8;
+  
+    /*! @abstract A vector of sixteen 32-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_int16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_int16 int16;
+  
+    /*! @abstract A vector of two 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uint2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uint2 uint2;
+  
+    /*! @abstract A vector of four 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uint4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uint4 uint4;
+  
+    /*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uint8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uint8 uint8;
+  
+    /*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uint16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uint16 uint16;
+  
+    /*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_float2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_float2 float2;
+  
+    /*! @abstract A vector of four 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_float4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_float4 float4;
+  
+    /*! @abstract A vector of eight 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_float8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_float8 float8;
+  
+    /*! @abstract A vector of sixteen 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_float16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_float16 float16;
+  
+    /*! @abstract A vector of two 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_long2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_long2 long2;
+  
+    /*! @abstract A vector of four 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_long4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_long4 long4;
+  
+    /*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_long8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_long8 long8;
+  
+    /*! @abstract A vector of two 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ulong2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ulong2 ulong2;
+  
+    /*! @abstract A vector of four 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ulong4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ulong4 ulong4;
+  
+    /*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ulong8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ulong8 ulong8;
+  
+    /*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_double2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_double2 double2;
+  
+    /*! @abstract A vector of four 64-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_double4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_double4 double4;
+  
+    /*! @abstract A vector of eight 64-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_double8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_double8 double8;
+  
+  } /* namespace simd::packed::                                               */
+} /* namespace simd::                                                         */
+#endif /* __cplusplus                                                         */
+
+/*  MARK: Deprecated vector types                                             */
+/*! @group Deprecated vector types
+ *  @discussion These are the original types used by earlier versions of the
+ *  simd library; they are provided here for compatability with existing source
+ *  files. Use the new ("simd_"-prefixed) types for future development.       */
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char2
+ *  or simd::packed::char2 instead.                                           */
+typedef simd_packed_char2 packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char4
+ *  or simd::packed::char4 instead.                                           */
+typedef simd_packed_char4 packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char8
+ *  or simd::packed::char8 instead.                                           */
+typedef simd_packed_char8 packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char16
+ *  or simd::packed::char16 instead.                                          */
+typedef simd_packed_char16 packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char32
+ *  or simd::packed::char32 instead.                                          */
+typedef simd_packed_char32 packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char64
+ *  or simd::packed::char64 instead.                                          */
+typedef simd_packed_char64 packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar2
+ *  or simd::packed::uchar2 instead.                                          */
+typedef simd_packed_uchar2 packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar4
+ *  or simd::packed::uchar4 instead.                                          */
+typedef simd_packed_uchar4 packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar8
+ *  or simd::packed::uchar8 instead.                                          */
+typedef simd_packed_uchar8 packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar16
+ *  or simd::packed::uchar16 instead.                                         */
+typedef simd_packed_uchar16 packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar32
+ *  or simd::packed::uchar32 instead.                                         */
+typedef simd_packed_uchar32 packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar64
+ *  or simd::packed::uchar64 instead.                                         */
+typedef simd_packed_uchar64 packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short2
+ *  or simd::packed::short2 instead.                                          */
+typedef simd_packed_short2 packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short4
+ *  or simd::packed::short4 instead.                                          */
+typedef simd_packed_short4 packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short8
+ *  or simd::packed::short8 instead.                                          */
+typedef simd_packed_short8 packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short16
+ *  or simd::packed::short16 instead.                                         */
+typedef simd_packed_short16 packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short32
+ *  or simd::packed::short32 instead.                                         */
+typedef simd_packed_short32 packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort2
+ *  or simd::packed::ushort2 instead.                                         */
+typedef simd_packed_ushort2 packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort4
+ *  or simd::packed::ushort4 instead.                                         */
+typedef simd_packed_ushort4 packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort8
+ *  or simd::packed::ushort8 instead.                                         */
+typedef simd_packed_ushort8 packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use
+ *  simd_packed_ushort16 or simd::packed::ushort16 instead.                   */
+typedef simd_packed_ushort16 packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use
+ *  simd_packed_ushort32 or simd::packed::ushort32 instead.                   */
+typedef simd_packed_ushort32 packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int2 or
+ *  simd::packed::int2 instead.                                               */
+typedef simd_packed_int2 packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int4 or
+ *  simd::packed::int4 instead.                                               */
+typedef simd_packed_int4 packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int8 or
+ *  simd::packed::int8 instead.                                               */
+typedef simd_packed_int8 packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int16
+ *  or simd::packed::int16 instead.                                           */
+typedef simd_packed_int16 packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint2
+ *  or simd::packed::uint2 instead.                                           */
+typedef simd_packed_uint2 packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint4
+ *  or simd::packed::uint4 instead.                                           */
+typedef simd_packed_uint4 packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint8
+ *  or simd::packed::uint8 instead.                                           */
+typedef simd_packed_uint8 packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint16
+ *  or simd::packed::uint16 instead.                                          */
+typedef simd_packed_uint16 packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float2
+ *  or simd::packed::float2 instead.                                          */
+typedef simd_packed_float2 packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float4
+ *  or simd::packed::float4 instead.                                          */
+typedef simd_packed_float4 packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float8
+ *  or simd::packed::float8 instead.                                          */
+typedef simd_packed_float8 packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float16
+ *  or simd::packed::float16 instead.                                         */
+typedef simd_packed_float16 packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long2
+ *  or simd::packed::long2 instead.                                           */
+typedef simd_packed_long2 packed_long2;
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long4
+ *  or simd::packed::long4 instead.                                           */
+typedef simd_packed_long4 packed_long4;
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long8
+ *  or simd::packed::long8 instead.                                           */
+typedef simd_packed_long8 packed_long8;
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong2
+ *  or simd::packed::ulong2 instead.                                          */
+typedef simd_packed_ulong2 packed_ulong2;
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong4
+ *  or simd::packed::ulong4 instead.                                          */
+typedef simd_packed_ulong4 packed_ulong4;
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong8
+ *  or simd::packed::ulong8 instead.                                          */
+typedef simd_packed_ulong8 packed_ulong8;
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double2
+ *  or simd::packed::double2 instead.                                         */
+typedef simd_packed_double2 packed_double2;
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double4
+ *  or simd::packed::double4 instead.                                         */
+typedef simd_packed_double4 packed_double4;
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double8
+ *  or simd::packed::double8 instead.                                         */
+typedef simd_packed_double8 packed_double8;
+
+# endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif
diff --git a/vfsoverlay/quaternion.h b/vfsoverlay/quaternion.h
new file mode 100644
index 00000000..bb9211b8
--- /dev/null
+++ b/vfsoverlay/quaternion.h
@@ -0,0 +1,1194 @@
+/*! @header
+ *  This header defines functions for constructing and using quaternions.
+ *  @copyright 2015-2016 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_QUATERNIONS
+#define SIMD_QUATERNIONS
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector.h>
+#include <simd/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+/*  MARK: - C and Objective-C float interfaces                                */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ *  @param ix The first component of the imaginary (vector) part.
+ *  @param iy The second component of the imaginary (vector) part.
+ *  @param iz The third component of the imaginary (vector) part.
+ *
+ *  @param r The real (scalar) part.                                          */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float ix, float iy, float iz, float r) {
+  return (simd_quatf){ { ix, iy, iz, r } };
+}
+  
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ *  @discussion Note that the imaginary part of the quaternion comes from 
+ *  array elements 0, 1, and 2, and the real part comes from element 3.       */
+static inline SIMD_NONCONST simd_quatf simd_quaternion(const float xyzr[4]) {
+  return (simd_quatf){ *(const simd_packed_float4 *)xyzr };
+}
+  
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ *  @discussion Note that the imaginary (vector) part of the quaternion comes
+ *  from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ *  lane 3.                                                                   */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(simd_float4 xyzr) {
+  return (simd_quatf){ xyzr };
+}
+  
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ *  `axis`.                                                                   */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis);
+  
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ *  @param from A normalized three-element vector.
+ *  @param to A normalized three-element vector.
+ *
+ *  @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ *  `to` point in opposite directions (to within machine precision), an
+ *  arbitrary rotation axis is chosen, and the angle is pi radians.           */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ *  @discussion If `matrix` is not orthogonal with determinant 1, the result
+ *  is undefined.                                                             */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ *  @discussion The last row and column of the matrix are ignored. This
+ *  function is equivalent to calling simd_quaternion with the upper-left 3x3
+ *  submatrix                .                                                */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix);
+  
+/*! @abstract The real (scalar) part of the quaternion `q`.                   */
+static inline SIMD_CFUNC float simd_real(simd_quatf q) {
+  return q.vector.w;
+}
+  
+/*! @abstract The imaginary (vector) part of the quaternion `q`.              */
+static inline SIMD_CFUNC simd_float3 simd_imag(simd_quatf q) {
+  return q.vector.xyz;
+}
+  
+/*! @abstract The angle (in radians) of rotation represented by `q`.          */
+static inline SIMD_CFUNC float simd_angle(simd_quatf q);
+  
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ *  action of the quaternion `q` rotates.                                     */
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q);
+  
+/*! @abstract The sum of the quaternions `p` and `q`.                         */
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The difference of the quaternions `p` and `q`.                  */
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The product of the quaternions `p` and `q`.                     */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q);
+  
+/*! @abstract The conjugate of the quaternion `q`.                            */
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q);
+  
+/*! @abstract The (multiplicative) inverse of the quaternion `q`.             */
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q);
+  
+/*! @abstract The negation (additive inverse) of the quaternion `q`.          */
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q);
+  
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ *  four-dimensional vectors.                                                 */
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The length of the quaternion `q`.                               */
+static inline SIMD_CFUNC float simd_length(simd_quatf q);
+  
+/*! @abstract The unit quaternion obtained by normalizing `q`.                */
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q);
+  
+/*! @abstract Rotates the vector `v` by the quaternion `q`.                   */
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v);
+  
+/*! @abstract Logarithm of the quaternion `q`.
+ *  @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ *  We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ *  `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ *  The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ *  complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ *  Note that this function is not robust against poorly-scaled non-unit
+ *  quaternions, because it is primarily used for spline interpolation of
+ *  unit quaternions. If you need to compute a robust logarithm of general
+ *  quaternions, you can use the following approach:
+ *
+ *    scale = simd_reduce_max(simd_abs(q.vector));
+ *    logq = log(simd_recip(scale)*q);
+ *    logq.real += log(scale);
+ *    return logq;                                                            */
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q);
+    
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ *  @discussion Do not call this function directly; use `exp(q)` instead.     */
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q);
+  
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ *  @discussion The function interpolates between q1 and q2. q0 is the left
+ *  endpoint of the previous interval, and q3 is the right endpoint of the next
+ *  interval. Use this function to smoothly interpolate between a sequence of
+ *  rotations.                                                                */
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ *  @discussion The function treats q0 ... q3 as control points and uses slerp
+ *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+ *  interpolation are thus q0 and q3, and the curve will not generally pass
+ *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+ *  curve does not hold on the sphere.                                        */
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+  
+#ifdef __cplusplus
+} /* extern "C" */
+/*  MARK: - C++ float interfaces                                              */
+
+namespace simd {
+  struct quatf : ::simd_quatf {
+    /*! @abstract The identity quaternion.                                    */
+    quatf( ) : ::simd_quatf(::simd_quaternion((float4){0,0,0,1})) { }
+    
+    /*! @abstract Constructs a C++ quaternion from a C quaternion.            */
+    quatf(::simd_quatf q) : ::simd_quatf(q) { }
+    
+    /*! @abstract Constructs a quaternion from components.                    */
+    quatf(float ix, float iy, float iz, float r) : ::simd_quatf(::simd_quaternion(ix, iy, iz, r)) { }
+    
+    /*! @abstract Constructs a quaternion from an array of scalars.           */
+    quatf(const float xyzr[4]) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Constructs a quaternion from a vector.                      */
+    quatf(float4 xyzr) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Quaternion representing rotation about `axis` by `angle` 
+     *  radians.                                                              */
+    quatf(float angle, float3 axis) : ::simd_quatf(::simd_quaternion(angle, axis)) { }
+    
+    /*! @abstract Quaternion that rotates `from` into `to`.                   */
+    quatf(float3 from, float3 to) : ::simd_quatf(::simd_quaternion(from, to)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatf(::simd_float3x3 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatf(::simd_float4x4 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+  
+    /*! @abstract The real (scalar) part of the quaternion.                   */
+    float real(void) const { return ::simd_real(*this); }
+    
+    /*! @abstract The imaginary (vector) part of the quaternion.              */
+    float3 imag(void) const { return ::simd_imag(*this); }
+    
+    /*! @abstract The angle the quaternion rotates by.                        */
+    float angle(void) const { return ::simd_angle(*this); }
+    
+    /*! @abstract The axis the quaternion rotates about.                      */
+    float3 axis(void) const { return ::simd_axis(*this); }
+    
+    /*! @abstract The length of the quaternion.                               */
+    float length(void) const { return ::simd_length(*this); }
+    
+    /*! @abstract Act on the vector `v` by rotation.                          */
+    float3  operator()(const ::simd_float3 v) const { return ::simd_act(*this, v); }
+  };
+  
+  static SIMD_CPPFUNC quatf operator+(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_add(p, q); }
+  static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_sub(p, q); }
+  static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p) { return ::simd_negate(p); }
+  static SIMD_CPPFUNC quatf operator*(const float r, const ::simd_quatf p) { return ::simd_mul(r, p); }
+  static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const float r) { return ::simd_mul(p, r); }
+  static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, q); }
+  static SIMD_CPPFUNC quatf operator/(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, ::simd_inverse(q)); }
+  static SIMD_INLINE SIMD_NODEBUG quatf operator+=(quatf &p, const ::simd_quatf q) { return p = p+q; }
+  static SIMD_INLINE SIMD_NODEBUG quatf operator-=(quatf &p, const ::simd_quatf q) { return p = p-q; }
+  static SIMD_INLINE SIMD_NODEBUG quatf operator*=(quatf &p, const float r) { return p = p*r; }
+  static SIMD_INLINE SIMD_NODEBUG quatf operator*=(quatf &p, const ::simd_quatf q) { return p = p*q; }
+  static SIMD_INLINE SIMD_NODEBUG quatf operator/=(quatf &p, const ::simd_quatf q) { return p = p/q; }
+  
+  /*! @abstract The conjugate of the quaternion `q`.                          */
+  static SIMD_CPPFUNC quatf conjugate(const ::simd_quatf p) { return ::simd_conjugate(p); }
+  
+  /*! @abstract The (multiplicative) inverse of the quaternion `q`.           */
+  static SIMD_CPPFUNC quatf inverse(const ::simd_quatf p) { return ::simd_inverse(p); }
+
+  /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+   *  four-dimensional vectors.                                               */
+  static SIMD_CPPFUNC float dot(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_dot(p, q); }
+  
+  /*! @abstract The unit quaternion obtained by normalizing `q`.              */
+  static SIMD_CPPFUNC quatf normalize(const ::simd_quatf p) { return ::simd_normalize(p); }
+
+  /*! @abstract logarithm of the quaternion `q`.                              */
+  static SIMD_CPPFUNC quatf log(const ::simd_quatf q) { return ::__tg_log(q); }
+
+  /*! @abstract exponential map of quaterion `q`.                             */
+  static SIMD_CPPFUNC quatf exp(const ::simd_quatf q) { return ::__tg_exp(q); }
+  
+  /*! @abstract Spherical linear interpolation along the shortest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatf slerp(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp(p0, p1, t); }
+  
+  /*! @abstract Spherical linear interpolation along the longest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatf slerp_longest(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp_longest(p0, p1, t); }
+  
+  /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+   *
+   *  @discussion The function interpolates between q1 and q2. q0 is the left
+   *  endpoint of the previous interval, and q3 is the right endpoint of the next
+   *  interval. Use this function to smoothly interpolate between a sequence of
+   *  rotations.                                                              */
+  static SIMD_CPPFUNC quatf spline(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_spline(p0, p1, p2, p3, t); }
+  
+  /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+   *
+   *  @discussion The function treats q0 ... q3 as control points and uses slerp
+   *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+   *  interpolation are thus q0 and q3, and the curve will not generally pass
+   *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+   *  curve does not hold on the sphere.                                      */
+  static SIMD_CPPFUNC quatf bezier(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+  
+/*  MARK: - float implementations                                             */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+  
+/*  tg_promote is implementation gobbledygook that enables the compile-time
+ *  dispatching in tgmath.h to work its magic.                                */
+static simd_quatf __attribute__((__overloadable__)) __tg_promote(simd_quatf);
+  
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ *  @discussion This function is hidden behind an underscore to avoid confusion
+ *  with the angle-axis constructor.                                          */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion(simd_float3 imag, float real) {
+  return simd_quaternion(simd_make_float4(imag, real));
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis) {
+  return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+  
+static inline SIMD_CFUNC float simd_angle(simd_quatf q) {
+  return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q) {
+  return simd_normalize(q.vector.xyz);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q) {
+  return simd_quaternion(p.vector + q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q) {
+  return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                          p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+                         (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+                          p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a) {
+  return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q) {
+  return simd_mul(q,a);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q) {
+  return simd_quaternion(q.vector * (simd_float4){-1,-1,-1, 1});
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q) {
+  return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q) {
+  return simd_quaternion(-q.vector);
+}
+  
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q) {
+  return simd_dot(p.vector, q.vector);
+}
+  
+static inline SIMD_CFUNC float simd_length(simd_quatf q) {
+  return simd_length(q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q) {
+  float length_squared = simd_length_squared(q.vector);
+  if (length_squared == 0) {
+    return simd_quaternion((simd_float4){0,0,0,1});
+  }
+  return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *  
+ *  @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ *  by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ *  implementation detail and you should not call it directly. It may be
+ *  removed or modified in future versions of the simd module.                */
+static inline SIMD_CFUNC simd_quatf _simd_mul_vq(simd_float3 v, simd_quatf q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                         v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+                         v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+  
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v) {
+#if defined __arm__ || defined __arm64__
+  return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+  #pragma STDC FP_CONTRACT ON
+  simd_float3 t = 2*simd_cross(simd_imag(q),v);
+  return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q) {
+  float real = __tg_log(simd_length_squared(q.vector))/2;
+  if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+  simd_float3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+  return _simd_quaternion(imag, real);
+}
+  
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q) {
+  //  angle is actually *twice* the angle of the rotation corresponding to
+  //  the resulting quaternion, which is why we don't simply use the (angle,
+  //  axis) constructor to generate `unit`.
+  float angle = simd_length(simd_imag(q));
+  if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+  simd_float3 axis = simd_normalize(simd_imag(q));
+  simd_quatf unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+  return simd_mul(exp(simd_real(q)), unit);
+}
+ 
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ *  initializer.
+ *
+ *  @discussion Computes the quaternion rotation `from` to `to` if they are
+ *  separated by less than 90 degrees. Not numerically stable for larger
+ *  angles. This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion_reduced(simd_float3 from, simd_float3 to) {
+  simd_float3 half = simd_normalize(from + to);
+  return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to) {
+  
+  //  If the angle between from and to is not too big, we can compute the
+  //  rotation accurately using a simple implementation.
+  if (simd_dot(from, to) >= 0) {
+    return _simd_quaternion_reduced(from, to);
+  }
+  
+  //  Because from and to are more than 90 degrees apart, we compute the
+  //  rotation in two stages (from -> half), (half -> to) to preserve numerical
+  //  accuracy.
+  simd_float3 half = simd_normalize(from) + simd_normalize(to);
+  
+  if (simd_length_squared(half) <= 0x1p-46f) {
+    //  half is nearly zero, so from and to point in nearly opposite directions
+    //  and the rotation is numerically underspecified. Pick an axis orthogonal
+    //  to the vectors, and use an angle of pi radians.
+    simd_float3 abs_from = simd_abs(from);
+    if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){1,0,0})), 0.f);
+    else if (abs_from.y <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,1,0})), 0.f);
+    else
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,0,1})), 0.f);
+  }
+
+  //  Compute the two-step rotation.                         */
+  half = simd_normalize(half);
+  return simd_mul(_simd_quaternion_reduced(from, half),
+                  _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix) {
+  const simd_float3 *mat = matrix.columns;
+  float trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    float r = 2*sqrt(1 + trace);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix) {
+  const simd_float4 *mat = matrix.columns;
+  float trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    float r = 2*sqrt(1 + trace);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE float _simd_angle(simd_quatf p, simd_quatf q) {
+  return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+  
+/*! @abstract sin(x)/x.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_CFUNC float _simd_sinc(float x) {
+  if (x == 0) return 1;
+  return sin(x)/x;
+}
+ 
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ *  @discussion This function may interpolate along either the longer or
+ *  shorter path between q0 and q1; it is used as an implementation detail
+ *  in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ *  instead of calling this directly.                                         */
+static SIMD_NOINLINE simd_quatf _simd_slerp_internal(simd_quatf q0, simd_quatf q1, float t) {
+  float s = 1 - t;
+  float a = _simd_angle(q0, q1);
+  float r = simd_recip(_simd_sinc(a));
+  return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+  
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, q1, t);
+  return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, simd_negate(q1), t);
+  return _simd_slerp_internal(q0, q1, t);
+}
+  
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatf _simd_intermediate(simd_quatf q0, simd_quatf q1, simd_quatf q2) {
+  simd_quatf p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+  simd_quatf p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+  return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatf _simd_squad(simd_quatf q0, simd_quatf qa, simd_quatf qb, simd_quatf q1, float t) {
+  simd_quatf r0 = _simd_slerp_internal(q0, q1, t);
+  simd_quatf r1 = _simd_slerp_internal(qa, qb, t);
+  return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+  
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+  simd_quatf qa = _simd_intermediate(q0, q1, q2);
+  simd_quatf qb = _simd_intermediate(q1, q2, q3);
+  return _simd_squad(q1, qa, qb, q2, t);
+}
+  
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+  simd_quatf q01 = _simd_slerp_internal(q0, q1, t);
+  simd_quatf q12 = _simd_slerp_internal(q1, q2, t);
+  simd_quatf q23 = _simd_slerp_internal(q2, q3, t);
+  simd_quatf q012 = _simd_slerp_internal(q01, q12, t);
+  simd_quatf q123 = _simd_slerp_internal(q12, q23, t);
+  return _simd_slerp_internal(q012, q123, t);
+}
+
+/*  MARK: - C and Objective-C double interfaces                                */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ *  @param ix The first component of the imaginary (vector) part.
+ *  @param iy The second component of the imaginary (vector) part.
+ *  @param iz The third component of the imaginary (vector) part.
+ *
+ *  @param r The real (scalar) part.                                          */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double ix, double iy, double iz, double r) {
+  return (simd_quatd){ { ix, iy, iz, r } };
+}
+  
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ *  @discussion Note that the imaginary part of the quaternion comes from 
+ *  array elements 0, 1, and 2, and the real part comes from element 3.       */
+static inline SIMD_NONCONST simd_quatd simd_quaternion(const double xyzr[4]) {
+  return (simd_quatd){ *(const simd_packed_double4 *)xyzr };
+}
+  
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ *  @discussion Note that the imaginary (vector) part of the quaternion comes
+ *  from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ *  lane 3.                                                                   */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(simd_double4 xyzr) {
+  return (simd_quatd){ xyzr };
+}
+  
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ *  `axis`.                                                                   */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis);
+  
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ *  @param from A normalized three-element vector.
+ *  @param to A normalized three-element vector.
+ *
+ *  @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ *  `to` point in opposite directions (to within machine precision), an
+ *  arbitrary rotation axis is chosen, and the angle is pi radians.           */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ *  @discussion If `matrix` is not orthogonal with determinant 1, the result
+ *  is undefined.                                                             */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ *  @discussion The last row and column of the matrix are ignored. This
+ *  function is equivalent to calling simd_quaternion with the upper-left 3x3
+ *  submatrix                .                                                */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix);
+  
+/*! @abstract The real (scalar) part of the quaternion `q`.                   */
+static inline SIMD_CFUNC double simd_real(simd_quatd q) {
+  return q.vector.w;
+}
+  
+/*! @abstract The imaginary (vector) part of the quaternion `q`.              */
+static inline SIMD_CFUNC simd_double3 simd_imag(simd_quatd q) {
+  return q.vector.xyz;
+}
+  
+/*! @abstract The angle (in radians) of rotation represented by `q`.          */
+static inline SIMD_CFUNC double simd_angle(simd_quatd q);
+  
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ *  action of the quaternion `q` rotates.                                     */
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q);
+  
+/*! @abstract The sum of the quaternions `p` and `q`.                         */
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The difference of the quaternions `p` and `q`.                  */
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The product of the quaternions `p` and `q`.                     */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q);
+  
+/*! @abstract The conjugate of the quaternion `q`.                            */
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q);
+  
+/*! @abstract The (multiplicative) inverse of the quaternion `q`.             */
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q);
+  
+/*! @abstract The negation (additive inverse) of the quaternion `q`.          */
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q);
+  
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ *  four-dimensional vectors.                                                 */
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The length of the quaternion `q`.                               */
+static inline SIMD_CFUNC double simd_length(simd_quatd q);
+  
+/*! @abstract The unit quaternion obtained by normalizing `q`.                */
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q);
+  
+/*! @abstract Rotates the vector `v` by the quaternion `q`.                   */
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v);
+  
+/*! @abstract Logarithm of the quaternion `q`.
+ *  @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ *  We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ *  `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ *  The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ *  complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ *  Note that this function is not robust against poorly-scaled non-unit
+ *  quaternions, because it is primarily used for spline interpolation of
+ *  unit quaternions. If you need to compute a robust logarithm of general
+ *  quaternions, you can use the following approach:
+ *
+ *    scale = simd_reduce_max(simd_abs(q.vector));
+ *    logq = log(simd_recip(scale)*q);
+ *    logq.real += log(scale);
+ *    return logq;                                                            */
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q);
+    
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ *  @discussion Do not call this function directly; use `exp(q)` instead.     */
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q);
+  
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ *  @discussion The function interpolates between q1 and q2. q0 is the left
+ *  endpoint of the previous interval, and q3 is the right endpoint of the next
+ *  interval. Use this function to smoothly interpolate between a sequence of
+ *  rotations.                                                                */
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ *  @discussion The function treats q0 ... q3 as control points and uses slerp
+ *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+ *  interpolation are thus q0 and q3, and the curve will not generally pass
+ *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+ *  curve does not hold on the sphere.                                        */
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+  
+#ifdef __cplusplus
+} /* extern "C" */
+/*  MARK: - C++ double interfaces                                              */
+
+namespace simd {
+  struct quatd : ::simd_quatd {
+    /*! @abstract The identity quaternion.                                    */
+    quatd( ) : ::simd_quatd(::simd_quaternion((double4){0,0,0,1})) { }
+    
+    /*! @abstract Constructs a C++ quaternion from a C quaternion.            */
+    quatd(::simd_quatd q) : ::simd_quatd(q) { }
+    
+    /*! @abstract Constructs a quaternion from components.                    */
+    quatd(double ix, double iy, double iz, double r) : ::simd_quatd(::simd_quaternion(ix, iy, iz, r)) { }
+    
+    /*! @abstract Constructs a quaternion from an array of scalars.           */
+    quatd(const double xyzr[4]) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Constructs a quaternion from a vector.                      */
+    quatd(double4 xyzr) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Quaternion representing rotation about `axis` by `angle` 
+     *  radians.                                                              */
+    quatd(double angle, double3 axis) : ::simd_quatd(::simd_quaternion(angle, axis)) { }
+    
+    /*! @abstract Quaternion that rotates `from` into `to`.                   */
+    quatd(double3 from, double3 to) : ::simd_quatd(::simd_quaternion(from, to)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatd(::simd_double3x3 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatd(::simd_double4x4 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+  
+    /*! @abstract The real (scalar) part of the quaternion.                   */
+    double real(void) const { return ::simd_real(*this); }
+    
+    /*! @abstract The imaginary (vector) part of the quaternion.              */
+    double3 imag(void) const { return ::simd_imag(*this); }
+    
+    /*! @abstract The angle the quaternion rotates by.                        */
+    double angle(void) const { return ::simd_angle(*this); }
+    
+    /*! @abstract The axis the quaternion rotates about.                      */
+    double3 axis(void) const { return ::simd_axis(*this); }
+    
+    /*! @abstract The length of the quaternion.                               */
+    double length(void) const { return ::simd_length(*this); }
+    
+    /*! @abstract Act on the vector `v` by rotation.                          */
+    double3  operator()(const ::simd_double3 v) const { return ::simd_act(*this, v); }
+  };
+  
+  static SIMD_CPPFUNC quatd operator+(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_add(p, q); }
+  static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_sub(p, q); }
+  static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p) { return ::simd_negate(p); }
+  static SIMD_CPPFUNC quatd operator*(const double r, const ::simd_quatd p) { return ::simd_mul(r, p); }
+  static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const double r) { return ::simd_mul(p, r); }
+  static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, q); }
+  static SIMD_CPPFUNC quatd operator/(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, ::simd_inverse(q)); }
+  static SIMD_INLINE SIMD_NODEBUG quatd operator+=(quatd &p, const ::simd_quatd q) { return p = p+q; }
+  static SIMD_INLINE SIMD_NODEBUG quatd operator-=(quatd &p, const ::simd_quatd q) { return p = p-q; }
+  static SIMD_INLINE SIMD_NODEBUG quatd operator*=(quatd &p, const double r) { return p = p*r; }
+  static SIMD_INLINE SIMD_NODEBUG quatd operator*=(quatd &p, const ::simd_quatd q) { return p = p*q; }
+  static SIMD_INLINE SIMD_NODEBUG quatd operator/=(quatd &p, const ::simd_quatd q) { return p = p/q; }
+  
+  /*! @abstract The conjugate of the quaternion `q`.                          */
+  static SIMD_CPPFUNC quatd conjugate(const ::simd_quatd p) { return ::simd_conjugate(p); }
+  
+  /*! @abstract The (multiplicative) inverse of the quaternion `q`.           */
+  static SIMD_CPPFUNC quatd inverse(const ::simd_quatd p) { return ::simd_inverse(p); }
+
+  /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+   *  four-dimensional vectors.                                               */
+  static SIMD_CPPFUNC double dot(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_dot(p, q); }
+  
+  /*! @abstract The unit quaternion obtained by normalizing `q`.              */
+  static SIMD_CPPFUNC quatd normalize(const ::simd_quatd p) { return ::simd_normalize(p); }
+
+  /*! @abstract logarithm of the quaternion `q`.                              */
+  static SIMD_CPPFUNC quatd log(const ::simd_quatd q) { return ::__tg_log(q); }
+
+  /*! @abstract exponential map of quaterion `q`.                             */
+  static SIMD_CPPFUNC quatd exp(const ::simd_quatd q) { return ::__tg_exp(q); }
+  
+  /*! @abstract Spherical linear interpolation along the shortest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatd slerp(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp(p0, p1, t); }
+  
+  /*! @abstract Spherical linear interpolation along the longest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatd slerp_longest(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp_longest(p0, p1, t); }
+  
+  /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+   *
+   *  @discussion The function interpolates between q1 and q2. q0 is the left
+   *  endpoint of the previous interval, and q3 is the right endpoint of the next
+   *  interval. Use this function to smoothly interpolate between a sequence of
+   *  rotations.                                                              */
+  static SIMD_CPPFUNC quatd spline(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_spline(p0, p1, p2, p3, t); }
+  
+  /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+   *
+   *  @discussion The function treats q0 ... q3 as control points and uses slerp
+   *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+   *  interpolation are thus q0 and q3, and the curve will not generally pass
+   *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+   *  curve does not hold on the sphere.                                      */
+  static SIMD_CPPFUNC quatd bezier(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+  
+/*  MARK: - double implementations                                             */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+  
+/*  tg_promote is implementation gobbledygook that enables the compile-time
+ *  dispatching in tgmath.h to work its magic.                                */
+static simd_quatd __attribute__((__overloadable__)) __tg_promote(simd_quatd);
+  
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ *  @discussion This function is hidden behind an underscore to avoid confusion
+ *  with the angle-axis constructor.                                          */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion(simd_double3 imag, double real) {
+  return simd_quaternion(simd_make_double4(imag, real));
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis) {
+  return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+  
+static inline SIMD_CFUNC double simd_angle(simd_quatd q) {
+  return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q) {
+  return simd_normalize(q.vector.xyz);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q) {
+  return simd_quaternion(p.vector + q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q) {
+  return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                          p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+                         (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+                          p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a) {
+  return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q) {
+  return simd_mul(q,a);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q) {
+  return simd_quaternion(q.vector * (simd_double4){-1,-1,-1, 1});
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q) {
+  return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q) {
+  return simd_quaternion(-q.vector);
+}
+  
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q) {
+  return simd_dot(p.vector, q.vector);
+}
+  
+static inline SIMD_CFUNC double simd_length(simd_quatd q) {
+  return simd_length(q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q) {
+  double length_squared = simd_length_squared(q.vector);
+  if (length_squared == 0) {
+    return simd_quaternion((simd_double4){0,0,0,1});
+  }
+  return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *  
+ *  @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ *  by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ *  implementation detail and you should not call it directly. It may be
+ *  removed or modified in future versions of the simd module.                */
+static inline SIMD_CFUNC simd_quatd _simd_mul_vq(simd_double3 v, simd_quatd q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                         v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+                         v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+  
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v) {
+#if defined __arm__ || defined __arm64__
+  return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+  #pragma STDC FP_CONTRACT ON
+  simd_double3 t = 2*simd_cross(simd_imag(q),v);
+  return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q) {
+  double real = __tg_log(simd_length_squared(q.vector))/2;
+  if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+  simd_double3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+  return _simd_quaternion(imag, real);
+}
+  
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q) {
+  //  angle is actually *twice* the angle of the rotation corresponding to
+  //  the resulting quaternion, which is why we don't simply use the (angle,
+  //  axis) constructor to generate `unit`.
+  double angle = simd_length(simd_imag(q));
+  if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+  simd_double3 axis = simd_normalize(simd_imag(q));
+  simd_quatd unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+  return simd_mul(exp(simd_real(q)), unit);
+}
+ 
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ *  initializer.
+ *
+ *  @discussion Computes the quaternion rotation `from` to `to` if they are
+ *  separated by less than 90 degrees. Not numerically stable for larger
+ *  angles. This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion_reduced(simd_double3 from, simd_double3 to) {
+  simd_double3 half = simd_normalize(from + to);
+  return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to) {
+  
+  //  If the angle between from and to is not too big, we can compute the
+  //  rotation accurately using a simple implementation.
+  if (simd_dot(from, to) >= 0) {
+    return _simd_quaternion_reduced(from, to);
+  }
+  
+  //  Because from and to are more than 90 degrees apart, we compute the
+  //  rotation in two stages (from -> half), (half -> to) to preserve numerical
+  //  accuracy.
+  simd_double3 half = simd_normalize(from) + simd_normalize(to);
+  
+  if (simd_length_squared(half) <= 0x1p-104) {
+    //  half is nearly zero, so from and to point in nearly opposite directions
+    //  and the rotation is numerically underspecified. Pick an axis orthogonal
+    //  to the vectors, and use an angle of pi radians.
+    simd_double3 abs_from = simd_abs(from);
+    if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){1,0,0})), 0.f);
+    else if (abs_from.y <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,1,0})), 0.f);
+    else
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,0,1})), 0.f);
+  }
+
+  //  Compute the two-step rotation.                         */
+  half = simd_normalize(half);
+  return simd_mul(_simd_quaternion_reduced(from, half),
+                  _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix) {
+  const simd_double3 *mat = matrix.columns;
+  double trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    double r = 2*sqrt(1 + trace);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix) {
+  const simd_double4 *mat = matrix.columns;
+  double trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    double r = 2*sqrt(1 + trace);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE double _simd_angle(simd_quatd p, simd_quatd q) {
+  return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+  
+/*! @abstract sin(x)/x.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_CFUNC double _simd_sinc(double x) {
+  if (x == 0) return 1;
+  return sin(x)/x;
+}
+ 
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ *  @discussion This function may interpolate along either the longer or
+ *  shorter path between q0 and q1; it is used as an implementation detail
+ *  in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ *  instead of calling this directly.                                         */
+static SIMD_NOINLINE simd_quatd _simd_slerp_internal(simd_quatd q0, simd_quatd q1, double t) {
+  double s = 1 - t;
+  double a = _simd_angle(q0, q1);
+  double r = simd_recip(_simd_sinc(a));
+  return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+  
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, q1, t);
+  return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, simd_negate(q1), t);
+  return _simd_slerp_internal(q0, q1, t);
+}
+  
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatd _simd_intermediate(simd_quatd q0, simd_quatd q1, simd_quatd q2) {
+  simd_quatd p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+  simd_quatd p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+  return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatd _simd_squad(simd_quatd q0, simd_quatd qa, simd_quatd qb, simd_quatd q1, double t) {
+  simd_quatd r0 = _simd_slerp_internal(q0, q1, t);
+  simd_quatd r1 = _simd_slerp_internal(qa, qb, t);
+  return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+  
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+  simd_quatd qa = _simd_intermediate(q0, q1, q2);
+  simd_quatd qb = _simd_intermediate(q1, q2, q3);
+  return _simd_squad(q1, qa, qb, q2, t);
+}
+  
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+  simd_quatd q01 = _simd_slerp_internal(q0, q1, t);
+  simd_quatd q12 = _simd_slerp_internal(q1, q2, t);
+  simd_quatd q23 = _simd_slerp_internal(q2, q3, t);
+  simd_quatd q012 = _simd_slerp_internal(q01, q12, t);
+  simd_quatd q123 = _simd_slerp_internal(q12, q23, t);
+  return _simd_slerp_internal(q012, q123, t);
+}
+
+#ifdef __cplusplus
+}      /* extern "C"  */
+#endif /* __cplusplus */
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_QUATERNIONS */
diff --git a/vfsoverlay/simd.h b/vfsoverlay/simd.h
new file mode 100644
index 00000000..fd566bf4
--- /dev/null
+++ b/vfsoverlay/simd.h
@@ -0,0 +1,30 @@
+/*  Copyright (c) 2014 Apple, Inc.  All rights reserved.
+ *
+ *  This header provides small vector (simd) and matrix types, and basic
+ *  arithmetic and mathematical functions for them.  The vast majority of these
+ *  operations are implemented as header inlines, as they can be performed
+ *  using just a few instructions on most processors.
+ *
+ *  These functions are broken into two groups; vector and matrix.  This header
+ *  includes all of them, but these may also be included separately.  Consult
+ *  these two headers for detailed documentation of what types and operations
+ *  are available.
+ */
+
+#ifndef __SIMD_HEADER__
+#define __SIMD_HEADER__
+
+#if __has_include(<realtime_safety/realtime_safety.h>)
+#include <realtime_safety/realtime_safety.h>
+REALTIME_SAFE_BEGIN
+#endif
+
+#include <simd/vector.h>
+#include <simd/matrix.h>
+#include <simd/quaternion.h>
+
+#if __has_include(<realtime_safety/realtime_safety.h>)
+REALTIME_SAFE_END
+#endif
+
+#endif
diff --git a/vfsoverlay/types.h b/vfsoverlay/types.h
new file mode 100644
index 00000000..e0944670
--- /dev/null
+++ b/vfsoverlay/types.h
@@ -0,0 +1,128 @@
+/*! @header
+ *  @copyright 2015-2016 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_TYPES
+#define SIMD_TYPES
+
+#include <simd/vector_types.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+
+/*! @group Matrices
+ *  @discussion
+ *  This header defines nine matrix types for each of float and double, which
+ *  are intended for use together with the vector types defined in
+ *  <simd/vector_types.h>.
+ *
+ *  For compatibility with common graphics libraries, these matrices are stored
+ *  in column-major order, and implemented as arrays of column vectors.
+ *  Column-major storage order may seem a little strange if you aren't used to
+ *  it, but for most usage the memory layout of the matrices shouldn't matter
+ *  at all; instead you should think of matrices as abstract mathematical
+ *  objects that you use to perform arithmetic without worrying about the
+ *  details of the underlying representation.
+ *
+ *  WARNING: vectors of length three are internally represented as length four
+ *  vectors with one element of padding (for alignment purposes).  This means
+ *  that when a floatNx3 or doubleNx3 is viewed as a vector, it appears to
+ *  have 4*N elements instead of the expected 3*N (with one padding element
+ *  at the end of each column).  The matrix elements are laid out in memory
+ *  as follows:
+ *
+ *      { 0, 1, 2, x, 3, 4, 5, x, ... }
+ *
+ *  (where the scalar indices used above indicate the conceptual column-
+ *  major storage order).  If you aren't monkeying around with the internal
+ *  storage details of matrices, you don't need to worry about this at all.
+ *  Consider this yet another good reason to avoid doing so.                  */
+
+/*! @abstract A matrix with 2 rows and 2 columns.                             */
+typedef struct { simd_float2 columns[2]; } simd_float2x2;
+
+/*! @abstract A matrix with 2 rows and 3 columns.                             */
+typedef struct { simd_float2 columns[3]; } simd_float3x2;
+
+/*! @abstract A matrix with 2 rows and 4 columns.                             */
+typedef struct { simd_float2 columns[4]; } simd_float4x2;
+
+/*! @abstract A matrix with 3 rows and 2 columns.                             */
+typedef struct { simd_float3 columns[2]; } simd_float2x3;
+
+/*! @abstract A matrix with 3 rows and 3 columns.                             */
+typedef struct { simd_float3 columns[3]; } simd_float3x3;
+
+/*! @abstract A matrix with 3 rows and 4 columns.                             */
+typedef struct { simd_float3 columns[4]; } simd_float4x3;
+
+/*! @abstract A matrix with 4 rows and 2 columns.                             */
+typedef struct { simd_float4 columns[2]; } simd_float2x4;
+
+/*! @abstract A matrix with 4 rows and 3 columns.                             */
+typedef struct { simd_float4 columns[3]; } simd_float3x4;
+
+/*! @abstract A matrix with 4 rows and 4 columns.                             */
+typedef struct { simd_float4 columns[4]; } simd_float4x4;
+
+/*! @abstract A matrix with 2 rows and 2 columns.                             */
+typedef struct { simd_double2 columns[2]; } simd_double2x2;
+
+/*! @abstract A matrix with 2 rows and 3 columns.                             */
+typedef struct { simd_double2 columns[3]; } simd_double3x2;
+
+/*! @abstract A matrix with 2 rows and 4 columns.                             */
+typedef struct { simd_double2 columns[4]; } simd_double4x2;
+
+/*! @abstract A matrix with 3 rows and 2 columns.                             */
+typedef struct { simd_double3 columns[2]; } simd_double2x3;
+
+/*! @abstract A matrix with 3 rows and 3 columns.                             */
+typedef struct { simd_double3 columns[3]; } simd_double3x3;
+
+/*! @abstract A matrix with 3 rows and 4 columns.                             */
+typedef struct { simd_double3 columns[4]; } simd_double4x3;
+
+/*! @abstract A matrix with 4 rows and 2 columns.                             */
+typedef struct { simd_double4 columns[2]; } simd_double2x4;
+
+/*! @abstract A matrix with 4 rows and 3 columns.                             */
+typedef struct { simd_double4 columns[3]; } simd_double3x4;
+
+/*! @abstract A matrix with 4 rows and 4 columns.                             */
+typedef struct { simd_double4 columns[4]; } simd_double4x4;
+
+
+/*! @group Quaternions
+ *  @discussion Unlike vectors, quaternions are not raw clang extended-vector
+ *  types, because if they were you'd be able to intermix them with vectors
+ *  in arithmetic operations freely, but the arithmetic would not do what you
+ *  want it to do (it would simply perform the arithmetic operation
+ *  componentwise on the quaternion and vector).
+ *
+ *  Quaternions aren't unions in C/Obj-C, because then the C++ types couldn't
+ *  inherit from the C types, which would make intermixing rather painful (you
+ *  can't inherit from a union).  This means that we can't provide nice member
+ *  access like .real and .imag; you need to use functions to access the pieces
+ *  of a quaternion instead.
+ *
+ *  This also means that you need to use functions instead of operators to do
+ *  arithmetic with quaternions in C and Obj-C.  In C++, we are able to provide
+ *  operator overloads for arithmetic.
+ *
+ *  Internally, a quaternion is represented as a vector of four elements.  The
+ *  first three elements are the "imaginary" (or "vector") part of the
+ *  quaternion, and the last element is the "real" (or "scalar") part.  As with
+ *  everything simd, you will generally get better performance if you avoid
+ *  using the internal storage details of the type, and instead treat these
+ *  quaternions as abstract mathematical objects once they are created.
+ *
+ *  While the C types are defined here, the operations on quaternions and the
+ *  C++ quaternion types are defined in <simd/quaternion.h>                   */
+
+/*! @abstract A single-precision quaternion.                                  */
+typedef struct {  simd_float4 vector; } simd_quatf;
+
+/*! @abstract A double-precision quaternion.                                  */
+typedef struct { simd_double4 vector; } simd_quatd;
+
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_TYPES */
diff --git a/vfsoverlay/vector.h b/vfsoverlay/vector.h
new file mode 100644
index 00000000..7ab8f2ad
--- /dev/null
+++ b/vfsoverlay/vector.h
@@ -0,0 +1,52 @@
+/*  Copyright (c) 2014 Apple, Inc. All rights reserved.
+ *
+ *  This header provides small vector (simd) types and basic arithmetic and
+ *  math functions that operate on them.
+ *
+ *  A wide assortment of vector types are provided in <simd/vector_types.h>,
+ *  which is included by this header.  The most important (as far as the rest
+ *  of this library is concerned) are vector_floatN (where N is 2, 3, 4, 8, or
+ *  16), and vector_doubleN (where N is 2, 3, 4, or 8).
+ *
+ *  All of the vector types are based on what clang call "OpenCL vectors",
+ *  defined with the __ext_vector_type__ attribute.  Many C operators "just
+ *  work" with these types, so it is not necessary to make function calls
+ *  to do basic arithmetic:
+ *
+ *      simd_float4 x, y;
+ *      x = x + y;          // vector sum of x and y.
+ *
+ *  scalar values are implicitly promoted to vectors (with a "splat"), so it
+ *  is possible to easily write expressions involving scalars as well:
+ *
+ *      simd_float4 x;
+ *      x = 2*x;            // scale x by 2.
+ *
+ *  Besides the basic operations provided by the compiler, this header provides
+ *  a set of mathematical and geometric primitives for use with these types.
+ *  In C and Objective-C, these functions are prefixed with vector_; in C++,
+ *  unprefixed names are available within the simd:: namespace.
+ *
+ *      simd_float3 x, y;
+ *      vector_max(x,y)     // elementwise maximum of x and y
+ *      fabs(x)             // same as vector_abs(x)
+ *      vector_clamp(x,0,1) // x clamped to the range [0,1].  This has no
+ *                          // standard-library analogue, so there is no
+ *                          // alternate name.
+ *
+ *  Matrix and matrix-vector operations are also available in <simd/matrix.h>.
+ */
+
+#ifndef __SIMD_VECTOR_HEADER__
+#define __SIMD_VECTOR_HEADER__
+
+#include <simd/vector_types.h>
+#include <simd/packed.h>
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+#include <simd/math.h>
+#include <simd/common.h>
+#include <simd/geometry.h>
+#include <simd/conversion.h>
+
+#endif
diff --git a/vfsoverlay/vector_make.h b/vfsoverlay/vector_make.h
new file mode 100644
index 00000000..73b95fa6
--- /dev/null
+++ b/vfsoverlay/vector_make.h
@@ -0,0 +1,7874 @@
+/*! @header
+ *  This header defines functions for constructing, extending, and truncating
+ *  simd vector types.
+ *
+ *  For each vector type `simd_typeN` supported by <simd/simd.h>, the following
+ *  constructors are provided:
+ *
+ *  ~~~
+ *  simd_typeN simd_make_typeN(type other);
+ *  simd_typeN simd_make_typeN(simd_typeM other);
+ *  ~~~
+ *  For the scalar-input version, or if M < N, these functions zero-extend
+ *  `other` to produce a wider vector. If M == N, `other` is passed through
+ *  unmodified. If `M > N`, `other` is truncated to form the result.
+ *
+ *  ~~~
+ *  simd_typeN simd_make_typeN_undef(type other);
+ *  simd_typeN simd_make_typeN_undef(simd_typeM other);
+ *  ~~~
+ *  These functions are only available for M < N and for scalar inputs. They 
+ *  extend `other` to produce a wider vector where the contents of the newly-
+ *  formed lanes are undefined.
+ *
+ *  In addition, if N is 2, 3, or 4, the following constructors are available:
+ *  ~~~
+ *  simd_make_typeN(parts ...)
+ *  ~~~
+ *  where parts is a list of scalars and smaller vectors such that the sum of
+ *  the number of lanes in the arguments is equal to N. For example, a
+ *  `simd_float3` can be constructed from three `floats`, or a `float` and a
+ *  `simd_float2` in any order:
+ *  ~~~
+ *  simd_float2 ab = { 1, 2 };
+ *  simd_float3 vector = simd_make_float3(ab, 3);
+ *  ~~~
+ *
+ *  In C++ the above functions are templated in the simd:: namespace.
+ *
+ *      C++ Function                            Equivalent C Function
+ *      -------------------------------------------------------------------
+ *      simd::make<simd::typeN>(x ...)          simd_make_typeN(x ...)
+ *      simd::make_undef<simd::typeN>(x ...)    simd_make_typeN_undef(x ...)
+ *
+ *
+ *  In addition, templated Vector<ScalarType, count> struct is available for
+ *  templated code based on the scalar type.
+ *
+ *      template <typename ScalarType, size_t count> struct simd::Vector {
+ *        //  static const size_t count
+ *        //  typedef scalar_t
+ *        //  typedef type
+ *        //  typedef packed_t
+ *      };
+ *
+ *  Lookup the equivalent Vector struct according to typeN:
+ *      template <typename typeN> struct simd::get_traits
+ *      {
+ *      //    using type = Vector<ScalarType, count>;
+ *      };
+ *
+ *  This is commonly used to get the type traits of typeN, so a helper type,
+ *  namely traits, is available to query the type traits easily.
+ *      simd::traits<typeN>::count
+ *      simd::traits<typeN>::scalar_t
+ *
+ *  @copyright 2014-2016 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_VECTOR_CONSTRUCTORS
+#define SIMD_VECTOR_CONSTRUCTORS
+
+#include <simd/vector_types.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(char x, char y) {
+  simd_char2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(char other) {
+  simd_char2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char2 simd_make_char2_undef(char other) {
+  simd_char2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char16 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char32 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char2 simd_make_char2(simd_char64 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(char x, char y, char z) {
+  simd_char3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(char x, simd_char2 yz) {
+  simd_char3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char2 xy, char z) {
+  simd_char3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(char other) {
+  simd_char3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char3 simd_make_char3_undef(char other) {
+  simd_char3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char2 other) {
+  simd_char3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char3 simd_make_char3_undef(simd_char2 other) {
+  simd_char3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char32 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char3 simd_make_char3(simd_char64 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(char x, char y, char z, char w) {
+  simd_char4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(char x, char y, simd_char2 zw) {
+  simd_char4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(char x, simd_char2 yz, char w) {
+  simd_char4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char2 xy, char z, char w) {
+  simd_char4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(char x, simd_char3 yzw) {
+  simd_char4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char2 xy, simd_char2 zw) {
+  simd_char4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char3 xyz, char w) {
+  simd_char4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(char other) {
+  simd_char4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char4 simd_make_char4_undef(char other) {
+  simd_char4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char2 other) {
+  simd_char4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char4 simd_make_char4_undef(simd_char2 other) {
+  simd_char4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char3 other) {
+  simd_char4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char4 simd_make_char4_undef(simd_char3 other) {
+  simd_char4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char32 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_char4 simd_make_char4(simd_char64 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char4 lo, simd_char4 hi) {
+  simd_char8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(char other) {
+  simd_char8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char8 simd_make_char8_undef(char other) {
+  simd_char8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char2 other) {
+  simd_char8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char8 simd_make_char8_undef(simd_char2 other) {
+  simd_char8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char3 other) {
+  simd_char8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char8 simd_make_char8_undef(simd_char3 other) {
+  simd_char8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char4 other) {
+  simd_char8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_char8 simd_make_char8_undef(simd_char4 other) {
+  simd_char8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char16 other) {
+  return simd_make_char8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char32 other) {
+  return simd_make_char8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char8 simd_make_char8(simd_char64 other) {
+  return simd_make_char8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char8 lo, simd_char8 hi) {
+  simd_char16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(char other) {
+  simd_char16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char16 simd_make_char16_undef(char other) {
+  simd_char16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char2 other) {
+  simd_char16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char16 simd_make_char16_undef(simd_char2 other) {
+  simd_char16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char3 other) {
+  simd_char16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char16 simd_make_char16_undef(simd_char3 other) {
+  simd_char16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char4 other) {
+  simd_char16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char16 simd_make_char16_undef(simd_char4 other) {
+  simd_char16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char8 other) {
+  simd_char16 result = 0;
+  result.lo = simd_make_char8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char16 simd_make_char16_undef(simd_char8 other) {
+  simd_char16 result;
+  result.lo = simd_make_char8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char16 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char32 other) {
+  return simd_make_char16(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char16 simd_make_char16(simd_char64 other) {
+  return simd_make_char16(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char16 lo, simd_char16 hi) {
+  simd_char32 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(char other) {
+  simd_char32 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(char other) {
+  simd_char32 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char2 other) {
+  simd_char32 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(simd_char2 other) {
+  simd_char32 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char3 other) {
+  simd_char32 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(simd_char3 other) {
+  simd_char32 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char4 other) {
+  simd_char32 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(simd_char4 other) {
+  simd_char32 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char8 other) {
+  simd_char32 result = 0;
+  result.lo = simd_make_char16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(simd_char8 other) {
+  simd_char32 result;
+  result.lo = simd_make_char16(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char16 other) {
+  simd_char32 result = 0;
+  result.lo = simd_make_char16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char32 simd_make_char32_undef(simd_char16 other) {
+  simd_char32 result;
+  result.lo = simd_make_char16(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char32 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_char32 simd_make_char32(simd_char64 other) {
+  return simd_make_char32(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixty-four
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char32 lo, simd_char32 hi) {
+  simd_char64 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(char other) {
+  simd_char64 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(char other) {
+  simd_char64 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char2 other) {
+  simd_char64 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char2 other) {
+  simd_char64 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char3 other) {
+  simd_char64 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char3 other) {
+  simd_char64 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char4 other) {
+  simd_char64 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char4 other) {
+  simd_char64 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char8 other) {
+  simd_char64 result = 0;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char8 other) {
+  simd_char64 result;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char16 other) {
+  simd_char64 result = 0;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char16 other) {
+  simd_char64 result;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char32 other) {
+  simd_char64 result = 0;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_char64 simd_make_char64_undef(simd_char32 other) {
+  simd_char64 result;
+  result.lo = simd_make_char32(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_char64 simd_make_char64(simd_char64 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(unsigned char x, unsigned char y) {
+  simd_uchar2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(unsigned char other) {
+  simd_uchar2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2_undef(unsigned char other) {
+  simd_uchar2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar16 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar32 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar2 simd_make_uchar2(simd_uchar64 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(unsigned char x, unsigned char y, unsigned char z) {
+  simd_uchar3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(unsigned char x, simd_uchar2 yz) {
+  simd_uchar3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar2 xy, unsigned char z) {
+  simd_uchar3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(unsigned char other) {
+  simd_uchar3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3_undef(unsigned char other) {
+  simd_uchar3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar2 other) {
+  simd_uchar3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3_undef(simd_uchar2 other) {
+  simd_uchar3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar32 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar3 simd_make_uchar3(simd_uchar64 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) {
+  simd_uchar4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(unsigned char x, unsigned char y, simd_uchar2 zw) {
+  simd_uchar4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(unsigned char x, simd_uchar2 yz, unsigned char w) {
+  simd_uchar4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar2 xy, unsigned char z, unsigned char w) {
+  simd_uchar4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(unsigned char x, simd_uchar3 yzw) {
+  simd_uchar4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar2 xy, simd_uchar2 zw) {
+  simd_uchar4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar3 xyz, unsigned char w) {
+  simd_uchar4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(unsigned char other) {
+  simd_uchar4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4_undef(unsigned char other) {
+  simd_uchar4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar2 other) {
+  simd_uchar4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4_undef(simd_uchar2 other) {
+  simd_uchar4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar3 other) {
+  simd_uchar4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4_undef(simd_uchar3 other) {
+  simd_uchar4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar32 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar4 simd_make_uchar4(simd_uchar64 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar4 lo, simd_uchar4 hi) {
+  simd_uchar8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(unsigned char other) {
+  simd_uchar8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8_undef(unsigned char other) {
+  simd_uchar8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar2 other) {
+  simd_uchar8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8_undef(simd_uchar2 other) {
+  simd_uchar8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar3 other) {
+  simd_uchar8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8_undef(simd_uchar3 other) {
+  simd_uchar8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar4 other) {
+  simd_uchar8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8_undef(simd_uchar4 other) {
+  simd_uchar8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar16 other) {
+  return simd_make_uchar8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar32 other) {
+  return simd_make_uchar8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar8 simd_make_uchar8(simd_uchar64 other) {
+  return simd_make_uchar8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar8 lo, simd_uchar8 hi) {
+  simd_uchar16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(unsigned char other) {
+  simd_uchar16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16_undef(unsigned char other) {
+  simd_uchar16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar2 other) {
+  simd_uchar16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16_undef(simd_uchar2 other) {
+  simd_uchar16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar3 other) {
+  simd_uchar16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16_undef(simd_uchar3 other) {
+  simd_uchar16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar4 other) {
+  simd_uchar16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16_undef(simd_uchar4 other) {
+  simd_uchar16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar8 other) {
+  simd_uchar16 result = 0;
+  result.lo = simd_make_uchar8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16_undef(simd_uchar8 other) {
+  simd_uchar16 result;
+  result.lo = simd_make_uchar8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar16 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar32 other) {
+  return simd_make_uchar16(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 8-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uchar16 simd_make_uchar16(simd_uchar64 other) {
+  return simd_make_uchar16(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar16 lo, simd_uchar16 hi) {
+  simd_uchar32 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(unsigned char other) {
+  simd_uchar32 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(unsigned char other) {
+  simd_uchar32 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar2 other) {
+  simd_uchar32 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(simd_uchar2 other) {
+  simd_uchar32 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar3 other) {
+  simd_uchar32 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(simd_uchar3 other) {
+  simd_uchar32 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar4 other) {
+  simd_uchar32 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(simd_uchar4 other) {
+  simd_uchar32 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar8 other) {
+  simd_uchar32 result = 0;
+  result.lo = simd_make_uchar16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(simd_uchar8 other) {
+  simd_uchar32 result;
+  result.lo = simd_make_uchar16(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar16 other) {
+  simd_uchar32 result = 0;
+  result.lo = simd_make_uchar16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32_undef(simd_uchar16 other) {
+  simd_uchar32 result;
+  result.lo = simd_make_uchar16(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar32 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of thirty-two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar32 simd_make_uchar32(simd_uchar64 other) {
+  return simd_make_uchar32(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixty-four
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar32 lo, simd_uchar32 hi) {
+  simd_uchar64 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(unsigned char other) {
+  simd_uchar64 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(unsigned char other) {
+  simd_uchar64 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar2 other) {
+  simd_uchar64 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar2 other) {
+  simd_uchar64 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar3 other) {
+  simd_uchar64 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar3 other) {
+  simd_uchar64 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar4 other) {
+  simd_uchar64 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar4 other) {
+  simd_uchar64 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar8 other) {
+  simd_uchar64 result = 0;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar8 other) {
+  simd_uchar64 result;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar16 other) {
+  simd_uchar64 result = 0;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar16 other) {
+  simd_uchar64 result;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixty-four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar32 other) {
+  simd_uchar64 result = 0;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64_undef(simd_uchar32 other) {
+  simd_uchar64 result;
+  result.lo = simd_make_uchar32(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uchar64 simd_make_uchar64(simd_uchar64 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(short x, short y) {
+  simd_short2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(short other) {
+  simd_short2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short2 simd_make_short2_undef(short other) {
+  simd_short2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short16 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_short2 simd_make_short2(simd_short32 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(short x, short y, short z) {
+  simd_short3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(short x, simd_short2 yz) {
+  simd_short3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short2 xy, short z) {
+  simd_short3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(short other) {
+  simd_short3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short3 simd_make_short3_undef(short other) {
+  simd_short3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short2 other) {
+  simd_short3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short3 simd_make_short3_undef(simd_short2 other) {
+  simd_short3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short3 simd_make_short3(simd_short32 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  16-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(short x, short y, short z, short w) {
+  simd_short4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(short x, short y, simd_short2 zw) {
+  simd_short4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(short x, simd_short2 yz, short w) {
+  simd_short4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short2 xy, short z, short w) {
+  simd_short4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(short x, simd_short3 yzw) {
+  simd_short4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short2 xy, simd_short2 zw) {
+  simd_short4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short3 xyz, short w) {
+  simd_short4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(short other) {
+  simd_short4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short4 simd_make_short4_undef(short other) {
+  simd_short4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short2 other) {
+  simd_short4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short4 simd_make_short4_undef(simd_short2 other) {
+  simd_short4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short3 other) {
+  simd_short4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short4 simd_make_short4_undef(simd_short3 other) {
+  simd_short4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short4 simd_make_short4(simd_short32 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short4 lo, simd_short4 hi) {
+  simd_short8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(short other) {
+  simd_short8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short8 simd_make_short8_undef(short other) {
+  simd_short8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short2 other) {
+  simd_short8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short8 simd_make_short8_undef(simd_short2 other) {
+  simd_short8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short3 other) {
+  simd_short8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short8 simd_make_short8_undef(simd_short3 other) {
+  simd_short8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short4 other) {
+  simd_short8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_short8 simd_make_short8_undef(simd_short4 other) {
+  simd_short8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short16 other) {
+  return simd_make_short8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short8 simd_make_short8(simd_short32 other) {
+  return simd_make_short8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short8 lo, simd_short8 hi) {
+  simd_short16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(short other) {
+  simd_short16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short16 simd_make_short16_undef(short other) {
+  simd_short16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short2 other) {
+  simd_short16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short16 simd_make_short16_undef(simd_short2 other) {
+  simd_short16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short3 other) {
+  simd_short16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short16 simd_make_short16_undef(simd_short3 other) {
+  simd_short16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short4 other) {
+  simd_short16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short16 simd_make_short16_undef(simd_short4 other) {
+  simd_short16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short8 other) {
+  simd_short16 result = 0;
+  result.lo = simd_make_short8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short16 simd_make_short16_undef(simd_short8 other) {
+  simd_short16 result;
+  result.lo = simd_make_short8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short16 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_short16 simd_make_short16(simd_short32 other) {
+  return simd_make_short16(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  16-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short16 lo, simd_short16 hi) {
+  simd_short32 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(short other) {
+  simd_short32 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(short other) {
+  simd_short32 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short2 other) {
+  simd_short32 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(simd_short2 other) {
+  simd_short32 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short3 other) {
+  simd_short32 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(simd_short3 other) {
+  simd_short32 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short4 other) {
+  simd_short32 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(simd_short4 other) {
+  simd_short32 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short8 other) {
+  simd_short32 result = 0;
+  result.lo = simd_make_short16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(simd_short8 other) {
+  simd_short32 result;
+  result.lo = simd_make_short16(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short16 other) {
+  simd_short32 result = 0;
+  result.lo = simd_make_short16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_short32 simd_make_short32_undef(simd_short16 other) {
+  simd_short32 result;
+  result.lo = simd_make_short16(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_short32 simd_make_short32(simd_short32 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(unsigned short x, unsigned short y) {
+  simd_ushort2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(unsigned short other) {
+  simd_ushort2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2_undef(unsigned short other) {
+  simd_ushort2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort16 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort2 simd_make_ushort2(simd_ushort32 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(unsigned short x, unsigned short y, unsigned short z) {
+  simd_ushort3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(unsigned short x, simd_ushort2 yz) {
+  simd_ushort3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort2 xy, unsigned short z) {
+  simd_ushort3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(unsigned short other) {
+  simd_ushort3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3_undef(unsigned short other) {
+  simd_ushort3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort2 other) {
+  simd_ushort3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3_undef(simd_ushort2 other) {
+  simd_ushort3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort3 simd_make_ushort3(simd_ushort32 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  16-bit unsigned integers.                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) {
+  simd_ushort4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(unsigned short x, unsigned short y, simd_ushort2 zw) {
+  simd_ushort4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(unsigned short x, simd_ushort2 yz, unsigned short w) {
+  simd_ushort4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort2 xy, unsigned short z, unsigned short w) {
+  simd_ushort4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(unsigned short x, simd_ushort3 yzw) {
+  simd_ushort4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort2 xy, simd_ushort2 zw) {
+  simd_ushort4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort3 xyz, unsigned short w) {
+  simd_ushort4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(unsigned short other) {
+  simd_ushort4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4_undef(unsigned short other) {
+  simd_ushort4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort2 other) {
+  simd_ushort4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4_undef(simd_ushort2 other) {
+  simd_ushort4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort3 other) {
+  simd_ushort4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4_undef(simd_ushort3 other) {
+  simd_ushort4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort4 simd_make_ushort4(simd_ushort32 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort4 lo, simd_ushort4 hi) {
+  simd_ushort8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(unsigned short other) {
+  simd_ushort8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8_undef(unsigned short other) {
+  simd_ushort8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort2 other) {
+  simd_ushort8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8_undef(simd_ushort2 other) {
+  simd_ushort8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort3 other) {
+  simd_ushort8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8_undef(simd_ushort3 other) {
+  simd_ushort8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort4 other) {
+  simd_ushort8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8_undef(simd_ushort4 other) {
+  simd_ushort8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort16 other) {
+  return simd_make_ushort8(other.lo);
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort8 simd_make_ushort8(simd_ushort32 other) {
+  return simd_make_ushort8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort8 lo, simd_ushort8 hi) {
+  simd_ushort16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(unsigned short other) {
+  simd_ushort16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16_undef(unsigned short other) {
+  simd_ushort16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort2 other) {
+  simd_ushort16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16_undef(simd_ushort2 other) {
+  simd_ushort16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort3 other) {
+  simd_ushort16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16_undef(simd_ushort3 other) {
+  simd_ushort16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort4 other) {
+  simd_ushort16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16_undef(simd_ushort4 other) {
+  simd_ushort16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort8 other) {
+  simd_ushort16 result = 0;
+  result.lo = simd_make_ushort8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16_undef(simd_ushort8 other) {
+  simd_ushort16 result;
+  result.lo = simd_make_ushort8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort16 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of sixteen 16-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ushort16 simd_make_ushort16(simd_ushort32 other) {
+  return simd_make_ushort16(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  16-bit unsigned integers.                                                 */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort16 lo, simd_ushort16 hi) {
+  simd_ushort32 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(unsigned short other) {
+  simd_ushort32 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(unsigned short other) {
+  simd_ushort32 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort2 other) {
+  simd_ushort32 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(simd_ushort2 other) {
+  simd_ushort32 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort3 other) {
+  simd_ushort32 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(simd_ushort3 other) {
+  simd_ushort32 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort4 other) {
+  simd_ushort32 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(simd_ushort4 other) {
+  simd_ushort32 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort8 other) {
+  simd_ushort32 result = 0;
+  result.lo = simd_make_ushort16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(simd_ushort8 other) {
+  simd_ushort32 result;
+  result.lo = simd_make_ushort16(other);
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of thirty-two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort16 other) {
+  simd_ushort32 result = 0;
+  result.lo = simd_make_ushort16(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32_undef(simd_ushort16 other) {
+  simd_ushort32 result;
+  result.lo = simd_make_ushort16(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ushort32 simd_make_ushort32(simd_ushort32 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(int x, int y) {
+  simd_int2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(int other) {
+  simd_int2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int2 simd_make_int2_undef(int other) {
+  simd_int2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(simd_int2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(simd_int3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(simd_int4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(simd_int8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_int2 simd_make_int2(simd_int16 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(int x, int y, int z) {
+  simd_int3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(int x, simd_int2 yz) {
+  simd_int3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int2 xy, int z) {
+  simd_int3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(int other) {
+  simd_int3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int3 simd_make_int3_undef(int other) {
+  simd_int3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int2 other) {
+  simd_int3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int3 simd_make_int3_undef(simd_int2 other) {
+  simd_int3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int3 simd_make_int3(simd_int16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(int x, int y, int z, int w) {
+  simd_int4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(int x, int y, simd_int2 zw) {
+  simd_int4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(int x, simd_int2 yz, int w) {
+  simd_int4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int2 xy, int z, int w) {
+  simd_int4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(int x, simd_int3 yzw) {
+  simd_int4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int2 xy, simd_int2 zw) {
+  simd_int4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int3 xyz, int w) {
+  simd_int4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(int other) {
+  simd_int4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int4 simd_make_int4_undef(int other) {
+  simd_int4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int2 other) {
+  simd_int4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int4 simd_make_int4_undef(simd_int2 other) {
+  simd_int4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int3 other) {
+  simd_int4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int4 simd_make_int4_undef(simd_int3 other) {
+  simd_int4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int4 simd_make_int4(simd_int16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int4 lo, simd_int4 hi) {
+  simd_int8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(int other) {
+  simd_int8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int8 simd_make_int8_undef(int other) {
+  simd_int8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int2 other) {
+  simd_int8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int8 simd_make_int8_undef(simd_int2 other) {
+  simd_int8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int3 other) {
+  simd_int8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int8 simd_make_int8_undef(simd_int3 other) {
+  simd_int8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int4 other) {
+  simd_int8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_int8 simd_make_int8_undef(simd_int4 other) {
+  simd_int8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int8 simd_make_int8(simd_int16 other) {
+  return simd_make_int8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int8 lo, simd_int8 hi) {
+  simd_int16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(int other) {
+  simd_int16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_int16 simd_make_int16_undef(int other) {
+  simd_int16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int2 other) {
+  simd_int16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_int16 simd_make_int16_undef(simd_int2 other) {
+  simd_int16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int3 other) {
+  simd_int16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_int16 simd_make_int16_undef(simd_int3 other) {
+  simd_int16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int4 other) {
+  simd_int16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_int16 simd_make_int16_undef(simd_int4 other) {
+  simd_int16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int8 other) {
+  simd_int16 result = 0;
+  result.lo = simd_make_int8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+static inline SIMD_CFUNC simd_int16 simd_make_int16_undef(simd_int8 other) {
+  simd_int16 result;
+  result.lo = simd_make_int8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_int16 simd_make_int16(simd_int16 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(unsigned int x, unsigned int y) {
+  simd_uint2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(unsigned int other) {
+  simd_uint2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2_undef(unsigned int other) {
+  simd_uint2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(simd_uint2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(simd_uint3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(simd_uint4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(simd_uint8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint2 simd_make_uint2(simd_uint16 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(unsigned int x, unsigned int y, unsigned int z) {
+  simd_uint3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(unsigned int x, simd_uint2 yz) {
+  simd_uint3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint2 xy, unsigned int z) {
+  simd_uint3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(unsigned int other) {
+  simd_uint3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3_undef(unsigned int other) {
+  simd_uint3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint2 other) {
+  simd_uint3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3_undef(simd_uint2 other) {
+  simd_uint3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint3 simd_make_uint3(simd_uint16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit unsigned integers.                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) {
+  simd_uint4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(unsigned int x, unsigned int y, simd_uint2 zw) {
+  simd_uint4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(unsigned int x, simd_uint2 yz, unsigned int w) {
+  simd_uint4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint2 xy, unsigned int z, unsigned int w) {
+  simd_uint4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(unsigned int x, simd_uint3 yzw) {
+  simd_uint4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint2 xy, simd_uint2 zw) {
+  simd_uint4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint3 xyz, unsigned int w) {
+  simd_uint4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(unsigned int other) {
+  simd_uint4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4_undef(unsigned int other) {
+  simd_uint4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint2 other) {
+  simd_uint4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4_undef(simd_uint2 other) {
+  simd_uint4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint3 other) {
+  simd_uint4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4_undef(simd_uint3 other) {
+  simd_uint4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint4 simd_make_uint4(simd_uint16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint4 lo, simd_uint4 hi) {
+  simd_uint8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(unsigned int other) {
+  simd_uint8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8_undef(unsigned int other) {
+  simd_uint8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint2 other) {
+  simd_uint8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8_undef(simd_uint2 other) {
+  simd_uint8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint3 other) {
+  simd_uint8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8_undef(simd_uint3 other) {
+  simd_uint8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint4 other) {
+  simd_uint8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8_undef(simd_uint4 other) {
+  simd_uint8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 32-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_uint8 simd_make_uint8(simd_uint16 other) {
+  return simd_make_uint8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint8 lo, simd_uint8 hi) {
+  simd_uint16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(unsigned int other) {
+  simd_uint16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16_undef(unsigned int other) {
+  simd_uint16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint2 other) {
+  simd_uint16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16_undef(simd_uint2 other) {
+  simd_uint16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint3 other) {
+  simd_uint16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16_undef(simd_uint3 other) {
+  simd_uint16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint4 other) {
+  simd_uint16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16_undef(simd_uint4 other) {
+  simd_uint16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint8 other) {
+  simd_uint16 result = 0;
+  result.lo = simd_make_uint8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16_undef(simd_uint8 other) {
+  simd_uint16 result;
+  result.lo = simd_make_uint8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_uint16 simd_make_uint16(simd_uint16 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(float x, float y) {
+  simd_float2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(float other) {
+  simd_float2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_float2 simd_make_float2_undef(float other) {
+  simd_float2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(simd_float2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(simd_float3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(simd_float4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(simd_float8 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float2 simd_make_float2(simd_float16 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(float x, float y, float z) {
+  simd_float3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(float x, simd_float2 yz) {
+  simd_float3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float2 xy, float z) {
+  simd_float3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(float other) {
+  simd_float3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float3 simd_make_float3_undef(float other) {
+  simd_float3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float2 other) {
+  simd_float3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float3 simd_make_float3_undef(simd_float2 other) {
+  simd_float3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float3 simd_make_float3(simd_float16 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit floating-point numbers.                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(float x, float y, float z, float w) {
+  simd_float4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(float x, float y, simd_float2 zw) {
+  simd_float4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(float x, simd_float2 yz, float w) {
+  simd_float4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float2 xy, float z, float w) {
+  simd_float4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(float x, simd_float3 yzw) {
+  simd_float4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float2 xy, simd_float2 zw) {
+  simd_float4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float3 xyz, float w) {
+  simd_float4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(float other) {
+  simd_float4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_float4 simd_make_float4_undef(float other) {
+  simd_float4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float2 other) {
+  simd_float4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_float4 simd_make_float4_undef(simd_float2 other) {
+  simd_float4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float3 other) {
+  simd_float4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_float4 simd_make_float4_undef(simd_float3 other) {
+  simd_float4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float4 simd_make_float4(simd_float16 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float4 lo, simd_float4 hi) {
+  simd_float8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(float other) {
+  simd_float8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float8 simd_make_float8_undef(float other) {
+  simd_float8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float2 other) {
+  simd_float8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float8 simd_make_float8_undef(simd_float2 other) {
+  simd_float8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float3 other) {
+  simd_float8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float8 simd_make_float8_undef(simd_float3 other) {
+  simd_float8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float4 other) {
+  simd_float8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float8 simd_make_float8_undef(simd_float4 other) {
+  simd_float8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float8 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of eight 32-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_float8 simd_make_float8(simd_float16 other) {
+  return simd_make_float8(other.lo);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float8 lo, simd_float8 hi) {
+  simd_float16 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(float other) {
+  simd_float16 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float16 simd_make_float16_undef(float other) {
+  simd_float16 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float2 other) {
+  simd_float16 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float16 simd_make_float16_undef(simd_float2 other) {
+  simd_float16 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float3 other) {
+  simd_float16 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float16 simd_make_float16_undef(simd_float3 other) {
+  simd_float16 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float4 other) {
+  simd_float16 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float16 simd_make_float16_undef(simd_float4 other) {
+  simd_float16 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float8 other) {
+  simd_float16 result = 0;
+  result.lo = simd_make_float8(other);
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_float16 simd_make_float16_undef(simd_float8 other) {
+  simd_float16 result;
+  result.lo = simd_make_float8(other);
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_float16 simd_make_float16(simd_float16 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long1 x, simd_long1 y) {
+  simd_long2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long1 other) {
+  simd_long2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long2 simd_make_long2_undef(simd_long1 other) {
+  simd_long2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit signed (twos-
+ *  complement) integers.                                                     */
+static inline SIMD_CFUNC simd_long2 simd_make_long2(simd_long8 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long1 x, simd_long1 y, simd_long1 z) {
+  simd_long3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long1 x, simd_long2 yz) {
+  simd_long3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long2 xy, simd_long1 z) {
+  simd_long3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long1 other) {
+  simd_long3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long3 simd_make_long3_undef(simd_long1 other) {
+  simd_long3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long2 other) {
+  simd_long3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long3 simd_make_long3_undef(simd_long2 other) {
+  simd_long3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long3 simd_make_long3(simd_long8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long1 x, simd_long1 y, simd_long1 z, simd_long1 w) {
+  simd_long4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long1 x, simd_long1 y, simd_long2 zw) {
+  simd_long4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long1 x, simd_long2 yz, simd_long1 w) {
+  simd_long4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long2 xy, simd_long1 z, simd_long1 w) {
+  simd_long4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long1 x, simd_long3 yzw) {
+  simd_long4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long2 xy, simd_long2 zw) {
+  simd_long4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long3 xyz, simd_long1 w) {
+  simd_long4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long1 other) {
+  simd_long4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long4 simd_make_long4_undef(simd_long1 other) {
+  simd_long4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long2 other) {
+  simd_long4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long4 simd_make_long4_undef(simd_long2 other) {
+  simd_long4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long3 other) {
+  simd_long4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long4 simd_make_long4_undef(simd_long3 other) {
+  simd_long4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long4 simd_make_long4(simd_long8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long4 lo, simd_long4 hi) {
+  simd_long8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long1 other) {
+  simd_long8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long8 simd_make_long8_undef(simd_long1 other) {
+  simd_long8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long2 other) {
+  simd_long8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long8 simd_make_long8_undef(simd_long2 other) {
+  simd_long8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long3 other) {
+  simd_long8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long8 simd_make_long8_undef(simd_long3 other) {
+  simd_long8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long4 other) {
+  simd_long8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_long8 simd_make_long8_undef(simd_long4 other) {
+  simd_long8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_long8 simd_make_long8(simd_long8 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong1 x, simd_ulong1 y) {
+  simd_ulong2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong1 other) {
+  simd_ulong2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2_undef(simd_ulong1 other) {
+  simd_ulong2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong2 simd_make_ulong2(simd_ulong8 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong1 x, simd_ulong1 y, simd_ulong1 z) {
+  simd_ulong3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong1 x, simd_ulong2 yz) {
+  simd_ulong3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong2 xy, simd_ulong1 z) {
+  simd_ulong3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong1 other) {
+  simd_ulong3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3_undef(simd_ulong1 other) {
+  simd_ulong3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong2 other) {
+  simd_ulong3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3_undef(simd_ulong2 other) {
+  simd_ulong3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong3 simd_make_ulong3(simd_ulong8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit unsigned integers.                                                 */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong1 x, simd_ulong1 y, simd_ulong1 z, simd_ulong1 w) {
+  simd_ulong4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong1 x, simd_ulong1 y, simd_ulong2 zw) {
+  simd_ulong4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong1 x, simd_ulong2 yz, simd_ulong1 w) {
+  simd_ulong4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong2 xy, simd_ulong1 z, simd_ulong1 w) {
+  simd_ulong4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong1 x, simd_ulong3 yzw) {
+  simd_ulong4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong2 xy, simd_ulong2 zw) {
+  simd_ulong4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong3 xyz, simd_ulong1 w) {
+  simd_ulong4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong1 other) {
+  simd_ulong4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4_undef(simd_ulong1 other) {
+  simd_ulong4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong2 other) {
+  simd_ulong4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4_undef(simd_ulong2 other) {
+  simd_ulong4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong3 other) {
+  simd_ulong4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4_undef(simd_ulong3 other) {
+  simd_ulong4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong4 simd_make_ulong4(simd_ulong8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong4 lo, simd_ulong4 hi) {
+  simd_ulong8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong1 other) {
+  simd_ulong8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8_undef(simd_ulong1 other) {
+  simd_ulong8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong2 other) {
+  simd_ulong8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8_undef(simd_ulong2 other) {
+  simd_ulong8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong3 other) {
+  simd_ulong8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8_undef(simd_ulong3 other) {
+  simd_ulong8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit unsigned
+ *  integers.                                                                 */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong4 other) {
+  simd_ulong8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8_undef(simd_ulong4 other) {
+  simd_ulong8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_ulong8 simd_make_ulong8(simd_ulong8 other) {
+  return other;
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(double x, double y) {
+  simd_double2 result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of two 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(double other) {
+  simd_double2 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_double2 simd_make_double2_undef(double other) {
+  simd_double2 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(simd_double2 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(simd_double3 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(simd_double4 other) {
+  return other.xy;
+}
+
+/*! @abstract Truncates `other` to form a vector of two 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double2 simd_make_double2(simd_double8 other) {
+  return other.xy;
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(double x, double y, double z) {
+  simd_double3 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(double x, simd_double2 yz) {
+  simd_double3 result;
+  result.x = x;
+  result.yz = yz;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(simd_double2 xy, double z) {
+  simd_double3 result;
+  result.xy = xy;
+  result.z = z;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(double other) {
+  simd_double3 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double3 simd_make_double3_undef(double other) {
+  simd_double3 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(simd_double2 other) {
+  simd_double3 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double3 simd_make_double3_undef(simd_double2 other) {
+  simd_double3 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(simd_double3 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(simd_double4 other) {
+  return other.xyz;
+}
+
+/*! @abstract Truncates `other` to form a vector of three 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double3 simd_make_double3(simd_double8 other) {
+  return other.xyz;
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit floating-point numbers.                                            */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(double x, double y, double z, double w) {
+  simd_double4 result;
+  result.x = x;
+  result.y = y;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(double x, double y, simd_double2 zw) {
+  simd_double4 result;
+  result.x = x;
+  result.y = y;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(double x, simd_double2 yz, double w) {
+  simd_double4 result;
+  result.x = x;
+  result.yz = yz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double2 xy, double z, double w) {
+  simd_double4 result;
+  result.xy = xy;
+  result.z = z;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(double x, simd_double3 yzw) {
+  simd_double4 result;
+  result.x = x;
+  result.yzw = yzw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double2 xy, simd_double2 zw) {
+  simd_double4 result;
+  result.xy = xy;
+  result.zw = zw;
+  return result;
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double3 xyz, double w) {
+  simd_double4 result;
+  result.xyz = xyz;
+  result.w = w;
+  return result;
+}
+  
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(double other) {
+  simd_double4 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_double4 simd_make_double4_undef(double other) {
+  simd_double4 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double2 other) {
+  simd_double4 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_double4 simd_make_double4_undef(simd_double2 other) {
+  simd_double4 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of four 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double3 other) {
+  simd_double4 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+static inline SIMD_CFUNC simd_double4 simd_make_double4_undef(simd_double3 other) {
+  simd_double4 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double4 other) {
+  return other;
+}
+
+/*! @abstract Truncates `other` to form a vector of four 64-bit floating-
+ *  point numbers.                                                            */
+static inline SIMD_CFUNC simd_double4 simd_make_double4(simd_double8 other) {
+  return other.xyzw;
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(simd_double4 lo, simd_double4 hi) {
+  simd_double8 result;
+  result.lo = lo;
+  result.hi = hi;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(double other) {
+  simd_double8 result = 0;
+  result.x = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double8 simd_make_double8_undef(double other) {
+  simd_double8 result;
+  result.x = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(simd_double2 other) {
+  simd_double8 result = 0;
+  result.xy = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double8 simd_make_double8_undef(simd_double2 other) {
+  simd_double8 result;
+  result.xy = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(simd_double3 other) {
+  simd_double8 result = 0;
+  result.xyz = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double8 simd_make_double8_undef(simd_double3 other) {
+  simd_double8 result;
+  result.xyz = other;
+  return result;
+}
+
+/*! @abstract Zero-extends `other` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(simd_double4 other) {
+  simd_double8 result = 0;
+  result.xyzw = other;
+  return result;
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+static inline SIMD_CFUNC simd_double8 simd_make_double8_undef(simd_double4 other) {
+  simd_double8 result;
+  result.xyzw = other;
+  return result;
+}
+
+/*! @abstract Returns `other` unmodified. This function is a convenience for
+ *  templated and autogenerated code.                                         */
+static inline SIMD_CFUNC simd_double8 simd_make_double8(simd_double8 other) {
+  return other;
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+#include <tuple>
+#include <simd/packed.h>
+
+namespace simd {
+/*! @abstract Concatenates `x` and `y` to form a vector of two 8-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CPPFUNC char2 make_char2(char x, char y) {
+ return ::simd_make_char2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  8-bit signed (twos-complement) integers.                                  */
+template <typename typeN> static SIMD_CPPFUNC char2 make_char2(typeN other) {
+  return ::simd_make_char2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC char2 make_char2_undef(typeN other) {
+  return ::simd_make_char2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char3 make_char3(char x, char y, char z) {
+ return ::simd_make_char3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char3 make_char3(char x, char2 yz) {
+ return ::simd_make_char3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char3 make_char3(char2 xy, char z) {
+ return ::simd_make_char3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  8-bit signed (twos-complement) integers.                                  */
+template <typename typeN> static SIMD_CPPFUNC char3 make_char3(typeN other) {
+  return ::simd_make_char3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC char3 make_char3_undef(typeN other) {
+  return ::simd_make_char3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CPPFUNC char4 make_char4(char x, char y, char z, char w) {
+ return ::simd_make_char4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char x, char y, char2 zw) {
+ return ::simd_make_char4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char x, char2 yz, char w) {
+ return ::simd_make_char4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char2 xy, char z, char w) {
+ return ::simd_make_char4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char x, char3 yzw) {
+ return ::simd_make_char4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char2 xy, char2 zw) {
+ return ::simd_make_char4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char4 make_char4(char3 xyz, char w) {
+ return ::simd_make_char4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  8-bit signed (twos-complement) integers.                                  */
+template <typename typeN> static SIMD_CPPFUNC char4 make_char4(typeN other) {
+  return ::simd_make_char4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC char4 make_char4_undef(typeN other) {
+  return ::simd_make_char4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char8 make_char8(char4 lo, char4 hi) {
+ return ::simd_make_char8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  8-bit signed (twos-complement) integers.                                  */
+template <typename typeN> static SIMD_CPPFUNC char8 make_char8(typeN other) {
+  return ::simd_make_char8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC char8 make_char8_undef(typeN other) {
+  return ::simd_make_char8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 8-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC char16 make_char16(char8 lo, char8 hi) {
+ return ::simd_make_char16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  8-bit signed (twos-complement) integers.                                  */
+template <typename typeN> static SIMD_CPPFUNC char16 make_char16(typeN other) {
+  return ::simd_make_char16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC char16 make_char16_undef(typeN other) {
+  return ::simd_make_char16_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CPPFUNC char32 make_char32(char16 lo, char16 hi) {
+ return ::simd_make_char32(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of thirty-
+ *  two 8-bit signed (twos-complement) integers.                              */
+template <typename typeN> static SIMD_CPPFUNC char32 make_char32(typeN other) {
+  return ::simd_make_char32(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC char32 make_char32_undef(typeN other) {
+  return ::simd_make_char32_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixty-four
+ *  8-bit signed (twos-complement) integers.                                  */
+static inline SIMD_CPPFUNC char64 make_char64(char32 lo, char32 hi) {
+ return ::simd_make_char64(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixty-
+ *  four 8-bit signed (twos-complement) integers.                             */
+template <typename typeN> static SIMD_CPPFUNC char64 make_char64(typeN other) {
+  return ::simd_make_char64(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC char64 make_char64_undef(typeN other) {
+  return ::simd_make_char64_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar2 make_uchar2(unsigned char x, unsigned char y) {
+ return ::simd_make_uchar2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  8-bit unsigned integers.                                                  */
+template <typename typeN> static SIMD_CPPFUNC uchar2 make_uchar2(typeN other) {
+  return ::simd_make_uchar2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar2 make_uchar2_undef(typeN other) {
+  return ::simd_make_uchar2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) {
+ return ::simd_make_uchar3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar3 make_uchar3(unsigned char x, uchar2 yz) {
+ return ::simd_make_uchar3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar3 make_uchar3(uchar2 xy, unsigned char z) {
+ return ::simd_make_uchar3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  8-bit unsigned integers.                                                  */
+template <typename typeN> static SIMD_CPPFUNC uchar3 make_uchar3(typeN other) {
+  return ::simd_make_uchar3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar3 make_uchar3_undef(typeN other) {
+  return ::simd_make_uchar3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) {
+ return ::simd_make_uchar4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(unsigned char x, unsigned char y, uchar2 zw) {
+ return ::simd_make_uchar4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(unsigned char x, uchar2 yz, unsigned char w) {
+ return ::simd_make_uchar4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(uchar2 xy, unsigned char z, unsigned char w) {
+ return ::simd_make_uchar4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(unsigned char x, uchar3 yzw) {
+ return ::simd_make_uchar4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(uchar2 xy, uchar2 zw) {
+ return ::simd_make_uchar4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar4 make_uchar4(uchar3 xyz, unsigned char w) {
+ return ::simd_make_uchar4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  8-bit unsigned integers.                                                  */
+template <typename typeN> static SIMD_CPPFUNC uchar4 make_uchar4(typeN other) {
+  return ::simd_make_uchar4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar4 make_uchar4_undef(typeN other) {
+  return ::simd_make_uchar4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar8 make_uchar8(uchar4 lo, uchar4 hi) {
+ return ::simd_make_uchar8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  8-bit unsigned integers.                                                  */
+template <typename typeN> static SIMD_CPPFUNC uchar8 make_uchar8(typeN other) {
+  return ::simd_make_uchar8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar8 make_uchar8_undef(typeN other) {
+  return ::simd_make_uchar8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 8-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uchar16 make_uchar16(uchar8 lo, uchar8 hi) {
+ return ::simd_make_uchar16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  8-bit unsigned integers.                                                  */
+template <typename typeN> static SIMD_CPPFUNC uchar16 make_uchar16(typeN other) {
+  return ::simd_make_uchar16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar16 make_uchar16_undef(typeN other) {
+  return ::simd_make_uchar16_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CPPFUNC uchar32 make_uchar32(uchar16 lo, uchar16 hi) {
+ return ::simd_make_uchar32(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of thirty-
+ *  two 8-bit unsigned integers.                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar32 make_uchar32(typeN other) {
+  return ::simd_make_uchar32(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar32 make_uchar32_undef(typeN other) {
+  return ::simd_make_uchar32_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixty-four
+ *  8-bit unsigned integers.                                                  */
+static inline SIMD_CPPFUNC uchar64 make_uchar64(uchar32 lo, uchar32 hi) {
+ return ::simd_make_uchar64(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixty-
+ *  four 8-bit unsigned integers.                                             */
+template <typename typeN> static SIMD_CPPFUNC uchar64 make_uchar64(typeN other) {
+  return ::simd_make_uchar64(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixty-four 8-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uchar64 make_uchar64_undef(typeN other) {
+  return ::simd_make_uchar64_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 16-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CPPFUNC short2 make_short2(short x, short y) {
+ return ::simd_make_short2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  16-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC short2 make_short2(typeN other) {
+  return ::simd_make_short2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC short2 make_short2_undef(typeN other) {
+  return ::simd_make_short2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short3 make_short3(short x, short y, short z) {
+ return ::simd_make_short3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short3 make_short3(short x, short2 yz) {
+ return ::simd_make_short3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short3 make_short3(short2 xy, short z) {
+ return ::simd_make_short3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  16-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC short3 make_short3(typeN other) {
+  return ::simd_make_short3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC short3 make_short3_undef(typeN other) {
+  return ::simd_make_short3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  16-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CPPFUNC short4 make_short4(short x, short y, short z, short w) {
+ return ::simd_make_short4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short x, short y, short2 zw) {
+ return ::simd_make_short4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short x, short2 yz, short w) {
+ return ::simd_make_short4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short2 xy, short z, short w) {
+ return ::simd_make_short4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short x, short3 yzw) {
+ return ::simd_make_short4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short2 xy, short2 zw) {
+ return ::simd_make_short4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short4 make_short4(short3 xyz, short w) {
+ return ::simd_make_short4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  16-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC short4 make_short4(typeN other) {
+  return ::simd_make_short4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC short4 make_short4_undef(typeN other) {
+  return ::simd_make_short4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short8 make_short8(short4 lo, short4 hi) {
+ return ::simd_make_short8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  16-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC short8 make_short8(typeN other) {
+  return ::simd_make_short8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC short8 make_short8_undef(typeN other) {
+  return ::simd_make_short8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 16-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC short16 make_short16(short8 lo, short8 hi) {
+ return ::simd_make_short16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  16-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC short16 make_short16(typeN other) {
+  return ::simd_make_short16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC short16 make_short16_undef(typeN other) {
+  return ::simd_make_short16_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  16-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CPPFUNC short32 make_short32(short16 lo, short16 hi) {
+ return ::simd_make_short32(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of thirty-
+ *  two 16-bit signed (twos-complement) integers.                             */
+template <typename typeN> static SIMD_CPPFUNC short32 make_short32(typeN other) {
+  return ::simd_make_short32(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC short32 make_short32_undef(typeN other) {
+  return ::simd_make_short32_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort2 make_ushort2(unsigned short x, unsigned short y) {
+ return ::simd_make_ushort2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  16-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ushort2 make_ushort2(typeN other) {
+  return ::simd_make_ushort2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort2 make_ushort2_undef(typeN other) {
+  return ::simd_make_ushort2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) {
+ return ::simd_make_ushort3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort3 make_ushort3(unsigned short x, ushort2 yz) {
+ return ::simd_make_ushort3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort3 make_ushort3(ushort2 xy, unsigned short z) {
+ return ::simd_make_ushort3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  16-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ushort3 make_ushort3(typeN other) {
+  return ::simd_make_ushort3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort3 make_ushort3_undef(typeN other) {
+  return ::simd_make_ushort3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  16-bit unsigned integers.                                                 */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) {
+ return ::simd_make_ushort4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(unsigned short x, unsigned short y, ushort2 zw) {
+ return ::simd_make_ushort4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(unsigned short x, ushort2 yz, unsigned short w) {
+ return ::simd_make_ushort4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(ushort2 xy, unsigned short z, unsigned short w) {
+ return ::simd_make_ushort4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(unsigned short x, ushort3 yzw) {
+ return ::simd_make_ushort4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(ushort2 xy, ushort2 zw) {
+ return ::simd_make_ushort4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort4 make_ushort4(ushort3 xyz, unsigned short w) {
+ return ::simd_make_ushort4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  16-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ushort4 make_ushort4(typeN other) {
+  return ::simd_make_ushort4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort4 make_ushort4_undef(typeN other) {
+  return ::simd_make_ushort4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort8 make_ushort8(ushort4 lo, ushort4 hi) {
+ return ::simd_make_ushort8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  16-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ushort8 make_ushort8(typeN other) {
+  return ::simd_make_ushort8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort8 make_ushort8_undef(typeN other) {
+  return ::simd_make_ushort8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 16-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ushort16 make_ushort16(ushort8 lo, ushort8 hi) {
+ return ::simd_make_ushort16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  16-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ushort16 make_ushort16(typeN other) {
+  return ::simd_make_ushort16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort16 make_ushort16_undef(typeN other) {
+  return ::simd_make_ushort16_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of thirty-two
+ *  16-bit unsigned integers.                                                 */
+static inline SIMD_CPPFUNC ushort32 make_ushort32(ushort16 lo, ushort16 hi) {
+ return ::simd_make_ushort32(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of thirty-
+ *  two 16-bit unsigned integers.                                             */
+template <typename typeN> static SIMD_CPPFUNC ushort32 make_ushort32(typeN other) {
+  return ::simd_make_ushort32(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of thirty-two 16-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ushort32 make_ushort32_undef(typeN other) {
+  return ::simd_make_ushort32_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CPPFUNC int2 make_int2(int x, int y) {
+ return ::simd_make_int2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  32-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC int2 make_int2(typeN other) {
+  return ::simd_make_int2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC int2 make_int2_undef(typeN other) {
+  return ::simd_make_int2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int3 make_int3(int x, int y, int z) {
+ return ::simd_make_int3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int3 make_int3(int x, int2 yz) {
+ return ::simd_make_int3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int3 make_int3(int2 xy, int z) {
+ return ::simd_make_int3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  32-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC int3 make_int3(typeN other) {
+  return ::simd_make_int3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC int3 make_int3_undef(typeN other) {
+  return ::simd_make_int3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CPPFUNC int4 make_int4(int x, int y, int z, int w) {
+ return ::simd_make_int4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int x, int y, int2 zw) {
+ return ::simd_make_int4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int x, int2 yz, int w) {
+ return ::simd_make_int4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int2 xy, int z, int w) {
+ return ::simd_make_int4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int x, int3 yzw) {
+ return ::simd_make_int4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int2 xy, int2 zw) {
+ return ::simd_make_int4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int4 make_int4(int3 xyz, int w) {
+ return ::simd_make_int4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  32-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC int4 make_int4(typeN other) {
+  return ::simd_make_int4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC int4 make_int4_undef(typeN other) {
+  return ::simd_make_int4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int8 make_int8(int4 lo, int4 hi) {
+ return ::simd_make_int8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  32-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC int8 make_int8(typeN other) {
+  return ::simd_make_int8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC int8 make_int8_undef(typeN other) {
+  return ::simd_make_int8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC int16 make_int16(int8 lo, int8 hi) {
+ return ::simd_make_int16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  32-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC int16 make_int16(typeN other) {
+  return ::simd_make_int16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit signed
+ *  (twos-complement) integers. The contents of the newly-created vector
+ *  lanes are unspecified.                                                    */
+template <typename typeN> static SIMD_CPPFUNC int16 make_int16_undef(typeN other) {
+  return ::simd_make_int16_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint2 make_uint2(unsigned int x, unsigned int y) {
+ return ::simd_make_uint2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  32-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC uint2 make_uint2(typeN other) {
+  return ::simd_make_uint2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uint2 make_uint2_undef(typeN other) {
+  return ::simd_make_uint2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) {
+ return ::simd_make_uint3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint3 make_uint3(unsigned int x, uint2 yz) {
+ return ::simd_make_uint3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint3 make_uint3(uint2 xy, unsigned int z) {
+ return ::simd_make_uint3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  32-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC uint3 make_uint3(typeN other) {
+  return ::simd_make_uint3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uint3 make_uint3_undef(typeN other) {
+  return ::simd_make_uint3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit unsigned integers.                                                 */
+static inline SIMD_CPPFUNC uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) {
+ return ::simd_make_uint4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(unsigned int x, unsigned int y, uint2 zw) {
+ return ::simd_make_uint4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(unsigned int x, uint2 yz, unsigned int w) {
+ return ::simd_make_uint4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(uint2 xy, unsigned int z, unsigned int w) {
+ return ::simd_make_uint4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(unsigned int x, uint3 yzw) {
+ return ::simd_make_uint4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(uint2 xy, uint2 zw) {
+ return ::simd_make_uint4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint4 make_uint4(uint3 xyz, unsigned int w) {
+ return ::simd_make_uint4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  32-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC uint4 make_uint4(typeN other) {
+  return ::simd_make_uint4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uint4 make_uint4_undef(typeN other) {
+  return ::simd_make_uint4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint8 make_uint8(uint4 lo, uint4 hi) {
+ return ::simd_make_uint8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  32-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC uint8 make_uint8(typeN other) {
+  return ::simd_make_uint8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uint8 make_uint8_undef(typeN other) {
+  return ::simd_make_uint8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC uint16 make_uint16(uint8 lo, uint8 hi) {
+ return ::simd_make_uint16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  32-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC uint16 make_uint16(typeN other) {
+  return ::simd_make_uint16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC uint16 make_uint16_undef(typeN other) {
+  return ::simd_make_uint16_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float2 make_float2(float x, float y) {
+ return ::simd_make_float2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  32-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC float2 make_float2(typeN other) {
+  return ::simd_make_float2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+template <typename typeN> static SIMD_CPPFUNC float2 make_float2_undef(typeN other) {
+  return ::simd_make_float2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float3 make_float3(float x, float y, float z) {
+ return ::simd_make_float3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float3 make_float3(float x, float2 yz) {
+ return ::simd_make_float3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float3 make_float3(float2 xy, float z) {
+ return ::simd_make_float3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  32-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC float3 make_float3(typeN other) {
+  return ::simd_make_float3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC float3 make_float3_undef(typeN other) {
+  return ::simd_make_float3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  32-bit floating-point numbers.                                            */
+static inline SIMD_CPPFUNC float4 make_float4(float x, float y, float z, float w) {
+ return ::simd_make_float4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float x, float y, float2 zw) {
+ return ::simd_make_float4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float x, float2 yz, float w) {
+ return ::simd_make_float4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float2 xy, float z, float w) {
+ return ::simd_make_float4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float x, float3 yzw) {
+ return ::simd_make_float4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float2 xy, float2 zw) {
+ return ::simd_make_float4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float4 make_float4(float3 xyz, float w) {
+ return ::simd_make_float4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  32-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC float4 make_float4(typeN other) {
+  return ::simd_make_float4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 32-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+template <typename typeN> static SIMD_CPPFUNC float4 make_float4_undef(typeN other) {
+  return ::simd_make_float4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float8 make_float8(float4 lo, float4 hi) {
+ return ::simd_make_float8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  32-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC float8 make_float8(typeN other) {
+  return ::simd_make_float8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC float8 make_float8_undef(typeN other) {
+  return ::simd_make_float8_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of sixteen 32-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC float16 make_float16(float8 lo, float8 hi) {
+ return ::simd_make_float16(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of sixteen
+ *  32-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC float16 make_float16(typeN other) {
+  return ::simd_make_float16(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of sixteen 32-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC float16 make_float16_undef(typeN other) {
+  return ::simd_make_float16_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit signed
+ *  (twos-complement) integers.                                               */
+static inline SIMD_CPPFUNC long2 make_long2(long1 x, long1 y) {
+ return ::simd_make_long2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  64-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC long2 make_long2(typeN other) {
+  return ::simd_make_long2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC long2 make_long2_undef(typeN other) {
+  return ::simd_make_long2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long3 make_long3(long1 x, long1 y, long1 z) {
+ return ::simd_make_long3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long3 make_long3(long1 x, long2 yz) {
+ return ::simd_make_long3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long3 make_long3(long2 xy, long1 z) {
+ return ::simd_make_long3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  64-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC long3 make_long3(typeN other) {
+  return ::simd_make_long3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC long3 make_long3_undef(typeN other) {
+  return ::simd_make_long3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit signed (twos-complement) integers.                                 */
+static inline SIMD_CPPFUNC long4 make_long4(long1 x, long1 y, long1 z, long1 w) {
+ return ::simd_make_long4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long1 x, long1 y, long2 zw) {
+ return ::simd_make_long4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long1 x, long2 yz, long1 w) {
+ return ::simd_make_long4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long2 xy, long1 z, long1 w) {
+ return ::simd_make_long4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long1 x, long3 yzw) {
+ return ::simd_make_long4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long2 xy, long2 zw) {
+ return ::simd_make_long4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long4 make_long4(long3 xyz, long1 w) {
+ return ::simd_make_long4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  64-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC long4 make_long4(typeN other) {
+  return ::simd_make_long4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC long4 make_long4_undef(typeN other) {
+  return ::simd_make_long4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  signed (twos-complement) integers.                                        */
+static inline SIMD_CPPFUNC long8 make_long8(long4 lo, long4 hi) {
+ return ::simd_make_long8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  64-bit signed (twos-complement) integers.                                 */
+template <typename typeN> static SIMD_CPPFUNC long8 make_long8(typeN other) {
+  return ::simd_make_long8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit signed (twos-
+ *  complement) integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC long8 make_long8_undef(typeN other) {
+  return ::simd_make_long8_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong2 make_ulong2(ulong1 x, ulong1 y) {
+ return ::simd_make_ulong2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  64-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ulong2 make_ulong2(typeN other) {
+  return ::simd_make_ulong2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ulong2 make_ulong2_undef(typeN other) {
+  return ::simd_make_ulong2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong3 make_ulong3(ulong1 x, ulong1 y, ulong1 z) {
+ return ::simd_make_ulong3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong3 make_ulong3(ulong1 x, ulong2 yz) {
+ return ::simd_make_ulong3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong3 make_ulong3(ulong2 xy, ulong1 z) {
+ return ::simd_make_ulong3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  64-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ulong3 make_ulong3(typeN other) {
+  return ::simd_make_ulong3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ulong3 make_ulong3_undef(typeN other) {
+  return ::simd_make_ulong3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit unsigned integers.                                                 */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong1 x, ulong1 y, ulong1 z, ulong1 w) {
+ return ::simd_make_ulong4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong1 x, ulong1 y, ulong2 zw) {
+ return ::simd_make_ulong4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong1 x, ulong2 yz, ulong1 w) {
+ return ::simd_make_ulong4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong2 xy, ulong1 z, ulong1 w) {
+ return ::simd_make_ulong4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong1 x, ulong3 yzw) {
+ return ::simd_make_ulong4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong2 xy, ulong2 zw) {
+ return ::simd_make_ulong4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong4 make_ulong4(ulong3 xyz, ulong1 w) {
+ return ::simd_make_ulong4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  64-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ulong4 make_ulong4(typeN other) {
+  return ::simd_make_ulong4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ulong4 make_ulong4_undef(typeN other) {
+  return ::simd_make_ulong4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  unsigned integers.                                                        */
+static inline SIMD_CPPFUNC ulong8 make_ulong8(ulong4 lo, ulong4 hi) {
+ return ::simd_make_ulong8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  64-bit unsigned integers.                                                 */
+template <typename typeN> static SIMD_CPPFUNC ulong8 make_ulong8(typeN other) {
+  return ::simd_make_ulong8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit unsigned
+ *  integers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC ulong8 make_ulong8_undef(typeN other) {
+  return ::simd_make_ulong8_undef(other);
+}
+
+/*! @abstract Concatenates `x` and `y` to form a vector of two 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double2 make_double2(double x, double y) {
+ return ::simd_make_double2(x, y);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of two
+ *  64-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC double2 make_double2(typeN other) {
+  return ::simd_make_double2(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of two 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+template <typename typeN> static SIMD_CPPFUNC double2 make_double2_undef(typeN other) {
+  return ::simd_make_double2_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y` and `z` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double3 make_double3(double x, double y, double z) {
+ return ::simd_make_double3(x, y, z);
+}
+  
+/*! @abstract Concatenates `x` and `yz` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double3 make_double3(double x, double2 yz) {
+ return ::simd_make_double3(x, yz);
+}
+  
+/*! @abstract Concatenates `xy` and `z` to form a vector of three 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double3 make_double3(double2 xy, double z) {
+ return ::simd_make_double3(xy, z);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of three
+ *  64-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC double3 make_double3(typeN other) {
+  return ::simd_make_double3(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of three 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC double3 make_double3_undef(typeN other) {
+  return ::simd_make_double3_undef(other);
+}
+
+/*! @abstract Concatenates `x`, `y`, `z` and `w` to form a vector of four
+ *  64-bit floating-point numbers.                                            */
+static inline SIMD_CPPFUNC double4 make_double4(double x, double y, double z, double w) {
+ return ::simd_make_double4(x, y, z, w);
+}
+  
+/*! @abstract Concatenates `x`, `y` and `zw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double x, double y, double2 zw) {
+ return ::simd_make_double4(x, y, zw);
+}
+  
+/*! @abstract Concatenates `x`, `yz` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double x, double2 yz, double w) {
+ return ::simd_make_double4(x, yz, w);
+}
+  
+/*! @abstract Concatenates `xy`, `z` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double2 xy, double z, double w) {
+ return ::simd_make_double4(xy, z, w);
+}
+  
+/*! @abstract Concatenates `x` and `yzw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double x, double3 yzw) {
+ return ::simd_make_double4(x, yzw);
+}
+  
+/*! @abstract Concatenates `xy` and `zw` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double2 xy, double2 zw) {
+ return ::simd_make_double4(xy, zw);
+}
+  
+/*! @abstract Concatenates `xyz` and `w` to form a vector of four 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double4 make_double4(double3 xyz, double w) {
+ return ::simd_make_double4(xyz, w);
+}
+  
+/*! @abstract Truncates or zero-extends `other` to form a vector of four
+ *  64-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC double4 make_double4(typeN other) {
+  return ::simd_make_double4(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of four 64-bit floating-point
+ *  numbers. The contents of the newly-created vector lanes are unspecified.  */
+template <typename typeN> static SIMD_CPPFUNC double4 make_double4_undef(typeN other) {
+  return ::simd_make_double4_undef(other);
+}
+
+/*! @abstract Concatenates `lo` and `hi` to form a vector of eight 64-bit
+ *  floating-point numbers.                                                   */
+static inline SIMD_CPPFUNC double8 make_double8(double4 lo, double4 hi) {
+ return ::simd_make_double8(lo, hi);
+}
+
+/*! @abstract Truncates or zero-extends `other` to form a vector of eight
+ *  64-bit floating-point numbers.                                            */
+template <typename typeN> static SIMD_CPPFUNC double8 make_double8(typeN other) {
+  return ::simd_make_double8(other);
+}
+  
+/*! @abstract Extends `other` to form a vector of eight 64-bit floating-
+ *  point numbers. The contents of the newly-created vector lanes are
+ *  unspecified.                                                              */
+template <typename typeN> static SIMD_CPPFUNC double8 make_double8_undef(typeN other) {
+  return ::simd_make_double8_undef(other);
+}
+
+/*! @struct Vector
+ *  @abstract Templated Vector struct based on scalar type and number of
+ *  elements
+ *  @field count Number of elements in the vector
+ *  @field scalar_t The scalar type of each element
+ *  @field type The inferred simd::typeN type
+ *  @field packed_t The inferred simd::packed::typeN type
+ *  @field mask_t The return type of comparison operations                    */
+template <typename ScalarType, size_t count> struct Vector {
+  //  static const size_t count
+  //  typedef scalar_t
+  //  typedef type
+  //  typedef packed_t
+  //  typedef mask_t
+};
+/*! @abstract Helper type to access the simd type easily.                     */
+template <typename ScalarType, size_t count>
+using Vector_t = typename Vector<ScalarType, count>::type;
+
+/*! @abstract Look up the equivalent Vector struct according to the simd
+ *  type.                                                                     */
+template <typename typeN> struct get_traits
+{
+//    using type = Vector<ScalarType, count>;
+};
+/*! @abstract Helper type to access the Vector struct easily.
+ *  @discussion This is commonly used to query the type traits of a simd
+ *  type.
+ *  For example, simd::traits<simd::float4>::count is 4.                      */
+template<typename typeN>
+using traits = typename get_traits<typeN>::type;
+
+template<> struct Vector<char1, 1> {
+  static const size_t count = 1;
+  typedef char1 scalar_t;
+  typedef char1 type;
+  typedef char1 mask_t;
+};
+
+template <> struct get_traits<char1>
+{
+    using type = Vector<char1, 1>;
+};
+
+template<> struct Vector<char1, 2> {
+  static const size_t count = 2;
+  typedef char1 scalar_t;
+  typedef char2 type;
+  typedef packed::char2 packed_t;
+  typedef char2 mask_t;
+};
+
+template <> struct get_traits<char2>
+{
+    using type = Vector<char1, 2>;
+};
+
+template<> struct Vector<char1, 3> {
+  static const size_t count = 3;
+  typedef char1 scalar_t;
+  typedef char3 type;
+  typedef char3 mask_t;
+};
+
+template <> struct get_traits<char3>
+{
+    using type = Vector<char1, 3>;
+};
+
+template<> struct Vector<char1, 4> {
+  static const size_t count = 4;
+  typedef char1 scalar_t;
+  typedef char4 type;
+  typedef packed::char4 packed_t;
+  typedef char4 mask_t;
+};
+
+template <> struct get_traits<char4>
+{
+    using type = Vector<char1, 4>;
+};
+
+template<> struct Vector<char1, 8> {
+  static const size_t count = 8;
+  typedef char1 scalar_t;
+  typedef char8 type;
+  typedef packed::char8 packed_t;
+  typedef char8 mask_t;
+};
+
+template <> struct get_traits<char8>
+{
+    using type = Vector<char1, 8>;
+};
+
+template<> struct Vector<char1, 16> {
+  static const size_t count = 16;
+  typedef char1 scalar_t;
+  typedef char16 type;
+  typedef packed::char16 packed_t;
+  typedef char16 mask_t;
+};
+
+template <> struct get_traits<char16>
+{
+    using type = Vector<char1, 16>;
+};
+
+template<> struct Vector<char1, 32> {
+  static const size_t count = 32;
+  typedef char1 scalar_t;
+  typedef char32 type;
+  typedef packed::char32 packed_t;
+  typedef char32 mask_t;
+};
+
+template <> struct get_traits<char32>
+{
+    using type = Vector<char1, 32>;
+};
+
+template<> struct Vector<char1, 64> {
+  static const size_t count = 64;
+  typedef char1 scalar_t;
+  typedef char64 type;
+  typedef packed::char64 packed_t;
+  typedef char64 mask_t;
+};
+
+template <> struct get_traits<char64>
+{
+    using type = Vector<char1, 64>;
+};
+
+template<> struct Vector<uchar1, 1> {
+  static const size_t count = 1;
+  typedef uchar1 scalar_t;
+  typedef uchar1 type;
+  typedef char1 mask_t;
+};
+
+template <> struct get_traits<uchar1>
+{
+    using type = Vector<uchar1, 1>;
+};
+
+template<> struct Vector<uchar1, 2> {
+  static const size_t count = 2;
+  typedef uchar1 scalar_t;
+  typedef uchar2 type;
+  typedef packed::uchar2 packed_t;
+  typedef char2 mask_t;
+};
+
+template <> struct get_traits<uchar2>
+{
+    using type = Vector<uchar1, 2>;
+};
+
+template<> struct Vector<uchar1, 3> {
+  static const size_t count = 3;
+  typedef uchar1 scalar_t;
+  typedef uchar3 type;
+  typedef char3 mask_t;
+};
+
+template <> struct get_traits<uchar3>
+{
+    using type = Vector<uchar1, 3>;
+};
+
+template<> struct Vector<uchar1, 4> {
+  static const size_t count = 4;
+  typedef uchar1 scalar_t;
+  typedef uchar4 type;
+  typedef packed::uchar4 packed_t;
+  typedef char4 mask_t;
+};
+
+template <> struct get_traits<uchar4>
+{
+    using type = Vector<uchar1, 4>;
+};
+
+template<> struct Vector<uchar1, 8> {
+  static const size_t count = 8;
+  typedef uchar1 scalar_t;
+  typedef uchar8 type;
+  typedef packed::uchar8 packed_t;
+  typedef char8 mask_t;
+};
+
+template <> struct get_traits<uchar8>
+{
+    using type = Vector<uchar1, 8>;
+};
+
+template<> struct Vector<uchar1, 16> {
+  static const size_t count = 16;
+  typedef uchar1 scalar_t;
+  typedef uchar16 type;
+  typedef packed::uchar16 packed_t;
+  typedef char16 mask_t;
+};
+
+template <> struct get_traits<uchar16>
+{
+    using type = Vector<uchar1, 16>;
+};
+
+template<> struct Vector<uchar1, 32> {
+  static const size_t count = 32;
+  typedef uchar1 scalar_t;
+  typedef uchar32 type;
+  typedef packed::uchar32 packed_t;
+  typedef char32 mask_t;
+};
+
+template <> struct get_traits<uchar32>
+{
+    using type = Vector<uchar1, 32>;
+};
+
+template<> struct Vector<uchar1, 64> {
+  static const size_t count = 64;
+  typedef uchar1 scalar_t;
+  typedef uchar64 type;
+  typedef packed::uchar64 packed_t;
+  typedef char64 mask_t;
+};
+
+template <> struct get_traits<uchar64>
+{
+    using type = Vector<uchar1, 64>;
+};
+
+template<> struct Vector<short1, 1> {
+  static const size_t count = 1;
+  typedef short1 scalar_t;
+  typedef short1 type;
+  typedef short1 mask_t;
+};
+
+template <> struct get_traits<short1>
+{
+    using type = Vector<short1, 1>;
+};
+
+template<> struct Vector<short1, 2> {
+  static const size_t count = 2;
+  typedef short1 scalar_t;
+  typedef short2 type;
+  typedef packed::short2 packed_t;
+  typedef short2 mask_t;
+};
+
+template <> struct get_traits<short2>
+{
+    using type = Vector<short1, 2>;
+};
+
+template<> struct Vector<short1, 3> {
+  static const size_t count = 3;
+  typedef short1 scalar_t;
+  typedef short3 type;
+  typedef short3 mask_t;
+};
+
+template <> struct get_traits<short3>
+{
+    using type = Vector<short1, 3>;
+};
+
+template<> struct Vector<short1, 4> {
+  static const size_t count = 4;
+  typedef short1 scalar_t;
+  typedef short4 type;
+  typedef packed::short4 packed_t;
+  typedef short4 mask_t;
+};
+
+template <> struct get_traits<short4>
+{
+    using type = Vector<short1, 4>;
+};
+
+template<> struct Vector<short1, 8> {
+  static const size_t count = 8;
+  typedef short1 scalar_t;
+  typedef short8 type;
+  typedef packed::short8 packed_t;
+  typedef short8 mask_t;
+};
+
+template <> struct get_traits<short8>
+{
+    using type = Vector<short1, 8>;
+};
+
+template<> struct Vector<short1, 16> {
+  static const size_t count = 16;
+  typedef short1 scalar_t;
+  typedef short16 type;
+  typedef packed::short16 packed_t;
+  typedef short16 mask_t;
+};
+
+template <> struct get_traits<short16>
+{
+    using type = Vector<short1, 16>;
+};
+
+template<> struct Vector<short1, 32> {
+  static const size_t count = 32;
+  typedef short1 scalar_t;
+  typedef short32 type;
+  typedef packed::short32 packed_t;
+  typedef short32 mask_t;
+};
+
+template <> struct get_traits<short32>
+{
+    using type = Vector<short1, 32>;
+};
+
+template<> struct Vector<ushort1, 1> {
+  static const size_t count = 1;
+  typedef ushort1 scalar_t;
+  typedef ushort1 type;
+  typedef short1 mask_t;
+};
+
+template <> struct get_traits<ushort1>
+{
+    using type = Vector<ushort1, 1>;
+};
+
+template<> struct Vector<ushort1, 2> {
+  static const size_t count = 2;
+  typedef ushort1 scalar_t;
+  typedef ushort2 type;
+  typedef packed::ushort2 packed_t;
+  typedef short2 mask_t;
+};
+
+template <> struct get_traits<ushort2>
+{
+    using type = Vector<ushort1, 2>;
+};
+
+template<> struct Vector<ushort1, 3> {
+  static const size_t count = 3;
+  typedef ushort1 scalar_t;
+  typedef ushort3 type;
+  typedef short3 mask_t;
+};
+
+template <> struct get_traits<ushort3>
+{
+    using type = Vector<ushort1, 3>;
+};
+
+template<> struct Vector<ushort1, 4> {
+  static const size_t count = 4;
+  typedef ushort1 scalar_t;
+  typedef ushort4 type;
+  typedef packed::ushort4 packed_t;
+  typedef short4 mask_t;
+};
+
+template <> struct get_traits<ushort4>
+{
+    using type = Vector<ushort1, 4>;
+};
+
+template<> struct Vector<ushort1, 8> {
+  static const size_t count = 8;
+  typedef ushort1 scalar_t;
+  typedef ushort8 type;
+  typedef packed::ushort8 packed_t;
+  typedef short8 mask_t;
+};
+
+template <> struct get_traits<ushort8>
+{
+    using type = Vector<ushort1, 8>;
+};
+
+template<> struct Vector<ushort1, 16> {
+  static const size_t count = 16;
+  typedef ushort1 scalar_t;
+  typedef ushort16 type;
+  typedef packed::ushort16 packed_t;
+  typedef short16 mask_t;
+};
+
+template <> struct get_traits<ushort16>
+{
+    using type = Vector<ushort1, 16>;
+};
+
+template<> struct Vector<ushort1, 32> {
+  static const size_t count = 32;
+  typedef ushort1 scalar_t;
+  typedef ushort32 type;
+  typedef packed::ushort32 packed_t;
+  typedef short32 mask_t;
+};
+
+template <> struct get_traits<ushort32>
+{
+    using type = Vector<ushort1, 32>;
+};
+
+template<> struct Vector<int1, 1> {
+  static const size_t count = 1;
+  typedef int1 scalar_t;
+  typedef int1 type;
+  typedef int1 mask_t;
+};
+
+template <> struct get_traits<int1>
+{
+    using type = Vector<int1, 1>;
+};
+
+template<> struct Vector<int1, 2> {
+  static const size_t count = 2;
+  typedef int1 scalar_t;
+  typedef int2 type;
+  typedef packed::int2 packed_t;
+  typedef int2 mask_t;
+};
+
+template <> struct get_traits<int2>
+{
+    using type = Vector<int1, 2>;
+};
+
+template<> struct Vector<int1, 3> {
+  static const size_t count = 3;
+  typedef int1 scalar_t;
+  typedef int3 type;
+  typedef int3 mask_t;
+};
+
+template <> struct get_traits<int3>
+{
+    using type = Vector<int1, 3>;
+};
+
+template<> struct Vector<int1, 4> {
+  static const size_t count = 4;
+  typedef int1 scalar_t;
+  typedef int4 type;
+  typedef packed::int4 packed_t;
+  typedef int4 mask_t;
+};
+
+template <> struct get_traits<int4>
+{
+    using type = Vector<int1, 4>;
+};
+
+template<> struct Vector<int1, 8> {
+  static const size_t count = 8;
+  typedef int1 scalar_t;
+  typedef int8 type;
+  typedef packed::int8 packed_t;
+  typedef int8 mask_t;
+};
+
+template <> struct get_traits<int8>
+{
+    using type = Vector<int1, 8>;
+};
+
+template<> struct Vector<int1, 16> {
+  static const size_t count = 16;
+  typedef int1 scalar_t;
+  typedef int16 type;
+  typedef packed::int16 packed_t;
+  typedef int16 mask_t;
+};
+
+template <> struct get_traits<int16>
+{
+    using type = Vector<int1, 16>;
+};
+
+template<> struct Vector<uint1, 1> {
+  static const size_t count = 1;
+  typedef uint1 scalar_t;
+  typedef uint1 type;
+  typedef int1 mask_t;
+};
+
+template <> struct get_traits<uint1>
+{
+    using type = Vector<uint1, 1>;
+};
+
+template<> struct Vector<uint1, 2> {
+  static const size_t count = 2;
+  typedef uint1 scalar_t;
+  typedef uint2 type;
+  typedef packed::uint2 packed_t;
+  typedef int2 mask_t;
+};
+
+template <> struct get_traits<uint2>
+{
+    using type = Vector<uint1, 2>;
+};
+
+template<> struct Vector<uint1, 3> {
+  static const size_t count = 3;
+  typedef uint1 scalar_t;
+  typedef uint3 type;
+  typedef int3 mask_t;
+};
+
+template <> struct get_traits<uint3>
+{
+    using type = Vector<uint1, 3>;
+};
+
+template<> struct Vector<uint1, 4> {
+  static const size_t count = 4;
+  typedef uint1 scalar_t;
+  typedef uint4 type;
+  typedef packed::uint4 packed_t;
+  typedef int4 mask_t;
+};
+
+template <> struct get_traits<uint4>
+{
+    using type = Vector<uint1, 4>;
+};
+
+template<> struct Vector<uint1, 8> {
+  static const size_t count = 8;
+  typedef uint1 scalar_t;
+  typedef uint8 type;
+  typedef packed::uint8 packed_t;
+  typedef int8 mask_t;
+};
+
+template <> struct get_traits<uint8>
+{
+    using type = Vector<uint1, 8>;
+};
+
+template<> struct Vector<uint1, 16> {
+  static const size_t count = 16;
+  typedef uint1 scalar_t;
+  typedef uint16 type;
+  typedef packed::uint16 packed_t;
+  typedef int16 mask_t;
+};
+
+template <> struct get_traits<uint16>
+{
+    using type = Vector<uint1, 16>;
+};
+
+template<> struct Vector<float1, 1> {
+  static const size_t count = 1;
+  typedef float1 scalar_t;
+  typedef float1 type;
+  typedef int1 mask_t;
+};
+
+template <> struct get_traits<float1>
+{
+    using type = Vector<float1, 1>;
+};
+
+template<> struct Vector<float1, 2> {
+  static const size_t count = 2;
+  typedef float1 scalar_t;
+  typedef float2 type;
+  typedef packed::float2 packed_t;
+  typedef int2 mask_t;
+};
+
+template <> struct get_traits<float2>
+{
+    using type = Vector<float1, 2>;
+};
+
+template<> struct Vector<float1, 3> {
+  static const size_t count = 3;
+  typedef float1 scalar_t;
+  typedef float3 type;
+  typedef int3 mask_t;
+};
+
+template <> struct get_traits<float3>
+{
+    using type = Vector<float1, 3>;
+};
+
+template<> struct Vector<float1, 4> {
+  static const size_t count = 4;
+  typedef float1 scalar_t;
+  typedef float4 type;
+  typedef packed::float4 packed_t;
+  typedef int4 mask_t;
+};
+
+template <> struct get_traits<float4>
+{
+    using type = Vector<float1, 4>;
+};
+
+template<> struct Vector<float1, 8> {
+  static const size_t count = 8;
+  typedef float1 scalar_t;
+  typedef float8 type;
+  typedef packed::float8 packed_t;
+  typedef int8 mask_t;
+};
+
+template <> struct get_traits<float8>
+{
+    using type = Vector<float1, 8>;
+};
+
+template<> struct Vector<float1, 16> {
+  static const size_t count = 16;
+  typedef float1 scalar_t;
+  typedef float16 type;
+  typedef packed::float16 packed_t;
+  typedef int16 mask_t;
+};
+
+template <> struct get_traits<float16>
+{
+    using type = Vector<float1, 16>;
+};
+
+template<> struct Vector<long1, 1> {
+  static const size_t count = 1;
+  typedef long1 scalar_t;
+  typedef long1 type;
+  typedef long1 mask_t;
+};
+
+template <> struct get_traits<long1>
+{
+    using type = Vector<long1, 1>;
+};
+
+template<> struct Vector<long1, 2> {
+  static const size_t count = 2;
+  typedef long1 scalar_t;
+  typedef long2 type;
+  typedef packed::long2 packed_t;
+  typedef long2 mask_t;
+};
+
+template <> struct get_traits<long2>
+{
+    using type = Vector<long1, 2>;
+};
+
+template<> struct Vector<long1, 3> {
+  static const size_t count = 3;
+  typedef long1 scalar_t;
+  typedef long3 type;
+  typedef long3 mask_t;
+};
+
+template <> struct get_traits<long3>
+{
+    using type = Vector<long1, 3>;
+};
+
+template<> struct Vector<long1, 4> {
+  static const size_t count = 4;
+  typedef long1 scalar_t;
+  typedef long4 type;
+  typedef packed::long4 packed_t;
+  typedef long4 mask_t;
+};
+
+template <> struct get_traits<long4>
+{
+    using type = Vector<long1, 4>;
+};
+
+template<> struct Vector<long1, 8> {
+  static const size_t count = 8;
+  typedef long1 scalar_t;
+  typedef long8 type;
+  typedef packed::long8 packed_t;
+  typedef long8 mask_t;
+};
+
+template <> struct get_traits<long8>
+{
+    using type = Vector<long1, 8>;
+};
+
+template<> struct Vector<ulong1, 1> {
+  static const size_t count = 1;
+  typedef ulong1 scalar_t;
+  typedef ulong1 type;
+  typedef long1 mask_t;
+};
+
+template <> struct get_traits<ulong1>
+{
+    using type = Vector<ulong1, 1>;
+};
+
+template<> struct Vector<ulong1, 2> {
+  static const size_t count = 2;
+  typedef ulong1 scalar_t;
+  typedef ulong2 type;
+  typedef packed::ulong2 packed_t;
+  typedef long2 mask_t;
+};
+
+template <> struct get_traits<ulong2>
+{
+    using type = Vector<ulong1, 2>;
+};
+
+template<> struct Vector<ulong1, 3> {
+  static const size_t count = 3;
+  typedef ulong1 scalar_t;
+  typedef ulong3 type;
+  typedef long3 mask_t;
+};
+
+template <> struct get_traits<ulong3>
+{
+    using type = Vector<ulong1, 3>;
+};
+
+template<> struct Vector<ulong1, 4> {
+  static const size_t count = 4;
+  typedef ulong1 scalar_t;
+  typedef ulong4 type;
+  typedef packed::ulong4 packed_t;
+  typedef long4 mask_t;
+};
+
+template <> struct get_traits<ulong4>
+{
+    using type = Vector<ulong1, 4>;
+};
+
+template<> struct Vector<ulong1, 8> {
+  static const size_t count = 8;
+  typedef ulong1 scalar_t;
+  typedef ulong8 type;
+  typedef packed::ulong8 packed_t;
+  typedef long8 mask_t;
+};
+
+template <> struct get_traits<ulong8>
+{
+    using type = Vector<ulong1, 8>;
+};
+
+template<> struct Vector<double1, 1> {
+  static const size_t count = 1;
+  typedef double1 scalar_t;
+  typedef double1 type;
+  typedef long1 mask_t;
+};
+
+template <> struct get_traits<double1>
+{
+    using type = Vector<double1, 1>;
+};
+
+template<> struct Vector<double1, 2> {
+  static const size_t count = 2;
+  typedef double1 scalar_t;
+  typedef double2 type;
+  typedef packed::double2 packed_t;
+  typedef long2 mask_t;
+};
+
+template <> struct get_traits<double2>
+{
+    using type = Vector<double1, 2>;
+};
+
+template<> struct Vector<double1, 3> {
+  static const size_t count = 3;
+  typedef double1 scalar_t;
+  typedef double3 type;
+  typedef long3 mask_t;
+};
+
+template <> struct get_traits<double3>
+{
+    using type = Vector<double1, 3>;
+};
+
+template<> struct Vector<double1, 4> {
+  static const size_t count = 4;
+  typedef double1 scalar_t;
+  typedef double4 type;
+  typedef packed::double4 packed_t;
+  typedef long4 mask_t;
+};
+
+template <> struct get_traits<double4>
+{
+    using type = Vector<double1, 4>;
+};
+
+template<> struct Vector<double1, 8> {
+  static const size_t count = 8;
+  typedef double1 scalar_t;
+  typedef double8 type;
+  typedef packed::double8 packed_t;
+  typedef long8 mask_t;
+};
+
+template <> struct get_traits<double8>
+{
+    using type = Vector<double1, 8>;
+};
+
+#if __has_feature(cxx_constexpr)
+/*! @abstract Templated make function based on return type and argument
+ *  type.                                                                     */
+template<typename typeN, typename... Args>
+static constexpr typeN make(Args... args)
+{
+    if constexpr (traits<typeN>::count == 1)
+    {
+        using FirstArgType = typename std::tuple_element<0, std::tuple<Args...>>::type;
+        if constexpr (std::is_same<FirstArgType, typename traits<FirstArgType>::scalar_t>::value)
+            return typeN(std::get<0>(std::make_tuple(args...)));
+        else
+            return typeN(std::get<0>(std::make_tuple(args...))[0]);
+    }
+    else if constexpr (std::is_same<typeN, char2>::value)
+        return make_char2(args...);
+    else if constexpr (std::is_same<typeN, char3>::value)
+        return make_char3(args...);
+    else if constexpr (std::is_same<typeN, char4>::value)
+        return make_char4(args...);
+    else if constexpr (std::is_same<typeN, char8>::value)
+        return make_char8(args...);
+    else if constexpr (std::is_same<typeN, char16>::value)
+        return make_char16(args...);
+    else if constexpr (std::is_same<typeN, char32>::value)
+        return make_char32(args...);
+    else if constexpr (std::is_same<typeN, char64>::value)
+        return make_char64(args...);
+    else if constexpr (std::is_same<typeN, uchar2>::value)
+        return make_uchar2(args...);
+    else if constexpr (std::is_same<typeN, uchar3>::value)
+        return make_uchar3(args...);
+    else if constexpr (std::is_same<typeN, uchar4>::value)
+        return make_uchar4(args...);
+    else if constexpr (std::is_same<typeN, uchar8>::value)
+        return make_uchar8(args...);
+    else if constexpr (std::is_same<typeN, uchar16>::value)
+        return make_uchar16(args...);
+    else if constexpr (std::is_same<typeN, uchar32>::value)
+        return make_uchar32(args...);
+    else if constexpr (std::is_same<typeN, uchar64>::value)
+        return make_uchar64(args...);
+    else if constexpr (std::is_same<typeN, short2>::value)
+        return make_short2(args...);
+    else if constexpr (std::is_same<typeN, short3>::value)
+        return make_short3(args...);
+    else if constexpr (std::is_same<typeN, short4>::value)
+        return make_short4(args...);
+    else if constexpr (std::is_same<typeN, short8>::value)
+        return make_short8(args...);
+    else if constexpr (std::is_same<typeN, short16>::value)
+        return make_short16(args...);
+    else if constexpr (std::is_same<typeN, short32>::value)
+        return make_short32(args...);
+    else if constexpr (std::is_same<typeN, ushort2>::value)
+        return make_ushort2(args...);
+    else if constexpr (std::is_same<typeN, ushort3>::value)
+        return make_ushort3(args...);
+    else if constexpr (std::is_same<typeN, ushort4>::value)
+        return make_ushort4(args...);
+    else if constexpr (std::is_same<typeN, ushort8>::value)
+        return make_ushort8(args...);
+    else if constexpr (std::is_same<typeN, ushort16>::value)
+        return make_ushort16(args...);
+    else if constexpr (std::is_same<typeN, ushort32>::value)
+        return make_ushort32(args...);
+    else if constexpr (std::is_same<typeN, int2>::value)
+        return make_int2(args...);
+    else if constexpr (std::is_same<typeN, int3>::value)
+        return make_int3(args...);
+    else if constexpr (std::is_same<typeN, int4>::value)
+        return make_int4(args...);
+    else if constexpr (std::is_same<typeN, int8>::value)
+        return make_int8(args...);
+    else if constexpr (std::is_same<typeN, int16>::value)
+        return make_int16(args...);
+    else if constexpr (std::is_same<typeN, uint2>::value)
+        return make_uint2(args...);
+    else if constexpr (std::is_same<typeN, uint3>::value)
+        return make_uint3(args...);
+    else if constexpr (std::is_same<typeN, uint4>::value)
+        return make_uint4(args...);
+    else if constexpr (std::is_same<typeN, uint8>::value)
+        return make_uint8(args...);
+    else if constexpr (std::is_same<typeN, uint16>::value)
+        return make_uint16(args...);
+    else if constexpr (std::is_same<typeN, float2>::value)
+        return make_float2(args...);
+    else if constexpr (std::is_same<typeN, float3>::value)
+        return make_float3(args...);
+    else if constexpr (std::is_same<typeN, float4>::value)
+        return make_float4(args...);
+    else if constexpr (std::is_same<typeN, float8>::value)
+        return make_float8(args...);
+    else if constexpr (std::is_same<typeN, float16>::value)
+        return make_float16(args...);
+    else if constexpr (std::is_same<typeN, long2>::value)
+        return make_long2(args...);
+    else if constexpr (std::is_same<typeN, long3>::value)
+        return make_long3(args...);
+    else if constexpr (std::is_same<typeN, long4>::value)
+        return make_long4(args...);
+    else if constexpr (std::is_same<typeN, long8>::value)
+        return make_long8(args...);
+    else if constexpr (std::is_same<typeN, ulong2>::value)
+        return make_ulong2(args...);
+    else if constexpr (std::is_same<typeN, ulong3>::value)
+        return make_ulong3(args...);
+    else if constexpr (std::is_same<typeN, ulong4>::value)
+        return make_ulong4(args...);
+    else if constexpr (std::is_same<typeN, ulong8>::value)
+        return make_ulong8(args...);
+    else if constexpr (std::is_same<typeN, double2>::value)
+        return make_double2(args...);
+    else if constexpr (std::is_same<typeN, double3>::value)
+        return make_double3(args...);
+    else if constexpr (std::is_same<typeN, double4>::value)
+        return make_double4(args...);
+    else if constexpr (std::is_same<typeN, double8>::value)
+        return make_double8(args...);
+}
+
+/*! @abstract Templated make_undef function based on return type and
+ *  argument type.                                                            */
+template<typename typeN, typename... Args>
+static constexpr typeN make_undef(Args... args)
+{
+    if constexpr (traits<typeN>::count == 1)
+    {
+        using FirstArgType = typename std::tuple_element<0, std::tuple<Args...>>::type;
+        if constexpr (std::is_same<FirstArgType, typename traits<FirstArgType>::scalar_t>::value)
+            return typeN(std::get<0>(std::make_tuple(args...)));
+        else
+            return typeN(std::get<0>(std::make_tuple(args...))[0]);
+    }
+    else if constexpr (std::is_same<typeN, char2>::value)
+        return make_char2_undef(args...);
+    else if constexpr (std::is_same<typeN, char3>::value)
+        return make_char3_undef(args...);
+    else if constexpr (std::is_same<typeN, char4>::value)
+        return make_char4_undef(args...);
+    else if constexpr (std::is_same<typeN, char8>::value)
+        return make_char8_undef(args...);
+    else if constexpr (std::is_same<typeN, char16>::value)
+        return make_char16_undef(args...);
+    else if constexpr (std::is_same<typeN, char32>::value)
+        return make_char32_undef(args...);
+    else if constexpr (std::is_same<typeN, char64>::value)
+        return make_char64_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar2>::value)
+        return make_uchar2_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar3>::value)
+        return make_uchar3_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar4>::value)
+        return make_uchar4_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar8>::value)
+        return make_uchar8_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar16>::value)
+        return make_uchar16_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar32>::value)
+        return make_uchar32_undef(args...);
+    else if constexpr (std::is_same<typeN, uchar64>::value)
+        return make_uchar64_undef(args...);
+    else if constexpr (std::is_same<typeN, short2>::value)
+        return make_short2_undef(args...);
+    else if constexpr (std::is_same<typeN, short3>::value)
+        return make_short3_undef(args...);
+    else if constexpr (std::is_same<typeN, short4>::value)
+        return make_short4_undef(args...);
+    else if constexpr (std::is_same<typeN, short8>::value)
+        return make_short8_undef(args...);
+    else if constexpr (std::is_same<typeN, short16>::value)
+        return make_short16_undef(args...);
+    else if constexpr (std::is_same<typeN, short32>::value)
+        return make_short32_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort2>::value)
+        return make_ushort2_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort3>::value)
+        return make_ushort3_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort4>::value)
+        return make_ushort4_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort8>::value)
+        return make_ushort8_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort16>::value)
+        return make_ushort16_undef(args...);
+    else if constexpr (std::is_same<typeN, ushort32>::value)
+        return make_ushort32_undef(args...);
+    else if constexpr (std::is_same<typeN, int2>::value)
+        return make_int2_undef(args...);
+    else if constexpr (std::is_same<typeN, int3>::value)
+        return make_int3_undef(args...);
+    else if constexpr (std::is_same<typeN, int4>::value)
+        return make_int4_undef(args...);
+    else if constexpr (std::is_same<typeN, int8>::value)
+        return make_int8_undef(args...);
+    else if constexpr (std::is_same<typeN, int16>::value)
+        return make_int16_undef(args...);
+    else if constexpr (std::is_same<typeN, uint2>::value)
+        return make_uint2_undef(args...);
+    else if constexpr (std::is_same<typeN, uint3>::value)
+        return make_uint3_undef(args...);
+    else if constexpr (std::is_same<typeN, uint4>::value)
+        return make_uint4_undef(args...);
+    else if constexpr (std::is_same<typeN, uint8>::value)
+        return make_uint8_undef(args...);
+    else if constexpr (std::is_same<typeN, uint16>::value)
+        return make_uint16_undef(args...);
+    else if constexpr (std::is_same<typeN, float2>::value)
+        return make_float2_undef(args...);
+    else if constexpr (std::is_same<typeN, float3>::value)
+        return make_float3_undef(args...);
+    else if constexpr (std::is_same<typeN, float4>::value)
+        return make_float4_undef(args...);
+    else if constexpr (std::is_same<typeN, float8>::value)
+        return make_float8_undef(args...);
+    else if constexpr (std::is_same<typeN, float16>::value)
+        return make_float16_undef(args...);
+    else if constexpr (std::is_same<typeN, long2>::value)
+        return make_long2_undef(args...);
+    else if constexpr (std::is_same<typeN, long3>::value)
+        return make_long3_undef(args...);
+    else if constexpr (std::is_same<typeN, long4>::value)
+        return make_long4_undef(args...);
+    else if constexpr (std::is_same<typeN, long8>::value)
+        return make_long8_undef(args...);
+    else if constexpr (std::is_same<typeN, ulong2>::value)
+        return make_ulong2_undef(args...);
+    else if constexpr (std::is_same<typeN, ulong3>::value)
+        return make_ulong3_undef(args...);
+    else if constexpr (std::is_same<typeN, ulong4>::value)
+        return make_ulong4_undef(args...);
+    else if constexpr (std::is_same<typeN, ulong8>::value)
+        return make_ulong8_undef(args...);
+    else if constexpr (std::is_same<typeN, double2>::value)
+        return make_double2_undef(args...);
+    else if constexpr (std::is_same<typeN, double3>::value)
+        return make_double3_undef(args...);
+    else if constexpr (std::is_same<typeN, double4>::value)
+        return make_double4_undef(args...);
+    else if constexpr (std::is_same<typeN, double8>::value)
+        return make_double8_undef(args...);
+}
+#endif /* __has_feature(cxx_constexpr) */
+} /* namespace simd */
+#endif /* __cplusplus */
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_VECTOR_CONSTRUCTORS */
diff --git a/vfsoverlay/vector_types.h b/vfsoverlay/vector_types.h
new file mode 100644
index 00000000..223d696e
--- /dev/null
+++ b/vfsoverlay/vector_types.h
@@ -0,0 +1,1281 @@
+/*! @header
+ *  This header defines fixed size vector types that are useful both for
+ *  graphics and geometry, and for software vectorization without
+ *  architecture-specific intrinsics.
+ *
+ *  These types are based on a clang feature called "Extended vector types"
+ *  or "OpenCL vector types" (despite the name, these types work just fine
+ *  in C, Objective-C, and C++). There are a few tricks that make these
+ *  types nicer to work with than traditional simd intrinsic types:
+ *
+ *    - Basic arithmetic operators are overloaded to perform lanewise
+ *      operations with these types, including both vector-vector and
+ *      vector-scalar operations.
+ *
+ *    - It is possible to access vector components both via array-style
+ *      subscripting and by using the "." operator with component names
+ *      "x", "y", "z", "w", and permutations thereof.
+ *
+ *    - There are also some named subvectors: .lo and .hi are the first
+ *      and second halves of a vector, and .even and .odd are the even-
+ *      and odd-indexed elements of a vector.
+ *
+ *    - Clang provides some useful builtins that operate on these vector
+ *      types: __builtin_shufflevector and __builtin_convertvector.
+ *
+ *    - The <simd/simd.h> headers define a large assortment of vector and
+ *      matrix operations that work on these types.
+ *
+ *    - You can also use the simd types with the architecture-specific
+ *      intrinsics defined in <immintrin.h> and <arm_neon.h>.
+ *
+ *  The following vector types are defined by this header:
+ *
+ *    simd_charN   where N is 1, 2, 3, 4, 8, 16, 32, or 64.
+ *    simd_ucharN  where N is 1, 2, 3, 4, 8, 16, 32, or 64.
+ *    simd_shortN  where N is 1, 2, 3, 4, 8, 16, or 32.
+ *    simd_ushortN where N is 1, 2, 3, 4, 8, 16, or 32.
+ *    simd_intN    where N is 1, 2, 3, 4, 8, or 16.
+ *    simd_uintN   where N is 1, 2, 3, 4, 8, or 16.
+ *    simd_floatN  where N is 1, 2, 3, 4, 8, or 16.
+ *    simd_longN   where N is 1, 2, 3, 4, or 8.
+ *    simd_ulongN  where N is 1, 2, 3, 4, or 8.
+ *    simd_doubleN where N is 1, 2, 3, 4, or 8.
+ *
+ *  These types generally have greater alignment than the underlying scalar
+ *  type; they are aligned to either the size of the vector[1] or 16 bytes,
+ *  whichever is smaller.
+ *
+ *    [1] Note that sizeof a three-element vector is the same as sizeof the
+ *    corresponding four-element vector, because three-element vectors have
+ *    a hidden lane of padding.
+ *
+ *  In earlier versions of the simd library, the alignment of vectors could
+ *  be larger than 16B, up to the "architectural vector size" of 16, 32, or
+ *  64B, depending on what options were passed on the command line when
+ *  compiling. This super-alignment does not interact well with malloc, and
+ *  makes it difficult for libraries to provide a stable API, while conferring
+ *  relatively little performance benefit, so it has been relaxed.
+ *
+ *  For each simd_typeN type where N is not 1 or 3, there is also a
+ *  corresponding simd_packed_typeN type that requires only the alignment
+ *  matching that of the underlying scalar type. Use this if you need to
+ *  work with pointers-to or arrays-of scalar values:
+ *
+ *    void myFunction(float *pointerToFourFloats) {
+ *      // This is a bug, because `pointerToFourFloats` does not satisfy
+ *      // the alignment requirements of the `simd_float4` type; attempting
+ *      // to dereference (load from) `vecptr` is likely to crash at runtime.
+ *      simd_float4 *vecptr = (simd_float4 *)pointerToFourFloats;
+ *
+ *      // Instead, convert to `simd_packed_float4`:
+ *      simd_packed_float4 *vecptr = (simd_packed_float4 *)pointerToFourFloats;
+ *      // The `simd_packed_float4` type has the same alignment requirements
+ *      // as `float`, so this conversion is safe, and lets us load a vector.
+ *      // Note that `simd_packed_float4` can be assigned to `simd_float4`
+ *      // without any conversion; they types only behave differently as
+ *      // pointers or arrays.
+ *      simd_float4 vector = vecptr[0];
+ *    }
+ *
+ *  All of the simd_-prefixed types are also available in the C++ simd::
+ *  namespace; simd_char4 can be used as simd::char4, for example. These types
+ *  largely match the Metal shader language vector types, except that there
+ *  are no vector types larger than 4 elements in Metal.
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_VECTOR_TYPES
+#define SIMD_VECTOR_TYPES
+
+# include <simd/base.h>
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+
+/*  MARK: Basic vector types                                                  */
+
+/*! @group C and Objective-C vector types
+ *  @discussion These are the basic types that underpin the simd library.     */
+
+/*! @abstract A scalar 8-bit signed (twos-complement) integer.                */
+typedef char simd_char1;
+
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::char2. The alignment of this type is greater than the alignment of
+ *  char; if you need to operate on data buffers that may not be suitably
+ *  aligned, you should access them using simd_packed_char2 instead.          */
+typedef __attribute__((__ext_vector_type__(2))) char simd_char2;
+
+/*! @abstract A vector of three 8-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::char3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_char4.                                         */
+typedef __attribute__((__ext_vector_type__(3))) char simd_char3;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::char4. The alignment of this type is greater than the alignment of
+ *  char; if you need to operate on data buffers that may not be suitably
+ *  aligned, you should access them using simd_packed_char4 instead.          */
+typedef __attribute__((__ext_vector_type__(4))) char simd_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::char8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of char; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_char8 instead.                                                */
+typedef __attribute__((__ext_vector_type__(8))) char simd_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::char16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of char; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_char16 instead.                                               */
+typedef __attribute__((__ext_vector_type__(16))) char simd_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+ *  integers.
+ *  @description In C++ this type is also available as simd::char32. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of char; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_char32 instead.                                               */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(16))) char simd_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement)
+ *  integers.
+ *  @description In C++ this type is also available as simd::char64. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of char; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_char64 instead.                                               */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(16))) char simd_char64;
+
+/*! @abstract A scalar 8-bit unsigned integer.                                */
+typedef unsigned char simd_uchar1;
+
+/*! @abstract A vector of two 8-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uchar2. The alignment of this type is greater than the alignment
+ *  of unsigned char; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_uchar2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) unsigned char simd_uchar2;
+
+/*! @abstract A vector of three 8-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uchar3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_uchar4.                                        */
+typedef __attribute__((__ext_vector_type__(3))) unsigned char simd_uchar3;
+
+/*! @abstract A vector of four 8-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uchar4. The alignment of this type is greater than the alignment
+ *  of unsigned char; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_uchar4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4))) unsigned char simd_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uchar8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned char; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uchar8 instead.                                               */
+typedef __attribute__((__ext_vector_type__(8))) unsigned char simd_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uchar16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned char; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uchar16 instead.                                              */
+typedef __attribute__((__ext_vector_type__(16))) unsigned char simd_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uchar32. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned char; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uchar32 instead.                                              */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(16))) unsigned char simd_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uchar64. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned char; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uchar64 instead.                                              */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(16))) unsigned char simd_uchar64;
+
+/*! @abstract A scalar 16-bit signed (twos-complement) integer.               */
+typedef short simd_short1;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::short2. The alignment of this type is greater than the alignment
+ *  of short; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_short2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) short simd_short2;
+
+/*! @abstract A vector of three 16-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::short3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_short4.                                        */
+typedef __attribute__((__ext_vector_type__(3))) short simd_short3;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::short4. The alignment of this type is greater than the alignment
+ *  of short; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_short4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4))) short simd_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::short8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of short; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_short8 instead.                                               */
+typedef __attribute__((__ext_vector_type__(8))) short simd_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::short16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of short; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_short16 instead.                                              */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) short simd_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers.
+ *  @description In C++ this type is also available as simd::short32. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of short; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_short32 instead.                                              */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(16))) short simd_short32;
+
+/*! @abstract A scalar 16-bit unsigned integer.                               */
+typedef unsigned short simd_ushort1;
+
+/*! @abstract A vector of two 16-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ushort2. The alignment of this type is greater than the alignment
+ *  of unsigned short; if you need to operate on data buffers that may not
+ *  be suitably aligned, you should access them using simd_packed_ushort2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) unsigned short simd_ushort2;
+
+/*! @abstract A vector of three 16-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ushort3. Note that vectors of this type are padded to have the
+ *  same size and alignment as simd_ushort4.                                  */
+typedef __attribute__((__ext_vector_type__(3))) unsigned short simd_ushort3;
+
+/*! @abstract A vector of four 16-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ushort4. The alignment of this type is greater than the alignment
+ *  of unsigned short; if you need to operate on data buffers that may not
+ *  be suitably aligned, you should access them using simd_packed_ushort4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4))) unsigned short simd_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::ushort8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned short; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_ushort8 instead.                                              */
+typedef __attribute__((__ext_vector_type__(8))) unsigned short simd_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::ushort16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned short; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_ushort16 instead.                                             */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) unsigned short simd_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::ushort32. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned short; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_ushort32 instead.                                             */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(16))) unsigned short simd_ushort32;
+
+/*! @abstract A scalar 32-bit signed (twos-complement) integer.               */
+typedef int simd_int1;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::int2. The alignment of this type is greater than the alignment of
+ *  int; if you need to operate on data buffers that may not be suitably
+ *  aligned, you should access them using simd_packed_int2 instead.           */
+typedef __attribute__((__ext_vector_type__(2))) int simd_int2;
+
+/*! @abstract A vector of three 32-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::int3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_int4.                                          */
+typedef __attribute__((__ext_vector_type__(3))) int simd_int3;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::int4. The alignment of this type is greater than the alignment of
+ *  int; if you need to operate on data buffers that may not be suitably
+ *  aligned, you should access them using simd_packed_int4 instead.           */
+typedef __attribute__((__ext_vector_type__(4))) int simd_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::int8. This type
+ *  is not available in Metal. The alignment of this type is greater than
+ *  the alignment of int; if you need to operate on data buffers that may
+ *  not be suitably aligned, you should access them using simd_packed_int8
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) int simd_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::int16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of int; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_int16 instead.                                                */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) int simd_int16;
+
+/*! @abstract A scalar 32-bit unsigned integer.                               */
+typedef unsigned int simd_uint1;
+
+/*! @abstract A vector of two 32-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uint2. The alignment of this type is greater than the alignment of
+ *  unsigned int; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_uint2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) unsigned int simd_uint2;
+
+/*! @abstract A vector of three 32-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uint3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_uint4.                                         */
+typedef __attribute__((__ext_vector_type__(3))) unsigned int simd_uint3;
+
+/*! @abstract A vector of four 32-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::uint4. The alignment of this type is greater than the alignment of
+ *  unsigned int; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_uint4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4))) unsigned int simd_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uint8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned int; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uint8 instead.                                                */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) unsigned int simd_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::uint16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of unsigned int; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_uint16 instead.                                               */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) unsigned int simd_uint16;
+
+/*! @abstract A scalar 32-bit floating-point number.                          */
+typedef float simd_float1;
+
+/*! @abstract A vector of two 32-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::float2. The alignment of this type is greater than the alignment
+ *  of float; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_float2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) float simd_float2;
+
+/*! @abstract A vector of three 32-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::float3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_float4.                                        */
+typedef __attribute__((__ext_vector_type__(3))) float simd_float3;
+
+/*! @abstract A vector of four 32-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::float4. The alignment of this type is greater than the alignment
+ *  of float; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_float4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4))) float simd_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers.
+ *  @description In C++ this type is also available as simd::float8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of float; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_float8 instead.                                               */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) float simd_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers.
+ *  @description In C++ this type is also available as simd::float16. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of float; if you need to operate on data buffers that
+ *  may not be suitably aligned, you should access them using
+ *  simd_packed_float16 instead.                                              */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) float simd_float16;
+
+/*! @abstract A scalar 64-bit signed (twos-complement) integer.               */
+#if defined __LP64__
+typedef long simd_long1;
+#else
+typedef long long simd_long1;
+#endif
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::long2. The alignment of this type is greater than the alignment of
+ *  simd_long1; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_long2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) simd_long1 simd_long2;
+
+/*! @abstract A vector of three 64-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::long3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_long4.                                         */
+typedef __attribute__((__ext_vector_type__(3),__aligned__(16))) simd_long1 simd_long3;
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::long4. The alignment of this type is greater than the alignment of
+ *  simd_long1; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_long4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(16))) simd_long1 simd_long4;
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers.
+ *  @description In C++ this type is also available as simd::long8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of simd_long1; if you need to operate on data buffers
+ *  that may not be suitably aligned, you should access them using
+ *  simd_packed_long8 instead.                                                */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) simd_long1 simd_long8;
+
+/*! @abstract A scalar 64-bit unsigned integer.                               */
+#if defined __LP64__
+typedef unsigned long simd_ulong1;
+#else
+typedef unsigned long long simd_ulong1;
+#endif
+
+/*! @abstract A vector of two 64-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ulong2. The alignment of this type is greater than the alignment
+ *  of simd_ulong1; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_ulong2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) simd_ulong1 simd_ulong2;
+
+/*! @abstract A vector of three 64-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ulong3. Note that vectors of this type are padded to have the same
+ *  size and alignment as simd_ulong4.                                        */
+typedef __attribute__((__ext_vector_type__(3),__aligned__(16))) simd_ulong1 simd_ulong3;
+
+/*! @abstract A vector of four 64-bit unsigned integers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::ulong4. The alignment of this type is greater than the alignment
+ *  of simd_ulong1; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_ulong4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(16))) simd_ulong1 simd_ulong4;
+
+/*! @abstract A vector of eight 64-bit unsigned integers.
+ *  @description In C++ this type is also available as simd::ulong8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of simd_ulong1; if you need to operate on data
+ *  buffers that may not be suitably aligned, you should access them using
+ *  simd_packed_ulong8 instead.                                               */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) simd_ulong1 simd_ulong8;
+
+/*! @abstract A scalar 64-bit floating-point number.                          */
+typedef double simd_double1;
+
+/*! @abstract A vector of two 64-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::double2. The alignment of this type is greater than the alignment
+ *  of double; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_double2
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(2))) double simd_double2;
+
+/*! @abstract A vector of three 64-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::double3. Note that vectors of this type are padded to have the
+ *  same size and alignment as simd_double4.                                  */
+typedef __attribute__((__ext_vector_type__(3),__aligned__(16))) double simd_double3;
+
+/*! @abstract A vector of four 64-bit floating-point numbers.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::double4. The alignment of this type is greater than the alignment
+ *  of double; if you need to operate on data buffers that may not be
+ *  suitably aligned, you should access them using simd_packed_double4
+ *  instead.                                                                  */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(16))) double simd_double4;
+
+/*! @abstract A vector of eight 64-bit floating-point numbers.
+ *  @description In C++ this type is also available as simd::double8. This
+ *  type is not available in Metal. The alignment of this type is greater
+ *  than the alignment of double; if you need to operate on data buffers
+ *  that may not be suitably aligned, you should access them using
+ *  simd_packed_double8 instead.                                              */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) double simd_double8;
+
+/*  MARK: C++ vector types                                                    */
+#if defined __cplusplus
+/*! @group C++ and Metal vector types
+ *  @discussion Shorter type names available within the simd:: namespace.
+ *  Each of these types is interchangable with the corresponding C type 
+ *  with the `simd_` prefix.                                                  */
+namespace simd {
+  /*! @abstract A scalar 8-bit signed (twos-complement) integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_char1.                                                             */
+typedef ::simd_char1 char1;
+  
+  /*! @abstract A vector of two 8-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_char2. The alignment of this type is greater than the alignment
+   *  of char; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_char2
+   *  instead.                                                                */
+typedef ::simd_char2 char2;
+  
+  /*! @abstract A vector of three 8-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_char3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_char4.                                                */
+typedef ::simd_char3 char3;
+  
+  /*! @abstract A vector of four 8-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_char4. The alignment of this type is greater than the alignment
+   *  of char; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_char4
+   *  instead.                                                                */
+typedef ::simd_char4 char4;
+  
+  /*! @abstract A vector of eight 8-bit signed (twos-complement) integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_char8. The alignment of this type is
+   *  greater than the alignment of char; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_char8 instead.                                             */
+typedef ::simd_char8 char8;
+  
+  /*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_char16. The alignment of this type is
+   *  greater than the alignment of char; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_char16 instead.                                            */
+typedef ::simd_char16 char16;
+  
+  /*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+   *  integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_char32. The alignment of this type is
+   *  greater than the alignment of char; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_char32 instead.                                            */
+typedef ::simd_char32 char32;
+  
+  /*! @abstract A vector of sixty-four 8-bit signed (twos-complement)
+   *  integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_char64. The alignment of this type is
+   *  greater than the alignment of char; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_char64 instead.                                            */
+typedef ::simd_char64 char64;
+  
+  /*! @abstract A scalar 8-bit unsigned integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_uchar1.                                                            */
+typedef ::simd_uchar1 uchar1;
+  
+  /*! @abstract A vector of two 8-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uchar2. The alignment of this type is greater than the alignment
+   *  of unsigned char; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_uchar2
+   *  instead.                                                                */
+typedef ::simd_uchar2 uchar2;
+  
+  /*! @abstract A vector of three 8-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uchar3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_uchar4.                                               */
+typedef ::simd_uchar3 uchar3;
+  
+  /*! @abstract A vector of four 8-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uchar4. The alignment of this type is greater than the alignment
+   *  of unsigned char; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_uchar4
+   *  instead.                                                                */
+typedef ::simd_uchar4 uchar4;
+  
+  /*! @abstract A vector of eight 8-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uchar8. The alignment of this type is
+   *  greater than the alignment of unsigned char; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uchar8 instead.                                      */
+typedef ::simd_uchar8 uchar8;
+  
+  /*! @abstract A vector of sixteen 8-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uchar16. The alignment of this type is
+   *  greater than the alignment of unsigned char; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uchar16 instead.                                     */
+typedef ::simd_uchar16 uchar16;
+  
+  /*! @abstract A vector of thirty-two 8-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uchar32. The alignment of this type is
+   *  greater than the alignment of unsigned char; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uchar32 instead.                                     */
+typedef ::simd_uchar32 uchar32;
+  
+  /*! @abstract A vector of sixty-four 8-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uchar64. The alignment of this type is
+   *  greater than the alignment of unsigned char; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uchar64 instead.                                     */
+typedef ::simd_uchar64 uchar64;
+  
+  /*! @abstract A scalar 16-bit signed (twos-complement) integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_short1.                                                            */
+typedef ::simd_short1 short1;
+  
+  /*! @abstract A vector of two 16-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_short2. The alignment of this type is greater than the alignment
+   *  of short; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_short2
+   *  instead.                                                                */
+typedef ::simd_short2 short2;
+  
+  /*! @abstract A vector of three 16-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_short3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_short4.                                               */
+typedef ::simd_short3 short3;
+  
+  /*! @abstract A vector of four 16-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_short4. The alignment of this type is greater than the alignment
+   *  of short; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_short4
+   *  instead.                                                                */
+typedef ::simd_short4 short4;
+  
+  /*! @abstract A vector of eight 16-bit signed (twos-complement) integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_short8. The alignment of this type is
+   *  greater than the alignment of short; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_short8 instead.                                            */
+typedef ::simd_short8 short8;
+  
+  /*! @abstract A vector of sixteen 16-bit signed (twos-complement)
+   *  integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_short16. The alignment of this type is
+   *  greater than the alignment of short; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_short16 instead.                                           */
+typedef ::simd_short16 short16;
+  
+  /*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+   *  integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_short32. The alignment of this type is
+   *  greater than the alignment of short; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_short32 instead.                                           */
+typedef ::simd_short32 short32;
+  
+  /*! @abstract A scalar 16-bit unsigned integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_ushort1.                                                           */
+typedef ::simd_ushort1 ushort1;
+  
+  /*! @abstract A vector of two 16-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ushort2. The alignment of this type is greater than the alignment
+   *  of unsigned short; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_ushort2
+   *  instead.                                                                */
+typedef ::simd_ushort2 ushort2;
+  
+  /*! @abstract A vector of three 16-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ushort3. Vectors of this type are padded to have the same size
+   *  and alignment as simd_ushort4.                                          */
+typedef ::simd_ushort3 ushort3;
+  
+  /*! @abstract A vector of four 16-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ushort4. The alignment of this type is greater than the alignment
+   *  of unsigned short; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_ushort4
+   *  instead.                                                                */
+typedef ::simd_ushort4 ushort4;
+  
+  /*! @abstract A vector of eight 16-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_ushort8. The alignment of this type is
+   *  greater than the alignment of unsigned short; if you need to operate
+   *  on data buffers that may not be suitably aligned, you should access
+   *  them using simd::packed_ushort8 instead.                                */
+typedef ::simd_ushort8 ushort8;
+  
+  /*! @abstract A vector of sixteen 16-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_ushort16. The alignment of this type is
+   *  greater than the alignment of unsigned short; if you need to operate
+   *  on data buffers that may not be suitably aligned, you should access
+   *  them using simd::packed_ushort16 instead.                               */
+typedef ::simd_ushort16 ushort16;
+  
+  /*! @abstract A vector of thirty-two 16-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_ushort32. The alignment of this type is
+   *  greater than the alignment of unsigned short; if you need to operate
+   *  on data buffers that may not be suitably aligned, you should access
+   *  them using simd::packed_ushort32 instead.                               */
+typedef ::simd_ushort32 ushort32;
+  
+  /*! @abstract A scalar 32-bit signed (twos-complement) integer.
+   *  @discussion In C and Objective-C, this type is available as simd_int1.  */
+typedef ::simd_int1 int1;
+  
+  /*! @abstract A vector of two 32-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as simd_int2.
+   *  The alignment of this type is greater than the alignment of int; if
+   *  you need to operate on data buffers that may not be suitably aligned,
+   *  you should access them using simd::packed_int2 instead.                 */
+typedef ::simd_int2 int2;
+  
+  /*! @abstract A vector of three 32-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as simd_int3.
+   *  Vectors of this type are padded to have the same size and alignment as
+   *  simd_int4.                                                              */
+typedef ::simd_int3 int3;
+  
+  /*! @abstract A vector of four 32-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as simd_int4.
+   *  The alignment of this type is greater than the alignment of int; if
+   *  you need to operate on data buffers that may not be suitably aligned,
+   *  you should access them using simd::packed_int4 instead.                 */
+typedef ::simd_int4 int4;
+  
+  /*! @abstract A vector of eight 32-bit signed (twos-complement) integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_int8. The alignment of this type is
+   *  greater than the alignment of int; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_int8 instead.                                              */
+typedef ::simd_int8 int8;
+  
+  /*! @abstract A vector of sixteen 32-bit signed (twos-complement)
+   *  integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_int16. The alignment of this type is
+   *  greater than the alignment of int; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_int16 instead.                                             */
+typedef ::simd_int16 int16;
+  
+  /*! @abstract A scalar 32-bit unsigned integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_uint1.                                                             */
+typedef ::simd_uint1 uint1;
+  
+  /*! @abstract A vector of two 32-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uint2. The alignment of this type is greater than the alignment
+   *  of unsigned int; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_uint2
+   *  instead.                                                                */
+typedef ::simd_uint2 uint2;
+  
+  /*! @abstract A vector of three 32-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uint3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_uint4.                                                */
+typedef ::simd_uint3 uint3;
+  
+  /*! @abstract A vector of four 32-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_uint4. The alignment of this type is greater than the alignment
+   *  of unsigned int; if you need to operate on data buffers that may not
+   *  be suitably aligned, you should access them using simd::packed_uint4
+   *  instead.                                                                */
+typedef ::simd_uint4 uint4;
+  
+  /*! @abstract A vector of eight 32-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uint8. The alignment of this type is
+   *  greater than the alignment of unsigned int; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uint8 instead.                                       */
+typedef ::simd_uint8 uint8;
+  
+  /*! @abstract A vector of sixteen 32-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_uint16. The alignment of this type is
+   *  greater than the alignment of unsigned int; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_uint16 instead.                                      */
+typedef ::simd_uint16 uint16;
+  
+  /*! @abstract A scalar 32-bit floating-point number.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_float1.                                                            */
+typedef ::simd_float1 float1;
+  
+  /*! @abstract A vector of two 32-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_float2. The alignment of this type is greater than the alignment
+   *  of float; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_float2
+   *  instead.                                                                */
+typedef ::simd_float2 float2;
+  
+  /*! @abstract A vector of three 32-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_float3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_float4.                                               */
+typedef ::simd_float3 float3;
+  
+  /*! @abstract A vector of four 32-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_float4. The alignment of this type is greater than the alignment
+   *  of float; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_float4
+   *  instead.                                                                */
+typedef ::simd_float4 float4;
+  
+  /*! @abstract A vector of eight 32-bit floating-point numbers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_float8. The alignment of this type is
+   *  greater than the alignment of float; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_float8 instead.                                            */
+typedef ::simd_float8 float8;
+  
+  /*! @abstract A vector of sixteen 32-bit floating-point numbers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_float16. The alignment of this type is
+   *  greater than the alignment of float; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_float16 instead.                                           */
+typedef ::simd_float16 float16;
+  
+  /*! @abstract A scalar 64-bit signed (twos-complement) integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_long1.                                                             */
+typedef ::simd_long1 long1;
+  
+  /*! @abstract A vector of two 64-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_long2. The alignment of this type is greater than the alignment
+   *  of simd_long1; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_long2
+   *  instead.                                                                */
+typedef ::simd_long2 long2;
+  
+  /*! @abstract A vector of three 64-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_long3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_long4.                                                */
+typedef ::simd_long3 long3;
+  
+  /*! @abstract A vector of four 64-bit signed (twos-complement) integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_long4. The alignment of this type is greater than the alignment
+   *  of simd_long1; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_long4
+   *  instead.                                                                */
+typedef ::simd_long4 long4;
+  
+  /*! @abstract A vector of eight 64-bit signed (twos-complement) integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_long8. The alignment of this type is
+   *  greater than the alignment of simd_long1; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_long8 instead.                                       */
+typedef ::simd_long8 long8;
+  
+  /*! @abstract A scalar 64-bit unsigned integer.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_ulong1.                                                            */
+typedef ::simd_ulong1 ulong1;
+  
+  /*! @abstract A vector of two 64-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ulong2. The alignment of this type is greater than the alignment
+   *  of simd_ulong1; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_ulong2
+   *  instead.                                                                */
+typedef ::simd_ulong2 ulong2;
+  
+  /*! @abstract A vector of three 64-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ulong3. Vectors of this type are padded to have the same size and
+   *  alignment as simd_ulong4.                                               */
+typedef ::simd_ulong3 ulong3;
+  
+  /*! @abstract A vector of four 64-bit unsigned integers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_ulong4. The alignment of this type is greater than the alignment
+   *  of simd_ulong1; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_ulong4
+   *  instead.                                                                */
+typedef ::simd_ulong4 ulong4;
+  
+  /*! @abstract A vector of eight 64-bit unsigned integers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_ulong8. The alignment of this type is
+   *  greater than the alignment of simd_ulong1; if you need to operate on
+   *  data buffers that may not be suitably aligned, you should access them
+   *  using simd::packed_ulong8 instead.                                      */
+typedef ::simd_ulong8 ulong8;
+  
+  /*! @abstract A scalar 64-bit floating-point number.
+   *  @discussion In C and Objective-C, this type is available as
+   *  simd_double1.                                                           */
+typedef ::simd_double1 double1;
+  
+  /*! @abstract A vector of two 64-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_double2. The alignment of this type is greater than the alignment
+   *  of double; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_double2
+   *  instead.                                                                */
+typedef ::simd_double2 double2;
+  
+  /*! @abstract A vector of three 64-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_double3. Vectors of this type are padded to have the same size
+   *  and alignment as simd_double4.                                          */
+typedef ::simd_double3 double3;
+  
+  /*! @abstract A vector of four 64-bit floating-point numbers.
+   *  @description In C or Objective-C, this type is available as
+   *  simd_double4. The alignment of this type is greater than the alignment
+   *  of double; if you need to operate on data buffers that may not be
+   *  suitably aligned, you should access them using simd::packed_double4
+   *  instead.                                                                */
+typedef ::simd_double4 double4;
+  
+  /*! @abstract A vector of eight 64-bit floating-point numbers.
+   *  @description This type is not available in Metal. In C or Objective-C,
+   *  this type is available as simd_double8. The alignment of this type is
+   *  greater than the alignment of double; if you need to operate on data
+   *  buffers that may not be suitably aligned, you should access them using
+   *  simd::packed_double8 instead.                                           */
+typedef ::simd_double8 double8;
+  
+} /* namespace simd::                                                         */
+#endif /* __cplusplus                                                         */
+
+/*  MARK: Deprecated vector types                                             */
+/*! @group Deprecated vector types
+ *  @discussion These are the original types used by earlier versions of the
+ *  simd library; they are provided here for compatability with existing source
+ *  files. Use the new ("simd_"-prefixed) types for future development.       */
+
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_char2 or
+ *  simd::char2 instead.                                                      */
+typedef simd_char2 vector_char2;
+
+/*! @abstract A vector of three 8-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_char3 or
+ *  simd::char3 instead.                                                      */
+typedef simd_char3 vector_char3;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_char4 or
+ *  simd::char4 instead.                                                      */
+typedef simd_char4 vector_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_char8 or
+ *  simd::char8 instead.                                                      */
+typedef simd_char8 vector_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_char16 or
+ *  simd::char16 instead.                                                     */
+typedef simd_char16 vector_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+ *  integers.
+ *  @description This type is deprecated; you should use simd_char32 or
+ *  simd::char32 instead.                                                     */
+typedef simd_char32 vector_char32;
+
+/*! @abstract A vector of two 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar2 or
+ *  simd::uchar2 instead.                                                     */
+typedef simd_uchar2 vector_uchar2;
+
+/*! @abstract A vector of three 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar3 or
+ *  simd::uchar3 instead.                                                     */
+typedef simd_uchar3 vector_uchar3;
+
+/*! @abstract A vector of four 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar4 or
+ *  simd::uchar4 instead.                                                     */
+typedef simd_uchar4 vector_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar8 or
+ *  simd::uchar8 instead.                                                     */
+typedef simd_uchar8 vector_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar16 or
+ *  simd::uchar16 instead.                                                    */
+typedef simd_uchar16 vector_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uchar32 or
+ *  simd::uchar32 instead.                                                    */
+typedef simd_uchar32 vector_uchar32;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_short2 or
+ *  simd::short2 instead.                                                     */
+typedef simd_short2 vector_short2;
+
+/*! @abstract A vector of three 16-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_short3 or
+ *  simd::short3 instead.                                                     */
+typedef simd_short3 vector_short3;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_short4 or
+ *  simd::short4 instead.                                                     */
+typedef simd_short4 vector_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_short8 or
+ *  simd::short8 instead.                                                     */
+typedef simd_short8 vector_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_short16 or
+ *  simd::short16 instead.                                                    */
+typedef simd_short16 vector_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers.
+ *  @description This type is deprecated; you should use simd_short32 or
+ *  simd::short32 instead.                                                    */
+typedef simd_short32 vector_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort2 or
+ *  simd::ushort2 instead.                                                    */
+typedef simd_ushort2 vector_ushort2;
+
+/*! @abstract A vector of three 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort3 or
+ *  simd::ushort3 instead.                                                    */
+typedef simd_ushort3 vector_ushort3;
+
+/*! @abstract A vector of four 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort4 or
+ *  simd::ushort4 instead.                                                    */
+typedef simd_ushort4 vector_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort8 or
+ *  simd::ushort8 instead.                                                    */
+typedef simd_ushort8 vector_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort16 or
+ *  simd::ushort16 instead.                                                   */
+typedef simd_ushort16 vector_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ushort32 or
+ *  simd::ushort32 instead.                                                   */
+typedef simd_ushort32 vector_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_int2 or
+ *  simd::int2 instead.                                                       */
+typedef simd_int2 vector_int2;
+
+/*! @abstract A vector of three 32-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_int3 or
+ *  simd::int3 instead.                                                       */
+typedef simd_int3 vector_int3;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_int4 or
+ *  simd::int4 instead.                                                       */
+typedef simd_int4 vector_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_int8 or
+ *  simd::int8 instead.                                                       */
+typedef simd_int8 vector_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_int16 or
+ *  simd::int16 instead.                                                      */
+typedef simd_int16 vector_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uint2 or
+ *  simd::uint2 instead.                                                      */
+typedef simd_uint2 vector_uint2;
+
+/*! @abstract A vector of three 32-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uint3 or
+ *  simd::uint3 instead.                                                      */
+typedef simd_uint3 vector_uint3;
+
+/*! @abstract A vector of four 32-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uint4 or
+ *  simd::uint4 instead.                                                      */
+typedef simd_uint4 vector_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uint8 or
+ *  simd::uint8 instead.                                                      */
+typedef simd_uint8 vector_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_uint16 or
+ *  simd::uint16 instead.                                                     */
+typedef simd_uint16 vector_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_float2 or
+ *  simd::float2 instead.                                                     */
+typedef simd_float2 vector_float2;
+
+/*! @abstract A vector of three 32-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_float3 or
+ *  simd::float3 instead.                                                     */
+typedef simd_float3 vector_float3;
+
+/*! @abstract A vector of four 32-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_float4 or
+ *  simd::float4 instead.                                                     */
+typedef simd_float4 vector_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_float8 or
+ *  simd::float8 instead.                                                     */
+typedef simd_float8 vector_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_float16 or
+ *  simd::float16 instead.                                                    */
+typedef simd_float16 vector_float16;
+
+/*! @abstract A scalar 64-bit signed (twos-complement) integer.
+ *  @description This type is deprecated; you should use simd_long1 or
+ *  simd::long1 instead.                                                      */
+typedef simd_long1 vector_long1;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_long2 or
+ *  simd::long2 instead.                                                      */
+typedef simd_long2 vector_long2;
+
+/*! @abstract A vector of three 64-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_long3 or
+ *  simd::long3 instead.                                                      */
+typedef simd_long3 vector_long3;
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_long4 or
+ *  simd::long4 instead.                                                      */
+typedef simd_long4 vector_long4;
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers.
+ *  @description This type is deprecated; you should use simd_long8 or
+ *  simd::long8 instead.                                                      */
+typedef simd_long8 vector_long8;
+
+/*! @abstract A scalar 64-bit unsigned integer.
+ *  @description This type is deprecated; you should use simd_ulong1 or
+ *  simd::ulong1 instead.                                                     */
+typedef simd_ulong1 vector_ulong1;
+
+/*! @abstract A vector of two 64-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ulong2 or
+ *  simd::ulong2 instead.                                                     */
+typedef simd_ulong2 vector_ulong2;
+
+/*! @abstract A vector of three 64-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ulong3 or
+ *  simd::ulong3 instead.                                                     */
+typedef simd_ulong3 vector_ulong3;
+
+/*! @abstract A vector of four 64-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ulong4 or
+ *  simd::ulong4 instead.                                                     */
+typedef simd_ulong4 vector_ulong4;
+
+/*! @abstract A vector of eight 64-bit unsigned integers.
+ *  @description This type is deprecated; you should use simd_ulong8 or
+ *  simd::ulong8 instead.                                                     */
+typedef simd_ulong8 vector_ulong8;
+
+/*! @abstract A vector of two 64-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_double2 or
+ *  simd::double2 instead.                                                    */
+typedef simd_double2 vector_double2;
+
+/*! @abstract A vector of three 64-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_double3 or
+ *  simd::double3 instead.                                                    */
+typedef simd_double3 vector_double3;
+
+/*! @abstract A vector of four 64-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_double4 or
+ *  simd::double4 instead.                                                    */
+typedef simd_double4 vector_double4;
+
+/*! @abstract A vector of eight 64-bit floating-point numbers.
+ *  @description This type is deprecated; you should use simd_double8 or
+ *  simd::double8 instead.                                                    */
+typedef simd_double8 vector_double8;
+
+# endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif
diff --git a/webrtc-kmp/build.gradle.kts b/webrtc-kmp/build.gradle.kts
index b1148f53..8fad17da 100644
--- a/webrtc-kmp/build.gradle.kts
+++ b/webrtc-kmp/build.gradle.kts
@@ -1,10 +1,12 @@
 import org.jetbrains.kotlin.gradle.ExperimentalKotlinGradlePluginApi
+import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl
+import org.jetbrains.kotlin.gradle.dsl.JvmTarget
 import org.jetbrains.kotlin.gradle.plugin.KotlinSourceSetTree
-import org.jetbrains.kotlin.gradle.targets.js.dsl.ExperimentalWasmDsl
 import org.jetbrains.kotlin.gradle.targets.js.webpack.KotlinWebpackConfig
 
 plugins {
-    id("webrtc.multiplatform")
+    alias(libs.plugins.kotlinMultiplatform)
+    alias(libs.plugins.androidLibrary)
     kotlin("native.cocoapods")
     id("maven-publish")
     id("signing")
@@ -15,6 +17,11 @@ group = "com.shepeliev"
 version = System.getenv("VERSION") ?: "0.0.0"
 
 kotlin {
+    @OptIn(ExperimentalKotlinGradlePluginApi::class)
+    compilerOptions {
+        freeCompilerArgs.add("-Xexpect-actual-classes")
+    }
+
     cocoapods {
         version = project.version.toString()
         summary = "WebRTC Kotlin Multiplatform SDK"
@@ -27,15 +34,21 @@ kotlin {
             version = libs.versions.webrtc.ios.sdk.get()
             moduleName = "WebRTC"
             packageName = "WebRTC"
+
+            // workaround for https://youtrack.jetbrains.com/issue/KT-69094
+            extraOpts += listOf("-compiler-option", "-ivfsoverlay", "-compiler-option", "../vfsoverlay/overlay.yaml")
         }
     }
 
+    @OptIn(ExperimentalKotlinGradlePluginApi::class)
     androidTarget {
         publishAllLibraryVariants()
-        @OptIn(ExperimentalKotlinGradlePluginApi::class)
         instrumentedTestVariant {
             sourceSetTree.set(KotlinSourceSetTree.test)
         }
+        compilerOptions {
+            jvmTarget = JvmTarget.JVM_1_8
+        }
     }
 
     iosX64()
@@ -108,11 +121,24 @@ kotlin {
 android {
     namespace = "com.shepeliev.webrtckmp"
 
+    compileSdk = libs.versions.compileSdk.get().toInt()
+    sourceSets["main"].manifest.srcFile("src/androidMain/AndroidManifest.xml")
+    sourceSets["main"].res.srcDir("src/androidMain/res")
+
     defaultConfig {
-        targetSdk = libs.versions.targetSdk.get().toInt()
+        minSdk = libs.versions.minSdk.get().toInt()
         testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
     }
 
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+
+    testOptions {
+        targetSdk = libs.versions.targetSdk.get().toInt()
+    }
+
     dependencies {
         androidTestImplementation(libs.androidx.test.core)
         androidTestImplementation(libs.androidx.test.runner)