diff --git a/CHANGELOG.md b/CHANGELOG.md
index 538546e4c17..67d91eef769 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -68,8 +68,18 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 
 #### General
 
+- Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
+
 #### Naga
 
+- In hlsl-out, allow passing information about the fragment entry point to omit vertex outputs that are not in the fragment inputs. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
+
+  ```diff
+  let writer: naga::back::hlsl::Writer = /* ... */;
+  -writer.write(&module, &module_info);
+  +writer.write(&module, &module_info, None);
+  ```
+
 ### Bug Fixes
 
 ## v0.20.0 (2024-04-28)
diff --git a/deno_webgpu/lib.rs b/deno_webgpu/lib.rs
index 453d4ea7e3d..18c6e932ad1 100644
--- a/deno_webgpu/lib.rs
+++ b/deno_webgpu/lib.rs
@@ -360,9 +360,6 @@ fn deserialize_features(features: &wgpu_types::Features) -> Vec<&'static str> {
     if features.contains(wgpu_types::Features::SHADER_EARLY_DEPTH_TEST) {
         return_features.push("shader-early-depth-test");
     }
-    if features.contains(wgpu_types::Features::SHADER_UNUSED_VERTEX_OUTPUT) {
-        return_features.push("shader-unused-vertex-output");
-    }
 
     return_features
 }
@@ -648,10 +645,6 @@ impl From<GpuRequiredFeatures> for wgpu_types::Features {
             wgpu_types::Features::SHADER_EARLY_DEPTH_TEST,
             required_features.0.contains("shader-early-depth-test"),
         );
-        features.set(
-            wgpu_types::Features::SHADER_UNUSED_VERTEX_OUTPUT,
-            required_features.0.contains("shader-unused-vertex-output"),
-        );
 
         features
     }
diff --git a/naga-cli/src/bin/naga.rs b/naga-cli/src/bin/naga.rs
index 7ff086d3f7a..593630e6389 100644
--- a/naga-cli/src/bin/naga.rs
+++ b/naga-cli/src/bin/naga.rs
@@ -777,7 +777,7 @@ fn write_output(
 
             let mut buffer = String::new();
             let mut writer = hlsl::Writer::new(&mut buffer, &params.hlsl);
-            writer.write(&module, &info).unwrap_pretty();
+            writer.write(&module, &info, None).unwrap_pretty();
             fs::write(output_path, buffer)?;
         }
         "wgsl" => {
diff --git a/naga/benches/criterion.rs b/naga/benches/criterion.rs
index e57c58a8476..b8d794563ec 100644
--- a/naga/benches/criterion.rs
+++ b/naga/benches/criterion.rs
@@ -226,7 +226,7 @@ fn backends(c: &mut Criterion) {
             let mut string = String::new();
             for &(ref module, ref info) in inputs.iter() {
                 let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
-                let _ = writer.write(module, info); // may fail on unimplemented things
+                let _ = writer.write(module, info, None); // may fail on unimplemented things
                 string.clear();
             }
         });
diff --git a/naga/src/back/hlsl/mod.rs b/naga/src/back/hlsl/mod.rs
index 28edbf70e1d..5cbfffc4745 100644
--- a/naga/src/back/hlsl/mod.rs
+++ b/naga/src/back/hlsl/mod.rs
@@ -287,6 +287,35 @@ impl Wrapped {
     }
 }
 
+/// A fragment entry point to be considered when generating HLSL for the output interface of vertex
+/// entry points.
+///
+/// This is provided as an optional paramter to [`Writer::write`].
+///
+/// If this is provided, vertex outputs will be removed if they are not inputs of this fragment
+/// entry point. This is necessary for generating correct HLSL when some of the vertex shader
+/// outputs are not consumed by the fragment shader.
+pub struct FragmentEntryPoint<'a> {
+    module: &'a crate::Module,
+    func: &'a crate::Function,
+}
+
+impl<'a> FragmentEntryPoint<'a> {
+    /// Returns `None` if the entry point with the provided name can't be found or isn't a fragment
+    /// entry point.
+    pub fn new(module: &'a crate::Module, ep_name: &'a str) -> Option<Self> {
+        module
+            .entry_points
+            .iter()
+            .find(|ep| ep.name == ep_name)
+            .filter(|ep| ep.stage == crate::ShaderStage::Fragment)
+            .map(|ep| Self {
+                module,
+                func: &ep.function,
+            })
+    }
+}
+
 pub struct Writer<'a, W> {
     out: W,
     names: crate::FastHashMap<proc::NameKey, String>,
diff --git a/naga/src/back/hlsl/writer.rs b/naga/src/back/hlsl/writer.rs
index 86d8f890357..5289002a15c 100644
--- a/naga/src/back/hlsl/writer.rs
+++ b/naga/src/back/hlsl/writer.rs
@@ -4,7 +4,7 @@ use super::{
         WrappedZeroValue,
     },
     storage::StoreValue,
-    BackendResult, Error, Options,
+    BackendResult, Error, FragmentEntryPoint, Options,
 };
 use crate::{
     back,
@@ -28,6 +28,7 @@ pub(crate) const INSERT_BITS_FUNCTION: &str = "naga_insertBits";
 struct EpStructMember {
     name: String,
     ty: Handle<crate::Type>,
+    // TODO: log error if binding is none?
     // technically, this should always be `Some`
     binding: Option<crate::Binding>,
     index: u32,
@@ -196,6 +197,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         &mut self,
         module: &Module,
         module_info: &valid::ModuleInfo,
+        fragment_entry_point: Option<&FragmentEntryPoint<'_>>,
     ) -> Result<super::ReflectionInfo, Error> {
         if !module.overrides.is_empty() {
             return Err(Error::Override);
@@ -296,7 +298,13 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         // Write all entry points wrapped structs
         for (index, ep) in module.entry_points.iter().enumerate() {
             let ep_name = self.names[&NameKey::EntryPoint(index as u16)].clone();
-            let ep_io = self.write_ep_interface(module, &ep.function, ep.stage, &ep_name)?;
+            let ep_io = self.write_ep_interface(
+                module,
+                &ep.function,
+                ep.stage,
+                &ep_name,
+                fragment_entry_point,
+            )?;
             self.entry_point_io.push(ep_io);
         }
 
@@ -504,6 +512,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         writeln!(self.out, "}};")?;
         writeln!(self.out)?;
 
+        // See ordering notes on EntryPointInterface fields
         match shader_stage.1 {
             Io::Input => {
                 // bring back the original order
@@ -537,6 +546,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         for arg in func.arguments.iter() {
             match module.types[arg.ty].inner {
                 TypeInner::Struct { ref members, .. } => {
+                    // TODO: what about nested structs? Is that possible? Maybe try an unwrap on
+                    // the binding?
                     for member in members.iter() {
                         let name = self.namer.call_or(&member.name, "member");
                         let index = fake_members.len() as u32;
@@ -573,10 +584,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         result: &crate::FunctionResult,
         stage: ShaderStage,
         entry_point_name: &str,
+        frag_ep: Option<&FragmentEntryPoint<'_>>,
     ) -> Result<EntryPointBinding, Error> {
         let struct_name = format!("{stage:?}Output_{entry_point_name}");
 
-        let mut fake_members = Vec::new();
         let empty = [];
         let members = match module.types[result.ty].inner {
             TypeInner::Struct { ref members, .. } => members,
@@ -586,14 +597,60 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             }
         };
 
-        for member in members.iter() {
+        // Gather list of fragment input locations. We use this below to remove user-defined
+        // varyings from VS outputs that aren't in the FS inputs. This makes the VS interface match
+        // as long as the FS inputs are a subset of the VS outputs. This is only applied if the
+        // writer is supplied with information about the fragment entry point.
+        let fs_input_locs = if let (Some(frag_ep), ShaderStage::Vertex) = (frag_ep, stage) {
+            let mut fs_input_locs = Vec::new();
+            for arg in frag_ep.func.arguments.iter() {
+                let mut push_if_location = |binding: &Option<crate::Binding>| {
+                    match *binding {
+                        Some(crate::Binding::Location { location, .. }) => {
+                            fs_input_locs.push(location)
+                        }
+                        Some(crate::Binding::BuiltIn(_)) => {}
+                        // Log error?
+                        None => {}
+                    }
+                };
+                match frag_ep.module.types[arg.ty].inner {
+                    TypeInner::Struct { ref members, .. } => {
+                        // TODO: nesting?
+                        for member in members.iter() {
+                            push_if_location(&member.binding);
+                        }
+                    }
+                    _ => push_if_location(&arg.binding),
+                }
+            }
+            fs_input_locs.sort();
+            Some(fs_input_locs)
+        } else {
+            None
+        };
+
+        let mut fake_members = Vec::new();
+        for (index, member) in members.iter().enumerate() {
+            if let Some(ref fs_input_locs) = fs_input_locs {
+                match member.binding {
+                    Some(crate::Binding::Location { location, .. }) => {
+                        if fs_input_locs.binary_search(&location).is_err() {
+                            continue;
+                        }
+                    }
+                    Some(crate::Binding::BuiltIn(_)) => {}
+                    // Log error?
+                    None => {}
+                }
+            }
+
             let member_name = self.namer.call_or(&member.name, "member");
-            let index = fake_members.len() as u32;
             fake_members.push(EpStructMember {
                 name: member_name,
                 ty: member.ty,
                 binding: member.binding.clone(),
-                index,
+                index: index as u32,
             });
         }
 
@@ -609,6 +666,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         func: &crate::Function,
         stage: ShaderStage,
         ep_name: &str,
+        frag_ep: Option<&FragmentEntryPoint<'_>>,
     ) -> Result<EntryPointInterface, Error> {
         Ok(EntryPointInterface {
             input: if !func.arguments.is_empty()
@@ -624,7 +682,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             },
             output: match func.result {
                 Some(ref fr) if fr.binding.is_none() && stage == ShaderStage::Vertex => {
-                    Some(self.write_ep_output_struct(module, fr, stage, ep_name)?)
+                    Some(self.write_ep_output_struct(module, fr, stage, ep_name, frag_ep)?)
                 }
                 _ => None,
             },
diff --git a/naga/tests/in/unconsumed_vertex_outputs_frag.param.ron b/naga/tests/in/unconsumed_vertex_outputs_frag.param.ron
new file mode 100644
index 00000000000..72873dd6677
--- /dev/null
+++ b/naga/tests/in/unconsumed_vertex_outputs_frag.param.ron
@@ -0,0 +1,2 @@
+(
+)
diff --git a/naga/tests/in/unconsumed_vertex_outputs_frag.wgsl b/naga/tests/in/unconsumed_vertex_outputs_frag.wgsl
new file mode 100644
index 00000000000..3a656c9696b
--- /dev/null
+++ b/naga/tests/in/unconsumed_vertex_outputs_frag.wgsl
@@ -0,0 +1,13 @@
+// Out of order to test sorting.
+struct FragmentIn {
+    @location(1) value: f32,
+    @location(3) value2: f32,
+    @builtin(position) position: vec4<f32>,
+    // @location(0) unused_value: f32,
+    // @location(2) unused_value2: vec4<f32>,
+}
+
+@fragment
+fn fs_main(v_out: FragmentIn) -> @location(0) vec4<f32> {
+    return vec4<f32>(v_out.value, v_out.value, v_out.value2, v_out.value2);
+}
diff --git a/naga/tests/in/unconsumed_vertex_outputs_vert.param.ron b/naga/tests/in/unconsumed_vertex_outputs_vert.param.ron
new file mode 100644
index 00000000000..72873dd6677
--- /dev/null
+++ b/naga/tests/in/unconsumed_vertex_outputs_vert.param.ron
@@ -0,0 +1,2 @@
+(
+)
diff --git a/naga/tests/in/unconsumed_vertex_outputs_vert.wgsl b/naga/tests/in/unconsumed_vertex_outputs_vert.wgsl
new file mode 100644
index 00000000000..46c39ea9300
--- /dev/null
+++ b/naga/tests/in/unconsumed_vertex_outputs_vert.wgsl
@@ -0,0 +1,13 @@
+// Out of order to test sorting.
+struct VertexOut {
+    @builtin(position) position: vec4<f32>,
+    @location(1) value: f32,
+    @location(2) unused_value2: vec4<f32>,
+    @location(0) unused_value: f32,
+    @location(3) value2: f32,
+}
+
+@vertex
+fn vs_main() -> VertexOut {
+    return VertexOut(vec4(1.0), 1.0, vec4(2.0), 1.0, 0.5);
+}
diff --git a/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.hlsl b/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.hlsl
new file mode 100644
index 00000000000..4005e435380
--- /dev/null
+++ b/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.hlsl
@@ -0,0 +1,17 @@
+struct FragmentIn {
+    float value : LOC1;
+    float value2_ : LOC3;
+    float4 position : SV_Position;
+};
+
+struct FragmentInput_fs_main {
+    float value : LOC1;
+    float value2_ : LOC3;
+    float4 position : SV_Position;
+};
+
+float4 fs_main(FragmentInput_fs_main fragmentinput_fs_main) : SV_Target0
+{
+    FragmentIn v_out = { fragmentinput_fs_main.value, fragmentinput_fs_main.value2_, fragmentinput_fs_main.position };
+    return float4(v_out.value, v_out.value, v_out.value2_, v_out.value2_);
+}
diff --git a/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.ron b/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.ron
new file mode 100644
index 00000000000..eac1b945d2b
--- /dev/null
+++ b/naga/tests/out/hlsl/unconsumed_vertex_outputs_frag.ron
@@ -0,0 +1,12 @@
+(
+    vertex:[
+    ],
+    fragment:[
+        (
+            entry_point:"fs_main",
+            target_profile:"ps_5_1",
+        ),
+    ],
+    compute:[
+    ],
+)
diff --git a/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.hlsl b/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.hlsl
new file mode 100644
index 00000000000..ea75d638773
--- /dev/null
+++ b/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.hlsl
@@ -0,0 +1,30 @@
+struct VertexOut {
+    float4 position : SV_Position;
+    float value : LOC1;
+    float4 unused_value2_ : LOC2;
+    float unused_value : LOC0;
+    float value2_ : LOC3;
+};
+
+struct VertexOutput_vs_main {
+    float value : LOC1;
+    float value2_ : LOC3;
+    float4 position : SV_Position;
+};
+
+VertexOut ConstructVertexOut(float4 arg0, float arg1, float4 arg2, float arg3, float arg4) {
+    VertexOut ret = (VertexOut)0;
+    ret.position = arg0;
+    ret.value = arg1;
+    ret.unused_value2_ = arg2;
+    ret.unused_value = arg3;
+    ret.value2_ = arg4;
+    return ret;
+}
+
+VertexOutput_vs_main vs_main()
+{
+    const VertexOut vertexout = ConstructVertexOut((1.0).xxxx, 1.0, (2.0).xxxx, 1.0, 0.5);
+    const VertexOutput_vs_main vertexout_1 = { vertexout.value, vertexout.value2_, vertexout.position };
+    return vertexout_1;
+}
diff --git a/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.ron b/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.ron
new file mode 100644
index 00000000000..a24f8d0eb8b
--- /dev/null
+++ b/naga/tests/out/hlsl/unconsumed_vertex_outputs_vert.ron
@@ -0,0 +1,12 @@
+(
+    vertex:[
+        (
+            entry_point:"vs_main",
+            target_profile:"vs_5_1",
+        ),
+    ],
+    fragment:[
+    ],
+    compute:[
+    ],
+)
diff --git a/naga/tests/snapshots.rs b/naga/tests/snapshots.rs
index ee775a3e63b..e98d368c030 100644
--- a/naga/tests/snapshots.rs
+++ b/naga/tests/snapshots.rs
@@ -265,7 +265,13 @@ fn check_targets(
     module: &mut naga::Module,
     targets: Targets,
     source_code: Option<&str>,
+    // For testing hlsl generation when fragment shader doesn't consume all vertex outputs.
+    frag_ep: Option<naga::back::hlsl::FragmentEntryPoint>,
 ) {
+    if frag_ep.is_some() && !targets.contains(Targets::HLSL) {
+        panic!("Providing FragmentEntryPoint only makes sense when testing hlsl-out");
+    }
+
     let params = input.read_parameters();
     let name = &input.file_name;
 
@@ -409,6 +415,7 @@ fn check_targets(
                 &info,
                 &params.hlsl,
                 &params.pipeline_constants,
+                frag_ep,
             );
         }
     }
@@ -587,6 +594,7 @@ fn write_output_hlsl(
     info: &naga::valid::ModuleInfo,
     options: &naga::back::hlsl::Options,
     pipeline_constants: &naga::back::PipelineConstants,
+    frag_ep: Option<naga::back::hlsl::FragmentEntryPoint>,
 ) {
     use naga::back::hlsl;
     use std::fmt::Write as _;
@@ -599,7 +607,9 @@ fn write_output_hlsl(
 
     let mut buffer = String::new();
     let mut writer = hlsl::Writer::new(&mut buffer, options);
-    let reflection_info = writer.write(&module, &info).expect("HLSL write failed");
+    let reflection_info = writer
+        .write(&module, &info, frag_ep.as_ref())
+        .expect("HLSL write failed");
 
     input.write_output_file("hlsl", "hlsl", buffer);
 
@@ -890,7 +900,7 @@ fn convert_wgsl() {
         let input = Input::new(None, name, "wgsl");
         let source = input.read_source();
         match naga::front::wgsl::parse_str(&source) {
-            Ok(mut module) => check_targets(&input, &mut module, targets, None),
+            Ok(mut module) => check_targets(&input, &mut module, targets, None, None),
             Err(e) => panic!(
                 "{}",
                 e.emit_to_string_with_path(&source, input.input_path())
@@ -912,7 +922,7 @@ fn convert_wgsl() {
             // crlf will make the large split output different on different platform
             let source = source.replace('\r', "");
             match naga::front::wgsl::parse_str(&source) {
-                Ok(mut module) => check_targets(&input, &mut module, targets, Some(&source)),
+                Ok(mut module) => check_targets(&input, &mut module, targets, Some(&source), None),
                 Err(e) => panic!(
                     "{}",
                     e.emit_to_string_with_path(&source, input.input_path())
@@ -922,6 +932,36 @@ fn convert_wgsl() {
     }
 }
 
+#[cfg(feature = "wgsl-in")]
+#[test]
+fn unconsumed_vertex_outputs_hlsl_out() {
+    let load_and_parse = |name| {
+        // WGSL shaders lives in root dir as a privileged.
+        let input = Input::new(None, name, "wgsl");
+        let source = input.read_source();
+        let module = match naga::front::wgsl::parse_str(&source) {
+            Ok(module) => module,
+            Err(e) => panic!(
+                "{}",
+                e.emit_to_string_with_path(&source, input.input_path())
+            ),
+        };
+        (input, module)
+    };
+
+    // Uses separate wgsl files to make sure the tested code doesn't accidentally rely on
+    // the fragment entry point being from the same parsed content (e.g. accidentally using the
+    // wrong `Module` when looking up info). We also don't just create a module from the same file
+    // twice since everything would probably be stored behind the same keys.
+    let (input, mut module) = load_and_parse("unconsumed_vertex_outputs_vert");
+    let (frag_input, mut frag_module) = load_and_parse("unconsumed_vertex_outputs_frag");
+    let frag_ep = naga::back::hlsl::FragmentEntryPoint::new(&frag_module, "fs_main")
+        .expect("fs_main not found");
+
+    check_targets(&input, &mut module, Targets::HLSL, None, Some(frag_ep));
+    check_targets(&frag_input, &mut frag_module, Targets::HLSL, None, None);
+}
+
 #[cfg(feature = "spv-in")]
 fn convert_spv(name: &str, adjust_coordinate_space: bool, targets: Targets) {
     let _ = env_logger::try_init();
@@ -936,7 +976,7 @@ fn convert_spv(name: &str, adjust_coordinate_space: bool, targets: Targets) {
         },
     )
     .unwrap();
-    check_targets(&input, &mut module, targets, None);
+    check_targets(&input, &mut module, targets, None, None);
 }
 
 #[cfg(feature = "spv-in")]
@@ -996,7 +1036,7 @@ fn convert_glsl_variations_check() {
             &source,
         )
         .unwrap();
-    check_targets(&input, &mut module, Targets::GLSL, None);
+    check_targets(&input, &mut module, Targets::GLSL, None, None);
 }
 
 #[cfg(feature = "glsl-in")]
diff --git a/tests/tests/regression/issue_3748.rs b/tests/tests/regression/issue_3748.rs
new file mode 100644
index 00000000000..c38235021ed
--- /dev/null
+++ b/tests/tests/regression/issue_3748.rs
@@ -0,0 +1,52 @@
+use wgpu_test::{gpu_test, GpuTestConfiguration};
+
+use wgpu::*;
+
+/// Previously, for every user-defined vertex output a fragment shader had to have a corresponding
+/// user-defined input. This would generate `StageError::InputNotComsumed`.
+///
+/// This requirement was removed from the WebGPU spec. Now, when generating hlsl, wgpu will
+/// automatically remove any user-defined outputs from the vertex shader that are not present in
+/// the fragment inputs. This is necessary for generating correct hlsl:
+/// https://github.com/gfx-rs/naga/issues/1945
+#[gpu_test]
+static ALLOW_INPUT_NOT_CONSUMED: GpuTestConfiguration =
+    GpuTestConfiguration::new().run_async(|ctx| async move {
+        let module = ctx
+            .device
+            .create_shader_module(include_wgsl!("issue_3748.wgsl"));
+
+        let pipeline_layout = ctx
+            .device
+            .create_pipeline_layout(&PipelineLayoutDescriptor {
+                label: Some("Pipeline Layout"),
+                bind_group_layouts: &[],
+                push_constant_ranges: &[],
+            });
+
+        ctx.device
+            .create_render_pipeline(&RenderPipelineDescriptor {
+                label: Some("Pipeline"),
+                layout: Some(&pipeline_layout),
+                vertex: VertexState {
+                    module: &module,
+                    entry_point: "vs_main",
+                    compilation_options: Default::default(),
+                    buffers: &[],
+                },
+                primitive: PrimitiveState::default(),
+                depth_stencil: None,
+                multisample: MultisampleState::default(),
+                fragment: Some(FragmentState {
+                    module: &module,
+                    entry_point: "fs_main",
+                    compilation_options: Default::default(),
+                    targets: &[Some(ColorTargetState {
+                        format: TextureFormat::Rgba8Unorm,
+                        blend: None,
+                        write_mask: ColorWrites::all(),
+                    })],
+                }),
+                multiview: None,
+            });
+    });
diff --git a/tests/tests/regression/issue_3748.wgsl b/tests/tests/regression/issue_3748.wgsl
new file mode 100644
index 00000000000..78ace6d9dba
--- /dev/null
+++ b/tests/tests/regression/issue_3748.wgsl
@@ -0,0 +1,23 @@
+struct VertexOut {
+    @builtin(position) position: vec4<f32>,
+    @location(0) unused_value: f32,
+    @location(1) value: f32,
+}
+
+struct FragmentIn {
+    @builtin(position) position: vec4<f32>,
+    // @location(0) unused_value: f32,
+    @location(1) value: f32,
+}
+
+@vertex
+fn vs_main() -> VertexOut {
+    return VertexOut(vec4(1.0), 1.0, 1.0);
+}
+
+@fragment
+fn fs_main(v_out: FragmentIn) -> @location(0) vec4<f32> {
+    return vec4<f32>(v_out.value);
+}
+
+
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index 6dc7af56ec2..91a3542c1b9 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -1,6 +1,7 @@
 mod regression {
     mod issue_3349;
     mod issue_3457;
+    mod issue_3748;
     mod issue_4024;
     mod issue_4122;
 }
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index b1aa17ba782..2a2af1ddb59 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -1506,8 +1506,7 @@ impl<A: HalApi> Device<A> {
                 })
             })?;
 
-        let interface =
-            validation::Interface::new(&module, &info, self.limits.clone(), self.features);
+        let interface = validation::Interface::new(&module, &info, self.limits.clone());
         let hal_shader = hal::ShaderInput::Naga(hal::NagaShader {
             module,
             info,
diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs
index d360ee96219..79e023283cd 100644
--- a/wgpu-core/src/validation.rs
+++ b/wgpu-core/src/validation.rs
@@ -132,7 +132,6 @@ struct EntryPoint {
 #[derive(Debug)]
 pub struct Interface {
     limits: wgt::Limits,
-    features: wgt::Features,
     resources: naga::Arena<Resource>,
     entry_points: FastHashMap<(naga::ShaderStage, String), EntryPoint>,
 }
@@ -281,8 +280,6 @@ pub enum StageError {
         #[source]
         error: InputError,
     },
-    #[error("Location[{location}] is provided by the previous stage output but is not consumed as input by this stage.")]
-    InputNotConsumed { location: wgt::ShaderLocation },
     #[error(
         "Unable to select an entry point: no entry point was found in the provided shader module"
     )]
@@ -884,12 +881,7 @@ impl Interface {
         list.push(varying);
     }
 
-    pub fn new(
-        module: &naga::Module,
-        info: &naga::valid::ModuleInfo,
-        limits: wgt::Limits,
-        features: wgt::Features,
-    ) -> Self {
+    pub fn new(module: &naga::Module, info: &naga::valid::ModuleInfo, limits: wgt::Limits) -> Self {
         let mut resources = naga::Arena::new();
         let mut resource_mapping = FastHashMap::default();
         for (var_handle, var) in module.global_variables.iter() {
@@ -976,7 +968,6 @@ impl Interface {
 
         Self {
             limits,
-            features,
             resources,
             entry_points,
         }
@@ -1198,6 +1189,11 @@ impl Interface {
                                             ));
                                         }
                                         (
+                                            // TODO: is a subtype allowed here? This isn't clear
+                                            // from the line in the spec: "For each user-defined
+                                            // input of descriptor.fragment there must be a
+                                            // user-defined output of descriptor.vertex that
+                                            // location, type, and interpolation of the input."
                                             iv.ty.is_subtype_of(&provided.ty),
                                             iv.ty.dim.num_components(),
                                         )
@@ -1223,35 +1219,23 @@ impl Interface {
                         }
                     }
                 }
+                // TODO: front_facing, sample_index, and sample_mask builtin's should all increase
+                // components count for fragment input.
                 Varying::BuiltIn(_) => {}
             }
         }
 
-        // Check all vertex outputs and make sure the fragment shader consumes them.
-        // This requirement is removed if the `SHADER_UNUSED_VERTEX_OUTPUT` feature is enabled.
-        if shader_stage == naga::ShaderStage::Fragment
-            && !self
-                .features
-                .contains(wgt::Features::SHADER_UNUSED_VERTEX_OUTPUT)
-        {
-            for &index in inputs.keys() {
-                // This is a linear scan, but the count should be low enough
-                // that this should be fine.
-                let found = entry_point.inputs.iter().any(|v| match *v {
-                    Varying::Local { location, .. } => location == index,
-                    Varying::BuiltIn(_) => false,
-                });
-
-                if !found {
-                    return Err(StageError::InputNotConsumed { location: index });
-                }
-            }
-        }
-
         if shader_stage == naga::ShaderStage::Vertex {
+            // TODO: if topology is point we should add 1 to inter_stage_components?
             for output in entry_point.outputs.iter() {
                 //TODO: count builtins towards the limit?
                 inter_stage_components += match *output {
+                    // TODO: Spec mentions "Each user-defined output of descriptor.vertex consumes
+                    // 4 scalar components". Not that it varies based on the type. So is there an
+                    // inconsistency here? Also are all these "user-defined" or is that unknown at
+                    // this stage?
+                    // https://gpuweb.github.io/gpuweb/#abstract-opdef-validating-inter-stage-interfaces
+                    // (same applies to counting components for fragment inputs)
                     Varying::Local { ref iv, .. } => iv.ty.dim.num_components(),
                     Varying::BuiltIn(_) => 0,
                 };
@@ -1273,6 +1257,9 @@ impl Interface {
             }
         }
 
+        // TODO: spec also has a max_inter_stage_shader_variables
+        // https://gpuweb.github.io/gpuweb/#abstract-opdef-validating-inter-stage-interfaces and
+        // the location of user defined outputs(vertex)/inputs(fragment) must all be less than this
         if inter_stage_components > self.limits.max_inter_stage_shader_components {
             return Err(StageError::TooManyVaryings {
                 used: inter_stage_components,
diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs
index 82075294ee5..c37289f4748 100644
--- a/wgpu-hal/src/dx12/device.rs
+++ b/wgpu-hal/src/dx12/device.rs
@@ -209,14 +209,26 @@ impl super::Device {
         Ok(())
     }
 
+    /// When generating the vertex shader, the fragment stage must be passed if it exists!
+    /// Otherwise, the generated HLSL may be incorrect since the fragment shader inputs are
+    /// allowed to be a subset of the vertex outputs.
     fn load_shader(
         &self,
         stage: &crate::ProgrammableStage<super::Api>,
         layout: &super::PipelineLayout,
         naga_stage: naga::ShaderStage,
+        fragment_stage: Option<&crate::ProgrammableStage<super::Api>>,
     ) -> Result<super::CompiledShader, crate::PipelineError> {
         use naga::back::hlsl;
 
+        let frag_ep = fragment_stage
+            .map(|fs_stage| {
+                hlsl::FragmentEntryPoint::new(&fs_stage.module.naga.module, fs_stage.entry_point).ok_or(
+                    crate::PipelineError::EntryPoint(naga::ShaderStage::Fragment),
+                )
+            })
+            .transpose()?;
+
         let stage_bit = crate::auxil::map_naga_stage(naga_stage);
 
         let (module, info) = naga::back::pipeline_constants::process_overrides(
@@ -243,7 +255,7 @@ impl super::Device {
         let reflection_info = {
             profiling::scope!("naga::back::hlsl::write");
             writer
-                .write(&module, &info)
+                .write(&module, &info, frag_ep.as_ref())
                 .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("HLSL: {e:?}")))?
         };
 
@@ -1300,12 +1312,16 @@ impl crate::Device for super::Device {
         let (topology_class, topology) = conv::map_topology(desc.primitive.topology);
         let mut shader_stages = wgt::ShaderStages::VERTEX;
 
-        let blob_vs =
-            self.load_shader(&desc.vertex_stage, desc.layout, naga::ShaderStage::Vertex)?;
+        let blob_vs = self.load_shader(
+            &desc.vertex_stage,
+            desc.layout,
+            naga::ShaderStage::Vertex,
+            desc.fragment_stage.as_ref(),
+        )?;
         let blob_fs = match desc.fragment_stage {
             Some(ref stage) => {
                 shader_stages |= wgt::ShaderStages::FRAGMENT;
-                Some(self.load_shader(stage, desc.layout, naga::ShaderStage::Fragment)?)
+                Some(self.load_shader(stage, desc.layout, naga::ShaderStage::Fragment, None)?)
             }
             None => None,
         };
@@ -1484,7 +1500,7 @@ impl crate::Device for super::Device {
         &self,
         desc: &crate::ComputePipelineDescriptor<super::Api>,
     ) -> Result<super::ComputePipeline, crate::PipelineError> {
-        let blob_cs = self.load_shader(&desc.stage, desc.layout, naga::ShaderStage::Compute)?;
+        let blob_cs = self.load_shader(&desc.stage, desc.layout, naga::ShaderStage::Compute, None)?;
 
         let pair = {
             profiling::scope!("ID3D12Device::CreateComputePipelineState");
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 052c77006bd..0fff90c5b7e 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -491,7 +491,6 @@ impl super::Adapter {
             wgt::Features::SHADER_EARLY_DEPTH_TEST,
             supported((3, 1), (4, 2)) || extensions.contains("GL_ARB_shader_image_load_store"),
         );
-        features.set(wgt::Features::SHADER_UNUSED_VERTEX_OUTPUT, true);
         if extensions.contains("GL_ARB_timer_query") {
             features.set(wgt::Features::TIMESTAMP_QUERY, true);
             features.set(wgt::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS, true);
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index cddba472bd1..0892f70b536 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -904,7 +904,6 @@ impl super::PrivateCapabilities {
         features.set(F::ADDRESS_MODE_CLAMP_TO_ZERO, true);
 
         features.set(F::RG11B10UFLOAT_RENDERABLE, self.format_rg11b10_all);
-        features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
         if self.supports_simd_scoped_operations {
             features.insert(F::SUBGROUP | F::SUBGROUP_BARRIER);
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 21219361f4c..90970b3f189 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -740,7 +740,6 @@ impl PhysicalDeviceFeatures {
                 | vk::FormatFeatureFlags::COLOR_ATTACHMENT_BLEND,
         );
         features.set(F::RG11B10UFLOAT_RENDERABLE, rg11b10ufloat_renderable);
-        features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
         features.set(
             F::BGRA8UNORM_STORAGE,
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index cb3f1add0ee..ec0111c2975 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -799,14 +799,6 @@ bitflags::bitflags! {
         ///
         /// This is a native only feature.
         const VERTEX_ATTRIBUTE_64BIT = 1 << 45;
-        /// Allows vertex shaders to have outputs which are not consumed
-        /// by the fragment shader.
-        ///
-        /// Supported platforms:
-        /// - Vulkan
-        /// - Metal
-        /// - OpenGL
-        const SHADER_UNUSED_VERTEX_OUTPUT = 1 << 46;
         /// Allows for creation of textures of format [`TextureFormat::NV12`]
         ///
         /// Supported platforms: