Fix typos, via a Levenshtein-type corrector (pytorch#31523)

Summary: Should be non-semantic. Uses https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines to find likely typos, with https://github.com/bwignall/typochecker to help automate the checking. Uses an updated version of the tool used in pytorch#30606 . Pull Request resolved: pytorch#31523 Differential Revision: D19216749 Pulled By: mrshenli fbshipit-source-id: 7fd489cb9a77cd7e4950c1046f925d57524960ea
Knightfire1998 · Jan 18, 2020 · f326045 · f326045
1 parent c8ca70e
commit f326045
Show file tree

Hide file tree

Showing 252 changed files with 284 additions and 284 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -270,7 +270,7 @@ if (MSVC)
     endif()
 
     # /bigobj increases number of sections in .obj file, which is needed to link
-    # against libaries in Python 2.7 under Windows
+    # against libraries in Python 2.7 under Windows
     set(${flag_var} "${${flag_var}} /MP /bigobj")
   endforeach(flag_var)
 

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -10,7 +10,7 @@
 /test/test_c10d.py @pietern @mrshenli @zhaojuanmao
 /torch/utils/cpp_extension.py @goldsborough @fmassa @soumith @ezyang
 
-# Not there to stricly require the approval, but to be tagged as a reviewer
+# Not there to strictly require the approval, but to be tagged as a reviewer
 # on the PRs to push them into a high priority inbox.
 /torch/csrc/api/data/ @apaszke
 /torch/csrc/autograd/ @apaszke

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -24,7 +24,7 @@ else()
   set(CAFFE2_STATIC_LINK_CUDA_INT 0)
 endif()
 CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
-# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortuantely,
+# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortunately,
 # this file generates AT_ROCM_ENABLED() which is required by the miopen
 # files, which are compiled even if we are doing a vanilla CUDA build.
 # Once we properly split CUDA and HIP in ATen, we can remove this code.

diff --git a/aten/src/ATen/core/boxing/kernel_lambda.h b/aten/src/ATen/core/boxing/kernel_lambda.h
@@ -8,7 +8,7 @@ namespace c10 {
 namespace detail {
   // WrapRuntimeKernelFunctor: Wraps any runtime functor into a functor that
   // inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
-  // This can, for example, be used for lamdas, functors or even function pointers.
+  // This can, for example, be used for lambdas, functors or even function pointers.
   // In the case of function pointers, since it is a runtime function pointer,
   // there is an overhead for calling it whenever the kernel is invoked.
   template<class FuncType, class ReturnType, class ParameterList> class WrapRuntimeKernelFunctor_ {};

diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
@@ -184,7 +184,7 @@ struct FunctionSchema {
   std::vector<Argument> returns_;
   // if true then this schema takes an arbitrary number of additional arguments
   // after the argument specified in arguments
-  // currently this is used primarily to represent 'primtive' operators whose
+  // currently this is used primarily to represent 'primitive' operators whose
   // arguments are not checked by schema
   bool is_vararg_;
   bool is_varret_;

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
@@ -1366,7 +1366,7 @@ struct getTypePtr_<at::optional<T>> final {
 } // namespace detail
 template <class T>
 inline TypePtr getTypePtr() {
-  // TODO: static_assert that a templated function exists, and throw a friendy
+  // TODO: static_assert that a templated function exists, and throw a friendly
   // error message if not
   return detail::getTypePtr_<T>::call();
 }

diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -84,7 +84,7 @@ struct Vec256 {
   // a constexpr variable if we never odr-use it.  But it seems that some
   // versions GCC/Clang have buggy determinations on whether or not an
   // identifier is odr-used or not, and in any case it's hard to tell if
-  // a variable is odr-used or not.  So best to just cut the probem at the root.
+  // a variable is odr-used or not.  So best to just cut the problem at the root.
   static constexpr int size() {
     return 32 / sizeof(T);
   }

diff --git a/aten/src/ATen/cuda/CUDAGenerator.cpp b/aten/src/ATen/cuda/CUDAGenerator.cpp
@@ -94,7 +94,7 @@ uint64_t CUDAGenerator::current_seed() const {
 }
 
 /**
- * Gets a nondeterminstic random number from /dev/urandom or time,
+ * Gets a nondeterministic random number from /dev/urandom or time,
  * seeds the CPUGenerator with it and then returns that number.
  * 
  * FIXME: You can move this function to Generator.cpp if the algorithm

diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -53,7 +53,7 @@ namespace at { namespace cuda {
 // NOTE [ ATen NVRTC Stub and HIP ]
 //
 // ATen's NVRTC stub library, caffe2_nvrtc, provides dynamic loading of both
-// NVRTC and driver APIs. While the former is not yet suppoted for HIP, the
+// NVRTC and driver APIs. While the former is not yet supported for HIP, the
 // later is supported and needed (e.g., in CUDAHooks::getDeviceWithPrimaryContext()
 // used by tensor.pin_memory()).
 //

diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -76,7 +76,7 @@ class TORCH_CUDA_API Descriptor
   T* desc() const { return desc_.get(); }
   T* desc() { return desc_.get(); }
 
-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
   // if you intend to modify what it points to (e.g., using
   // cudnnSetFooDescriptor).  This will ensure that the descriptor
   // is initialized.  Code in this file will use this function.

diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -27,7 +27,7 @@ namespace c10 { namespace hip {
 // HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
 // For example, when you use HIPified PyTorch, you say x.cuda() to
 // move a tensor onto ROCm device.  We call this situation "HIP
-// maquerading as CUDA".
+// masquerading as CUDA".
 //
 // This leads to a very awkward situation when we want to call c10_hip
 // code from PyTorch, since c10_hip is expecting things to be called

diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
@@ -61,7 +61,7 @@ class Descriptor
   T* desc() const { return desc_.get(); }
   T* desc() { return desc_.get(); }
 
-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
   // if you intend to modify what it points to (e.g., using
   // miopenSetFooDescriptor).  This will ensure that the descriptor
   // is initialized.  Code in this file will use this function.

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -1104,7 +1104,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten
   return self_working_copy;
 }
 
-// Supports arbitrary batch dimensions for self and LU_data (implicity LU_pivots also)
+// Supports arbitrary batch dimensions for self and LU_data (implicitly LU_pivots also)
 Tensor lu_solve(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots) {
   TORCH_CHECK(self.dim() >= 2,
               "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");

diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -59,7 +59,7 @@ static inline void multi_margin_loss_cpu_kernel(
   using accscalar_t = at::acc_type<scalar_t, false>;
 
   // dim() != 0 check is for 1d input which produces a scalar output (that
-  // cannot be handeld by TensorAccessor)
+  // cannot be handled by TensorAccessor)
   if (reduction == Reduction::None && output.dim() > 0) {
     auto output_acc = output.accessor<scalar_t, 1>();
     for (int64_t t = 0; t < nframe; t++) {

diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
@@ -295,7 +295,7 @@ static std::vector<QuantizedCellParamsDynamic> gather_quantized_params_dynamic(
   }
   return result;
 #else // USE_FBGEMM
-  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN wihtout FBGEMM!")
+  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN without FBGEMM!")
 #endif // USE_FBGEMM
 }
 

diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
@@ -276,7 +276,7 @@ std::tuple<Tensor, Tensor> kthvalue(
   return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
 }
 
-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 Tensor median_cpu(const Tensor& self) {
   NoNamesGuard guard;
   TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
@@ -618,7 +618,7 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
     self - sparse tensor, its shape is sizes = sparse_shape + dense_shape
       indices - 2-D tensor of indices, shape is (sparse_dims, nnz)
       values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape
-    index_select(dim, index) returns a sparse tensor with the follwing data
+    index_select(dim, index) returns a sparse tensor with the following data
       new_sizes = sizes[:dim] + (n,) + sizes[dim+1:]
       new_indices - shape is (sparse_dims, new_nnz)
       new_values - shape is (new_nnz,) + dense_shape

diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp
@@ -85,7 +85,7 @@ static void unfolded3d_copy(
     const int64_t input_hw = input_height * input_width;
     const int64_t input_dhw = input_hw * input_depth;
 
-    // the following variables are updated ouside the most inner loop
+    // the following variables are updated outside the most inner loop
     int64_t d = d_out * dT - pT + i;
     int64_t h = h_out * dH - pH + j;
     int64_t ofs = nip * input_dhw + d * input_hw + h * input_width;

diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
@@ -28,7 +28,7 @@
  * are computed from the input and the output size;
  *
  *
- * When the scales are infered from the input and output sizes,
+ * When the scales are inferred from the input and output sizes,
  * we view each pixel as an area, idx + 0.5 as its center index.
  * Here is an example formula in 1D case.
  * if align_corners: center of two corner pixel areas are preserved,

diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -26,7 +26,7 @@ struct Dist {
   //     map :      This tells how to modify (a - b) to form the component that
   //                gets summed.
   //     red :      This tells how to sum the result of map up. This is
-  //                separate because the inf norm actuall uses max instead of
+  //                separate because the inf norm actually uses max instead of
   //                sum.
   //     finish :   This tells what to do with the aggregated value to compute
   //                the norm. Generally this is the result of val ^ (1 / p).

diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -158,7 +158,7 @@ namespace at { namespace native { namespace {
  *      `apply_fn` will be called multiple times, and together cover the entire
  *      output spatial space.
  *
- *  Now you should be able tp understand everything about the implementaion of
+ *  Now you should be able tp understand everything about the implementation of
  *  2D forward kernel shown at the beginning of this note.
  *
  **/

diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -117,7 +117,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
   Device dst_device = iter.device(0);
   Device src_device = iter.device(1);
 
-  // Enable p2p access between devices. (No-op if it invovles the CPU)
+  // Enable p2p access between devices. (No-op if it involves the CPU)
   bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
 
   if (copy_requires_temporaries(iter, p2p_enabled)) {

diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -364,7 +364,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
         //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
         scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NHW[0] = gix_mult * gix;
@@ -383,7 +383,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
         //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
         scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
@@ -569,7 +569,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
         //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
         scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NDHW[0] = gix_mult * gix;
@@ -591,7 +591,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
         //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
         scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);

diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
@@ -108,7 +108,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size,
 }
 
 static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
-  // computes the stride as if tensor were contigous
+  // computes the stride as if tensor were contiguous
   auto sizes = tensor.sizes();
   std::vector<int64_t> stride(tensor.dim());
   stride[tensor.dim() - 1] = 1;

diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
@@ -7,7 +7,7 @@
 //
 // The gpu_kernel_with_scalars generates specializations that support a
 // single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
-// is lifted to a kernel paramter instead of copying to device memory.
+// is lifted to a kernel parameter instead of copying to device memory.
 // This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
 // which is the default for TensorIterator::binary_op. Otherwise, all inputs
 // and the output must be on the GPU.

diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -51,7 +51,7 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
 // A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
 // This is important because it means only __shfl_ instructions are required for reductions.
 // Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
-// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
 // ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
 // is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
 // The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.

diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -200,7 +200,7 @@ __global__ void cunn_SpatialSoftMaxForward(
     for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
       const uint32_t data_offset = outer_offset + inner_index;
       ////////////////////////////////////////////////////////////
-      // These two blocks are really eqivalent, but specializing on
+      // These two blocks are really equivalent, but specializing on
       // blockDim.x == 1 makes the kernel faster when it's unused.
       // I didn't want to thread an extra template parameter, and nvcc
       // seems to be smart enough to hoist the if outside of the loops.

diff --git a/aten/src/ATen/native/cuda/SortingKthValue.cu b/aten/src/ATen/native/cuda/SortingKthValue.cu
@@ -177,7 +177,7 @@ void kthvalue_cuda_template(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 template <typename scalar_t>
 Tensor median_cuda_template(const Tensor& self) {
   TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");

diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -211,7 +211,7 @@ inline int64_t resolve_root_int(
 //                       (row + 2f - 1)row <= 2x
 //                  row^2 + (2f-1)row - 2x <= 0.                            [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = 2f - 1
@@ -254,7 +254,7 @@ inline void get_coordinate_in_tril_trapezoid(
 //                       (-row + 2f + 1)row <= 2x
 //                   row^2 - (2f+1)row + 2x >= 0.                           [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = -1 - 2f

diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -213,7 +213,7 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
     accscalar_t value) {
   int access_y = max(min(y, height - 1), 0);
   int access_x = max(min(x, width - 1), 0);
-  /* TODO: result here is trucated to scalar_t,
+  /* TODO: result here is truncated to scalar_t,
      check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
    */
   gpuAtomicAdd(

diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1119,7 +1119,7 @@ std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor&
 struct DropoutState {
   // Both buffer and event are lazily instantiated when a dropout state is needed
   // for the first time. Note that in this case needed != used, as we don't need
-  // a bufer to e.g. run RNNs in test mode.
+  // a buffer to e.g. run RNNs in test mode.
   at::Tensor buffer;
   c10::optional<cuda::CUDAEvent> event;
   std::mutex mutex;

diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -99,15 +99,15 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
         //   1. if this dim idx becomes 1, will need to add (size - 1) * stride
         //   2. otherwise, will need to subtract stride
         if (from_slice_indices[d] == 0) {
-          // Substract. Carries over to previous dimension
+          // Subtract. Carries over to previous dimension
           from_slice_data -= output.stride(d);
         } else if (from_slice_indices[d] == 1) {
           // Dimension index becomes 1
           // Doesn't carry over to previous dimension
           from_slice_data += (output.size(d) - 1) * output.stride(d);
           break;
         } else {
-          // Substract. Doesn't carry over to previous dimension
+          // Subtract. Doesn't carry over to previous dimension
           from_slice_data -= output.stride(d);
           break;
         }

diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -43,7 +43,7 @@ using namespace mkldnn;
 
 namespace {
 // Helper function for getting an ideep tensor out of an aten Tensor.
-// Note in case the aten Tensor is a dense tensor, the retured ideep
+// Note in case the aten Tensor is a dense tensor, the returned ideep
 // tensor is just a view of the storage of the aten dense tensor, so
 // caller needs to make sure the aten dense tensor's lifetime is
 // longer than the ideep tensor.

diff --git a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
@@ -23,7 +23,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
    * in_len: the dimension_size of input matrix
    * Basically, in_len / out_len gives the number of
    * elements in each average computation.
-   * This functin computes the start index on input matrix.
+   * This function computes the start index on input matrix.
    */
   return (int)std::floor((float)(out_idx * in_len) / out_len);
 }

diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@@ -23,7 +23,7 @@ Tensor quantized_clamp_impl(
     qclamp_stub(qx.device().type(), qx, *min, *max, qy);
   } else {
     TORCH_CHECK(
-        false, "Both min and max should be specifed for quantized clamp!");
+        false, "Both min and max should be specified for quantized clamp!");
   }
   return qy;
 }

diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -15,7 +15,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
   TORCH_CHECK(qa.qscheme() == kPerTensorAffine,
               "Only per tensor quantization is supported in Mul.");
   TORCH_CHECK(qa.qscheme() == qb.qscheme(),
-              "Both inputs to Mul must have the same quantization shceme.");
+              "Both inputs to Mul must have the same quantization scheme.");
   TORCH_CHECK(qa.numel() == qb.numel(),
               "Mul operands must be the same size!");
   TORCH_CHECK(qa.scalar_type() == qb.scalar_type(),

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
@@ -63,7 +63,7 @@ void pytorch_qnnp_requantize_fp32__neon(
 
 #ifdef __aarch64__
     /*
-     * Leverage "Floating-point Convert to Signed integer, rouding to nearest
+     * Leverage "Floating-point Convert to Signed integer, rounding to nearest
      * with ties to even" instruction. This is an ARMv8 instruction (always
      * available in AArch64), which saturates result on overflow. We don't need
      * to specifically consider saturated results, they will be clamped at the

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
@@ -46,7 +46,7 @@ void pytorch_qnnp_requantize_fp32__psimd(
      * - Large int32_t values can't be exactly represented as FP32. We expect
      * that conversion instruction would round it to nearest FP32 value with
      * ties to even, but Clang documentation for __builtin_convertvector does
-     *   not guaratee that.
+     *   not guarantee that.
      * - Product of two FP32 values is generally not exactly representation as
      * an FP32 value, and will be rounded to nearest FP32 value with ties to
      * even.